[feat] リポジトリのスキャンとマークダウンファイルの生成機能を追加
Browse files- ユーザーがリポジトリのURLを入力し、"CodeLumia Run ..."ボタンをクリックすると、リポジトリのスキャンが開始されます。
- リポジトリのクローン、ファイルツリーの取得、マークダウンコンテンツの作成が行われます。
- 生成されたマークダウンファイルをプレビューする機能を追加しました。
- `preview_markdown`オプションを有効にすると、マークダウンがレンダリングされて表示されます。
- `preview_plaintext`オプションを有効にすると、マークダウンのプレーンテキストが表示されます。
- マークダウンファイルのダウンロードリンクを作成しました。
- サイドバーに一時ディレクトリ(`tmp_dir`)の指定オプションを追加しました。
[docs] コードにコメントを追加し、可読性を向上
- 各機能の説明をコメントで追加しました。
- コードの構造を整理し、理解しやすくなるようにコメントを記述しました。
[refactor] ファイル操作とGit操作のモジュールを改善
- `file_operations.py`と`git_operations.py`のコードを整理し、リファクタリングしました。
- パス操作の一貫性を確保するために、`os.sep`の代わりに`/`を使用するように変更しました。
- 一時ディレクトリ(`tmp_dir`)を指定できるようにし、柔軟性を向上させました。
[chore] 依存関係の更新とコードの整理
- `.CodeLumiaignore`ファイルに`LICENSE*`パターンを追加しました。
- `requirements.txt`ファイルを更新し、必要な依存関係を追加しました。
- コードのフォーマットを整えて、可読性を向上させました。
- .CodeLumiaignore +2 -1
- DeepSeek-Math.md +259 -0
- app.py +27 -18
- modules/file_operations.py +24 -3
- modules/git_operations.py +18 -7
- tmp/DeepSeek-Math +1 -0
@@ -170,4 +170,5 @@ LICENSE
|
|
170 |
*.png
|
171 |
*.sqlite
|
172 |
*.jpg
|
173 |
-
requirements.txt
|
|
|
|
170 |
*.png
|
171 |
*.sqlite
|
172 |
*.jpg
|
173 |
+
requirements.txt
|
174 |
+
LICENSE*
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# << DeepSeek-Math>>
|
2 |
+
## DeepSeek-Math File Tree
|
3 |
+
|
4 |
+
```
|
5 |
+
DeepSeek-Math/
|
6 |
+
cog.yaml
|
7 |
+
README.md
|
8 |
+
|
9 |
+
```
|
10 |
+
|
11 |
+
## cog.yaml
|
12 |
+
|
13 |
+
```yaml
|
14 |
+
# Configuration for Cog ⚙️
|
15 |
+
# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
|
16 |
+
|
17 |
+
build:
|
18 |
+
gpu: true
|
19 |
+
python_version: "3.11"
|
20 |
+
python_packages:
|
21 |
+
- torch==2.0.1
|
22 |
+
- torchvision==0.15.2
|
23 |
+
- transformers==4.37.2
|
24 |
+
- accelerate==0.27.0
|
25 |
+
- hf_transfer
|
26 |
+
|
27 |
+
# predict.py defines how predictions are run on your model
|
28 |
+
predict: "replicate/predict.py:Predictor"
|
29 |
+
|
30 |
+
```
|
31 |
+
|
32 |
+
## README.md
|
33 |
+
|
34 |
+
```markdown
|
35 |
+
|
36 |
+
<!-- markdownlint-disable first-line-h1 -->
|
37 |
+
<!-- markdownlint-disable html -->
|
38 |
+
<!-- markdownlint-disable no-duplicate-header -->
|
39 |
+
|
40 |
+
<div align="center">
|
41 |
+
<img src="images/logo.svg" width="60%" alt="DeepSeek LLM" />
|
42 |
+
</div>
|
43 |
+
<hr>
|
44 |
+
<div align="center">
|
45 |
+
|
46 |
+
<a href="https://www.deepseek.com/" target="_blank">
|
47 |
+
<img alt="Homepage" src="images/badge.svg" />
|
48 |
+
</a>
|
49 |
+
<a href="https://chat.deepseek.com/" target="_blank">
|
50 |
+
<img alt="Chat" src="https://img.shields.io/badge/🤖%20Chat-DeepSeek%20LLM-536af5?color=536af5&logoColor=white" />
|
51 |
+
</a>
|
52 |
+
<a href="https://huggingface.co/deepseek-ai" target="_blank">
|
53 |
+
<img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-DeepSeek%20AI-ffc107?color=ffc107&logoColor=white" />
|
54 |
+
</a>
|
55 |
+
<a href="https://replicate.com/cjwbw/deepseek-math-7b-base" target="_parent"><img src="https://replicate.com/cjwbw/deepseek-math-7b-base/badge" alt="Replicate"/></a>
|
56 |
+
</div>
|
57 |
+
|
58 |
+
<div align="center">
|
59 |
+
|
60 |
+
<a href="https://discord.gg/Tc7c45Zzu5" target="_blank">
|
61 |
+
<img alt="Discord" src="https://img.shields.io/badge/Discord-DeepSeek%20AI-7289da?logo=discord&logoColor=white&color=7289da" />
|
62 |
+
</a>
|
63 |
+
<a href="images/qr.jpeg" target="_blank">
|
64 |
+
<img alt="Wechat" src="https://img.shields.io/badge/WeChat-DeepSeek%20AI-brightgreen?logo=wechat&logoColor=white" />
|
65 |
+
</a>
|
66 |
+
<a href="https://twitter.com/deepseek_ai" target="_blank">
|
67 |
+
<img alt="Twitter Follow" src="https://img.shields.io/badge/Twitter-deepseek_ai-white?logo=x&logoColor=white" />
|
68 |
+
</a>
|
69 |
+
|
70 |
+
</div>
|
71 |
+
|
72 |
+
<div align="center">
|
73 |
+
|
74 |
+
<a href="LICENSE-CODE">
|
75 |
+
<img alt="Code License" src="https://img.shields.io/badge/Code_License-MIT-f5de53?&color=f5de53">
|
76 |
+
</a>
|
77 |
+
<a href="LICENSE-MODEL">
|
78 |
+
<img alt="Model License" src="https://img.shields.io/badge/Model_License-Model_Agreement-f5de53?&color=f5de53">
|
79 |
+
</a>
|
80 |
+
</div>
|
81 |
+
|
82 |
+
|
83 |
+
<p align="center">
|
84 |
+
<a href="#4-model-downloads">Model Download</a> |
|
85 |
+
<a href="#2-evaluation-results">Evaluation Results</a> |
|
86 |
+
<a href="#5-quick-start">Quick Start</a> |
|
87 |
+
<a href="#6-license">License</a> |
|
88 |
+
<a href="#7-citation">Citation</a>
|
89 |
+
</p>
|
90 |
+
|
91 |
+
<p align="center">
|
92 |
+
<a href="https://arxiv.org/pdf/2402.03300.pdf"><b>Paper Link</b>👁️</a>
|
93 |
+
</p>
|
94 |
+
|
95 |
+
|
96 |
+
## 1. Introduction
|
97 |
+
|
98 |
+
DeepSeekMath is initialized with [DeepSeek-Coder-v1.5 7B](https://huggingface.co/deepseek-ai/deepseek-coder-7b-base-v1.5) and continues pre-training on math-related tokens sourced from Common Crawl, together with natural language and code data for 500B tokens. DeepSeekMath 7B has achieved an impressive score of **51.7%** on the competition-level MATH benchmark without relying on external toolkits and voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. For research purposes, we release [checkpoints](#4-model-downloads) of base, instruct, and RL models to the public.
|
99 |
+
|
100 |
+
<p align="center">
|
101 |
+
<img src="images/math.png" alt="table" width="70%">
|
102 |
+
</p>
|
103 |
+
|
104 |
+
## 2. Evaluation Results
|
105 |
+
|
106 |
+
### DeepSeekMath-Base 7B
|
107 |
+
|
108 |
+
We conduct a comprehensive assessment of the mathematical capabilities of DeepSeekMath-Base 7B, focusing on its ability to produce self-contained mathematical solutions without relying on external tools, solve math problems using tools, and conduct formal theorem proving. Beyond mathematics, we also provide a more general profile of the base model, including its performance of natural language understanding, reasoning, and programming skills.
|
109 |
+
|
110 |
+
- **Mathematical problem solving with step-by-step reasoning**
|
111 |
+
|
112 |
+
<p align="center">
|
113 |
+
<img src="images/base_results_1.png" alt="table" width="70%">
|
114 |
+
</p>
|
115 |
+
|
116 |
+
- **Mathematical problem solving with tool use**
|
117 |
+
|
118 |
+
<p align="center">
|
119 |
+
<img src="images/base_results_2.png" alt="table" width="50%">
|
120 |
+
</p>
|
121 |
+
|
122 |
+
- **Natural Language Understanding, Reasoning, and Code**
|
123 |
+
<p align="center">
|
124 |
+
<img src="images/base_results_3.png" alt="table" width="50%">
|
125 |
+
</p>
|
126 |
+
|
127 |
+
The evaluation results from the tables above can be summarized as follows:
|
128 |
+
- **Superior Mathematical Reasoning:** On the competition-level MATH dataset, DeepSeekMath-Base 7B outperforms existing open-source base models by more than 10% in absolute terms through few-shot chain-of-thought prompting, and also surpasses Minerva 540B.
|
129 |
+
- **Strong Tool Use Ability:** Continuing pre-training with DeepSeekCoder-Base-7B-v1.5 enables DeepSeekMath-Base 7B to more effectively solve and prove mathematical problems by writing programs.
|
130 |
+
- **Comparable Reasoning and Coding Performance:** DeepSeekMath-Base 7B achieves performance in reasoning and coding that is comparable to that of DeepSeekCoder-Base-7B-v1.5.
|
131 |
+
|
132 |
+
### DeepSeekMath-Instruct and -RL 7B
|
133 |
+
|
134 |
+
DeepSeekMath-Instruct 7B is a mathematically instructed tuning model derived from DeepSeekMath-Base 7B, while DeepSeekMath-RL 7B is trained on the foundation of DeepSeekMath-Instruct 7B, utilizing our proposed Group Relative Policy Optimization (GRPO) algorithm.
|
135 |
+
|
136 |
+
We evaluate mathematical performance both without and with tool use, on 4 quantitative reasoning benchmarks in English and Chinese. As shown in Table, DeepSeekMath-Instruct 7B demonstrates strong performance of step-by-step reasoning, and DeepSeekMath-RL 7B approaches an accuracy of 60% on MATH with tool use, surpassing all existing open-source models.
|
137 |
+
|
138 |
+
<p align="center">
|
139 |
+
<img src="images/instruct_results.png" alt="table" width="50%">
|
140 |
+
</p>
|
141 |
+
|
142 |
+
|
143 |
+
## 3. Data Collection
|
144 |
+
|
145 |
+
- Step 1: Select [OpenWebMath](https://arxiv.org/pdf/2310.06786.pdf), a collection of high-quality mathematical web texts, as our initial seed corpus for training a FastText model.
|
146 |
+
- Step 2: Use the FastText model to retrieve mathematical web pages from the deduplicated Common Crawl database.
|
147 |
+
- Step 3: Identify potential math-related domains through statistical analysis.
|
148 |
+
- Step 4: Manually annotate URLs within these identified domains that are associated with mathematical content.
|
149 |
+
- Step 5: Add web pages linked to these annotated URLs, but not yet collected, to the seed corpus. Jump to step 1 until four iterations.
|
150 |
+
|
151 |
+
|
152 |
+
<p align="center">
|
153 |
+
<img src="images/data_pipeline.png" alt="table" width="80%">
|
154 |
+
</p>
|
155 |
+
|
156 |
+
After four iterations of data collection, we end up with **35.5M** mathematical web pages, totaling **120B** tokens.
|
157 |
+
|
158 |
+
## 4. Model Downloads
|
159 |
+
|
160 |
+
We release the DeepSeekMath 7B, including base, instruct and RL models, to the public. To support a broader and more diverse range of research within both academic and commercial communities. Please **note** that the use of this model is subject to the terms outlined in [License section](#6-license). Commercial usage is permitted under these terms.
|
161 |
+
|
162 |
+
### Huggingface
|
163 |
+
|
164 |
+
| Model | Sequence Length | Download |
|
165 |
+
| :----------------------- | :-------------: | :----------------------------------------------------------: |
|
166 |
+
| DeepSeekMath-Base 7B | 4096 | 🤗 [HuggingFace](https://huggingface.co/deepseek-ai/deepseek-math-7b-base) |
|
167 |
+
| DeepSeekMath-Instruct 7B | 4096 | 🤗 [HuggingFace](https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct) |
|
168 |
+
| DeepSeekMath-RL 7B | 4096 | 🤗 [HuggingFace](https://huggingface.co/deepseek-ai/deepseek-math-7b-rl) |
|
169 |
+
|
170 |
+
## 5. Quick Start
|
171 |
+
|
172 |
+
You can directly employ [Huggingface's Transformers](https://github.com/huggingface/transformers) for model inference.
|
173 |
+
|
174 |
+
**Text Completion**
|
175 |
+
|
176 |
+
```python
|
177 |
+
import torch
|
178 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
|
179 |
+
|
180 |
+
model_name = "deepseek-ai/deepseek-math-7b-base"
|
181 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
182 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
|
183 |
+
model.generation_config = GenerationConfig.from_pretrained(model_name)
|
184 |
+
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
185 |
+
|
186 |
+
text = "The integral of x^2 from 0 to 2 is"
|
187 |
+
inputs = tokenizer(text, return_tensors="pt")
|
188 |
+
outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)
|
189 |
+
|
190 |
+
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
191 |
+
print(result)
|
192 |
+
```
|
193 |
+
|
194 |
+
**Chat Completion**
|
195 |
+
|
196 |
+
```python
|
197 |
+
import torch
|
198 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
|
199 |
+
|
200 |
+
model_name = "deepseek-ai/deepseek-math-7b-instruct"
|
201 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
202 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
|
203 |
+
model.generation_config = GenerationConfig.from_pretrained(model_name)
|
204 |
+
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
205 |
+
|
206 |
+
messages = [
|
207 |
+
{"role": "user", "content": "what is the integral of x^2 from 0 to 2?\nPlease reason step by step, and put your final answer within \boxed{}."}
|
208 |
+
]
|
209 |
+
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
|
210 |
+
outputs = model.generate(input_tensor.to(model.device), max_new_tokens=100)
|
211 |
+
|
212 |
+
result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)
|
213 |
+
print(result)
|
214 |
+
```
|
215 |
+
|
216 |
+
Avoiding the use of the provided function `apply_chat_template`, you can also interact with our model following the sample template. Note that `messages` should be replaced by your input.
|
217 |
+
|
218 |
+
```
|
219 |
+
User: {messages[0]['content']}
|
220 |
+
|
221 |
+
Assistant: {messages[1]['content']}<|end▁of▁sentence|>User: {messages[2]['content']}
|
222 |
+
|
223 |
+
Assistant:
|
224 |
+
```
|
225 |
+
|
226 |
+
**Note:** By default (`add_special_tokens=True`), our tokenizer automatically adds a `bos_token` (`<|begin▁of▁sentence|>`) before the input text. Additionally, since the system prompt is not compatible with this version of our models, we DO NOT RECOMMEND including the system prompt in your input.
|
227 |
+
|
228 |
+
❗❗❗ **Please use chain-of-thought prompt to test DeepSeekMath-Instruct and DeepSeekMath-RL:**
|
229 |
+
|
230 |
+
- English questions: **{question}\nPlease reason step by step, and put your final answer within \\boxed{}.**
|
231 |
+
|
232 |
+
- Chinese questions: **{question}\n请通过逐步推理来解答问题,并把最终答案放置于\\boxed{}中。**
|
233 |
+
|
234 |
+
|
235 |
+
## 6. License
|
236 |
+
This code repository is licensed under the MIT License. The use of DeepSeekMath models is subject to the Model License. DeepSeekMath supports commercial use.
|
237 |
+
|
238 |
+
See the [LICENSE-CODE](LICENSE-CODE) and [LICENSE-MODEL](LICENSE-MODEL) for more details.
|
239 |
+
|
240 |
+
## 7. Citation
|
241 |
+
|
242 |
+
```
|
243 |
+
@misc{deepseek-math,
|
244 |
+
author = {Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Mingchuan Zhang, Y.K. Li, Y. Wu, Daya Guo},
|
245 |
+
title = {DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models},
|
246 |
+
journal = {CoRR},
|
247 |
+
volume = {abs/2402.03300},
|
248 |
+
year = {2024},
|
249 |
+
url = {https://arxiv.org/abs/2402.03300},
|
250 |
+
}
|
251 |
+
```
|
252 |
+
|
253 |
+
|
254 |
+
## 8. Contact
|
255 |
+
|
256 |
+
If you have any questions, please raise an issue or contact us at [[email protected]](mailto:[email protected]).
|
257 |
+
|
258 |
+
```
|
259 |
+
|
@@ -25,31 +25,40 @@ st.markdown("---")
|
|
25 |
# リポジトリのURLを入力するテキストボックス
|
26 |
repo_url = st.text_input("リポジトリのURL:")
|
27 |
st.markdown("---")
|
28 |
-
st.markdown("[Full Text](#full-text)")
|
29 |
|
30 |
# .gitignoreのパターンを編集するサイドバー
|
31 |
st.sidebar.title(".CodeLumiaignore Patterns")
|
32 |
-
ignore_patterns = st.sidebar.text_area("Enter patterns (one per line):", value="\n".join(ignore_patterns), height=
|
|
|
33 |
# 探索の最大深度を入力するテキストボックス
|
34 |
-
max_depth = st.sidebar.number_input("探索の最大深度:", min_value=1, value=
|
35 |
|
|
|
|
|
36 |
|
37 |
-
if
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
|
42 |
-
|
43 |
|
44 |
-
|
45 |
-
|
|
|
46 |
|
47 |
-
|
48 |
-
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
st.markdown("# Full Text")
|
55 |
-
st.code(markdown_content)
|
|
|
25 |
# リポジトリのURLを入力するテキストボックス
|
26 |
repo_url = st.text_input("リポジトリのURL:")
|
27 |
st.markdown("---")
|
28 |
+
# st.markdown("[Full Text](#full-text)")
|
29 |
|
30 |
# .gitignoreのパターンを編集するサイドバー
|
31 |
st.sidebar.title(".CodeLumiaignore Patterns")
|
32 |
+
ignore_patterns = st.sidebar.text_area("Enter patterns (one per line):", value="\n".join(ignore_patterns), height=300).split("\n")
|
33 |
+
tmp_dir = st.sidebar.text_input('tmp_dir', './tmp')
|
34 |
# 探索の最大深度を入力するテキストボックス
|
35 |
+
max_depth = st.sidebar.number_input("探索の最大深度:", min_value=1, value=1, step=1)
|
36 |
|
37 |
+
preview_markdown = st.sidebar.checkbox('preview markdown', value=False)
|
38 |
+
preview_plaintext = st.sidebar.checkbox('preview plaintext', value=False)
|
39 |
|
40 |
+
if st.button("CodeLumia Run ...", type="primary"):
|
41 |
+
if repo_url:
|
42 |
+
repo_name = repo_url.split("/")[-1].split(".")[0]
|
43 |
+
with st.status("Scaning repository...", expanded=False):
|
44 |
+
st.write("clone repository...")
|
45 |
+
repo_path = clone_repository(repo_url, repo_name, tmp_dir=tmp_dir)
|
46 |
+
st.write("get file tree...")
|
47 |
+
file_tree = get_file_tree(repo_path, ignore_patterns, max_depth)
|
48 |
+
st.write("create markdown content...")
|
49 |
+
markdown_content = create_markdown_content(repo_name, file_tree, repo_path, ignore_patterns, max_depth)
|
50 |
|
51 |
+
# マークダウンファイルを保存
|
52 |
+
save_markdown_file(repo_name, markdown_content)
|
53 |
|
54 |
+
# Streamlitアプリケーションの構築
|
55 |
+
if(preview_markdown):
|
56 |
+
st.markdown(markdown_content, unsafe_allow_html=True)
|
57 |
|
58 |
+
# ダウンロードリンクの作成
|
59 |
+
st.markdown(f'<div align="center"><a href="data:text/markdown;base64,{base64.b64encode(markdown_content.encode("utf-8")).decode("utf-8")}" download="{repo_name}.md">Download Markdown File</a></div>', unsafe_allow_html=True)
|
60 |
|
61 |
+
st.markdown("---")
|
62 |
+
if(preview_plaintext):
|
63 |
+
st.markdown("# Full Text")
|
64 |
+
st.code(markdown_content)
|
|
|
|
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import os
|
2 |
import fnmatch
|
3 |
|
@@ -7,12 +8,17 @@ def get_file_tree(repo_path, ignore_patterns, max_depth):
|
|
7 |
# .gitignoreに一致するディレクトリを無視
|
8 |
dirs[:] = [d for d in dirs if not any(fnmatch.fnmatch(d, pattern) for pattern in ignore_patterns)]
|
9 |
|
10 |
-
level = root.replace(repo_path, "").count(os.sep)
|
|
|
|
|
|
|
|
|
11 |
if level > max_depth:
|
12 |
continue
|
13 |
|
14 |
indent = " " * 4 * (level)
|
15 |
file_tree += f"{indent}{os.path.basename(root)}/\n"
|
|
|
16 |
subindent = " " * 4 * (level + 1)
|
17 |
for f in files:
|
18 |
# .gitignoreに一致するファイルを無視
|
@@ -26,7 +32,7 @@ def process_files(repo_path, ignore_patterns, max_depth):
|
|
26 |
# .gitignoreに一致するディレクトリを無視
|
27 |
dirs[:] = [d for d in dirs if not any(fnmatch.fnmatch(d, pattern) for pattern in ignore_patterns)]
|
28 |
|
29 |
-
level = root.replace(repo_path, "").count(os.sep)
|
30 |
if level > max_depth:
|
31 |
continue
|
32 |
|
@@ -37,4 +43,19 @@ def process_files(repo_path, ignore_patterns, max_depth):
|
|
37 |
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
38 |
content = f.read()
|
39 |
file_contents.append((file_path.replace(f'{repo_path}/', ''), content))
|
40 |
-
return file_contents
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
import os
|
3 |
import fnmatch
|
4 |
|
|
|
8 |
# .gitignoreに一致するディレクトリを無視
|
9 |
dirs[:] = [d for d in dirs if not any(fnmatch.fnmatch(d, pattern) for pattern in ignore_patterns)]
|
10 |
|
11 |
+
level = root.replace(repo_path, "/").count(os.sep)
|
12 |
+
# print(f"------------------------- max_depth : {max_depth}")
|
13 |
+
# print(f"dirs1:{dirs}")
|
14 |
+
# print(f"level:{level}")
|
15 |
+
# print(f"files:{files}")
|
16 |
if level > max_depth:
|
17 |
continue
|
18 |
|
19 |
indent = " " * 4 * (level)
|
20 |
file_tree += f"{indent}{os.path.basename(root)}/\n"
|
21 |
+
|
22 |
subindent = " " * 4 * (level + 1)
|
23 |
for f in files:
|
24 |
# .gitignoreに一致するファイルを無視
|
|
|
32 |
# .gitignoreに一致するディレクトリを無視
|
33 |
dirs[:] = [d for d in dirs if not any(fnmatch.fnmatch(d, pattern) for pattern in ignore_patterns)]
|
34 |
|
35 |
+
level = root.replace(repo_path, "/").count(os.sep)
|
36 |
if level > max_depth:
|
37 |
continue
|
38 |
|
|
|
43 |
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
44 |
content = f.read()
|
45 |
file_contents.append((file_path.replace(f'{repo_path}/', ''), content))
|
46 |
+
return file_contents
|
47 |
+
|
48 |
+
if __name__ == "__main__":
|
49 |
+
|
50 |
+
repo_path = "tmp/DeepSeek-Math"
|
51 |
+
# .gitignoreのパターンを読み込む
|
52 |
+
ignore_patterns = []
|
53 |
+
if os.path.exists(".CodeLumiaignore"):
|
54 |
+
with open(".CodeLumiaignore", "r") as f:
|
55 |
+
for line in f:
|
56 |
+
line = line.strip()
|
57 |
+
if line and not line.startswith("#"):
|
58 |
+
ignore_patterns.append(line)
|
59 |
+
max_depth = 1
|
60 |
+
file_tree = get_file_tree(repo_path, ignore_patterns, max_depth)
|
61 |
+
print(file_tree)
|
@@ -2,21 +2,32 @@ import os
|
|
2 |
import shutil
|
3 |
import time
|
4 |
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
6 |
# tmpフォルダを削除
|
7 |
-
if os.path.exists(
|
8 |
-
|
9 |
|
10 |
# tmpフォルダを作成
|
11 |
-
os.makedirs(
|
12 |
|
13 |
# リポジトリのクローン
|
14 |
-
repo_path =
|
15 |
if os.path.exists(repo_path):
|
16 |
shutil.rmtree(repo_path)
|
17 |
-
|
18 |
|
19 |
# 一時的な遅延を追加
|
20 |
time.sleep(1)
|
21 |
|
22 |
-
return repo_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import shutil
|
3 |
import time
|
4 |
|
5 |
+
import os
|
6 |
+
import shutil
|
7 |
+
from git import Repo
|
8 |
+
import time
|
9 |
+
|
10 |
+
def clone_repository(repo_url, repo_name, tmp_dir="./tmp"):
|
11 |
# tmpフォルダを削除
|
12 |
+
# if os.path.exists(tmp_dir):
|
13 |
+
# shutil.rmtree(tmp_dir)
|
14 |
|
15 |
# tmpフォルダを作成
|
16 |
+
os.makedirs(tmp_dir, exist_ok=True)
|
17 |
|
18 |
# リポジトリのクローン
|
19 |
+
repo_path = os.path.join(tmp_dir, repo_name)
|
20 |
if os.path.exists(repo_path):
|
21 |
shutil.rmtree(repo_path)
|
22 |
+
Repo.clone_from(repo_url, repo_path)
|
23 |
|
24 |
# 一時的な遅延を追加
|
25 |
time.sleep(1)
|
26 |
|
27 |
+
return repo_path
|
28 |
+
|
29 |
+
if __name__ == "__main__":
|
30 |
+
repo_url = "https://github.com/deepseek-ai/DeepSeek-Math"
|
31 |
+
repo_name = repo_url.split("/")[-1].split(".")[0]
|
32 |
+
tmp_dir = "./tmp" # 必要に応じてtmpディレクトリを指定
|
33 |
+
clone_repository(repo_url, repo_name, tmp_dir)
|
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit b8b0f8ce093d80bf8e9a641e44142f06d092c305
|