update all documentation
Browse files
data/scraping_scripts/github_to_markdown_ai_docs.py
CHANGED
@@ -142,16 +142,33 @@ def download_file(file_url: str, file_path: str, retries: int = 0):
|
|
142 |
else:
|
143 |
print(f"Failed to download file after {MAX_RETRIES} retries: {e}")
|
144 |
|
|
|
|
|
|
|
145 |
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
|
|
149 |
|
150 |
-
exporter = MarkdownExporter()
|
151 |
-
markdown, _ = exporter.from_notebook_node(notebook)
|
152 |
|
153 |
-
|
154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
|
157 |
def fetch_files(api_url: str, local_dir: str):
|
|
|
142 |
else:
|
143 |
print(f"Failed to download file after {MAX_RETRIES} retries: {e}")
|
144 |
|
145 |
+
# def convert_ipynb_to_md(ipynb_path: str, md_path: str):
|
146 |
+
# with open(ipynb_path, "r", encoding="utf-8") as f:
|
147 |
+
# notebook = nbformat.read(f, as_version=4)
|
148 |
|
149 |
+
# exporter = MarkdownExporter()
|
150 |
+
# markdown, _ = exporter.from_notebook_node(notebook)
|
151 |
+
|
152 |
+
# with open(md_path, "w", encoding="utf-8") as f:
|
153 |
+
# f.write(markdown)
|
154 |
|
|
|
|
|
155 |
|
156 |
+
def convert_ipynb_to_md(ipynb_path: str, md_path: str):
|
157 |
+
try:
|
158 |
+
with open(ipynb_path, "r", encoding="utf-8") as f:
|
159 |
+
notebook = nbformat.read(f, as_version=4)
|
160 |
+
|
161 |
+
exporter = MarkdownExporter()
|
162 |
+
markdown, _ = exporter.from_notebook_node(notebook)
|
163 |
+
|
164 |
+
with open(md_path, "w", encoding="utf-8") as f:
|
165 |
+
f.write(markdown)
|
166 |
+
except (json.JSONDecodeError, nbformat.reader.NotJSONError) as e:
|
167 |
+
print(f"Error converting notebook {ipynb_path}: {str(e)}")
|
168 |
+
print("Skipping this file and continuing with others...")
|
169 |
+
except Exception as e:
|
170 |
+
print(f"Unexpected error converting notebook {ipynb_path}: {str(e)}")
|
171 |
+
print("Skipping this file and continuing with others...")
|
172 |
|
173 |
|
174 |
def fetch_files(api_url: str, local_dir: str):
|
data/scraping_scripts/process_md_files.py
CHANGED
@@ -381,7 +381,7 @@ SOURCE_CONFIGS = {
|
|
381 |
"url_extension": ".ipynb",
|
382 |
},
|
383 |
"langchain": {
|
384 |
-
"base_url": "https://python.langchain.com/
|
385 |
"input_directory": "data/langchain_md_files",
|
386 |
"output_file": "data/langchain_data.jsonl",
|
387 |
"source_name": "langchain",
|
@@ -460,11 +460,7 @@ def should_include_file(file_path: str, config: Dict) -> bool:
|
|
460 |
|
461 |
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
462 |
encoding = tiktoken.get_encoding(encoding_name)
|
463 |
-
num_tokens = len(
|
464 |
-
encoding.encode(
|
465 |
-
string, disallowed_special=(encoding.special_tokens_set - {"<|endoftext|>"})
|
466 |
-
)
|
467 |
-
)
|
468 |
return num_tokens
|
469 |
|
470 |
|
|
|
381 |
"url_extension": ".ipynb",
|
382 |
},
|
383 |
"langchain": {
|
384 |
+
"base_url": "https://python.langchain.com/docs/",
|
385 |
"input_directory": "data/langchain_md_files",
|
386 |
"output_file": "data/langchain_data.jsonl",
|
387 |
"source_name": "langchain",
|
|
|
460 |
|
461 |
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
462 |
encoding = tiktoken.get_encoding(encoding_name)
|
463 |
+
num_tokens = len(encoding.encode(string, disallowed_special=()))
|
|
|
|
|
|
|
|
|
464 |
return num_tokens
|
465 |
|
466 |
|
requirements.txt
CHANGED
@@ -17,4 +17,5 @@ google-generativeai
|
|
17 |
llama-index-llms-gemini
|
18 |
gradio
|
19 |
pymongo
|
20 |
-
huggingface_hub
|
|
|
|
17 |
llama-index-llms-gemini
|
18 |
gradio
|
19 |
pymongo
|
20 |
+
huggingface_hub
|
21 |
+
nbconvert
|