omarsol commited on
Commit
518ace9
Β·
1 Parent(s): bd70582

update all documentation

Browse files
data/scraping_scripts/github_to_markdown_ai_docs.py CHANGED
@@ -142,16 +142,33 @@ def download_file(file_url: str, file_path: str, retries: int = 0):
142
  else:
143
  print(f"Failed to download file after {MAX_RETRIES} retries: {e}")
144
 
 
 
 
145
 
146
- def convert_ipynb_to_md(ipynb_path: str, md_path: str):
147
- with open(ipynb_path, "r", encoding="utf-8") as f:
148
- notebook = nbformat.read(f, as_version=4)
 
 
149
 
150
- exporter = MarkdownExporter()
151
- markdown, _ = exporter.from_notebook_node(notebook)
152
 
153
- with open(md_path, "w", encoding="utf-8") as f:
154
- f.write(markdown)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
 
157
  def fetch_files(api_url: str, local_dir: str):
 
142
  else:
143
  print(f"Failed to download file after {MAX_RETRIES} retries: {e}")
144
 
145
+ # def convert_ipynb_to_md(ipynb_path: str, md_path: str):
146
+ # with open(ipynb_path, "r", encoding="utf-8") as f:
147
+ # notebook = nbformat.read(f, as_version=4)
148
 
149
+ # exporter = MarkdownExporter()
150
+ # markdown, _ = exporter.from_notebook_node(notebook)
151
+
152
+ # with open(md_path, "w", encoding="utf-8") as f:
153
+ # f.write(markdown)
154
 
 
 
155
 
156
+ def convert_ipynb_to_md(ipynb_path: str, md_path: str):
157
+ try:
158
+ with open(ipynb_path, "r", encoding="utf-8") as f:
159
+ notebook = nbformat.read(f, as_version=4)
160
+
161
+ exporter = MarkdownExporter()
162
+ markdown, _ = exporter.from_notebook_node(notebook)
163
+
164
+ with open(md_path, "w", encoding="utf-8") as f:
165
+ f.write(markdown)
166
+ except (json.JSONDecodeError, nbformat.reader.NotJSONError) as e:
167
+ print(f"Error converting notebook {ipynb_path}: {str(e)}")
168
+ print("Skipping this file and continuing with others...")
169
+ except Exception as e:
170
+ print(f"Unexpected error converting notebook {ipynb_path}: {str(e)}")
171
+ print("Skipping this file and continuing with others...")
172
 
173
 
174
  def fetch_files(api_url: str, local_dir: str):
data/scraping_scripts/process_md_files.py CHANGED
@@ -381,7 +381,7 @@ SOURCE_CONFIGS = {
381
  "url_extension": ".ipynb",
382
  },
383
  "langchain": {
384
- "base_url": "https://python.langchain.com/v0.2/docs/",
385
  "input_directory": "data/langchain_md_files",
386
  "output_file": "data/langchain_data.jsonl",
387
  "source_name": "langchain",
@@ -460,11 +460,7 @@ def should_include_file(file_path: str, config: Dict) -> bool:
460
 
461
  def num_tokens_from_string(string: str, encoding_name: str) -> int:
462
  encoding = tiktoken.get_encoding(encoding_name)
463
- num_tokens = len(
464
- encoding.encode(
465
- string, disallowed_special=(encoding.special_tokens_set - {"<|endoftext|>"})
466
- )
467
- )
468
  return num_tokens
469
 
470
 
 
381
  "url_extension": ".ipynb",
382
  },
383
  "langchain": {
384
+ "base_url": "https://python.langchain.com/docs/",
385
  "input_directory": "data/langchain_md_files",
386
  "output_file": "data/langchain_data.jsonl",
387
  "source_name": "langchain",
 
460
 
461
  def num_tokens_from_string(string: str, encoding_name: str) -> int:
462
  encoding = tiktoken.get_encoding(encoding_name)
463
+ num_tokens = len(encoding.encode(string, disallowed_special=()))
 
 
 
 
464
  return num_tokens
465
 
466
 
requirements.txt CHANGED
@@ -17,4 +17,5 @@ google-generativeai
17
  llama-index-llms-gemini
18
  gradio
19
  pymongo
20
- huggingface_hub
 
 
17
  llama-index-llms-gemini
18
  gradio
19
  pymongo
20
+ huggingface_hub
21
+ nbconvert