Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Running

omarsol commited on 10 days ago

Commit

518ace9

1 Parent(s): bd70582

update all documentation

Files changed (3) hide show

data/scraping_scripts/github_to_markdown_ai_docs.py CHANGED Viewed

@@ -142,16 +142,33 @@ def download_file(file_url: str, file_path: str, retries: int = 0):
         else:
             print(f"Failed to download file after {MAX_RETRIES} retries: {e}")
-def convert_ipynb_to_md(ipynb_path: str, md_path: str):
-    with open(ipynb_path, "r", encoding="utf-8") as f:
-        notebook = nbformat.read(f, as_version=4)
-    exporter = MarkdownExporter()
-    markdown, _ = exporter.from_notebook_node(notebook)
-    with open(md_path, "w", encoding="utf-8") as f:
-        f.write(markdown)
 def fetch_files(api_url: str, local_dir: str):

         else:
             print(f"Failed to download file after {MAX_RETRIES} retries: {e}")
+    # def convert_ipynb_to_md(ipynb_path: str, md_path: str):
+    #     with open(ipynb_path, "r", encoding="utf-8") as f:
+    #         notebook = nbformat.read(f, as_version=4)
+    #     exporter = MarkdownExporter()
+    #     markdown, _ = exporter.from_notebook_node(notebook)
+    #     with open(md_path, "w", encoding="utf-8") as f:
+    #         f.write(markdown)
+def convert_ipynb_to_md(ipynb_path: str, md_path: str):
+    try:
+        with open(ipynb_path, "r", encoding="utf-8") as f:
+            notebook = nbformat.read(f, as_version=4)
+        exporter = MarkdownExporter()
+        markdown, _ = exporter.from_notebook_node(notebook)
+        with open(md_path, "w", encoding="utf-8") as f:
+            f.write(markdown)
+    except (json.JSONDecodeError, nbformat.reader.NotJSONError) as e:
+        print(f"Error converting notebook {ipynb_path}: {str(e)}")
+        print("Skipping this file and continuing with others...")
+    except Exception as e:
+        print(f"Unexpected error converting notebook {ipynb_path}: {str(e)}")
+        print("Skipping this file and continuing with others...")
 def fetch_files(api_url: str, local_dir: str):

data/scraping_scripts/process_md_files.py CHANGED Viewed

@@ -381,7 +381,7 @@ SOURCE_CONFIGS = {
         "url_extension": ".ipynb",
     },
     "langchain": {
-        "base_url": "https://python.langchain.com/v0.2/docs/",
         "input_directory": "data/langchain_md_files",
         "output_file": "data/langchain_data.jsonl",
         "source_name": "langchain",
@@ -460,11 +460,7 @@ def should_include_file(file_path: str, config: Dict) -> bool:
 def num_tokens_from_string(string: str, encoding_name: str) -> int:
     encoding = tiktoken.get_encoding(encoding_name)
-    num_tokens = len(
-        encoding.encode(
-            string, disallowed_special=(encoding.special_tokens_set - {"<|endoftext|>"})
-        )
-    )
     return num_tokens

         "url_extension": ".ipynb",
     },
     "langchain": {
+        "base_url": "https://python.langchain.com/docs/",
         "input_directory": "data/langchain_md_files",
         "output_file": "data/langchain_data.jsonl",
         "source_name": "langchain",
 def num_tokens_from_string(string: str, encoding_name: str) -> int:
     encoding = tiktoken.get_encoding(encoding_name)
+    num_tokens = len(encoding.encode(string, disallowed_special=()))
     return num_tokens

requirements.txt CHANGED Viewed

@@ -17,4 +17,5 @@ google-generativeai
 llama-index-llms-gemini
 gradio
 pymongo
-huggingface_hub

 llama-index-llms-gemini
 gradio
 pymongo
+huggingface_hub
+nbconvert