Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Sleeping

Omar Solano commited on Jul 29, 2024

Commit

34b6f5e

1 Parent(s): 207bc12

update scripts

Files changed (3) hide show

data/scraping_scripts/create_vector_stores.py CHANGED Viewed

@@ -28,6 +28,7 @@ import argparse
 import json
 import os
 import pickle
 from typing import Dict, List
 import chromadb
@@ -69,7 +70,7 @@ def create_docs(input_file: str) -> List[Document]:
                 Document(
                     doc_id=data["doc_id"],
                     text=data["content"],
-                    metadata={
                         "url": data["url"],
                         "title": data["name"],
                         "tokens": data["tokens"],
@@ -95,14 +96,22 @@ def create_docs(input_file: str) -> List[Document]:
 def process_source(source: str):
     config = SOURCE_CONFIGS[source]
     input_file = config["input_file"]
     db_name = config["db_name"]
     print(f"Processing source: {source}")
     documents = create_docs(input_file)
     print(f"Created {len(documents)} documents")
     # Create Chroma client and collection
     chroma_client = chromadb.PersistentClient(path=f"data/{db_name}")
     chroma_collection = chroma_client.create_collection(db_name)

 import json
 import os
 import pickle
+import shutil
 from typing import Dict, List
 import chromadb
                 Document(
                     doc_id=data["doc_id"],
                     text=data["content"],
+                    metadata={  # type: ignore
                         "url": data["url"],
                         "title": data["name"],
                         "tokens": data["tokens"],
 def process_source(source: str):
     config = SOURCE_CONFIGS[source]
     input_file = config["input_file"]
     db_name = config["db_name"]
+    db_path = f"data/{db_name}"
     print(f"Processing source: {source}")
     documents = create_docs(input_file)
     print(f"Created {len(documents)} documents")
+    # Check if the folder exists and delete it
+    if os.path.exists(db_path):
+        print(f"Existing database found at {db_path}. Deleting...")
+        shutil.rmtree(db_path)
+        print(f"Deleted existing database at {db_path}")
     # Create Chroma client and collection
     chroma_client = chromadb.PersistentClient(path=f"data/{db_name}")
     chroma_collection = chroma_client.create_collection(db_name)

data/scraping_scripts/github_to_markdown_ai_docs.py CHANGED Viewed

@@ -38,8 +38,11 @@ from typing import Dict, List
 import nbformat
 import requests
 from nbconvert import MarkdownExporter
 # Configuration for different sources
 SOURCE_CONFIGS = {
     "transformers": {
@@ -75,7 +78,7 @@ SOURCE_CONFIGS = {
 }
 # GitHub Personal Access Token (replace with your own token)
-GITHUB_TOKEN = "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"
 # Headers for authenticated requests
 HEADERS = {

 import nbformat
 import requests
+from dotenv import load_dotenv
 from nbconvert import MarkdownExporter
+load_dotenv()
 # Configuration for different sources
 SOURCE_CONFIGS = {
     "transformers": {
 }
 # GitHub Personal Access Token (replace with your own token)
+GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
 # Headers for authenticated requests
 HEADERS = {

data/scraping_scripts/upload_dbs_to_hf.py CHANGED Viewed

@@ -15,7 +15,6 @@ The script will:
 Configuration:
 - The script is set to upload to the "towardsai-buster/test-data" dataset repository.
-- It ignores files with extensions .jsonl, .py, .txt, and .ipynb.
 - It deletes all existing files in the repository before uploading (due to delete_patterns=["*"]).
 """
@@ -30,5 +29,5 @@ api.upload_folder(
     multi_commits=True,
     multi_commits_verbose=True,
     delete_patterns=["*"],
-    ignore_patterns=["*.jsonl", "*.py", "*.txt", "*.ipynb"],
 )

 Configuration:
 - The script is set to upload to the "towardsai-buster/test-data" dataset repository.
 - It deletes all existing files in the repository before uploading (due to delete_patterns=["*"]).
 """
     multi_commits=True,
     multi_commits_verbose=True,
     delete_patterns=["*"],
+    ignore_patterns=["*.jsonl", "*.py", "*.txt", "*.ipynb", "*.md", "*.pyc"],
 )