Omar Solano commited on
Commit
34b6f5e
Β·
1 Parent(s): 207bc12

update scripts

Browse files
data/scraping_scripts/create_vector_stores.py CHANGED
@@ -28,6 +28,7 @@ import argparse
28
  import json
29
  import os
30
  import pickle
 
31
  from typing import Dict, List
32
 
33
  import chromadb
@@ -69,7 +70,7 @@ def create_docs(input_file: str) -> List[Document]:
69
  Document(
70
  doc_id=data["doc_id"],
71
  text=data["content"],
72
- metadata={
73
  "url": data["url"],
74
  "title": data["name"],
75
  "tokens": data["tokens"],
@@ -95,14 +96,22 @@ def create_docs(input_file: str) -> List[Document]:
95
 
96
  def process_source(source: str):
97
  config = SOURCE_CONFIGS[source]
 
98
  input_file = config["input_file"]
99
  db_name = config["db_name"]
 
100
 
101
  print(f"Processing source: {source}")
102
 
103
  documents = create_docs(input_file)
104
  print(f"Created {len(documents)} documents")
105
 
 
 
 
 
 
 
106
  # Create Chroma client and collection
107
  chroma_client = chromadb.PersistentClient(path=f"data/{db_name}")
108
  chroma_collection = chroma_client.create_collection(db_name)
 
28
  import json
29
  import os
30
  import pickle
31
+ import shutil
32
  from typing import Dict, List
33
 
34
  import chromadb
 
70
  Document(
71
  doc_id=data["doc_id"],
72
  text=data["content"],
73
+ metadata={ # type: ignore
74
  "url": data["url"],
75
  "title": data["name"],
76
  "tokens": data["tokens"],
 
96
 
97
  def process_source(source: str):
98
  config = SOURCE_CONFIGS[source]
99
+
100
  input_file = config["input_file"]
101
  db_name = config["db_name"]
102
+ db_path = f"data/{db_name}"
103
 
104
  print(f"Processing source: {source}")
105
 
106
  documents = create_docs(input_file)
107
  print(f"Created {len(documents)} documents")
108
 
109
+ # Check if the folder exists and delete it
110
+ if os.path.exists(db_path):
111
+ print(f"Existing database found at {db_path}. Deleting...")
112
+ shutil.rmtree(db_path)
113
+ print(f"Deleted existing database at {db_path}")
114
+
115
  # Create Chroma client and collection
116
  chroma_client = chromadb.PersistentClient(path=f"data/{db_name}")
117
  chroma_collection = chroma_client.create_collection(db_name)
data/scraping_scripts/github_to_markdown_ai_docs.py CHANGED
@@ -38,8 +38,11 @@ from typing import Dict, List
38
 
39
  import nbformat
40
  import requests
 
41
  from nbconvert import MarkdownExporter
42
 
 
 
43
  # Configuration for different sources
44
  SOURCE_CONFIGS = {
45
  "transformers": {
@@ -75,7 +78,7 @@ SOURCE_CONFIGS = {
75
  }
76
 
77
  # GitHub Personal Access Token (replace with your own token)
78
- GITHUB_TOKEN = "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"
79
 
80
  # Headers for authenticated requests
81
  HEADERS = {
 
38
 
39
  import nbformat
40
  import requests
41
+ from dotenv import load_dotenv
42
  from nbconvert import MarkdownExporter
43
 
44
+ load_dotenv()
45
+
46
  # Configuration for different sources
47
  SOURCE_CONFIGS = {
48
  "transformers": {
 
78
  }
79
 
80
  # GitHub Personal Access Token (replace with your own token)
81
+ GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
82
 
83
  # Headers for authenticated requests
84
  HEADERS = {
data/scraping_scripts/upload_dbs_to_hf.py CHANGED
@@ -15,7 +15,6 @@ The script will:
15
 
16
  Configuration:
17
  - The script is set to upload to the "towardsai-buster/test-data" dataset repository.
18
- - It ignores files with extensions .jsonl, .py, .txt, and .ipynb.
19
  - It deletes all existing files in the repository before uploading (due to delete_patterns=["*"]).
20
  """
21
 
@@ -30,5 +29,5 @@ api.upload_folder(
30
  multi_commits=True,
31
  multi_commits_verbose=True,
32
  delete_patterns=["*"],
33
- ignore_patterns=["*.jsonl", "*.py", "*.txt", "*.ipynb"],
34
  )
 
15
 
16
  Configuration:
17
  - The script is set to upload to the "towardsai-buster/test-data" dataset repository.
 
18
  - It deletes all existing files in the repository before uploading (due to delete_patterns=["*"]).
19
  """
20
 
 
29
  multi_commits=True,
30
  multi_commits_verbose=True,
31
  delete_patterns=["*"],
32
+ ignore_patterns=["*.jsonl", "*.py", "*.txt", "*.ipynb", "*.md", "*.pyc"],
33
  )