Spaces:
Sleeping
Sleeping
Omar Solano
commited on
Commit
Β·
34b6f5e
1
Parent(s):
207bc12
update scripts
Browse files
data/scraping_scripts/create_vector_stores.py
CHANGED
@@ -28,6 +28,7 @@ import argparse
|
|
28 |
import json
|
29 |
import os
|
30 |
import pickle
|
|
|
31 |
from typing import Dict, List
|
32 |
|
33 |
import chromadb
|
@@ -69,7 +70,7 @@ def create_docs(input_file: str) -> List[Document]:
|
|
69 |
Document(
|
70 |
doc_id=data["doc_id"],
|
71 |
text=data["content"],
|
72 |
-
metadata={
|
73 |
"url": data["url"],
|
74 |
"title": data["name"],
|
75 |
"tokens": data["tokens"],
|
@@ -95,14 +96,22 @@ def create_docs(input_file: str) -> List[Document]:
|
|
95 |
|
96 |
def process_source(source: str):
|
97 |
config = SOURCE_CONFIGS[source]
|
|
|
98 |
input_file = config["input_file"]
|
99 |
db_name = config["db_name"]
|
|
|
100 |
|
101 |
print(f"Processing source: {source}")
|
102 |
|
103 |
documents = create_docs(input_file)
|
104 |
print(f"Created {len(documents)} documents")
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
# Create Chroma client and collection
|
107 |
chroma_client = chromadb.PersistentClient(path=f"data/{db_name}")
|
108 |
chroma_collection = chroma_client.create_collection(db_name)
|
|
|
28 |
import json
|
29 |
import os
|
30 |
import pickle
|
31 |
+
import shutil
|
32 |
from typing import Dict, List
|
33 |
|
34 |
import chromadb
|
|
|
70 |
Document(
|
71 |
doc_id=data["doc_id"],
|
72 |
text=data["content"],
|
73 |
+
metadata={ # type: ignore
|
74 |
"url": data["url"],
|
75 |
"title": data["name"],
|
76 |
"tokens": data["tokens"],
|
|
|
96 |
|
97 |
def process_source(source: str):
|
98 |
config = SOURCE_CONFIGS[source]
|
99 |
+
|
100 |
input_file = config["input_file"]
|
101 |
db_name = config["db_name"]
|
102 |
+
db_path = f"data/{db_name}"
|
103 |
|
104 |
print(f"Processing source: {source}")
|
105 |
|
106 |
documents = create_docs(input_file)
|
107 |
print(f"Created {len(documents)} documents")
|
108 |
|
109 |
+
# Check if the folder exists and delete it
|
110 |
+
if os.path.exists(db_path):
|
111 |
+
print(f"Existing database found at {db_path}. Deleting...")
|
112 |
+
shutil.rmtree(db_path)
|
113 |
+
print(f"Deleted existing database at {db_path}")
|
114 |
+
|
115 |
# Create Chroma client and collection
|
116 |
chroma_client = chromadb.PersistentClient(path=f"data/{db_name}")
|
117 |
chroma_collection = chroma_client.create_collection(db_name)
|
data/scraping_scripts/github_to_markdown_ai_docs.py
CHANGED
@@ -38,8 +38,11 @@ from typing import Dict, List
|
|
38 |
|
39 |
import nbformat
|
40 |
import requests
|
|
|
41 |
from nbconvert import MarkdownExporter
|
42 |
|
|
|
|
|
43 |
# Configuration for different sources
|
44 |
SOURCE_CONFIGS = {
|
45 |
"transformers": {
|
@@ -75,7 +78,7 @@ SOURCE_CONFIGS = {
|
|
75 |
}
|
76 |
|
77 |
# GitHub Personal Access Token (replace with your own token)
|
78 |
-
GITHUB_TOKEN = "
|
79 |
|
80 |
# Headers for authenticated requests
|
81 |
HEADERS = {
|
|
|
38 |
|
39 |
import nbformat
|
40 |
import requests
|
41 |
+
from dotenv import load_dotenv
|
42 |
from nbconvert import MarkdownExporter
|
43 |
|
44 |
+
load_dotenv()
|
45 |
+
|
46 |
# Configuration for different sources
|
47 |
SOURCE_CONFIGS = {
|
48 |
"transformers": {
|
|
|
78 |
}
|
79 |
|
80 |
# GitHub Personal Access Token (replace with your own token)
|
81 |
+
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
|
82 |
|
83 |
# Headers for authenticated requests
|
84 |
HEADERS = {
|
data/scraping_scripts/upload_dbs_to_hf.py
CHANGED
@@ -15,7 +15,6 @@ The script will:
|
|
15 |
|
16 |
Configuration:
|
17 |
- The script is set to upload to the "towardsai-buster/test-data" dataset repository.
|
18 |
-
- It ignores files with extensions .jsonl, .py, .txt, and .ipynb.
|
19 |
- It deletes all existing files in the repository before uploading (due to delete_patterns=["*"]).
|
20 |
"""
|
21 |
|
@@ -30,5 +29,5 @@ api.upload_folder(
|
|
30 |
multi_commits=True,
|
31 |
multi_commits_verbose=True,
|
32 |
delete_patterns=["*"],
|
33 |
-
ignore_patterns=["*.jsonl", "*.py", "*.txt", "*.ipynb"],
|
34 |
)
|
|
|
15 |
|
16 |
Configuration:
|
17 |
- The script is set to upload to the "towardsai-buster/test-data" dataset repository.
|
|
|
18 |
- It deletes all existing files in the repository before uploading (due to delete_patterns=["*"]).
|
19 |
"""
|
20 |
|
|
|
29 |
multi_commits=True,
|
30 |
multi_commits_verbose=True,
|
31 |
delete_patterns=["*"],
|
32 |
+
ignore_patterns=["*.jsonl", "*.py", "*.txt", "*.ipynb", "*.md", "*.pyc"],
|
33 |
)
|