Commit
·
1c9d91a
1
Parent(s):
f7d37d9
stop using gradio
Browse files- app.py +87 -245
- requirements.in +0 -6
- requirements.txt +0 -520
app.py
CHANGED
@@ -1,291 +1,133 @@
|
|
1 |
import asyncio
|
2 |
-
import re
|
3 |
from typing import Dict, List
|
4 |
|
5 |
import gradio as gr
|
6 |
import httpx
|
7 |
-
from cashews import cache
|
8 |
-
from huggingface_hub import ModelCard
|
9 |
|
10 |
-
|
11 |
|
12 |
-
cache.setup("mem://")
|
13 |
-
API_URL = "https://davanstrien-huggingface-datasets-search-v2.hf.space"
|
14 |
-
HF_API_URL = "https://huggingface.co/api/datasets"
|
15 |
-
README_URL_TEMPLATE = "https://huggingface.co/datasets/{}/raw/main/README.md"
|
16 |
|
17 |
-
|
18 |
-
async def fetch_similar_datasets(dataset_id: str, limit: int = 10) -> List[Dict]:
|
19 |
async with httpx.AsyncClient() as client:
|
20 |
response = await client.get(
|
21 |
-
f"{API_URL}/
|
|
|
22 |
)
|
23 |
if response.status_code == 200:
|
24 |
-
|
25 |
-
# Remove the input dataset from the results
|
26 |
-
return [r for r in results if r["dataset_id"] != dataset_id][:limit]
|
27 |
return []
|
28 |
|
29 |
|
30 |
-
async def fetch_similar_datasets_by_text(query: str, limit: int =
|
31 |
-
async with httpx.AsyncClient(
|
32 |
response = await client.get(
|
33 |
-
f"{API_URL}/
|
34 |
)
|
35 |
if response.status_code == 200:
|
36 |
-
|
37 |
-
return results[:limit]
|
38 |
return []
|
39 |
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
if not results:
|
45 |
-
return "No similar datasets found."
|
46 |
-
|
47 |
-
# Fetch dataset cards and info concurrently
|
48 |
-
dataset_cards = await asyncio.gather(
|
49 |
-
*[fetch_dataset_card(result["dataset_id"]) for result in results]
|
50 |
-
)
|
51 |
-
dataset_infos = await asyncio.gather(
|
52 |
-
*[fetch_dataset_info(result["dataset_id"]) for result in results]
|
53 |
-
)
|
54 |
-
|
55 |
-
return format_results(results, dataset_cards, dataset_infos)
|
56 |
-
|
57 |
|
58 |
-
|
59 |
-
url = README_URL_TEMPLATE.format(dataset_id)
|
60 |
-
async with httpx.AsyncClient() as client:
|
61 |
-
response = await client.get(url)
|
62 |
-
return ModelCard(response.text).text if response.status_code == 200 else ""
|
63 |
-
|
64 |
-
|
65 |
-
async def fetch_dataset_info(dataset_id: str) -> Dict:
|
66 |
-
async with httpx.AsyncClient() as client:
|
67 |
-
response = await client.get(f"{HF_API_URL}/{dataset_id}")
|
68 |
-
return response.json() if response.status_code == 200 else {}
|
69 |
-
|
70 |
-
|
71 |
-
def format_results(
|
72 |
-
results: List[Dict], dataset_cards: List[str], dataset_infos: List[Dict]
|
73 |
-
) -> str:
|
74 |
-
markdown = (
|
75 |
-
"<h1 style='text-align: center;'>✨ Similar Datasets ✨</h1>\n\n"
|
76 |
-
)
|
77 |
-
for result, card, info in zip(results, dataset_cards, dataset_infos):
|
78 |
hub_id = result["dataset_id"]
|
79 |
similarity = result["similarity"]
|
|
|
80 |
url = f"https://huggingface.co/datasets/{hub_id}"
|
81 |
|
82 |
-
|
83 |
-
|
84 |
-
markdown +=
|
85 |
-
markdown += f"**Similarity Score:** {similarity:.4f}\n\n"
|
86 |
-
|
87 |
-
if info:
|
88 |
-
downloads = info.get("downloads", 0)
|
89 |
-
likes = info.get("likes", 0)
|
90 |
-
last_modified = info.get("lastModified", "N/A")
|
91 |
-
markdown += f"**Downloads:** {downloads} | **Likes:** {likes} | **Last Modified:** {last_modified}\n\n"
|
92 |
-
|
93 |
-
if card:
|
94 |
-
# Remove the title from the card content
|
95 |
-
card_without_title = re.sub(
|
96 |
-
r"^#.*\n", "", card, count=1, flags=re.MULTILINE
|
97 |
-
)
|
98 |
-
|
99 |
-
# Split the card into paragraphs
|
100 |
-
paragraphs = card_without_title.split("\n\n")
|
101 |
-
|
102 |
-
# Find the first non-empty text paragraph that's not just an image
|
103 |
-
preview = next(
|
104 |
-
(
|
105 |
-
p
|
106 |
-
for p in paragraphs
|
107 |
-
if p.strip()
|
108 |
-
and not p.strip().startswith("![")
|
109 |
-
and not p.strip().startswith("<img")
|
110 |
-
),
|
111 |
-
"No preview available.",
|
112 |
-
)
|
113 |
-
|
114 |
-
# Limit the preview to a reasonable length (e.g., 300 characters)
|
115 |
-
preview = f"{preview[:300]}..." if len(preview) > 300 else preview
|
116 |
-
|
117 |
-
# Add the preview
|
118 |
-
markdown += f"{preview}\n\n"
|
119 |
-
|
120 |
-
# Limit image size in the full dataset card
|
121 |
-
full_card = re.sub(
|
122 |
-
r'<img src="([^"]+)"',
|
123 |
-
r'<img src="\1" style="max-width: 300px; max-height: 300px;"',
|
124 |
-
card_without_title,
|
125 |
-
)
|
126 |
-
full_card = re.sub(
|
127 |
-
r"!\[([^\]]*)\]\(([^\)]+)\)",
|
128 |
-
r'<img src="\2" alt="\1" style="max-width: 300px; max-height: 300px;">',
|
129 |
-
full_card,
|
130 |
-
)
|
131 |
-
markdown += f"<details><summary>Full Dataset Card</summary>\n\n{full_card}\n\n</details>\n\n"
|
132 |
-
|
133 |
markdown += "---\n\n"
|
134 |
|
135 |
return markdown
|
136 |
|
137 |
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
dataset_cards = await asyncio.gather(
|
146 |
-
*[fetch_dataset_card(result["dataset_id"]) for result in results]
|
147 |
-
)
|
148 |
-
dataset_infos = await asyncio.gather(
|
149 |
-
*[fetch_dataset_info(result["dataset_id"]) for result in results]
|
150 |
)
|
151 |
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
f"{API_URL}/search-viewer", params={"query": query, "n": limit}
|
159 |
)
|
160 |
-
if response.status_code == 200:
|
161 |
-
results = response.json()["results"]
|
162 |
-
return format_viewer_results(results)
|
163 |
-
return "No results found."
|
164 |
-
|
165 |
-
|
166 |
-
def format_viewer_results(results: List[Dict]) -> str:
|
167 |
-
html = "<div style='height: 600px; overflow-y: auto;'>"
|
168 |
-
for result in results:
|
169 |
-
dataset_id = result["dataset_id"]
|
170 |
-
html += f"""
|
171 |
-
<div style='margin-bottom: 20px; border: 1px solid #ddd; padding: 10px;'>
|
172 |
-
<h3>{dataset_id}</h3>
|
173 |
-
<p><strong>Similarity Score:</strong> {result['similarity']:.4f}</p>
|
174 |
-
<iframe
|
175 |
-
src="https://huggingface.co/datasets/{dataset_id}/embed/viewer/default/train"
|
176 |
-
frameborder="0"
|
177 |
-
width="100%"
|
178 |
-
height="560px"
|
179 |
-
></iframe>
|
180 |
-
</div>
|
181 |
-
"""
|
182 |
-
html += "</div>"
|
183 |
-
return html
|
184 |
-
|
185 |
-
|
186 |
-
with gr.Blocks() as demo:
|
187 |
-
gr.Markdown("## 🤗 Dataset Search and Similarity")
|
188 |
-
|
189 |
-
with gr.Tabs():
|
190 |
-
with gr.TabItem("Similar Datasets"):
|
191 |
-
gr.Markdown("## 🤗 Dataset Similarity Search")
|
192 |
-
with gr.Row():
|
193 |
-
gr.Markdown(
|
194 |
-
"This Gradio app allows you to find similar datasets based on a given dataset ID or a text query. "
|
195 |
-
"Choose the search type and enter either a dataset ID or a text query to find similar datasets with previews of their dataset cards.\n\n"
|
196 |
-
"For a seamless experience on the Hugging Face website, check out the "
|
197 |
-
"[Hugging Face Similar Chrome extension](https://chromewebstore.google.com/detail/hugging-face-similar/aijelnjllajooinkcpkpbhckbghghpnl?authuser=0&hl=en). "
|
198 |
-
"This extension adds a 'Similar Datasets' section directly to Hugging Face dataset pages, "
|
199 |
-
"making it even easier to discover related datasets for your projects."
|
200 |
-
)
|
201 |
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
)
|
214 |
-
text_query = gr.Textbox(
|
215 |
-
label="Text Query (e.g., 'natural language processing dataset')",
|
216 |
-
visible=False,
|
217 |
-
)
|
218 |
-
|
219 |
-
with gr.Row():
|
220 |
-
search_btn = gr.Button("Search Similar Datasets")
|
221 |
-
max_results = gr.Slider(
|
222 |
-
minimum=1,
|
223 |
-
maximum=50,
|
224 |
-
step=1,
|
225 |
-
value=10,
|
226 |
-
label="Maximum number of results",
|
227 |
-
)
|
228 |
-
|
229 |
-
results = gr.Markdown()
|
230 |
-
|
231 |
-
def toggle_input_visibility(choice):
|
232 |
-
return gr.update(visible=choice == "Dataset ID"), gr.update(
|
233 |
-
visible=choice == "Text Query"
|
234 |
-
)
|
235 |
-
|
236 |
-
search_type.change(
|
237 |
-
toggle_input_visibility,
|
238 |
-
inputs=[search_type],
|
239 |
-
outputs=[dataset_id, text_query],
|
240 |
)
|
241 |
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
)
|
251 |
|
252 |
-
|
253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
"Unlike the other search methods, this search utilizes the dataset viewer embedded in most datasets to match your query. "
|
261 |
-
"This means it doesn't rely on the dataset card for matching!\n\n"
|
262 |
-
"Enter a query to find relevant datasets and preview them directly using the dataset viewer.\n\n"
|
263 |
-
"Currently, this search is using a subset of datasets and a very early version of an embedding model to match natural language queries to datasets."
|
264 |
-
"**Help us improve!** Contribute to query quality improvement by participating in our "
|
265 |
-
"[Argilla annotation task](https://huggingface.co/spaces/davanstrien/my-argilla). Your feedback helps refine search results for everyone."
|
266 |
-
)
|
267 |
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
|
|
272 |
|
273 |
-
|
274 |
-
|
275 |
-
viewer_max_results = gr.Slider(
|
276 |
-
minimum=1,
|
277 |
-
maximum=50,
|
278 |
-
step=1,
|
279 |
-
value=10,
|
280 |
-
label="Maximum number of results",
|
281 |
-
)
|
282 |
|
283 |
-
|
284 |
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
290 |
|
291 |
demo.launch()
|
|
|
1 |
import asyncio
|
|
|
2 |
from typing import Dict, List
|
3 |
|
4 |
import gradio as gr
|
5 |
import httpx
|
|
|
|
|
6 |
|
7 |
+
API_URL = "http://localhost:8000"
|
8 |
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
async def fetch_similar_datasets(dataset_id: str, limit: int = 5) -> List[Dict]:
|
|
|
11 |
async with httpx.AsyncClient() as client:
|
12 |
response = await client.get(
|
13 |
+
f"{API_URL}/similarity/datasets",
|
14 |
+
params={"dataset_id": dataset_id, "k": limit},
|
15 |
)
|
16 |
if response.status_code == 200:
|
17 |
+
return response.json()["results"]
|
|
|
|
|
18 |
return []
|
19 |
|
20 |
|
21 |
+
async def fetch_similar_datasets_by_text(query: str, limit: int = 5) -> List[Dict]:
|
22 |
+
async with httpx.AsyncClient() as client:
|
23 |
response = await client.get(
|
24 |
+
f"{API_URL}/search/datasets", params={"query": query, "k": limit}
|
25 |
)
|
26 |
if response.status_code == 200:
|
27 |
+
return response.json()["results"]
|
|
|
28 |
return []
|
29 |
|
30 |
|
31 |
+
def format_results(results: List[Dict]) -> str:
|
32 |
+
markdown = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
for result in results:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
hub_id = result["dataset_id"]
|
36 |
similarity = result["similarity"]
|
37 |
+
summary = result.get("summary", "No summary available.")
|
38 |
url = f"https://huggingface.co/datasets/{hub_id}"
|
39 |
|
40 |
+
markdown += f"### [{hub_id}]({url})\n"
|
41 |
+
markdown += f"*Similarity: {similarity:.2f}*\n\n"
|
42 |
+
markdown += f"{summary}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
markdown += "---\n\n"
|
44 |
|
45 |
return markdown
|
46 |
|
47 |
|
48 |
+
with gr.Blocks() as demo:
|
49 |
+
gr.Markdown(
|
50 |
+
"""
|
51 |
+
# 🔍 Dataset Explorer
|
52 |
+
Find similar datasets or search by text query
|
53 |
+
""",
|
54 |
+
elem_classes=["center-text"],
|
|
|
|
|
|
|
|
|
|
|
55 |
)
|
56 |
|
57 |
+
with gr.Column(variant="panel"):
|
58 |
+
search_type = gr.Radio(
|
59 |
+
["Dataset ID", "Text Query"],
|
60 |
+
label="Search Method",
|
61 |
+
value="Dataset ID",
|
62 |
+
container=False,
|
|
|
63 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
+
with gr.Group():
|
66 |
+
dataset_id = gr.Textbox(
|
67 |
+
value="airtrain-ai/fineweb-edu-fortified",
|
68 |
+
label="Dataset ID",
|
69 |
+
container=False,
|
70 |
+
)
|
71 |
+
text_query = gr.Textbox(
|
72 |
+
label="Text Query",
|
73 |
+
placeholder="Enter at least 3 characters...",
|
74 |
+
container=False,
|
75 |
+
visible=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
)
|
77 |
|
78 |
+
with gr.Row():
|
79 |
+
search_btn = gr.Button("🔍 Search", size="lg")
|
80 |
+
max_results = gr.Slider(
|
81 |
+
minimum=1,
|
82 |
+
maximum=20,
|
83 |
+
step=1,
|
84 |
+
value=5,
|
85 |
+
label="Number of results",
|
86 |
)
|
87 |
|
88 |
+
results = gr.Markdown(elem_classes=["results-container"])
|
89 |
+
|
90 |
+
def toggle_input_visibility(choice):
|
91 |
+
return (
|
92 |
+
gr.update(visible=choice == "Dataset ID"),
|
93 |
+
gr.update(visible=choice == "Text Query"),
|
94 |
+
gr.update(visible=choice == "Dataset ID"),
|
95 |
+
)
|
96 |
|
97 |
+
search_type.change(
|
98 |
+
toggle_input_visibility,
|
99 |
+
inputs=[search_type],
|
100 |
+
outputs=[dataset_id, text_query, search_btn],
|
101 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
+
async def search_handler(search_type, dataset_id, text_query, limit):
|
104 |
+
if search_type == "Dataset ID":
|
105 |
+
results = await fetch_similar_datasets(dataset_id, limit)
|
106 |
+
else:
|
107 |
+
results = await fetch_similar_datasets_by_text(text_query, limit)
|
108 |
|
109 |
+
if not results:
|
110 |
+
return "No similar datasets found."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
+
return format_results(results)
|
113 |
|
114 |
+
text_query.input(
|
115 |
+
lambda search_type, text_query, limit: asyncio.run(
|
116 |
+
search_handler(search_type, "", text_query, limit)
|
117 |
+
)
|
118 |
+
if len(text_query) >= 3
|
119 |
+
else None, # Only trigger after 3 characters
|
120 |
+
inputs=[search_type, text_query, max_results],
|
121 |
+
outputs=results,
|
122 |
+
api_name=False,
|
123 |
+
)
|
124 |
+
|
125 |
+
search_btn.click(
|
126 |
+
lambda search_type, dataset_id, text_query, limit: asyncio.run(
|
127 |
+
search_handler(search_type, dataset_id, text_query, limit)
|
128 |
+
),
|
129 |
+
inputs=[search_type, dataset_id, text_query, max_results],
|
130 |
+
outputs=results,
|
131 |
+
)
|
132 |
|
133 |
demo.launch()
|
requirements.in
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
cashews
|
2 |
-
gradio
|
3 |
-
httpx
|
4 |
-
huggingface_hub
|
5 |
-
ragatouille
|
6 |
-
toolz
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
DELETED
@@ -1,520 +0,0 @@
|
|
1 |
-
# This file was autogenerated by uv via the following command:
|
2 |
-
# uv pip compile requirements.in -o requirements.txt
|
3 |
-
aiofiles==23.2.1
|
4 |
-
# via gradio
|
5 |
-
aiohappyeyeballs==2.4.0
|
6 |
-
# via aiohttp
|
7 |
-
aiohttp==3.10.5
|
8 |
-
# via
|
9 |
-
# datasets
|
10 |
-
# fsspec
|
11 |
-
# langchain
|
12 |
-
# llama-index-core
|
13 |
-
# llama-index-legacy
|
14 |
-
aiosignal==1.3.1
|
15 |
-
# via aiohttp
|
16 |
-
annotated-types==0.7.0
|
17 |
-
# via pydantic
|
18 |
-
anyio==4.4.0
|
19 |
-
# via
|
20 |
-
# gradio
|
21 |
-
# httpx
|
22 |
-
# openai
|
23 |
-
# starlette
|
24 |
-
attrs==24.2.0
|
25 |
-
# via aiohttp
|
26 |
-
beautifulsoup4==4.12.3
|
27 |
-
# via llama-index-readers-file
|
28 |
-
bitarray==2.9.2
|
29 |
-
# via colbert-ai
|
30 |
-
blinker==1.8.2
|
31 |
-
# via flask
|
32 |
-
cashews==7.3.1
|
33 |
-
# via -r requirements.in
|
34 |
-
catalogue==2.0.10
|
35 |
-
# via srsly
|
36 |
-
certifi==2024.8.30
|
37 |
-
# via
|
38 |
-
# httpcore
|
39 |
-
# httpx
|
40 |
-
# requests
|
41 |
-
charset-normalizer==3.3.2
|
42 |
-
# via requests
|
43 |
-
click==8.1.7
|
44 |
-
# via
|
45 |
-
# flask
|
46 |
-
# nltk
|
47 |
-
# typer
|
48 |
-
# uvicorn
|
49 |
-
colbert-ai==0.2.19
|
50 |
-
# via ragatouille
|
51 |
-
contourpy==1.3.0
|
52 |
-
# via matplotlib
|
53 |
-
cycler==0.12.1
|
54 |
-
# via matplotlib
|
55 |
-
dataclasses-json==0.6.7
|
56 |
-
# via
|
57 |
-
# llama-index-core
|
58 |
-
# llama-index-legacy
|
59 |
-
datasets==2.14.4
|
60 |
-
# via colbert-ai
|
61 |
-
deprecated==1.2.14
|
62 |
-
# via
|
63 |
-
# llama-index-core
|
64 |
-
# llama-index-legacy
|
65 |
-
dill==0.3.7
|
66 |
-
# via
|
67 |
-
# datasets
|
68 |
-
# multiprocess
|
69 |
-
dirtyjson==1.0.8
|
70 |
-
# via
|
71 |
-
# llama-index-core
|
72 |
-
# llama-index-legacy
|
73 |
-
distro==1.9.0
|
74 |
-
# via openai
|
75 |
-
faiss-cpu==1.8.0.post1
|
76 |
-
# via ragatouille
|
77 |
-
fast-pytorch-kmeans==0.2.0.1
|
78 |
-
# via ragatouille
|
79 |
-
fastapi==0.112.4
|
80 |
-
# via gradio
|
81 |
-
ffmpy==0.4.0
|
82 |
-
# via gradio
|
83 |
-
filelock==3.16.0
|
84 |
-
# via
|
85 |
-
# huggingface-hub
|
86 |
-
# torch
|
87 |
-
# transformers
|
88 |
-
flask==3.0.3
|
89 |
-
# via colbert-ai
|
90 |
-
fonttools==4.53.1
|
91 |
-
# via matplotlib
|
92 |
-
frozenlist==1.4.1
|
93 |
-
# via
|
94 |
-
# aiohttp
|
95 |
-
# aiosignal
|
96 |
-
fsspec==2024.9.0
|
97 |
-
# via
|
98 |
-
# datasets
|
99 |
-
# gradio-client
|
100 |
-
# huggingface-hub
|
101 |
-
# llama-index-core
|
102 |
-
# llama-index-legacy
|
103 |
-
# torch
|
104 |
-
git-python==1.0.3
|
105 |
-
# via colbert-ai
|
106 |
-
gitdb==4.0.11
|
107 |
-
# via gitpython
|
108 |
-
gitpython==3.1.43
|
109 |
-
# via git-python
|
110 |
-
gradio==4.43.0
|
111 |
-
# via -r requirements.in
|
112 |
-
gradio-client==1.3.0
|
113 |
-
# via gradio
|
114 |
-
greenlet==3.1.0
|
115 |
-
# via sqlalchemy
|
116 |
-
h11==0.14.0
|
117 |
-
# via
|
118 |
-
# httpcore
|
119 |
-
# uvicorn
|
120 |
-
httpcore==1.0.5
|
121 |
-
# via httpx
|
122 |
-
httpx==0.27.2
|
123 |
-
# via
|
124 |
-
# -r requirements.in
|
125 |
-
# gradio
|
126 |
-
# gradio-client
|
127 |
-
# langsmith
|
128 |
-
# llama-cloud
|
129 |
-
# llama-index-core
|
130 |
-
# llama-index-legacy
|
131 |
-
# openai
|
132 |
-
huggingface-hub==0.24.6
|
133 |
-
# via
|
134 |
-
# -r requirements.in
|
135 |
-
# datasets
|
136 |
-
# gradio
|
137 |
-
# gradio-client
|
138 |
-
# sentence-transformers
|
139 |
-
# tokenizers
|
140 |
-
# transformers
|
141 |
-
idna==3.8
|
142 |
-
# via
|
143 |
-
# anyio
|
144 |
-
# httpx
|
145 |
-
# requests
|
146 |
-
# yarl
|
147 |
-
importlib-resources==6.4.4
|
148 |
-
# via gradio
|
149 |
-
itsdangerous==2.2.0
|
150 |
-
# via flask
|
151 |
-
jinja2==3.1.4
|
152 |
-
# via
|
153 |
-
# flask
|
154 |
-
# gradio
|
155 |
-
# torch
|
156 |
-
jiter==0.5.0
|
157 |
-
# via openai
|
158 |
-
joblib==1.4.2
|
159 |
-
# via
|
160 |
-
# nltk
|
161 |
-
# scikit-learn
|
162 |
-
jsonpatch==1.33
|
163 |
-
# via langchain-core
|
164 |
-
jsonpointer==3.0.0
|
165 |
-
# via jsonpatch
|
166 |
-
kiwisolver==1.4.7
|
167 |
-
# via matplotlib
|
168 |
-
langchain==0.2.16
|
169 |
-
# via ragatouille
|
170 |
-
langchain-core==0.2.39
|
171 |
-
# via
|
172 |
-
# langchain
|
173 |
-
# langchain-text-splitters
|
174 |
-
# ragatouille
|
175 |
-
langchain-text-splitters==0.2.4
|
176 |
-
# via langchain
|
177 |
-
langsmith==0.1.117
|
178 |
-
# via
|
179 |
-
# langchain
|
180 |
-
# langchain-core
|
181 |
-
llama-cloud==0.0.17
|
182 |
-
# via llama-index-indices-managed-llama-cloud
|
183 |
-
llama-index==0.11.8
|
184 |
-
# via ragatouille
|
185 |
-
llama-index-agent-openai==0.3.1
|
186 |
-
# via
|
187 |
-
# llama-index
|
188 |
-
# llama-index-llms-openai
|
189 |
-
# llama-index-program-openai
|
190 |
-
llama-index-cli==0.3.1
|
191 |
-
# via llama-index
|
192 |
-
llama-index-core==0.11.8
|
193 |
-
# via
|
194 |
-
# llama-index
|
195 |
-
# llama-index-agent-openai
|
196 |
-
# llama-index-cli
|
197 |
-
# llama-index-embeddings-openai
|
198 |
-
# llama-index-indices-managed-llama-cloud
|
199 |
-
# llama-index-llms-openai
|
200 |
-
# llama-index-multi-modal-llms-openai
|
201 |
-
# llama-index-program-openai
|
202 |
-
# llama-index-question-gen-openai
|
203 |
-
# llama-index-readers-file
|
204 |
-
# llama-index-readers-llama-parse
|
205 |
-
# llama-parse
|
206 |
-
llama-index-embeddings-openai==0.2.4
|
207 |
-
# via
|
208 |
-
# llama-index
|
209 |
-
# llama-index-cli
|
210 |
-
llama-index-indices-managed-llama-cloud==0.3.0
|
211 |
-
# via llama-index
|
212 |
-
llama-index-legacy==0.9.48.post3
|
213 |
-
# via llama-index
|
214 |
-
llama-index-llms-openai==0.2.3
|
215 |
-
# via
|
216 |
-
# llama-index
|
217 |
-
# llama-index-agent-openai
|
218 |
-
# llama-index-cli
|
219 |
-
# llama-index-multi-modal-llms-openai
|
220 |
-
# llama-index-program-openai
|
221 |
-
# llama-index-question-gen-openai
|
222 |
-
llama-index-multi-modal-llms-openai==0.2.0
|
223 |
-
# via llama-index
|
224 |
-
llama-index-program-openai==0.2.0
|
225 |
-
# via
|
226 |
-
# llama-index
|
227 |
-
# llama-index-question-gen-openai
|
228 |
-
llama-index-question-gen-openai==0.2.0
|
229 |
-
# via llama-index
|
230 |
-
llama-index-readers-file==0.2.1
|
231 |
-
# via llama-index
|
232 |
-
llama-index-readers-llama-parse==0.3.0
|
233 |
-
# via llama-index
|
234 |
-
llama-parse==0.5.5
|
235 |
-
# via llama-index-readers-llama-parse
|
236 |
-
markdown-it-py==3.0.0
|
237 |
-
# via rich
|
238 |
-
markupsafe==2.1.5
|
239 |
-
# via
|
240 |
-
# gradio
|
241 |
-
# jinja2
|
242 |
-
# werkzeug
|
243 |
-
marshmallow==3.22.0
|
244 |
-
# via dataclasses-json
|
245 |
-
matplotlib==3.9.2
|
246 |
-
# via gradio
|
247 |
-
mdurl==0.1.2
|
248 |
-
# via markdown-it-py
|
249 |
-
mpmath==1.3.0
|
250 |
-
# via sympy
|
251 |
-
multidict==6.1.0
|
252 |
-
# via
|
253 |
-
# aiohttp
|
254 |
-
# yarl
|
255 |
-
multiprocess==0.70.15
|
256 |
-
# via datasets
|
257 |
-
mypy-extensions==1.0.0
|
258 |
-
# via typing-inspect
|
259 |
-
nest-asyncio==1.6.0
|
260 |
-
# via
|
261 |
-
# llama-index-core
|
262 |
-
# llama-index-legacy
|
263 |
-
networkx==3.3
|
264 |
-
# via
|
265 |
-
# llama-index-core
|
266 |
-
# llama-index-legacy
|
267 |
-
# torch
|
268 |
-
ninja==1.11.1.1
|
269 |
-
# via colbert-ai
|
270 |
-
nltk==3.9.1
|
271 |
-
# via
|
272 |
-
# llama-index
|
273 |
-
# llama-index-core
|
274 |
-
# llama-index-legacy
|
275 |
-
numpy==1.26.4
|
276 |
-
# via
|
277 |
-
# contourpy
|
278 |
-
# datasets
|
279 |
-
# faiss-cpu
|
280 |
-
# fast-pytorch-kmeans
|
281 |
-
# gradio
|
282 |
-
# langchain
|
283 |
-
# llama-index-core
|
284 |
-
# llama-index-legacy
|
285 |
-
# matplotlib
|
286 |
-
# onnx
|
287 |
-
# pandas
|
288 |
-
# pyarrow
|
289 |
-
# scikit-learn
|
290 |
-
# scipy
|
291 |
-
# sentence-transformers
|
292 |
-
# transformers
|
293 |
-
# voyager
|
294 |
-
onnx==1.16.2
|
295 |
-
# via ragatouille
|
296 |
-
openai==1.44.1
|
297 |
-
# via
|
298 |
-
# llama-index-agent-openai
|
299 |
-
# llama-index-embeddings-openai
|
300 |
-
# llama-index-legacy
|
301 |
-
# llama-index-llms-openai
|
302 |
-
orjson==3.10.7
|
303 |
-
# via
|
304 |
-
# gradio
|
305 |
-
# langsmith
|
306 |
-
packaging==24.1
|
307 |
-
# via
|
308 |
-
# datasets
|
309 |
-
# faiss-cpu
|
310 |
-
# gradio
|
311 |
-
# gradio-client
|
312 |
-
# huggingface-hub
|
313 |
-
# langchain-core
|
314 |
-
# marshmallow
|
315 |
-
# matplotlib
|
316 |
-
# transformers
|
317 |
-
pandas==2.2.2
|
318 |
-
# via
|
319 |
-
# datasets
|
320 |
-
# gradio
|
321 |
-
# llama-index-legacy
|
322 |
-
# llama-index-readers-file
|
323 |
-
pillow==10.4.0
|
324 |
-
# via
|
325 |
-
# gradio
|
326 |
-
# llama-index-core
|
327 |
-
# matplotlib
|
328 |
-
# sentence-transformers
|
329 |
-
protobuf==5.28.0
|
330 |
-
# via onnx
|
331 |
-
pyarrow==17.0.0
|
332 |
-
# via datasets
|
333 |
-
pydantic==2.9.1
|
334 |
-
# via
|
335 |
-
# fastapi
|
336 |
-
# gradio
|
337 |
-
# langchain
|
338 |
-
# langchain-core
|
339 |
-
# langsmith
|
340 |
-
# llama-cloud
|
341 |
-
# llama-index-core
|
342 |
-
# openai
|
343 |
-
pydantic-core==2.23.3
|
344 |
-
# via pydantic
|
345 |
-
pydub==0.25.1
|
346 |
-
# via gradio
|
347 |
-
pygments==2.18.0
|
348 |
-
# via rich
|
349 |
-
pynvml==11.5.3
|
350 |
-
# via fast-pytorch-kmeans
|
351 |
-
pyparsing==3.1.4
|
352 |
-
# via matplotlib
|
353 |
-
pypdf==4.3.1
|
354 |
-
# via llama-index-readers-file
|
355 |
-
python-dateutil==2.9.0.post0
|
356 |
-
# via
|
357 |
-
# matplotlib
|
358 |
-
# pandas
|
359 |
-
python-dotenv==1.0.1
|
360 |
-
# via colbert-ai
|
361 |
-
python-multipart==0.0.9
|
362 |
-
# via gradio
|
363 |
-
pytz==2024.1
|
364 |
-
# via pandas
|
365 |
-
pyyaml==6.0.2
|
366 |
-
# via
|
367 |
-
# datasets
|
368 |
-
# gradio
|
369 |
-
# huggingface-hub
|
370 |
-
# langchain
|
371 |
-
# langchain-core
|
372 |
-
# llama-index-core
|
373 |
-
# transformers
|
374 |
-
ragatouille==0.0.8.post4
|
375 |
-
# via -r requirements.in
|
376 |
-
regex==2024.7.24
|
377 |
-
# via
|
378 |
-
# nltk
|
379 |
-
# tiktoken
|
380 |
-
# transformers
|
381 |
-
requests==2.32.3
|
382 |
-
# via
|
383 |
-
# datasets
|
384 |
-
# huggingface-hub
|
385 |
-
# langchain
|
386 |
-
# langsmith
|
387 |
-
# llama-index-core
|
388 |
-
# llama-index-legacy
|
389 |
-
# tiktoken
|
390 |
-
# transformers
|
391 |
-
rich==13.8.0
|
392 |
-
# via typer
|
393 |
-
ruff==0.6.4
|
394 |
-
# via gradio
|
395 |
-
safetensors==0.4.5
|
396 |
-
# via transformers
|
397 |
-
scikit-learn==1.5.1
|
398 |
-
# via sentence-transformers
|
399 |
-
scipy==1.14.1
|
400 |
-
# via
|
401 |
-
# colbert-ai
|
402 |
-
# scikit-learn
|
403 |
-
# sentence-transformers
|
404 |
-
semantic-version==2.10.0
|
405 |
-
# via gradio
|
406 |
-
sentence-transformers==2.7.0
|
407 |
-
# via ragatouille
|
408 |
-
setuptools==74.1.2
|
409 |
-
# via torch
|
410 |
-
shellingham==1.5.4
|
411 |
-
# via typer
|
412 |
-
six==1.16.0
|
413 |
-
# via python-dateutil
|
414 |
-
smmap==5.0.1
|
415 |
-
# via gitdb
|
416 |
-
sniffio==1.3.1
|
417 |
-
# via
|
418 |
-
# anyio
|
419 |
-
# httpx
|
420 |
-
# openai
|
421 |
-
soupsieve==2.6
|
422 |
-
# via beautifulsoup4
|
423 |
-
sqlalchemy==2.0.34
|
424 |
-
# via
|
425 |
-
# langchain
|
426 |
-
# llama-index-core
|
427 |
-
# llama-index-legacy
|
428 |
-
srsly==2.4.8
|
429 |
-
# via ragatouille
|
430 |
-
starlette==0.38.5
|
431 |
-
# via fastapi
|
432 |
-
striprtf==0.0.26
|
433 |
-
# via llama-index-readers-file
|
434 |
-
sympy==1.13.2
|
435 |
-
# via torch
|
436 |
-
tenacity==8.5.0
|
437 |
-
# via
|
438 |
-
# langchain
|
439 |
-
# langchain-core
|
440 |
-
# llama-index-core
|
441 |
-
# llama-index-legacy
|
442 |
-
threadpoolctl==3.5.0
|
443 |
-
# via scikit-learn
|
444 |
-
tiktoken==0.7.0
|
445 |
-
# via
|
446 |
-
# llama-index-core
|
447 |
-
# llama-index-legacy
|
448 |
-
tokenizers==0.19.1
|
449 |
-
# via transformers
|
450 |
-
tomlkit==0.12.0
|
451 |
-
# via gradio
|
452 |
-
toolz==0.12.1
|
453 |
-
# via -r requirements.in
|
454 |
-
torch==2.4.1
|
455 |
-
# via
|
456 |
-
# fast-pytorch-kmeans
|
457 |
-
# ragatouille
|
458 |
-
# sentence-transformers
|
459 |
-
tqdm==4.66.5
|
460 |
-
# via
|
461 |
-
# colbert-ai
|
462 |
-
# datasets
|
463 |
-
# huggingface-hub
|
464 |
-
# llama-index-core
|
465 |
-
# nltk
|
466 |
-
# openai
|
467 |
-
# sentence-transformers
|
468 |
-
# transformers
|
469 |
-
transformers==4.44.2
|
470 |
-
# via
|
471 |
-
# colbert-ai
|
472 |
-
# ragatouille
|
473 |
-
# sentence-transformers
|
474 |
-
typer==0.12.5
|
475 |
-
# via gradio
|
476 |
-
typing-extensions==4.12.2
|
477 |
-
# via
|
478 |
-
# fastapi
|
479 |
-
# gradio
|
480 |
-
# gradio-client
|
481 |
-
# huggingface-hub
|
482 |
-
# langchain-core
|
483 |
-
# llama-index-core
|
484 |
-
# llama-index-legacy
|
485 |
-
# openai
|
486 |
-
# pydantic
|
487 |
-
# pydantic-core
|
488 |
-
# sqlalchemy
|
489 |
-
# torch
|
490 |
-
# typer
|
491 |
-
# typing-inspect
|
492 |
-
typing-inspect==0.9.0
|
493 |
-
# via
|
494 |
-
# dataclasses-json
|
495 |
-
# llama-index-core
|
496 |
-
# llama-index-legacy
|
497 |
-
tzdata==2024.1
|
498 |
-
# via pandas
|
499 |
-
ujson==5.10.0
|
500 |
-
# via colbert-ai
|
501 |
-
urllib3==2.2.2
|
502 |
-
# via
|
503 |
-
# gradio
|
504 |
-
# requests
|
505 |
-
uvicorn==0.30.6
|
506 |
-
# via gradio
|
507 |
-
voyager==2.0.9
|
508 |
-
# via ragatouille
|
509 |
-
websockets==12.0
|
510 |
-
# via gradio-client
|
511 |
-
werkzeug==3.0.4
|
512 |
-
# via flask
|
513 |
-
wrapt==1.16.0
|
514 |
-
# via
|
515 |
-
# deprecated
|
516 |
-
# llama-index-core
|
517 |
-
xxhash==3.5.0
|
518 |
-
# via datasets
|
519 |
-
yarl==1.11.1
|
520 |
-
# via aiohttp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|