davanstrien HF staff commited on
Commit
1c9d91a
·
1 Parent(s): f7d37d9

stop using gradio

Browse files
Files changed (3) hide show
  1. app.py +87 -245
  2. requirements.in +0 -6
  3. requirements.txt +0 -520
app.py CHANGED
@@ -1,291 +1,133 @@
1
  import asyncio
2
- import re
3
  from typing import Dict, List
4
 
5
  import gradio as gr
6
  import httpx
7
- from cashews import cache
8
- from huggingface_hub import ModelCard
9
 
10
- from ragatouille_search import create_ragatouille_interface
11
 
12
- cache.setup("mem://")
13
- API_URL = "https://davanstrien-huggingface-datasets-search-v2.hf.space"
14
- HF_API_URL = "https://huggingface.co/api/datasets"
15
- README_URL_TEMPLATE = "https://huggingface.co/datasets/{}/raw/main/README.md"
16
 
17
-
18
- async def fetch_similar_datasets(dataset_id: str, limit: int = 10) -> List[Dict]:
19
  async with httpx.AsyncClient() as client:
20
  response = await client.get(
21
- f"{API_URL}/similar?dataset_id={dataset_id}&n={limit + 1}"
 
22
  )
23
  if response.status_code == 200:
24
- results = response.json()["results"]
25
- # Remove the input dataset from the results
26
- return [r for r in results if r["dataset_id"] != dataset_id][:limit]
27
  return []
28
 
29
 
30
- async def fetch_similar_datasets_by_text(query: str, limit: int = 10) -> List[Dict]:
31
- async with httpx.AsyncClient(timeout=30) as client:
32
  response = await client.get(
33
- f"{API_URL}/similar-text", params={"query": query, "n": limit + 1}
34
  )
35
  if response.status_code == 200:
36
- results = response.json()["results"]
37
- return results[:limit]
38
  return []
39
 
40
 
41
- async def search_similar_datasets_by_text(query: str, limit: int = 10):
42
- results = await fetch_similar_datasets_by_text(query, limit)
43
-
44
- if not results:
45
- return "No similar datasets found."
46
-
47
- # Fetch dataset cards and info concurrently
48
- dataset_cards = await asyncio.gather(
49
- *[fetch_dataset_card(result["dataset_id"]) for result in results]
50
- )
51
- dataset_infos = await asyncio.gather(
52
- *[fetch_dataset_info(result["dataset_id"]) for result in results]
53
- )
54
-
55
- return format_results(results, dataset_cards, dataset_infos)
56
-
57
 
58
- async def fetch_dataset_card(dataset_id: str) -> str:
59
- url = README_URL_TEMPLATE.format(dataset_id)
60
- async with httpx.AsyncClient() as client:
61
- response = await client.get(url)
62
- return ModelCard(response.text).text if response.status_code == 200 else ""
63
-
64
-
65
- async def fetch_dataset_info(dataset_id: str) -> Dict:
66
- async with httpx.AsyncClient() as client:
67
- response = await client.get(f"{HF_API_URL}/{dataset_id}")
68
- return response.json() if response.status_code == 200 else {}
69
-
70
-
71
- def format_results(
72
- results: List[Dict], dataset_cards: List[str], dataset_infos: List[Dict]
73
- ) -> str:
74
- markdown = (
75
- "<h1 style='text-align: center;'>&#x2728; Similar Datasets &#x2728;</h1>\n\n"
76
- )
77
- for result, card, info in zip(results, dataset_cards, dataset_infos):
78
  hub_id = result["dataset_id"]
79
  similarity = result["similarity"]
 
80
  url = f"https://huggingface.co/datasets/{hub_id}"
81
 
82
- # Always use the Hub ID as the title
83
- header = f"## [{hub_id}]({url})"
84
- markdown += header + "\n"
85
- markdown += f"**Similarity Score:** {similarity:.4f}\n\n"
86
-
87
- if info:
88
- downloads = info.get("downloads", 0)
89
- likes = info.get("likes", 0)
90
- last_modified = info.get("lastModified", "N/A")
91
- markdown += f"**Downloads:** {downloads} | **Likes:** {likes} | **Last Modified:** {last_modified}\n\n"
92
-
93
- if card:
94
- # Remove the title from the card content
95
- card_without_title = re.sub(
96
- r"^#.*\n", "", card, count=1, flags=re.MULTILINE
97
- )
98
-
99
- # Split the card into paragraphs
100
- paragraphs = card_without_title.split("\n\n")
101
-
102
- # Find the first non-empty text paragraph that's not just an image
103
- preview = next(
104
- (
105
- p
106
- for p in paragraphs
107
- if p.strip()
108
- and not p.strip().startswith("![")
109
- and not p.strip().startswith("<img")
110
- ),
111
- "No preview available.",
112
- )
113
-
114
- # Limit the preview to a reasonable length (e.g., 300 characters)
115
- preview = f"{preview[:300]}..." if len(preview) > 300 else preview
116
-
117
- # Add the preview
118
- markdown += f"{preview}\n\n"
119
-
120
- # Limit image size in the full dataset card
121
- full_card = re.sub(
122
- r'<img src="([^"]+)"',
123
- r'<img src="\1" style="max-width: 300px; max-height: 300px;"',
124
- card_without_title,
125
- )
126
- full_card = re.sub(
127
- r"!\[([^\]]*)\]\(([^\)]+)\)",
128
- r'<img src="\2" alt="\1" style="max-width: 300px; max-height: 300px;">',
129
- full_card,
130
- )
131
- markdown += f"<details><summary>Full Dataset Card</summary>\n\n{full_card}\n\n</details>\n\n"
132
-
133
  markdown += "---\n\n"
134
 
135
  return markdown
136
 
137
 
138
- async def search_similar_datasets(dataset_id: str, limit: int = 10):
139
- results = await fetch_similar_datasets(dataset_id, limit)
140
-
141
- if not results:
142
- return "No similar datasets found."
143
-
144
- # Fetch dataset cards and info concurrently
145
- dataset_cards = await asyncio.gather(
146
- *[fetch_dataset_card(result["dataset_id"]) for result in results]
147
- )
148
- dataset_infos = await asyncio.gather(
149
- *[fetch_dataset_info(result["dataset_id"]) for result in results]
150
  )
151
 
152
- return format_results(results, dataset_cards, dataset_infos)
153
-
154
-
155
- async def search_viewer(query: str, limit: int = 10):
156
- async with httpx.AsyncClient(timeout=30) as client:
157
- response = await client.get(
158
- f"{API_URL}/search-viewer", params={"query": query, "n": limit}
159
  )
160
- if response.status_code == 200:
161
- results = response.json()["results"]
162
- return format_viewer_results(results)
163
- return "No results found."
164
-
165
-
166
- def format_viewer_results(results: List[Dict]) -> str:
167
- html = "<div style='height: 600px; overflow-y: auto;'>"
168
- for result in results:
169
- dataset_id = result["dataset_id"]
170
- html += f"""
171
- <div style='margin-bottom: 20px; border: 1px solid #ddd; padding: 10px;'>
172
- <h3>{dataset_id}</h3>
173
- <p><strong>Similarity Score:</strong> {result['similarity']:.4f}</p>
174
- <iframe
175
- src="https://huggingface.co/datasets/{dataset_id}/embed/viewer/default/train"
176
- frameborder="0"
177
- width="100%"
178
- height="560px"
179
- ></iframe>
180
- </div>
181
- """
182
- html += "</div>"
183
- return html
184
-
185
-
186
- with gr.Blocks() as demo:
187
- gr.Markdown("## &#129303; Dataset Search and Similarity")
188
-
189
- with gr.Tabs():
190
- with gr.TabItem("Similar Datasets"):
191
- gr.Markdown("## &#129303; Dataset Similarity Search")
192
- with gr.Row():
193
- gr.Markdown(
194
- "This Gradio app allows you to find similar datasets based on a given dataset ID or a text query. "
195
- "Choose the search type and enter either a dataset ID or a text query to find similar datasets with previews of their dataset cards.\n\n"
196
- "For a seamless experience on the Hugging Face website, check out the "
197
- "[Hugging Face Similar Chrome extension](https://chromewebstore.google.com/detail/hugging-face-similar/aijelnjllajooinkcpkpbhckbghghpnl?authuser=0&hl=en). "
198
- "This extension adds a 'Similar Datasets' section directly to Hugging Face dataset pages, "
199
- "making it even easier to discover related datasets for your projects."
200
- )
201
 
202
- with gr.Row():
203
- search_type = gr.Radio(
204
- ["Dataset ID", "Text Query"],
205
- label="Search Type",
206
- value="Dataset ID",
207
- )
208
-
209
- with gr.Row():
210
- dataset_id = gr.Textbox(
211
- value="airtrain-ai/fineweb-edu-fortified",
212
- label="Dataset ID (e.g., airtrain-ai/fineweb-edu-fortified)",
213
- )
214
- text_query = gr.Textbox(
215
- label="Text Query (e.g., 'natural language processing dataset')",
216
- visible=False,
217
- )
218
-
219
- with gr.Row():
220
- search_btn = gr.Button("Search Similar Datasets")
221
- max_results = gr.Slider(
222
- minimum=1,
223
- maximum=50,
224
- step=1,
225
- value=10,
226
- label="Maximum number of results",
227
- )
228
-
229
- results = gr.Markdown()
230
-
231
- def toggle_input_visibility(choice):
232
- return gr.update(visible=choice == "Dataset ID"), gr.update(
233
- visible=choice == "Text Query"
234
- )
235
-
236
- search_type.change(
237
- toggle_input_visibility,
238
- inputs=[search_type],
239
- outputs=[dataset_id, text_query],
240
  )
241
 
242
- search_btn.click(
243
- lambda search_type, dataset_id, text_query, limit: asyncio.run(
244
- search_similar_datasets(dataset_id, limit)
245
- if search_type == "Dataset ID"
246
- else search_similar_datasets_by_text(text_query, limit)
247
- ),
248
- inputs=[search_type, dataset_id, text_query, max_results],
249
- outputs=results,
250
  )
251
 
252
- with gr.TabItem("RAGatouille Search"):
253
- ragatouille_interface = create_ragatouille_interface()
 
 
 
 
 
 
254
 
255
- with gr.TabItem("Search Viewer"):
256
- gr.Markdown("## &#128269; Search Viewer")
257
- with gr.Row():
258
- gr.Markdown(
259
- "This tab allows you to search for datasets using their dataset viewer preview! "
260
- "Unlike the other search methods, this search utilizes the dataset viewer embedded in most datasets to match your query. "
261
- "This means it doesn't rely on the dataset card for matching!\n\n"
262
- "Enter a query to find relevant datasets and preview them directly using the dataset viewer.\n\n"
263
- "Currently, this search is using a subset of datasets and a very early version of an embedding model to match natural language queries to datasets."
264
- "**Help us improve!** Contribute to query quality improvement by participating in our "
265
- "[Argilla annotation task](https://huggingface.co/spaces/davanstrien/my-argilla). Your feedback helps refine search results for everyone."
266
- )
267
 
268
- with gr.Row():
269
- viewer_query = gr.Textbox(
270
- label="Search Query", placeholder="Enter your search query here"
271
- )
 
272
 
273
- with gr.Row():
274
- viewer_search_btn = gr.Button("Search")
275
- viewer_max_results = gr.Slider(
276
- minimum=1,
277
- maximum=50,
278
- step=1,
279
- value=10,
280
- label="Maximum number of results",
281
- )
282
 
283
- viewer_results = gr.HTML()
284
 
285
- viewer_search_btn.click(
286
- lambda query, limit: asyncio.run(search_viewer(query, limit)),
287
- inputs=[viewer_query, viewer_max_results],
288
- outputs=viewer_results,
289
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
291
  demo.launch()
 
1
  import asyncio
 
2
  from typing import Dict, List
3
 
4
  import gradio as gr
5
  import httpx
 
 
6
 
7
+ API_URL = "http://localhost:8000"
8
 
 
 
 
 
9
 
10
+ async def fetch_similar_datasets(dataset_id: str, limit: int = 5) -> List[Dict]:
 
11
  async with httpx.AsyncClient() as client:
12
  response = await client.get(
13
+ f"{API_URL}/similarity/datasets",
14
+ params={"dataset_id": dataset_id, "k": limit},
15
  )
16
  if response.status_code == 200:
17
+ return response.json()["results"]
 
 
18
  return []
19
 
20
 
21
+ async def fetch_similar_datasets_by_text(query: str, limit: int = 5) -> List[Dict]:
22
+ async with httpx.AsyncClient() as client:
23
  response = await client.get(
24
+ f"{API_URL}/search/datasets", params={"query": query, "k": limit}
25
  )
26
  if response.status_code == 200:
27
+ return response.json()["results"]
 
28
  return []
29
 
30
 
31
+ def format_results(results: List[Dict]) -> str:
32
+ markdown = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ for result in results:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  hub_id = result["dataset_id"]
36
  similarity = result["similarity"]
37
+ summary = result.get("summary", "No summary available.")
38
  url = f"https://huggingface.co/datasets/{hub_id}"
39
 
40
+ markdown += f"### [{hub_id}]({url})\n"
41
+ markdown += f"*Similarity: {similarity:.2f}*\n\n"
42
+ markdown += f"{summary}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  markdown += "---\n\n"
44
 
45
  return markdown
46
 
47
 
48
+ with gr.Blocks() as demo:
49
+ gr.Markdown(
50
+ """
51
+ # 🔍 Dataset Explorer
52
+ Find similar datasets or search by text query
53
+ """,
54
+ elem_classes=["center-text"],
 
 
 
 
 
55
  )
56
 
57
+ with gr.Column(variant="panel"):
58
+ search_type = gr.Radio(
59
+ ["Dataset ID", "Text Query"],
60
+ label="Search Method",
61
+ value="Dataset ID",
62
+ container=False,
 
63
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ with gr.Group():
66
+ dataset_id = gr.Textbox(
67
+ value="airtrain-ai/fineweb-edu-fortified",
68
+ label="Dataset ID",
69
+ container=False,
70
+ )
71
+ text_query = gr.Textbox(
72
+ label="Text Query",
73
+ placeholder="Enter at least 3 characters...",
74
+ container=False,
75
+ visible=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  )
77
 
78
+ with gr.Row():
79
+ search_btn = gr.Button("🔍 Search", size="lg")
80
+ max_results = gr.Slider(
81
+ minimum=1,
82
+ maximum=20,
83
+ step=1,
84
+ value=5,
85
+ label="Number of results",
86
  )
87
 
88
+ results = gr.Markdown(elem_classes=["results-container"])
89
+
90
+ def toggle_input_visibility(choice):
91
+ return (
92
+ gr.update(visible=choice == "Dataset ID"),
93
+ gr.update(visible=choice == "Text Query"),
94
+ gr.update(visible=choice == "Dataset ID"),
95
+ )
96
 
97
+ search_type.change(
98
+ toggle_input_visibility,
99
+ inputs=[search_type],
100
+ outputs=[dataset_id, text_query, search_btn],
101
+ )
 
 
 
 
 
 
 
102
 
103
+ async def search_handler(search_type, dataset_id, text_query, limit):
104
+ if search_type == "Dataset ID":
105
+ results = await fetch_similar_datasets(dataset_id, limit)
106
+ else:
107
+ results = await fetch_similar_datasets_by_text(text_query, limit)
108
 
109
+ if not results:
110
+ return "No similar datasets found."
 
 
 
 
 
 
 
111
 
112
+ return format_results(results)
113
 
114
+ text_query.input(
115
+ lambda search_type, text_query, limit: asyncio.run(
116
+ search_handler(search_type, "", text_query, limit)
117
+ )
118
+ if len(text_query) >= 3
119
+ else None, # Only trigger after 3 characters
120
+ inputs=[search_type, text_query, max_results],
121
+ outputs=results,
122
+ api_name=False,
123
+ )
124
+
125
+ search_btn.click(
126
+ lambda search_type, dataset_id, text_query, limit: asyncio.run(
127
+ search_handler(search_type, dataset_id, text_query, limit)
128
+ ),
129
+ inputs=[search_type, dataset_id, text_query, max_results],
130
+ outputs=results,
131
+ )
132
 
133
  demo.launch()
requirements.in DELETED
@@ -1,6 +0,0 @@
1
- cashews
2
- gradio
3
- httpx
4
- huggingface_hub
5
- ragatouille
6
- toolz
 
 
 
 
 
 
 
requirements.txt DELETED
@@ -1,520 +0,0 @@
1
- # This file was autogenerated by uv via the following command:
2
- # uv pip compile requirements.in -o requirements.txt
3
- aiofiles==23.2.1
4
- # via gradio
5
- aiohappyeyeballs==2.4.0
6
- # via aiohttp
7
- aiohttp==3.10.5
8
- # via
9
- # datasets
10
- # fsspec
11
- # langchain
12
- # llama-index-core
13
- # llama-index-legacy
14
- aiosignal==1.3.1
15
- # via aiohttp
16
- annotated-types==0.7.0
17
- # via pydantic
18
- anyio==4.4.0
19
- # via
20
- # gradio
21
- # httpx
22
- # openai
23
- # starlette
24
- attrs==24.2.0
25
- # via aiohttp
26
- beautifulsoup4==4.12.3
27
- # via llama-index-readers-file
28
- bitarray==2.9.2
29
- # via colbert-ai
30
- blinker==1.8.2
31
- # via flask
32
- cashews==7.3.1
33
- # via -r requirements.in
34
- catalogue==2.0.10
35
- # via srsly
36
- certifi==2024.8.30
37
- # via
38
- # httpcore
39
- # httpx
40
- # requests
41
- charset-normalizer==3.3.2
42
- # via requests
43
- click==8.1.7
44
- # via
45
- # flask
46
- # nltk
47
- # typer
48
- # uvicorn
49
- colbert-ai==0.2.19
50
- # via ragatouille
51
- contourpy==1.3.0
52
- # via matplotlib
53
- cycler==0.12.1
54
- # via matplotlib
55
- dataclasses-json==0.6.7
56
- # via
57
- # llama-index-core
58
- # llama-index-legacy
59
- datasets==2.14.4
60
- # via colbert-ai
61
- deprecated==1.2.14
62
- # via
63
- # llama-index-core
64
- # llama-index-legacy
65
- dill==0.3.7
66
- # via
67
- # datasets
68
- # multiprocess
69
- dirtyjson==1.0.8
70
- # via
71
- # llama-index-core
72
- # llama-index-legacy
73
- distro==1.9.0
74
- # via openai
75
- faiss-cpu==1.8.0.post1
76
- # via ragatouille
77
- fast-pytorch-kmeans==0.2.0.1
78
- # via ragatouille
79
- fastapi==0.112.4
80
- # via gradio
81
- ffmpy==0.4.0
82
- # via gradio
83
- filelock==3.16.0
84
- # via
85
- # huggingface-hub
86
- # torch
87
- # transformers
88
- flask==3.0.3
89
- # via colbert-ai
90
- fonttools==4.53.1
91
- # via matplotlib
92
- frozenlist==1.4.1
93
- # via
94
- # aiohttp
95
- # aiosignal
96
- fsspec==2024.9.0
97
- # via
98
- # datasets
99
- # gradio-client
100
- # huggingface-hub
101
- # llama-index-core
102
- # llama-index-legacy
103
- # torch
104
- git-python==1.0.3
105
- # via colbert-ai
106
- gitdb==4.0.11
107
- # via gitpython
108
- gitpython==3.1.43
109
- # via git-python
110
- gradio==4.43.0
111
- # via -r requirements.in
112
- gradio-client==1.3.0
113
- # via gradio
114
- greenlet==3.1.0
115
- # via sqlalchemy
116
- h11==0.14.0
117
- # via
118
- # httpcore
119
- # uvicorn
120
- httpcore==1.0.5
121
- # via httpx
122
- httpx==0.27.2
123
- # via
124
- # -r requirements.in
125
- # gradio
126
- # gradio-client
127
- # langsmith
128
- # llama-cloud
129
- # llama-index-core
130
- # llama-index-legacy
131
- # openai
132
- huggingface-hub==0.24.6
133
- # via
134
- # -r requirements.in
135
- # datasets
136
- # gradio
137
- # gradio-client
138
- # sentence-transformers
139
- # tokenizers
140
- # transformers
141
- idna==3.8
142
- # via
143
- # anyio
144
- # httpx
145
- # requests
146
- # yarl
147
- importlib-resources==6.4.4
148
- # via gradio
149
- itsdangerous==2.2.0
150
- # via flask
151
- jinja2==3.1.4
152
- # via
153
- # flask
154
- # gradio
155
- # torch
156
- jiter==0.5.0
157
- # via openai
158
- joblib==1.4.2
159
- # via
160
- # nltk
161
- # scikit-learn
162
- jsonpatch==1.33
163
- # via langchain-core
164
- jsonpointer==3.0.0
165
- # via jsonpatch
166
- kiwisolver==1.4.7
167
- # via matplotlib
168
- langchain==0.2.16
169
- # via ragatouille
170
- langchain-core==0.2.39
171
- # via
172
- # langchain
173
- # langchain-text-splitters
174
- # ragatouille
175
- langchain-text-splitters==0.2.4
176
- # via langchain
177
- langsmith==0.1.117
178
- # via
179
- # langchain
180
- # langchain-core
181
- llama-cloud==0.0.17
182
- # via llama-index-indices-managed-llama-cloud
183
- llama-index==0.11.8
184
- # via ragatouille
185
- llama-index-agent-openai==0.3.1
186
- # via
187
- # llama-index
188
- # llama-index-llms-openai
189
- # llama-index-program-openai
190
- llama-index-cli==0.3.1
191
- # via llama-index
192
- llama-index-core==0.11.8
193
- # via
194
- # llama-index
195
- # llama-index-agent-openai
196
- # llama-index-cli
197
- # llama-index-embeddings-openai
198
- # llama-index-indices-managed-llama-cloud
199
- # llama-index-llms-openai
200
- # llama-index-multi-modal-llms-openai
201
- # llama-index-program-openai
202
- # llama-index-question-gen-openai
203
- # llama-index-readers-file
204
- # llama-index-readers-llama-parse
205
- # llama-parse
206
- llama-index-embeddings-openai==0.2.4
207
- # via
208
- # llama-index
209
- # llama-index-cli
210
- llama-index-indices-managed-llama-cloud==0.3.0
211
- # via llama-index
212
- llama-index-legacy==0.9.48.post3
213
- # via llama-index
214
- llama-index-llms-openai==0.2.3
215
- # via
216
- # llama-index
217
- # llama-index-agent-openai
218
- # llama-index-cli
219
- # llama-index-multi-modal-llms-openai
220
- # llama-index-program-openai
221
- # llama-index-question-gen-openai
222
- llama-index-multi-modal-llms-openai==0.2.0
223
- # via llama-index
224
- llama-index-program-openai==0.2.0
225
- # via
226
- # llama-index
227
- # llama-index-question-gen-openai
228
- llama-index-question-gen-openai==0.2.0
229
- # via llama-index
230
- llama-index-readers-file==0.2.1
231
- # via llama-index
232
- llama-index-readers-llama-parse==0.3.0
233
- # via llama-index
234
- llama-parse==0.5.5
235
- # via llama-index-readers-llama-parse
236
- markdown-it-py==3.0.0
237
- # via rich
238
- markupsafe==2.1.5
239
- # via
240
- # gradio
241
- # jinja2
242
- # werkzeug
243
- marshmallow==3.22.0
244
- # via dataclasses-json
245
- matplotlib==3.9.2
246
- # via gradio
247
- mdurl==0.1.2
248
- # via markdown-it-py
249
- mpmath==1.3.0
250
- # via sympy
251
- multidict==6.1.0
252
- # via
253
- # aiohttp
254
- # yarl
255
- multiprocess==0.70.15
256
- # via datasets
257
- mypy-extensions==1.0.0
258
- # via typing-inspect
259
- nest-asyncio==1.6.0
260
- # via
261
- # llama-index-core
262
- # llama-index-legacy
263
- networkx==3.3
264
- # via
265
- # llama-index-core
266
- # llama-index-legacy
267
- # torch
268
- ninja==1.11.1.1
269
- # via colbert-ai
270
- nltk==3.9.1
271
- # via
272
- # llama-index
273
- # llama-index-core
274
- # llama-index-legacy
275
- numpy==1.26.4
276
- # via
277
- # contourpy
278
- # datasets
279
- # faiss-cpu
280
- # fast-pytorch-kmeans
281
- # gradio
282
- # langchain
283
- # llama-index-core
284
- # llama-index-legacy
285
- # matplotlib
286
- # onnx
287
- # pandas
288
- # pyarrow
289
- # scikit-learn
290
- # scipy
291
- # sentence-transformers
292
- # transformers
293
- # voyager
294
- onnx==1.16.2
295
- # via ragatouille
296
- openai==1.44.1
297
- # via
298
- # llama-index-agent-openai
299
- # llama-index-embeddings-openai
300
- # llama-index-legacy
301
- # llama-index-llms-openai
302
- orjson==3.10.7
303
- # via
304
- # gradio
305
- # langsmith
306
- packaging==24.1
307
- # via
308
- # datasets
309
- # faiss-cpu
310
- # gradio
311
- # gradio-client
312
- # huggingface-hub
313
- # langchain-core
314
- # marshmallow
315
- # matplotlib
316
- # transformers
317
- pandas==2.2.2
318
- # via
319
- # datasets
320
- # gradio
321
- # llama-index-legacy
322
- # llama-index-readers-file
323
- pillow==10.4.0
324
- # via
325
- # gradio
326
- # llama-index-core
327
- # matplotlib
328
- # sentence-transformers
329
- protobuf==5.28.0
330
- # via onnx
331
- pyarrow==17.0.0
332
- # via datasets
333
- pydantic==2.9.1
334
- # via
335
- # fastapi
336
- # gradio
337
- # langchain
338
- # langchain-core
339
- # langsmith
340
- # llama-cloud
341
- # llama-index-core
342
- # openai
343
- pydantic-core==2.23.3
344
- # via pydantic
345
- pydub==0.25.1
346
- # via gradio
347
- pygments==2.18.0
348
- # via rich
349
- pynvml==11.5.3
350
- # via fast-pytorch-kmeans
351
- pyparsing==3.1.4
352
- # via matplotlib
353
- pypdf==4.3.1
354
- # via llama-index-readers-file
355
- python-dateutil==2.9.0.post0
356
- # via
357
- # matplotlib
358
- # pandas
359
- python-dotenv==1.0.1
360
- # via colbert-ai
361
- python-multipart==0.0.9
362
- # via gradio
363
- pytz==2024.1
364
- # via pandas
365
- pyyaml==6.0.2
366
- # via
367
- # datasets
368
- # gradio
369
- # huggingface-hub
370
- # langchain
371
- # langchain-core
372
- # llama-index-core
373
- # transformers
374
- ragatouille==0.0.8.post4
375
- # via -r requirements.in
376
- regex==2024.7.24
377
- # via
378
- # nltk
379
- # tiktoken
380
- # transformers
381
- requests==2.32.3
382
- # via
383
- # datasets
384
- # huggingface-hub
385
- # langchain
386
- # langsmith
387
- # llama-index-core
388
- # llama-index-legacy
389
- # tiktoken
390
- # transformers
391
- rich==13.8.0
392
- # via typer
393
- ruff==0.6.4
394
- # via gradio
395
- safetensors==0.4.5
396
- # via transformers
397
- scikit-learn==1.5.1
398
- # via sentence-transformers
399
- scipy==1.14.1
400
- # via
401
- # colbert-ai
402
- # scikit-learn
403
- # sentence-transformers
404
- semantic-version==2.10.0
405
- # via gradio
406
- sentence-transformers==2.7.0
407
- # via ragatouille
408
- setuptools==74.1.2
409
- # via torch
410
- shellingham==1.5.4
411
- # via typer
412
- six==1.16.0
413
- # via python-dateutil
414
- smmap==5.0.1
415
- # via gitdb
416
- sniffio==1.3.1
417
- # via
418
- # anyio
419
- # httpx
420
- # openai
421
- soupsieve==2.6
422
- # via beautifulsoup4
423
- sqlalchemy==2.0.34
424
- # via
425
- # langchain
426
- # llama-index-core
427
- # llama-index-legacy
428
- srsly==2.4.8
429
- # via ragatouille
430
- starlette==0.38.5
431
- # via fastapi
432
- striprtf==0.0.26
433
- # via llama-index-readers-file
434
- sympy==1.13.2
435
- # via torch
436
- tenacity==8.5.0
437
- # via
438
- # langchain
439
- # langchain-core
440
- # llama-index-core
441
- # llama-index-legacy
442
- threadpoolctl==3.5.0
443
- # via scikit-learn
444
- tiktoken==0.7.0
445
- # via
446
- # llama-index-core
447
- # llama-index-legacy
448
- tokenizers==0.19.1
449
- # via transformers
450
- tomlkit==0.12.0
451
- # via gradio
452
- toolz==0.12.1
453
- # via -r requirements.in
454
- torch==2.4.1
455
- # via
456
- # fast-pytorch-kmeans
457
- # ragatouille
458
- # sentence-transformers
459
- tqdm==4.66.5
460
- # via
461
- # colbert-ai
462
- # datasets
463
- # huggingface-hub
464
- # llama-index-core
465
- # nltk
466
- # openai
467
- # sentence-transformers
468
- # transformers
469
- transformers==4.44.2
470
- # via
471
- # colbert-ai
472
- # ragatouille
473
- # sentence-transformers
474
- typer==0.12.5
475
- # via gradio
476
- typing-extensions==4.12.2
477
- # via
478
- # fastapi
479
- # gradio
480
- # gradio-client
481
- # huggingface-hub
482
- # langchain-core
483
- # llama-index-core
484
- # llama-index-legacy
485
- # openai
486
- # pydantic
487
- # pydantic-core
488
- # sqlalchemy
489
- # torch
490
- # typer
491
- # typing-inspect
492
- typing-inspect==0.9.0
493
- # via
494
- # dataclasses-json
495
- # llama-index-core
496
- # llama-index-legacy
497
- tzdata==2024.1
498
- # via pandas
499
- ujson==5.10.0
500
- # via colbert-ai
501
- urllib3==2.2.2
502
- # via
503
- # gradio
504
- # requests
505
- uvicorn==0.30.6
506
- # via gradio
507
- voyager==2.0.9
508
- # via ragatouille
509
- websockets==12.0
510
- # via gradio-client
511
- werkzeug==3.0.4
512
- # via flask
513
- wrapt==1.16.0
514
- # via
515
- # deprecated
516
- # llama-index-core
517
- xxhash==3.5.0
518
- # via datasets
519
- yarl==1.11.1
520
- # via aiohttp