Spaces:

librarian-bots
/

huggingface-datasets-semantic-search

Running

App Files Files Community

davanstrien HF staff commited on 5 days ago

Commit

1c9d91a

1 Parent(s): f7d37d9

stop using gradio

Browse files

Files changed (3) hide show

app.py +87 -245
requirements.in +0 -6
requirements.txt +0 -520

app.py CHANGED Viewed

@@ -1,291 +1,133 @@
 import asyncio
-import re
 from typing import Dict, List
 import gradio as gr
 import httpx
-from cashews import cache
-from huggingface_hub import ModelCard
-from ragatouille_search import create_ragatouille_interface
-cache.setup("mem://")
-API_URL = "https://davanstrien-huggingface-datasets-search-v2.hf.space"
-HF_API_URL = "https://huggingface.co/api/datasets"
-README_URL_TEMPLATE = "https://huggingface.co/datasets/{}/raw/main/README.md"
-async def fetch_similar_datasets(dataset_id: str, limit: int = 10) -> List[Dict]:
     async with httpx.AsyncClient() as client:
         response = await client.get(
-            f"{API_URL}/similar?dataset_id={dataset_id}&n={limit + 1}"
         )
         if response.status_code == 200:
-            results = response.json()["results"]
-            # Remove the input dataset from the results
-            return [r for r in results if r["dataset_id"] != dataset_id][:limit]
         return []
-async def fetch_similar_datasets_by_text(query: str, limit: int = 10) -> List[Dict]:
-    async with httpx.AsyncClient(timeout=30) as client:
         response = await client.get(
-            f"{API_URL}/similar-text", params={"query": query, "n": limit + 1}
         )
         if response.status_code == 200:
-            results = response.json()["results"]
-            return results[:limit]
         return []
-async def search_similar_datasets_by_text(query: str, limit: int = 10):
-    results = await fetch_similar_datasets_by_text(query, limit)
-    if not results:
-        return "No similar datasets found."
-    # Fetch dataset cards and info concurrently
-    dataset_cards = await asyncio.gather(
-        *[fetch_dataset_card(result["dataset_id"]) for result in results]
-    )
-    dataset_infos = await asyncio.gather(
-        *[fetch_dataset_info(result["dataset_id"]) for result in results]
-    )
-    return format_results(results, dataset_cards, dataset_infos)
-async def fetch_dataset_card(dataset_id: str) -> str:
-    url = README_URL_TEMPLATE.format(dataset_id)
-    async with httpx.AsyncClient() as client:
-        response = await client.get(url)
-        return ModelCard(response.text).text if response.status_code == 200 else ""
-async def fetch_dataset_info(dataset_id: str) -> Dict:
-    async with httpx.AsyncClient() as client:
-        response = await client.get(f"{HF_API_URL}/{dataset_id}")
-        return response.json() if response.status_code == 200 else {}
-def format_results(
-    results: List[Dict], dataset_cards: List[str], dataset_infos: List[Dict]
-) -> str:
-    markdown = (
-        "<h1 style='text-align: center;'>&#x2728; Similar Datasets &#x2728;</h1>\n\n"
-    )
-    for result, card, info in zip(results, dataset_cards, dataset_infos):
         hub_id = result["dataset_id"]
         similarity = result["similarity"]
         url = f"https://huggingface.co/datasets/{hub_id}"
-        # Always use the Hub ID as the title
-        header = f"## [{hub_id}]({url})"
-        markdown += header + "\n"
-        markdown += f"**Similarity Score:** {similarity:.4f}\n\n"
-        if info:
-            downloads = info.get("downloads", 0)
-            likes = info.get("likes", 0)
-            last_modified = info.get("lastModified", "N/A")
-            markdown += f"**Downloads:** {downloads} | **Likes:** {likes} | **Last Modified:** {last_modified}\n\n"
-        if card:
-            # Remove the title from the card content
-            card_without_title = re.sub(
-                r"^#.*\n", "", card, count=1, flags=re.MULTILINE
-            )
-            # Split the card into paragraphs
-            paragraphs = card_without_title.split("\n\n")
-            # Find the first non-empty text paragraph that's not just an image
-            preview = next(
-                (
-                    p
-                    for p in paragraphs
-                    if p.strip()
-                    and not p.strip().startswith("![")
-                    and not p.strip().startswith("<img")
-                ),
-                "No preview available.",
-            )
-            # Limit the preview to a reasonable length (e.g., 300 characters)
-            preview = f"{preview[:300]}..." if len(preview) > 300 else preview
-            # Add the preview
-            markdown += f"{preview}\n\n"
-            # Limit image size in the full dataset card
-            full_card = re.sub(
-                r'<img src="([^"]+)"',
-                r'<img src="\1" style="max-width: 300px; max-height: 300px;"',
-                card_without_title,
-            )
-            full_card = re.sub(
-                r"!\[([^\]]*)\]\(([^\)]+)\)",
-                r'<img src="\2" alt="\1" style="max-width: 300px; max-height: 300px;">',
-                full_card,
-            )
-            markdown += f"<details><summary>Full Dataset Card</summary>\n\n{full_card}\n\n</details>\n\n"
         markdown += "---\n\n"
     return markdown
-async def search_similar_datasets(dataset_id: str, limit: int = 10):
-    results = await fetch_similar_datasets(dataset_id, limit)
-    if not results:
-        return "No similar datasets found."
-    # Fetch dataset cards and info concurrently
-    dataset_cards = await asyncio.gather(
-        *[fetch_dataset_card(result["dataset_id"]) for result in results]
-    )
-    dataset_infos = await asyncio.gather(
-        *[fetch_dataset_info(result["dataset_id"]) for result in results]
     )
-    return format_results(results, dataset_cards, dataset_infos)
-async def search_viewer(query: str, limit: int = 10):
-    async with httpx.AsyncClient(timeout=30) as client:
-        response = await client.get(
-            f"{API_URL}/search-viewer", params={"query": query, "n": limit}
         )
-        if response.status_code == 200:
-            results = response.json()["results"]
-            return format_viewer_results(results)
-        return "No results found."
-def format_viewer_results(results: List[Dict]) -> str:
-    html = "<div style='height: 600px; overflow-y: auto;'>"
-    for result in results:
-        dataset_id = result["dataset_id"]
-        html += f"""
-        <div style='margin-bottom: 20px; border: 1px solid #ddd; padding: 10px;'>
-            <h3>{dataset_id}</h3>
-            <p><strong>Similarity Score:</strong> {result['similarity']:.4f}</p>
-            <iframe
-                src="https://huggingface.co/datasets/{dataset_id}/embed/viewer/default/train"
-                frameborder="0"
-                width="100%"
-                height="560px"
-            ></iframe>
-        </div>
-        """
-    html += "</div>"
-    return html
-with gr.Blocks() as demo:
-    gr.Markdown("## &#129303; Dataset Search and Similarity")
-    with gr.Tabs():
-        with gr.TabItem("Similar Datasets"):
-            gr.Markdown("## &#129303; Dataset Similarity Search")
-            with gr.Row():
-                gr.Markdown(
-                    "This Gradio app allows you to find similar datasets based on a given dataset ID or a text query. "
-                    "Choose the search type and enter either a dataset ID or a text query to find similar datasets with previews of their dataset cards.\n\n"
-                    "For a seamless experience on the Hugging Face website, check out the "
-                    "[Hugging Face Similar Chrome extension](https://chromewebstore.google.com/detail/hugging-face-similar/aijelnjllajooinkcpkpbhckbghghpnl?authuser=0&hl=en). "
-                    "This extension adds a 'Similar Datasets' section directly to Hugging Face dataset pages, "
-                    "making it even easier to discover related datasets for your projects."
-                )
-            with gr.Row():
-                search_type = gr.Radio(
-                    ["Dataset ID", "Text Query"],
-                    label="Search Type",
-                    value="Dataset ID",
-                )
-            with gr.Row():
-                dataset_id = gr.Textbox(
-                    value="airtrain-ai/fineweb-edu-fortified",
-                    label="Dataset ID (e.g., airtrain-ai/fineweb-edu-fortified)",
-                )
-                text_query = gr.Textbox(
-                    label="Text Query (e.g., 'natural language processing dataset')",
-                    visible=False,
-                )
-            with gr.Row():
-                search_btn = gr.Button("Search Similar Datasets")
-                max_results = gr.Slider(
-                    minimum=1,
-                    maximum=50,
-                    step=1,
-                    value=10,
-                    label="Maximum number of results",
-                )
-            results = gr.Markdown()
-            def toggle_input_visibility(choice):
-                return gr.update(visible=choice == "Dataset ID"), gr.update(
-                    visible=choice == "Text Query"
-                )
-            search_type.change(
-                toggle_input_visibility,
-                inputs=[search_type],
-                outputs=[dataset_id, text_query],
             )
-            search_btn.click(
-                lambda search_type, dataset_id, text_query, limit: asyncio.run(
-                    search_similar_datasets(dataset_id, limit)
-                    if search_type == "Dataset ID"
-                    else search_similar_datasets_by_text(text_query, limit)
-                ),
-                inputs=[search_type, dataset_id, text_query, max_results],
-                outputs=results,
             )
-        with gr.TabItem("RAGatouille Search"):
-            ragatouille_interface = create_ragatouille_interface()
-        with gr.TabItem("Search Viewer"):
-            gr.Markdown("## &#128269; Search Viewer")
-            with gr.Row():
-                gr.Markdown(
-                    "This tab allows you to search for datasets using their dataset viewer preview! "
-                    "Unlike the other search methods, this search utilizes the dataset viewer embedded in most datasets to match your query. "
-                    "This means it doesn't rely on the dataset card for matching!\n\n"
-                    "Enter a query to find relevant datasets and preview them directly using the dataset viewer.\n\n"
-                    "Currently, this search is using a subset of datasets and a very early version of an embedding model to match natural language queries to datasets."
-                    "**Help us improve!** Contribute to query quality improvement by participating in our "
-                    "[Argilla annotation task](https://huggingface.co/spaces/davanstrien/my-argilla). Your feedback helps refine search results for everyone."
-                )
-            with gr.Row():
-                viewer_query = gr.Textbox(
-                    label="Search Query", placeholder="Enter your search query here"
-                )
-            with gr.Row():
-                viewer_search_btn = gr.Button("Search")
-                viewer_max_results = gr.Slider(
-                    minimum=1,
-                    maximum=50,
-                    step=1,
-                    value=10,
-                    label="Maximum number of results",
-                )
-            viewer_results = gr.HTML()
-            viewer_search_btn.click(
-                lambda query, limit: asyncio.run(search_viewer(query, limit)),
-                inputs=[viewer_query, viewer_max_results],
-                outputs=viewer_results,
-            )
 demo.launch()

 import asyncio
 from typing import Dict, List
 import gradio as gr
 import httpx
+API_URL = "http://localhost:8000"
+async def fetch_similar_datasets(dataset_id: str, limit: int = 5) -> List[Dict]:
     async with httpx.AsyncClient() as client:
         response = await client.get(
+            f"{API_URL}/similarity/datasets",
+            params={"dataset_id": dataset_id, "k": limit},
         )
         if response.status_code == 200:
+            return response.json()["results"]
         return []
+async def fetch_similar_datasets_by_text(query: str, limit: int = 5) -> List[Dict]:
+    async with httpx.AsyncClient() as client:
         response = await client.get(
+            f"{API_URL}/search/datasets", params={"query": query, "k": limit}
         )
         if response.status_code == 200:
+            return response.json()["results"]
         return []
+def format_results(results: List[Dict]) -> str:
+    markdown = ""
+    for result in results:
         hub_id = result["dataset_id"]
         similarity = result["similarity"]
+        summary = result.get("summary", "No summary available.")
         url = f"https://huggingface.co/datasets/{hub_id}"
+        markdown += f"### [{hub_id}]({url})\n"
+        markdown += f"*Similarity: {similarity:.2f}*\n\n"
+        markdown += f"{summary}\n\n"
         markdown += "---\n\n"
     return markdown
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        # 🔍 Dataset Explorer
+        Find similar datasets or search by text query
+        """,
+        elem_classes=["center-text"],
     )
+    with gr.Column(variant="panel"):
+        search_type = gr.Radio(
+            ["Dataset ID", "Text Query"],
+            label="Search Method",
+            value="Dataset ID",
+            container=False,
         )
+        with gr.Group():
+            dataset_id = gr.Textbox(
+                value="airtrain-ai/fineweb-edu-fortified",
+                label="Dataset ID",
+                container=False,
+            )
+            text_query = gr.Textbox(
+                label="Text Query",
+                placeholder="Enter at least 3 characters...",
+                container=False,
+                visible=False,
             )
+        with gr.Row():
+            search_btn = gr.Button("🔍 Search", size="lg")
+            max_results = gr.Slider(
+                minimum=1,
+                maximum=20,
+                step=1,
+                value=5,
+                label="Number of results",
             )
+    results = gr.Markdown(elem_classes=["results-container"])
+    def toggle_input_visibility(choice):
+        return (
+            gr.update(visible=choice == "Dataset ID"),
+            gr.update(visible=choice == "Text Query"),
+            gr.update(visible=choice == "Dataset ID"),
+        )
+    search_type.change(
+        toggle_input_visibility,
+        inputs=[search_type],
+        outputs=[dataset_id, text_query, search_btn],
+    )
+    async def search_handler(search_type, dataset_id, text_query, limit):
+        if search_type == "Dataset ID":
+            results = await fetch_similar_datasets(dataset_id, limit)
+        else:
+            results = await fetch_similar_datasets_by_text(text_query, limit)
+        if not results:
+            return "No similar datasets found."
+        return format_results(results)
+    text_query.input(
+        lambda search_type, text_query, limit: asyncio.run(
+            search_handler(search_type, "", text_query, limit)
+        )
+        if len(text_query) >= 3
+        else None,  # Only trigger after 3 characters
+        inputs=[search_type, text_query, max_results],
+        outputs=results,
+        api_name=False,
+    )
+    search_btn.click(
+        lambda search_type, dataset_id, text_query, limit: asyncio.run(
+            search_handler(search_type, dataset_id, text_query, limit)
+        ),
+        inputs=[search_type, dataset_id, text_query, max_results],
+        outputs=results,
+    )
 demo.launch()

requirements.in DELETED Viewed

@@ -1,6 +0,0 @@
-cashews
-gradio
-httpx
-huggingface_hub
-ragatouille
-toolz

requirements.txt DELETED Viewed

@@ -1,520 +0,0 @@
-# This file was autogenerated by uv via the following command:
-#    uv pip compile requirements.in -o requirements.txt
-aiofiles==23.2.1
-    # via gradio
-aiohappyeyeballs==2.4.0
-    # via aiohttp
-aiohttp==3.10.5
-    # via
-    #   datasets
-    #   fsspec
-    #   langchain
-    #   llama-index-core
-    #   llama-index-legacy
-aiosignal==1.3.1
-    # via aiohttp
-annotated-types==0.7.0
-    # via pydantic
-anyio==4.4.0
-    # via
-    #   gradio
-    #   httpx
-    #   openai
-    #   starlette
-attrs==24.2.0
-    # via aiohttp
-beautifulsoup4==4.12.3
-    # via llama-index-readers-file
-bitarray==2.9.2
-    # via colbert-ai
-blinker==1.8.2
-    # via flask
-cashews==7.3.1
-    # via -r requirements.in
-catalogue==2.0.10
-    # via srsly
-certifi==2024.8.30
-    # via
-    #   httpcore
-    #   httpx
-    #   requests
-charset-normalizer==3.3.2
-    # via requests
-click==8.1.7
-    # via
-    #   flask
-    #   nltk
-    #   typer
-    #   uvicorn
-colbert-ai==0.2.19
-    # via ragatouille
-contourpy==1.3.0
-    # via matplotlib
-cycler==0.12.1
-    # via matplotlib
-dataclasses-json==0.6.7
-    # via
-    #   llama-index-core
-    #   llama-index-legacy
-datasets==2.14.4
-    # via colbert-ai
-deprecated==1.2.14
-    # via
-    #   llama-index-core
-    #   llama-index-legacy
-dill==0.3.7
-    # via
-    #   datasets
-    #   multiprocess
-dirtyjson==1.0.8
-    # via
-    #   llama-index-core
-    #   llama-index-legacy
-distro==1.9.0
-    # via openai
-faiss-cpu==1.8.0.post1
-    # via ragatouille
-fast-pytorch-kmeans==0.2.0.1
-    # via ragatouille
-fastapi==0.112.4
-    # via gradio
-ffmpy==0.4.0
-    # via gradio
-filelock==3.16.0
-    # via
-    #   huggingface-hub
-    #   torch
-    #   transformers
-flask==3.0.3
-    # via colbert-ai
-fonttools==4.53.1
-    # via matplotlib
-frozenlist==1.4.1
-    # via
-    #   aiohttp
-    #   aiosignal
-fsspec==2024.9.0
-    # via
-    #   datasets
-    #   gradio-client
-    #   huggingface-hub
-    #   llama-index-core
-    #   llama-index-legacy
-    #   torch
-git-python==1.0.3
-    # via colbert-ai
-gitdb==4.0.11
-    # via gitpython
-gitpython==3.1.43
-    # via git-python
-gradio==4.43.0
-    # via -r requirements.in
-gradio-client==1.3.0
-    # via gradio
-greenlet==3.1.0
-    # via sqlalchemy
-h11==0.14.0
-    # via
-    #   httpcore
-    #   uvicorn
-httpcore==1.0.5
-    # via httpx
-httpx==0.27.2
-    # via
-    #   -r requirements.in
-    #   gradio
-    #   gradio-client
-    #   langsmith
-    #   llama-cloud
-    #   llama-index-core
-    #   llama-index-legacy
-    #   openai
-huggingface-hub==0.24.6
-    # via
-    #   -r requirements.in
-    #   datasets
-    #   gradio
-    #   gradio-client
-    #   sentence-transformers
-    #   tokenizers
-    #   transformers
-idna==3.8
-    # via
-    #   anyio
-    #   httpx
-    #   requests
-    #   yarl
-importlib-resources==6.4.4
-    # via gradio
-itsdangerous==2.2.0
-    # via flask
-jinja2==3.1.4
-    # via
-    #   flask
-    #   gradio
-    #   torch
-jiter==0.5.0
-    # via openai
-joblib==1.4.2
-    # via
-    #   nltk
-    #   scikit-learn
-jsonpatch==1.33
-    # via langchain-core
-jsonpointer==3.0.0
-    # via jsonpatch
-kiwisolver==1.4.7
-    # via matplotlib
-langchain==0.2.16
-    # via ragatouille
-langchain-core==0.2.39
-    # via
-    #   langchain
-    #   langchain-text-splitters
-    #   ragatouille
-langchain-text-splitters==0.2.4
-    # via langchain
-langsmith==0.1.117
-    # via
-    #   langchain
-    #   langchain-core
-llama-cloud==0.0.17
-    # via llama-index-indices-managed-llama-cloud
-llama-index==0.11.8
-    # via ragatouille
-llama-index-agent-openai==0.3.1
-    # via
-    #   llama-index
-    #   llama-index-llms-openai
-    #   llama-index-program-openai
-llama-index-cli==0.3.1
-    # via llama-index
-llama-index-core==0.11.8
-    # via
-    #   llama-index
-    #   llama-index-agent-openai
-    #   llama-index-cli
-    #   llama-index-embeddings-openai
-    #   llama-index-indices-managed-llama-cloud
-    #   llama-index-llms-openai
-    #   llama-index-multi-modal-llms-openai
-    #   llama-index-program-openai
-    #   llama-index-question-gen-openai
-    #   llama-index-readers-file
-    #   llama-index-readers-llama-parse
-    #   llama-parse
-llama-index-embeddings-openai==0.2.4
-    # via
-    #   llama-index
-    #   llama-index-cli
-llama-index-indices-managed-llama-cloud==0.3.0
-    # via llama-index
-llama-index-legacy==0.9.48.post3
-    # via llama-index
-llama-index-llms-openai==0.2.3
-    # via
-    #   llama-index
-    #   llama-index-agent-openai
-    #   llama-index-cli
-    #   llama-index-multi-modal-llms-openai
-    #   llama-index-program-openai
-    #   llama-index-question-gen-openai
-llama-index-multi-modal-llms-openai==0.2.0
-    # via llama-index
-llama-index-program-openai==0.2.0
-    # via
-    #   llama-index
-    #   llama-index-question-gen-openai
-llama-index-question-gen-openai==0.2.0
-    # via llama-index
-llama-index-readers-file==0.2.1
-    # via llama-index
-llama-index-readers-llama-parse==0.3.0
-    # via llama-index
-llama-parse==0.5.5
-    # via llama-index-readers-llama-parse
-markdown-it-py==3.0.0
-    # via rich
-markupsafe==2.1.5
-    # via
-    #   gradio
-    #   jinja2
-    #   werkzeug
-marshmallow==3.22.0
-    # via dataclasses-json
-matplotlib==3.9.2
-    # via gradio
-mdurl==0.1.2
-    # via markdown-it-py
-mpmath==1.3.0
-    # via sympy
-multidict==6.1.0
-    # via
-    #   aiohttp
-    #   yarl
-multiprocess==0.70.15
-    # via datasets
-mypy-extensions==1.0.0
-    # via typing-inspect
-nest-asyncio==1.6.0
-    # via
-    #   llama-index-core
-    #   llama-index-legacy
-networkx==3.3
-    # via
-    #   llama-index-core
-    #   llama-index-legacy
-    #   torch
-ninja==1.11.1.1
-    # via colbert-ai
-nltk==3.9.1
-    # via
-    #   llama-index
-    #   llama-index-core
-    #   llama-index-legacy
-numpy==1.26.4
-    # via
-    #   contourpy
-    #   datasets
-    #   faiss-cpu
-    #   fast-pytorch-kmeans
-    #   gradio
-    #   langchain
-    #   llama-index-core
-    #   llama-index-legacy
-    #   matplotlib
-    #   onnx
-    #   pandas
-    #   pyarrow
-    #   scikit-learn
-    #   scipy
-    #   sentence-transformers
-    #   transformers
-    #   voyager
-onnx==1.16.2
-    # via ragatouille
-openai==1.44.1
-    # via
-    #   llama-index-agent-openai
-    #   llama-index-embeddings-openai
-    #   llama-index-legacy
-    #   llama-index-llms-openai
-orjson==3.10.7
-    # via
-    #   gradio
-    #   langsmith
-packaging==24.1
-    # via
-    #   datasets
-    #   faiss-cpu
-    #   gradio
-    #   gradio-client
-    #   huggingface-hub
-    #   langchain-core
-    #   marshmallow
-    #   matplotlib
-    #   transformers
-pandas==2.2.2
-    # via
-    #   datasets
-    #   gradio
-    #   llama-index-legacy
-    #   llama-index-readers-file
-pillow==10.4.0
-    # via
-    #   gradio
-    #   llama-index-core
-    #   matplotlib
-    #   sentence-transformers
-protobuf==5.28.0
-    # via onnx
-pyarrow==17.0.0
-    # via datasets
-pydantic==2.9.1
-    # via
-    #   fastapi
-    #   gradio
-    #   langchain
-    #   langchain-core
-    #   langsmith
-    #   llama-cloud
-    #   llama-index-core
-    #   openai
-pydantic-core==2.23.3
-    # via pydantic
-pydub==0.25.1
-    # via gradio
-pygments==2.18.0
-    # via rich
-pynvml==11.5.3
-    # via fast-pytorch-kmeans
-pyparsing==3.1.4
-    # via matplotlib
-pypdf==4.3.1
-    # via llama-index-readers-file
-python-dateutil==2.9.0.post0
-    # via
-    #   matplotlib
-    #   pandas
-python-dotenv==1.0.1
-    # via colbert-ai
-python-multipart==0.0.9
-    # via gradio
-pytz==2024.1
-    # via pandas
-pyyaml==6.0.2
-    # via
-    #   datasets
-    #   gradio
-    #   huggingface-hub
-    #   langchain
-    #   langchain-core
-    #   llama-index-core
-    #   transformers
-ragatouille==0.0.8.post4
-    # via -r requirements.in
-regex==2024.7.24
-    # via
-    #   nltk
-    #   tiktoken
-    #   transformers
-requests==2.32.3
-    # via
-    #   datasets
-    #   huggingface-hub
-    #   langchain
-    #   langsmith
-    #   llama-index-core
-    #   llama-index-legacy
-    #   tiktoken
-    #   transformers
-rich==13.8.0
-    # via typer
-ruff==0.6.4
-    # via gradio
-safetensors==0.4.5
-    # via transformers
-scikit-learn==1.5.1
-    # via sentence-transformers
-scipy==1.14.1
-    # via
-    #   colbert-ai
-    #   scikit-learn
-    #   sentence-transformers
-semantic-version==2.10.0
-    # via gradio
-sentence-transformers==2.7.0
-    # via ragatouille
-setuptools==74.1.2
-    # via torch
-shellingham==1.5.4
-    # via typer
-six==1.16.0
-    # via python-dateutil
-smmap==5.0.1
-    # via gitdb
-sniffio==1.3.1
-    # via
-    #   anyio
-    #   httpx
-    #   openai
-soupsieve==2.6
-    # via beautifulsoup4
-sqlalchemy==2.0.34
-    # via
-    #   langchain
-    #   llama-index-core
-    #   llama-index-legacy
-srsly==2.4.8
-    # via ragatouille
-starlette==0.38.5
-    # via fastapi
-striprtf==0.0.26
-    # via llama-index-readers-file
-sympy==1.13.2
-    # via torch
-tenacity==8.5.0
-    # via
-    #   langchain
-    #   langchain-core
-    #   llama-index-core
-    #   llama-index-legacy
-threadpoolctl==3.5.0
-    # via scikit-learn
-tiktoken==0.7.0
-    # via
-    #   llama-index-core
-    #   llama-index-legacy
-tokenizers==0.19.1
-    # via transformers
-tomlkit==0.12.0
-    # via gradio
-toolz==0.12.1
-    # via -r requirements.in
-torch==2.4.1
-    # via
-    #   fast-pytorch-kmeans
-    #   ragatouille
-    #   sentence-transformers
-tqdm==4.66.5
-    # via
-    #   colbert-ai
-    #   datasets
-    #   huggingface-hub
-    #   llama-index-core
-    #   nltk
-    #   openai
-    #   sentence-transformers
-    #   transformers
-transformers==4.44.2
-    # via
-    #   colbert-ai
-    #   ragatouille
-    #   sentence-transformers
-typer==0.12.5
-    # via gradio
-typing-extensions==4.12.2
-    # via
-    #   fastapi
-    #   gradio
-    #   gradio-client
-    #   huggingface-hub
-    #   langchain-core
-    #   llama-index-core
-    #   llama-index-legacy
-    #   openai
-    #   pydantic
-    #   pydantic-core
-    #   sqlalchemy
-    #   torch
-    #   typer
-    #   typing-inspect
-typing-inspect==0.9.0
-    # via
-    #   dataclasses-json
-    #   llama-index-core
-    #   llama-index-legacy
-tzdata==2024.1
-    # via pandas
-ujson==5.10.0
-    # via colbert-ai
-urllib3==2.2.2
-    # via
-    #   gradio
-    #   requests
-uvicorn==0.30.6
-    # via gradio
-voyager==2.0.9
-    # via ragatouille
-websockets==12.0
-    # via gradio-client
-werkzeug==3.0.4
-    # via flask
-wrapt==1.16.0
-    # via
-    #   deprecated
-    #   llama-index-core
-xxhash==3.5.0
-    # via datasets
-yarl==1.11.1
-    # via aiohttp