|
import asyncio |
|
from typing import Dict, List |
|
|
|
import gradio as gr |
|
import httpx |
|
|
|
API_URL = "http://localhost:8000" |
|
|
|
|
|
async def fetch_similar_datasets(dataset_id: str, limit: int = 5) -> List[Dict]: |
|
async with httpx.AsyncClient() as client: |
|
response = await client.get( |
|
f"{API_URL}/similarity/datasets", |
|
params={"dataset_id": dataset_id, "k": limit}, |
|
) |
|
if response.status_code == 200: |
|
return response.json()["results"] |
|
return [] |
|
|
|
|
|
async def fetch_similar_datasets_by_text(query: str, limit: int = 5) -> List[Dict]: |
|
async with httpx.AsyncClient() as client: |
|
response = await client.get( |
|
f"{API_URL}/search/datasets", params={"query": query, "k": limit} |
|
) |
|
if response.status_code == 200: |
|
return response.json()["results"] |
|
return [] |
|
|
|
|
|
def format_results(results: List[Dict]) -> str: |
|
markdown = "" |
|
|
|
for result in results: |
|
hub_id = result["dataset_id"] |
|
similarity = result["similarity"] |
|
summary = result.get("summary", "No summary available.") |
|
url = f"https://huggingface.co./datasets/{hub_id}" |
|
|
|
markdown += f"### [{hub_id}]({url})\n" |
|
markdown += f"*Similarity: {similarity:.2f}*\n\n" |
|
markdown += f"{summary}\n\n" |
|
markdown += "---\n\n" |
|
|
|
return markdown |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown( |
|
""" |
|
# π Dataset Explorer |
|
Find similar datasets or search by text query |
|
""", |
|
elem_classes=["center-text"], |
|
) |
|
|
|
with gr.Column(variant="panel"): |
|
search_type = gr.Radio( |
|
["Dataset ID", "Text Query"], |
|
label="Search Method", |
|
value="Dataset ID", |
|
container=False, |
|
) |
|
|
|
with gr.Group(): |
|
dataset_id = gr.Textbox( |
|
value="airtrain-ai/fineweb-edu-fortified", |
|
label="Dataset ID", |
|
container=False, |
|
) |
|
text_query = gr.Textbox( |
|
label="Text Query", |
|
placeholder="Enter at least 3 characters...", |
|
container=False, |
|
visible=False, |
|
) |
|
|
|
with gr.Row(): |
|
search_btn = gr.Button("π Search", size="lg") |
|
max_results = gr.Slider( |
|
minimum=1, |
|
maximum=20, |
|
step=1, |
|
value=5, |
|
label="Number of results", |
|
) |
|
|
|
results = gr.Markdown(elem_classes=["results-container"]) |
|
|
|
def toggle_input_visibility(choice): |
|
return ( |
|
gr.update(visible=choice == "Dataset ID"), |
|
gr.update(visible=choice == "Text Query"), |
|
gr.update(visible=choice == "Dataset ID"), |
|
) |
|
|
|
search_type.change( |
|
toggle_input_visibility, |
|
inputs=[search_type], |
|
outputs=[dataset_id, text_query, search_btn], |
|
) |
|
|
|
async def search_handler(search_type, dataset_id, text_query, limit): |
|
if search_type == "Dataset ID": |
|
results = await fetch_similar_datasets(dataset_id, limit) |
|
else: |
|
results = await fetch_similar_datasets_by_text(text_query, limit) |
|
|
|
if not results: |
|
return "No similar datasets found." |
|
|
|
return format_results(results) |
|
|
|
text_query.input( |
|
lambda search_type, text_query, limit: asyncio.run( |
|
search_handler(search_type, "", text_query, limit) |
|
) |
|
if len(text_query) >= 3 |
|
else None, |
|
inputs=[search_type, text_query, max_results], |
|
outputs=results, |
|
api_name=False, |
|
) |
|
|
|
search_btn.click( |
|
lambda search_type, dataset_id, text_query, limit: asyncio.run( |
|
search_handler(search_type, dataset_id, text_query, limit) |
|
), |
|
inputs=[search_type, dataset_id, text_query, max_results], |
|
outputs=results, |
|
) |
|
|
|
demo.launch() |
|
|