import asyncio from typing import Dict, List import gradio as gr import httpx API_URL = "http://localhost:8000" async def fetch_similar_datasets(dataset_id: str, limit: int = 5) -> List[Dict]: async with httpx.AsyncClient() as client: response = await client.get( f"{API_URL}/similarity/datasets", params={"dataset_id": dataset_id, "k": limit}, ) if response.status_code == 200: return response.json()["results"] return [] async def fetch_similar_datasets_by_text(query: str, limit: int = 5) -> List[Dict]: async with httpx.AsyncClient() as client: response = await client.get( f"{API_URL}/search/datasets", params={"query": query, "k": limit} ) if response.status_code == 200: return response.json()["results"] return [] def format_results(results: List[Dict]) -> str: markdown = "" for result in results: hub_id = result["dataset_id"] similarity = result["similarity"] summary = result.get("summary", "No summary available.") url = f"https://huggingface.co./datasets/{hub_id}" markdown += f"### [{hub_id}]({url})\n" markdown += f"*Similarity: {similarity:.2f}*\n\n" markdown += f"{summary}\n\n" markdown += "---\n\n" return markdown with gr.Blocks() as demo: gr.Markdown( """ # 🔍 Dataset Explorer Find similar datasets or search by text query """, elem_classes=["center-text"], ) with gr.Column(variant="panel"): search_type = gr.Radio( ["Dataset ID", "Text Query"], label="Search Method", value="Dataset ID", container=False, ) with gr.Group(): dataset_id = gr.Textbox( value="airtrain-ai/fineweb-edu-fortified", label="Dataset ID", container=False, ) text_query = gr.Textbox( label="Text Query", placeholder="Enter at least 3 characters...", container=False, visible=False, ) with gr.Row(): search_btn = gr.Button("🔍 Search", size="lg") max_results = gr.Slider( minimum=1, maximum=20, step=1, value=5, label="Number of results", ) results = gr.Markdown(elem_classes=["results-container"]) def toggle_input_visibility(choice): return ( gr.update(visible=choice == "Dataset ID"), gr.update(visible=choice == "Text Query"), gr.update(visible=choice == "Dataset ID"), ) search_type.change( toggle_input_visibility, inputs=[search_type], outputs=[dataset_id, text_query, search_btn], ) async def search_handler(search_type, dataset_id, text_query, limit): if search_type == "Dataset ID": results = await fetch_similar_datasets(dataset_id, limit) else: results = await fetch_similar_datasets_by_text(text_query, limit) if not results: return "No similar datasets found." return format_results(results) text_query.input( lambda search_type, text_query, limit: asyncio.run( search_handler(search_type, "", text_query, limit) ) if len(text_query) >= 3 else None, # Only trigger after 3 characters inputs=[search_type, text_query, max_results], outputs=results, api_name=False, ) search_btn.click( lambda search_type, dataset_id, text_query, limit: asyncio.run( search_handler(search_type, dataset_id, text_query, limit) ), inputs=[search_type, dataset_id, text_query, max_results], outputs=results, ) demo.launch()