File size: 3,949 Bytes
6ff6cb6 84bfe38 6ff6cb6 2834fe9 1c9d91a 13dd954 1c9d91a 6ff6cb6 2834fe9 1c9d91a 2834fe9 87a91f0 1c9d91a 87a91f0 13dd954 1c9d91a 2834fe9 1c9d91a eb9f45f 1c9d91a eb9f45f 1c9d91a eb9f45f 1c9d91a 6ff6cb6 1c9d91a 24f13dd 6ff6cb6 1c9d91a 6ff6cb6 dba982b 6ff6cb6 13dd954 1c9d91a 84bfe38 13dd954 1c9d91a 2834fe9 dd2978a 1c9d91a dd2978a eb9f45f 1c9d91a dd2978a eb9f45f 1c9d91a 13dd954 1c9d91a 2834fe9 1c9d91a 2834fe9 1c9d91a 2834fe9 1c9d91a 2834fe9 1c9d91a 2834fe9 13dd954 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import asyncio
from typing import Dict, List
import gradio as gr
import httpx
API_URL = "http://localhost:8000"
async def fetch_similar_datasets(dataset_id: str, limit: int = 5) -> List[Dict]:
async with httpx.AsyncClient() as client:
response = await client.get(
f"{API_URL}/similarity/datasets",
params={"dataset_id": dataset_id, "k": limit},
)
if response.status_code == 200:
return response.json()["results"]
return []
async def fetch_similar_datasets_by_text(query: str, limit: int = 5) -> List[Dict]:
async with httpx.AsyncClient() as client:
response = await client.get(
f"{API_URL}/search/datasets", params={"query": query, "k": limit}
)
if response.status_code == 200:
return response.json()["results"]
return []
def format_results(results: List[Dict]) -> str:
markdown = ""
for result in results:
hub_id = result["dataset_id"]
similarity = result["similarity"]
summary = result.get("summary", "No summary available.")
url = f"https://huggingface.co./datasets/{hub_id}"
markdown += f"### [{hub_id}]({url})\n"
markdown += f"*Similarity: {similarity:.2f}*\n\n"
markdown += f"{summary}\n\n"
markdown += "---\n\n"
return markdown
with gr.Blocks() as demo:
gr.Markdown(
"""
# π Dataset Explorer
Find similar datasets or search by text query
""",
elem_classes=["center-text"],
)
with gr.Column(variant="panel"):
search_type = gr.Radio(
["Dataset ID", "Text Query"],
label="Search Method",
value="Dataset ID",
container=False,
)
with gr.Group():
dataset_id = gr.Textbox(
value="airtrain-ai/fineweb-edu-fortified",
label="Dataset ID",
container=False,
)
text_query = gr.Textbox(
label="Text Query",
placeholder="Enter at least 3 characters...",
container=False,
visible=False,
)
with gr.Row():
search_btn = gr.Button("π Search", size="lg")
max_results = gr.Slider(
minimum=1,
maximum=20,
step=1,
value=5,
label="Number of results",
)
results = gr.Markdown(elem_classes=["results-container"])
def toggle_input_visibility(choice):
return (
gr.update(visible=choice == "Dataset ID"),
gr.update(visible=choice == "Text Query"),
gr.update(visible=choice == "Dataset ID"),
)
search_type.change(
toggle_input_visibility,
inputs=[search_type],
outputs=[dataset_id, text_query, search_btn],
)
async def search_handler(search_type, dataset_id, text_query, limit):
if search_type == "Dataset ID":
results = await fetch_similar_datasets(dataset_id, limit)
else:
results = await fetch_similar_datasets_by_text(text_query, limit)
if not results:
return "No similar datasets found."
return format_results(results)
text_query.input(
lambda search_type, text_query, limit: asyncio.run(
search_handler(search_type, "", text_query, limit)
)
if len(text_query) >= 3
else None, # Only trigger after 3 characters
inputs=[search_type, text_query, max_results],
outputs=results,
api_name=False,
)
search_btn.click(
lambda search_type, dataset_id, text_query, limit: asyncio.run(
search_handler(search_type, dataset_id, text_query, limit)
),
inputs=[search_type, dataset_id, text_query, max_results],
outputs=results,
)
demo.launch()
|