File size: 3,949 Bytes
6ff6cb6
 
84bfe38
 
6ff6cb6
2834fe9
1c9d91a
13dd954
 
1c9d91a
6ff6cb6
2834fe9
1c9d91a
 
2834fe9
87a91f0
1c9d91a
87a91f0
13dd954
 
1c9d91a
 
2834fe9
1c9d91a
eb9f45f
 
1c9d91a
eb9f45f
 
 
1c9d91a
 
eb9f45f
1c9d91a
6ff6cb6
 
1c9d91a
24f13dd
6ff6cb6
1c9d91a
 
 
6ff6cb6
dba982b
6ff6cb6
13dd954
 
1c9d91a
 
 
 
 
 
 
84bfe38
13dd954
1c9d91a
 
 
 
 
 
2834fe9
dd2978a
1c9d91a
 
 
 
 
 
 
 
 
 
 
dd2978a
eb9f45f
1c9d91a
 
 
 
 
 
 
 
dd2978a
eb9f45f
1c9d91a
 
 
 
 
 
 
 
13dd954
1c9d91a
 
 
 
 
2834fe9
1c9d91a
 
 
 
 
2834fe9
1c9d91a
 
2834fe9
1c9d91a
2834fe9
1c9d91a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2834fe9
13dd954
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import asyncio
from typing import Dict, List

import gradio as gr
import httpx

API_URL = "http://localhost:8000"


async def fetch_similar_datasets(dataset_id: str, limit: int = 5) -> List[Dict]:
    async with httpx.AsyncClient() as client:
        response = await client.get(
            f"{API_URL}/similarity/datasets",
            params={"dataset_id": dataset_id, "k": limit},
        )
        if response.status_code == 200:
            return response.json()["results"]
        return []


async def fetch_similar_datasets_by_text(query: str, limit: int = 5) -> List[Dict]:
    async with httpx.AsyncClient() as client:
        response = await client.get(
            f"{API_URL}/search/datasets", params={"query": query, "k": limit}
        )
        if response.status_code == 200:
            return response.json()["results"]
        return []


def format_results(results: List[Dict]) -> str:
    markdown = ""

    for result in results:
        hub_id = result["dataset_id"]
        similarity = result["similarity"]
        summary = result.get("summary", "No summary available.")
        url = f"https://huggingface.co./datasets/{hub_id}"

        markdown += f"### [{hub_id}]({url})\n"
        markdown += f"*Similarity: {similarity:.2f}*\n\n"
        markdown += f"{summary}\n\n"
        markdown += "---\n\n"

    return markdown


with gr.Blocks() as demo:
    gr.Markdown(
        """
        # πŸ” Dataset Explorer
        Find similar datasets or search by text query
        """,
        elem_classes=["center-text"],
    )

    with gr.Column(variant="panel"):
        search_type = gr.Radio(
            ["Dataset ID", "Text Query"],
            label="Search Method",
            value="Dataset ID",
            container=False,
        )

        with gr.Group():
            dataset_id = gr.Textbox(
                value="airtrain-ai/fineweb-edu-fortified",
                label="Dataset ID",
                container=False,
            )
            text_query = gr.Textbox(
                label="Text Query",
                placeholder="Enter at least 3 characters...",
                container=False,
                visible=False,
            )

        with gr.Row():
            search_btn = gr.Button("πŸ” Search", size="lg")
            max_results = gr.Slider(
                minimum=1,
                maximum=20,
                step=1,
                value=5,
                label="Number of results",
            )

    results = gr.Markdown(elem_classes=["results-container"])

    def toggle_input_visibility(choice):
        return (
            gr.update(visible=choice == "Dataset ID"),
            gr.update(visible=choice == "Text Query"),
            gr.update(visible=choice == "Dataset ID"),
        )

    search_type.change(
        toggle_input_visibility,
        inputs=[search_type],
        outputs=[dataset_id, text_query, search_btn],
    )

    async def search_handler(search_type, dataset_id, text_query, limit):
        if search_type == "Dataset ID":
            results = await fetch_similar_datasets(dataset_id, limit)
        else:
            results = await fetch_similar_datasets_by_text(text_query, limit)

        if not results:
            return "No similar datasets found."

        return format_results(results)

    text_query.input(
        lambda search_type, text_query, limit: asyncio.run(
            search_handler(search_type, "", text_query, limit)
        )
        if len(text_query) >= 3
        else None,  # Only trigger after 3 characters
        inputs=[search_type, text_query, max_results],
        outputs=results,
        api_name=False,
    )

    search_btn.click(
        lambda search_type, dataset_id, text_query, limit: asyncio.run(
            search_handler(search_type, dataset_id, text_query, limit)
        ),
        inputs=[search_type, dataset_id, text_query, max_results],
        outputs=results,
    )

demo.launch()