infinite-dataset-hub

Sleeping

App Files Files Community

lhoestq HF staff commited on Jul 18, 2024

Commit

e6ec5b6

1 Parent(s): 067985f

add dataset page

Browse files

Files changed (1) hide show

app.py +116 -50

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from functools import partial
 from typing import Iterator
 import gradio as gr
 from huggingface_hub import InferenceClient
@@ -12,21 +14,38 @@ GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
         "A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
         "Generate a list of 10 names of quality dataset that don't exist but sound plausible and would "
         "be helpful. Feel free to reuse words from the query '{search_query}' to name the datasets. "
-        "Give each dataset descriptive tags/keywords and use the following format:\n1. DatasetName (tag1, tag2, tag3)"
 )
-def stream_reponse(msg: str) -> Iterator[str]:
-    for message in client.chat_completion(
-        messages=[{"role": "user", "content": msg}],
-        max_tokens=500,
-        stream=True,
-    ):
-        yield message.choices[0].delta.content
 def gen_datasets(search_query: str) -> Iterator[str]:
-    search_query = search_query if search_query.strip() else "topic classification"
     generated_text = ""
     for token in stream_reponse(GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY.format(search_query=search_query)):
         generated_text += token
@@ -35,19 +54,33 @@ def gen_datasets(search_query: str) -> Iterator[str]:
     yield generated_text.strip()
     print("-----\n\n" + generated_text)
 NB_ITEMS_PER_PAGE = 10
 default_output = """
-1. NewsArticleCollection (BreakingNews, VietnameseNewsTrends, CountrySpecificTopics)
-2. ScienceJournalDataset (AstrophysicsTrends, EcologyPatterns, QuantumMechanicsInsights)
-3. TechnologyReviewDB (TechInnovationSurges, MobileDevicesAnalysis, CybersecurityBreachStudies)
-4. BusinessWeeklyReports (MarketTrends, E-commerceGrowth, CorporateChangeDynamics)
-5. HealthResearchArchive (PandemicPatterns, DiseaseOutbreakInferences, WellnessTrends)
-6. SportsDataCorpus (ExerciseRoutineShifts, ProfessionalLeagueShifts, InjuryImpactAnalysis)
-7. EducationSectorStatistics (OnlineEducationAdoption, CurriculumImpactStudies, TeacherTrainingAmendments)
-8. CinemaCritiqueBank (FilmGenreRotation, HollywoodProductionImpacts, GlobalEntertainmentSurveys)
-9. CulturalShiftSamples (FoodCuisineEvolution, SocialMediaInfluence, ArtTrendsEvolution)
-10. LocalLifestyleSections (UrbanAgricultureInfluence, EcoFriendlyLiving, SustainableTransportationTrends)
 """.strip().split("\n")
 assert len(default_output) == NB_ITEMS_PER_PAGE
@@ -138,9 +171,20 @@ def search_datasets(search_query):
         yield output_values
-def show_dataset(*buttons_values, i):
     dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
-    return f"TODO: show {dataset_name=}, {tags=}"
 with gr.Blocks(css=css) as demo:
@@ -148,33 +192,55 @@ with gr.Blocks(css=css) as demo:
         "# 🤗 Infinite Dataset Hub\n\n"
         f"_powered by [{model_id}](https://huggingface.co/{model_id})_"
     )
-    with gr.Row():
-        with gr.Column(scale=4, min_width=0):
-            pass
-        with gr.Column(scale=9):
-            search_bar = gr.Textbox(max_lines=1, placeholder="Search datasets", show_label=False, container=False)
-        with gr.Column(min_width=64):
-            search_button = gr.Button("🔍", variant="primary")
-        with gr.Column(scale=4, min_width=0):
-            pass
-    outputs = []
-    outputs.append(gr.Markdown())
-    with gr.Row():
-        with gr.Column(scale=4, min_width=0):
-            pass
-        with gr.Column(scale=10):
-            buttons = []
-            for i in range(10):
-                line = default_output[i]
-                dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1)
-                with gr.Group(elem_classes="buttonsGroup"):
-                    top = gr.Button(dataset_name, elem_classes="topButton")
-                    bottom = gr.Button(tags, elem_classes="bottomButton")
-                    buttons += [top, bottom]
-                    top.click(partial(show_dataset, i=i), inputs=buttons, outputs=outputs)
-                    bottom.click(partial(show_dataset, i=i), inputs=buttons, outputs=outputs)
-        with gr.Column(scale=4, min_width=0):
-            pass
-    search_bar.submit(search_datasets, inputs=search_bar, outputs=buttons)
-    search_button.click(search_datasets, inputs=search_bar, outputs=buttons)
 demo.launch()

+import time
 from functools import partial
 from typing import Iterator
 import gradio as gr
+import requests.exceptions
 from huggingface_hub import InferenceClient
         "A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
         "Generate a list of 10 names of quality dataset that don't exist but sound plausible and would "
         "be helpful. Feel free to reuse words from the query '{search_query}' to name the datasets. "
+        "Every dataset should be about '{search_query}' and have descriptive tags/keywords including the ML task name associated to the dataset (classification, regression, anomaly detection, etc.). Use the following format:\n1. DatasetName1 (tag1, tag2, tag3)\n1. DatasetName2 (tag1, tag2, tag3)"
 )
+GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS = (
+    "A ML practitioner is looking for a dataset CSV after the query '{search_query}'. "
+    "Generate the first 5 rows of a plausible and quality CSV for the dataset '{dataset_name}'. "
+    "You can get inspiration from related keywords '{tags}' but most importantly the dataset should correspond to the query '{search_query}'. "
+    "Focus on quality text content and and use a 'label' or 'labels' column if it makes sense (invent labels, avoid reusing the keywords, be accurate while labelling texts). "
+    "Reply using a short description of the dataset with title **Dataset Description:** followed by the CSV content in a code block and with title **CSV Content Preview:**."
+)
+default_query = "various datasets on many different subjects and topics, from classification to language modeling, from science to sport to finance to news"
+def stream_reponse(msg: str, max_tokens=500) -> Iterator[str]:
+    for _ in range(3):
+        try:
+            for message in client.chat_completion(
+                messages=[{"role": "user", "content": msg}],
+                max_tokens=max_tokens,
+                stream=True,
+            ):
+                yield message.choices[0].delta.content
+        except requests.exceptions.ConnectionError as e:
+            print(e + "\n\nRetrying in 1sec")
+            time.sleep(1)
+            continue
+        break
 def gen_datasets(search_query: str) -> Iterator[str]:
+    search_query = search_query if search_query.strip() else default_query
     generated_text = ""
     for token in stream_reponse(GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY.format(search_query=search_query)):
         generated_text += token
     yield generated_text.strip()
     print("-----\n\n" + generated_text)
+def gen_dataset_content(search_query: str, dataset_name: str, tags: str) -> Iterator[str]:
+    search_query = search_query if search_query.strip() else default_query
+    generated_text = ""
+    for token in stream_reponse(GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
+        search_query=search_query,
+        dataset_name=dataset_name,
+        tags=tags,
+    ), max_tokens=1500):
+        generated_text += token
+        yield generated_text
+    print("-----\n\n" + generated_text)
 NB_ITEMS_PER_PAGE = 10
 default_output = """
+1. NewsEventsPredict (classification, media, trend)
+2. FinancialForecast (economy, stocks, regression)
+3. HealthMonitor (science, real-time, anomaly detection)
+4. SportsAnalysis (classification, performance, player tracking)
+5. SciLiteracyTools (language modeling, science literacy, text classification)
+6. RetailSalesAnalyzer (consumer behavior, sales trend, segmentation)
+7. SocialSentimentEcho (social media, emotion analysis, clustering)
+8. NewsEventTracker (classification, public awareness, topical clustering)
+9. HealthVitalSigns (anomaly detection, biometrics, prediction)
+10. GameStockPredict (classification, finance, sports contingency)
 """.strip().split("\n")
 assert len(default_output) == NB_ITEMS_PER_PAGE
         yield output_values
+def show_dataset(search_query, *buttons_values, i):
     dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
+    dataset_title = f"# {dataset_name}\n\n tags: {tags}\n\n _Note: This is an AI-generated dataset so its content may be inaccurate or false_"
+    yield gr.Column(visible=False), gr.Column(visible=True), dataset_title, ""
+    for generated_text in gen_dataset_content(search_query=search_query, dataset_name=dataset_name, tags=tags):
+        yield gr.Column(), gr.Column(), dataset_title, generated_text
+def show_search_page():
+    return gr.Column(visible=True), gr.Column(visible=False)
+def generate_full_dataset():
+    raise gr.Error("Not implemented yet sorry !")
 with gr.Blocks(css=css) as demo:
         "# 🤗 Infinite Dataset Hub\n\n"
         f"_powered by [{model_id}](https://huggingface.co/{model_id})_"
     )
+    with gr.Column() as search_page:
+        with gr.Row():
+            with gr.Column(scale=4, min_width=0):
+                pass
+            with gr.Column(scale=9):
+                search_bar = gr.Textbox(max_lines=1, placeholder="Search datasets", show_label=False, container=False)
+            with gr.Column(min_width=64):
+                search_button = gr.Button("🔍", variant="primary")
+            with gr.Column(scale=4, min_width=0):
+                pass
+        inputs = [search_bar]
+        show_dataset_outputs = [search_page]
+        with gr.Row():
+            with gr.Column(scale=4, min_width=0):
+                pass
+            with gr.Column(scale=10):
+                buttons = []
+                for i in range(10):
+                    line = default_output[i]
+                    dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1)
+                    with gr.Group(elem_classes="buttonsGroup"):
+                        top = gr.Button(dataset_name, elem_classes="topButton")
+                        bottom = gr.Button(tags, elem_classes="bottomButton")
+                        buttons += [top, bottom]
+                        top.click(partial(show_dataset, i=i), inputs=inputs, outputs=show_dataset_outputs)
+                        bottom.click(partial(show_dataset, i=i), inputs=inputs, outputs=show_dataset_outputs)
+                inputs += buttons
+            with gr.Column(scale=4, min_width=0):
+                pass
+        search_bar.submit(search_datasets, inputs=search_bar, outputs=buttons)
+        search_button.click(search_datasets, inputs=search_bar, outputs=buttons)
+    with gr.Column(visible=False) as dataset_page:
+        with gr.Row():
+            with gr.Column(scale=4, min_width=0):
+                pass
+            with gr.Column(scale=10):
+                dataset_title = gr.Markdown()
+                dataset_content = gr.Markdown()
+                with gr.Row():
+                    with gr.Column(scale=4, min_width=0):
+                        pass
+                    with gr.Column():
+                        generate_full_dataset_button = gr.Button("Generate Full Dataset", variant="primary")
+                        generate_full_dataset_button.click(generate_full_dataset)
+                        back_button = gr.Button("< Back", size="sm")
+                        back_button.click(show_search_page, inputs=[], outputs=[search_page, dataset_page])
+                    with gr.Column(scale=4, min_width=0):
+                        pass
+            with gr.Column(scale=4, min_width=0):
+                pass
+        show_dataset_outputs += [dataset_page, dataset_title, dataset_content]
 demo.launch()