lhoestq HF staff commited on
Commit
e6ec5b6
·
1 Parent(s): 067985f

add dataset page

Browse files
Files changed (1) hide show
  1. app.py +116 -50
app.py CHANGED
@@ -1,7 +1,9 @@
 
1
  from functools import partial
2
  from typing import Iterator
3
 
4
  import gradio as gr
 
5
  from huggingface_hub import InferenceClient
6
 
7
 
@@ -12,21 +14,38 @@ GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
12
  "A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
13
  "Generate a list of 10 names of quality dataset that don't exist but sound plausible and would "
14
  "be helpful. Feel free to reuse words from the query '{search_query}' to name the datasets. "
15
- "Give each dataset descriptive tags/keywords and use the following format:\n1. DatasetName (tag1, tag2, tag3)"
16
  )
17
 
 
 
 
 
 
 
 
 
 
18
 
19
- def stream_reponse(msg: str) -> Iterator[str]:
20
- for message in client.chat_completion(
21
- messages=[{"role": "user", "content": msg}],
22
- max_tokens=500,
23
- stream=True,
24
- ):
25
- yield message.choices[0].delta.content
 
 
 
 
 
 
 
 
26
 
27
 
28
  def gen_datasets(search_query: str) -> Iterator[str]:
29
- search_query = search_query if search_query.strip() else "topic classification"
30
  generated_text = ""
31
  for token in stream_reponse(GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY.format(search_query=search_query)):
32
  generated_text += token
@@ -35,19 +54,33 @@ def gen_datasets(search_query: str) -> Iterator[str]:
35
  yield generated_text.strip()
36
  print("-----\n\n" + generated_text)
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  NB_ITEMS_PER_PAGE = 10
39
 
40
  default_output = """
41
- 1. NewsArticleCollection (BreakingNews, VietnameseNewsTrends, CountrySpecificTopics)
42
- 2. ScienceJournalDataset (AstrophysicsTrends, EcologyPatterns, QuantumMechanicsInsights)
43
- 3. TechnologyReviewDB (TechInnovationSurges, MobileDevicesAnalysis, CybersecurityBreachStudies)
44
- 4. BusinessWeeklyReports (MarketTrends, E-commerceGrowth, CorporateChangeDynamics)
45
- 5. HealthResearchArchive (PandemicPatterns, DiseaseOutbreakInferences, WellnessTrends)
46
- 6. SportsDataCorpus (ExerciseRoutineShifts, ProfessionalLeagueShifts, InjuryImpactAnalysis)
47
- 7. EducationSectorStatistics (OnlineEducationAdoption, CurriculumImpactStudies, TeacherTrainingAmendments)
48
- 8. CinemaCritiqueBank (FilmGenreRotation, HollywoodProductionImpacts, GlobalEntertainmentSurveys)
49
- 9. CulturalShiftSamples (FoodCuisineEvolution, SocialMediaInfluence, ArtTrendsEvolution)
50
- 10. LocalLifestyleSections (UrbanAgricultureInfluence, EcoFriendlyLiving, SustainableTransportationTrends)
51
  """.strip().split("\n")
52
  assert len(default_output) == NB_ITEMS_PER_PAGE
53
 
@@ -138,9 +171,20 @@ def search_datasets(search_query):
138
  yield output_values
139
 
140
 
141
- def show_dataset(*buttons_values, i):
142
  dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
143
- return f"TODO: show {dataset_name=}, {tags=}"
 
 
 
 
 
 
 
 
 
 
 
144
 
145
 
146
  with gr.Blocks(css=css) as demo:
@@ -148,33 +192,55 @@ with gr.Blocks(css=css) as demo:
148
  "# 🤗 Infinite Dataset Hub\n\n"
149
  f"_powered by [{model_id}](https://huggingface.co/{model_id})_"
150
  )
151
- with gr.Row():
152
- with gr.Column(scale=4, min_width=0):
153
- pass
154
- with gr.Column(scale=9):
155
- search_bar = gr.Textbox(max_lines=1, placeholder="Search datasets", show_label=False, container=False)
156
- with gr.Column(min_width=64):
157
- search_button = gr.Button("🔍", variant="primary")
158
- with gr.Column(scale=4, min_width=0):
159
- pass
160
- outputs = []
161
- outputs.append(gr.Markdown())
162
- with gr.Row():
163
- with gr.Column(scale=4, min_width=0):
164
- pass
165
- with gr.Column(scale=10):
166
- buttons = []
167
- for i in range(10):
168
- line = default_output[i]
169
- dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1)
170
- with gr.Group(elem_classes="buttonsGroup"):
171
- top = gr.Button(dataset_name, elem_classes="topButton")
172
- bottom = gr.Button(tags, elem_classes="bottomButton")
173
- buttons += [top, bottom]
174
- top.click(partial(show_dataset, i=i), inputs=buttons, outputs=outputs)
175
- bottom.click(partial(show_dataset, i=i), inputs=buttons, outputs=outputs)
176
- with gr.Column(scale=4, min_width=0):
177
- pass
178
- search_bar.submit(search_datasets, inputs=search_bar, outputs=buttons)
179
- search_button.click(search_datasets, inputs=search_bar, outputs=buttons)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  demo.launch()
 
1
+ import time
2
  from functools import partial
3
  from typing import Iterator
4
 
5
  import gradio as gr
6
+ import requests.exceptions
7
  from huggingface_hub import InferenceClient
8
 
9
 
 
14
  "A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
15
  "Generate a list of 10 names of quality dataset that don't exist but sound plausible and would "
16
  "be helpful. Feel free to reuse words from the query '{search_query}' to name the datasets. "
17
+ "Every dataset should be about '{search_query}' and have descriptive tags/keywords including the ML task name associated to the dataset (classification, regression, anomaly detection, etc.). Use the following format:\n1. DatasetName1 (tag1, tag2, tag3)\n1. DatasetName2 (tag1, tag2, tag3)"
18
  )
19
 
20
+ GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS = (
21
+ "A ML practitioner is looking for a dataset CSV after the query '{search_query}'. "
22
+ "Generate the first 5 rows of a plausible and quality CSV for the dataset '{dataset_name}'. "
23
+ "You can get inspiration from related keywords '{tags}' but most importantly the dataset should correspond to the query '{search_query}'. "
24
+ "Focus on quality text content and and use a 'label' or 'labels' column if it makes sense (invent labels, avoid reusing the keywords, be accurate while labelling texts). "
25
+ "Reply using a short description of the dataset with title **Dataset Description:** followed by the CSV content in a code block and with title **CSV Content Preview:**."
26
+ )
27
+
28
+ default_query = "various datasets on many different subjects and topics, from classification to language modeling, from science to sport to finance to news"
29
 
30
+
31
+ def stream_reponse(msg: str, max_tokens=500) -> Iterator[str]:
32
+ for _ in range(3):
33
+ try:
34
+ for message in client.chat_completion(
35
+ messages=[{"role": "user", "content": msg}],
36
+ max_tokens=max_tokens,
37
+ stream=True,
38
+ ):
39
+ yield message.choices[0].delta.content
40
+ except requests.exceptions.ConnectionError as e:
41
+ print(e + "\n\nRetrying in 1sec")
42
+ time.sleep(1)
43
+ continue
44
+ break
45
 
46
 
47
  def gen_datasets(search_query: str) -> Iterator[str]:
48
+ search_query = search_query if search_query.strip() else default_query
49
  generated_text = ""
50
  for token in stream_reponse(GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY.format(search_query=search_query)):
51
  generated_text += token
 
54
  yield generated_text.strip()
55
  print("-----\n\n" + generated_text)
56
 
57
+
58
+ def gen_dataset_content(search_query: str, dataset_name: str, tags: str) -> Iterator[str]:
59
+ search_query = search_query if search_query.strip() else default_query
60
+ generated_text = ""
61
+ for token in stream_reponse(GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
62
+ search_query=search_query,
63
+ dataset_name=dataset_name,
64
+ tags=tags,
65
+ ), max_tokens=1500):
66
+ generated_text += token
67
+ yield generated_text
68
+ print("-----\n\n" + generated_text)
69
+
70
+
71
  NB_ITEMS_PER_PAGE = 10
72
 
73
  default_output = """
74
+ 1. NewsEventsPredict (classification, media, trend)
75
+ 2. FinancialForecast (economy, stocks, regression)
76
+ 3. HealthMonitor (science, real-time, anomaly detection)
77
+ 4. SportsAnalysis (classification, performance, player tracking)
78
+ 5. SciLiteracyTools (language modeling, science literacy, text classification)
79
+ 6. RetailSalesAnalyzer (consumer behavior, sales trend, segmentation)
80
+ 7. SocialSentimentEcho (social media, emotion analysis, clustering)
81
+ 8. NewsEventTracker (classification, public awareness, topical clustering)
82
+ 9. HealthVitalSigns (anomaly detection, biometrics, prediction)
83
+ 10. GameStockPredict (classification, finance, sports contingency)
84
  """.strip().split("\n")
85
  assert len(default_output) == NB_ITEMS_PER_PAGE
86
 
 
171
  yield output_values
172
 
173
 
174
+ def show_dataset(search_query, *buttons_values, i):
175
  dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
176
+ dataset_title = f"# {dataset_name}\n\n tags: {tags}\n\n _Note: This is an AI-generated dataset so its content may be inaccurate or false_"
177
+ yield gr.Column(visible=False), gr.Column(visible=True), dataset_title, ""
178
+ for generated_text in gen_dataset_content(search_query=search_query, dataset_name=dataset_name, tags=tags):
179
+ yield gr.Column(), gr.Column(), dataset_title, generated_text
180
+
181
+
182
+ def show_search_page():
183
+ return gr.Column(visible=True), gr.Column(visible=False)
184
+
185
+
186
+ def generate_full_dataset():
187
+ raise gr.Error("Not implemented yet sorry !")
188
 
189
 
190
  with gr.Blocks(css=css) as demo:
 
192
  "# 🤗 Infinite Dataset Hub\n\n"
193
  f"_powered by [{model_id}](https://huggingface.co/{model_id})_"
194
  )
195
+ with gr.Column() as search_page:
196
+ with gr.Row():
197
+ with gr.Column(scale=4, min_width=0):
198
+ pass
199
+ with gr.Column(scale=9):
200
+ search_bar = gr.Textbox(max_lines=1, placeholder="Search datasets", show_label=False, container=False)
201
+ with gr.Column(min_width=64):
202
+ search_button = gr.Button("🔍", variant="primary")
203
+ with gr.Column(scale=4, min_width=0):
204
+ pass
205
+ inputs = [search_bar]
206
+ show_dataset_outputs = [search_page]
207
+ with gr.Row():
208
+ with gr.Column(scale=4, min_width=0):
209
+ pass
210
+ with gr.Column(scale=10):
211
+ buttons = []
212
+ for i in range(10):
213
+ line = default_output[i]
214
+ dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1)
215
+ with gr.Group(elem_classes="buttonsGroup"):
216
+ top = gr.Button(dataset_name, elem_classes="topButton")
217
+ bottom = gr.Button(tags, elem_classes="bottomButton")
218
+ buttons += [top, bottom]
219
+ top.click(partial(show_dataset, i=i), inputs=inputs, outputs=show_dataset_outputs)
220
+ bottom.click(partial(show_dataset, i=i), inputs=inputs, outputs=show_dataset_outputs)
221
+ inputs += buttons
222
+ with gr.Column(scale=4, min_width=0):
223
+ pass
224
+ search_bar.submit(search_datasets, inputs=search_bar, outputs=buttons)
225
+ search_button.click(search_datasets, inputs=search_bar, outputs=buttons)
226
+ with gr.Column(visible=False) as dataset_page:
227
+ with gr.Row():
228
+ with gr.Column(scale=4, min_width=0):
229
+ pass
230
+ with gr.Column(scale=10):
231
+ dataset_title = gr.Markdown()
232
+ dataset_content = gr.Markdown()
233
+ with gr.Row():
234
+ with gr.Column(scale=4, min_width=0):
235
+ pass
236
+ with gr.Column():
237
+ generate_full_dataset_button = gr.Button("Generate Full Dataset", variant="primary")
238
+ generate_full_dataset_button.click(generate_full_dataset)
239
+ back_button = gr.Button("< Back", size="sm")
240
+ back_button.click(show_search_page, inputs=[], outputs=[search_page, dataset_page])
241
+ with gr.Column(scale=4, min_width=0):
242
+ pass
243
+ with gr.Column(scale=4, min_width=0):
244
+ pass
245
+ show_dataset_outputs += [dataset_page, dataset_title, dataset_content]
246
  demo.launch()