polinaeterna HF staff commited on
Commit
373e797
·
1 Parent(s): 6fae90e

add feature dropdown

Browse files
Files changed (1) hide show
  1. app.py +21 -22
app.py CHANGED
@@ -83,14 +83,7 @@ def plot_and_df(texts, preds):
83
 
84
  @spaces.GPU
85
  def run_quality_check(dataset, config, split, column, batch_size, num_examples):
86
- # info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
87
- # if "error" in info_resp:
88
- # yield "❌ " + info_resp["error"], gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(),
89
- # return
90
- # config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
91
- # split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(
92
- # iter(info_resp["dataset_info"][config]["splits"]))
93
- logging.info(f"Fetching data for {dataset} {config} {split}")
94
  try:
95
  data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/{split}/0000.parquet", columns=[column])
96
  except pl.exceptions.ComputeError:
@@ -244,7 +237,6 @@ with gr.Blocks() as demo:
244
  label="Hub Dataset ID",
245
  placeholder="Search for dataset id on Huggingface",
246
  search_type="dataset",
247
- # value="fka/awesome-chatgpt-prompts",
248
  )
249
  subset_dropdown = gr.Dropdown(info="Subset", show_label=False, visible=False)
250
  split_dropdown = gr.Dropdown(info="Split", show_label=False, visible=False)
@@ -263,40 +255,47 @@ with gr.Blocks() as demo:
263
  """
264
  return gr.HTML(value=html_code)
265
 
 
 
266
  def _resolve_dataset_selection(dataset: str, default_subset: str, default_split: str):
267
  if "/" not in dataset.strip().strip("/"):
268
  return {
269
  subset_dropdown: gr.Dropdown(visible=False),
270
  split_dropdown: gr.Dropdown(visible=False),
 
271
  }
272
  info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
273
  if "error" in info_resp:
274
  return {
275
  subset_dropdown: gr.Dropdown(visible=False),
276
  split_dropdown: gr.Dropdown(visible=False),
 
277
  }
278
  subsets: list[str] = list(info_resp["dataset_info"])
279
  subset = default_subset if default_subset in subsets else subsets[0]
280
  splits: list[str] = info_resp["dataset_info"][subset]["splits"]
281
  split = default_split if default_split in splits else splits[0]
 
 
282
  return {
283
  subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
284
  split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
 
285
  }
286
 
287
- @dataset_name.change(inputs=[dataset_name], outputs=[subset_dropdown, split_dropdown])
288
  def show_input_from_subset_dropdown(dataset: str) -> dict:
289
  return _resolve_dataset_selection(dataset, default_subset="default", default_split="train")
290
 
291
- @subset_dropdown.change(inputs=[dataset_name, subset_dropdown], outputs=[subset_dropdown, split_dropdown])
292
  def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
293
  return _resolve_dataset_selection(dataset, default_subset=subset, default_split="train")
294
 
295
- @split_dropdown.change(inputs=[dataset_name, subset_dropdown, split_dropdown], outputs=[subset_dropdown, split_dropdown])
296
  def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
297
  return _resolve_dataset_selection(dataset, default_subset=subset, default_split=split)
298
 
299
- text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
300
 
301
  gr.Markdown("## Run nvidia quality classifier")
302
  batch_size = gr.Slider(0, 64, 32, step=4, label="Inference batch size (set this to smaller value if this space crashes.)")
@@ -317,17 +316,17 @@ with gr.Blocks() as demo:
317
  texts_df = gr.DataFrame(visible=False)
318
  gr_check_btn.click(
319
  run_quality_check,
320
- inputs=[dataset_name, subset_dropdown, split_dropdown, text_column, batch_size, num_examples],
321
  outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_df]
322
  )
323
 
324
- gr.Markdown("""## Compute text quality measures
325
- * proportion of non-ascii characters
326
- * #TODO""")
327
- gr_ascii_btn = gr.Button("Data measures")
328
- non_ascii_hist = gr.Plot()
329
-
330
- gr_ascii_btn.click(non_ascii_check, inputs=[texts_df, text_column], outputs=[non_ascii_hist])
331
 
332
  gr.Markdown("## Explore toxicity")
333
  checkbox = gr.Checkbox(value=False, label="Run on full first parquet data (better not)")
@@ -338,7 +337,7 @@ with gr.Blocks() as demo:
338
  toxicity_df = gr.DataFrame()
339
  gr_toxicity_btn.click(
340
  call_perspective_api,
341
- inputs=[texts_df, text_column, checkbox],
342
  outputs=[toxicity_progress_bar, toxicity_hist, toxicity_df]
343
  )
344
 
 
83
 
84
  @spaces.GPU
85
  def run_quality_check(dataset, config, split, column, batch_size, num_examples):
86
+ logging.info(f"Fetching data for {dataset=} {config=} {split=} {column=}")
 
 
 
 
 
 
 
87
  try:
88
  data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/{split}/0000.parquet", columns=[column])
89
  except pl.exceptions.ComputeError:
 
237
  label="Hub Dataset ID",
238
  placeholder="Search for dataset id on Huggingface",
239
  search_type="dataset",
 
240
  )
241
  subset_dropdown = gr.Dropdown(info="Subset", show_label=False, visible=False)
242
  split_dropdown = gr.Dropdown(info="Split", show_label=False, visible=False)
 
255
  """
256
  return gr.HTML(value=html_code)
257
 
258
+ text_column_dropdown = gr.Dropdown(label="Text column name", info="Text colum name to check (only non-nested texts are supported)")
259
+
260
  def _resolve_dataset_selection(dataset: str, default_subset: str, default_split: str):
261
  if "/" not in dataset.strip().strip("/"):
262
  return {
263
  subset_dropdown: gr.Dropdown(visible=False),
264
  split_dropdown: gr.Dropdown(visible=False),
265
+ text_column_dropdown: gr.Dropdown(info="Text colum name to check (only non-nested texts are supported)"),
266
  }
267
  info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
268
  if "error" in info_resp:
269
  return {
270
  subset_dropdown: gr.Dropdown(visible=False),
271
  split_dropdown: gr.Dropdown(visible=False),
272
+ text_column_dropdown: gr.Dropdown(label="Text column name", info="Text colum name to check (only non-nested texts are supported)")
273
  }
274
  subsets: list[str] = list(info_resp["dataset_info"])
275
  subset = default_subset if default_subset in subsets else subsets[0]
276
  splits: list[str] = info_resp["dataset_info"][subset]["splits"]
277
  split = default_split if default_split in splits else splits[0]
278
+ features = info_resp["dataset_info"][subset]["features"]
279
+ text_features = [feature_name for feature_name, feature in features.items() if isinstance(feature, dict) and feature.get("dtype") == "string"] # and feature.get("_type") == "Value"]
280
  return {
281
  subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
282
  split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
283
+ text_column_dropdown: gr.Dropdown(choices=text_features, label="Text column name", info="Text colum name to check (only non-nested texts are supported)"),
284
  }
285
 
286
+ @dataset_name.change(inputs=[dataset_name], outputs=[subset_dropdown, split_dropdown, text_column_dropdown])
287
  def show_input_from_subset_dropdown(dataset: str) -> dict:
288
  return _resolve_dataset_selection(dataset, default_subset="default", default_split="train")
289
 
290
+ @subset_dropdown.change(inputs=[dataset_name, subset_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown])
291
  def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
292
  return _resolve_dataset_selection(dataset, default_subset=subset, default_split="train")
293
 
294
+ @split_dropdown.change(inputs=[dataset_name, subset_dropdown, split_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown])
295
  def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
296
  return _resolve_dataset_selection(dataset, default_subset=subset, default_split=split)
297
 
298
+ # text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
299
 
300
  gr.Markdown("## Run nvidia quality classifier")
301
  batch_size = gr.Slider(0, 64, 32, step=4, label="Inference batch size (set this to smaller value if this space crashes.)")
 
316
  texts_df = gr.DataFrame(visible=False)
317
  gr_check_btn.click(
318
  run_quality_check,
319
+ inputs=[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown, batch_size, num_examples],
320
  outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_df]
321
  )
322
 
323
+ # gr.Markdown("""## Compute text quality measures
324
+ # * proportion of non-ascii characters
325
+ # * #TODO""")
326
+ # gr_ascii_btn = gr.Button("Data measures")
327
+ # non_ascii_hist = gr.Plot()
328
+ #
329
+ # gr_ascii_btn.click(non_ascii_check, inputs=[texts_df, text_column], outputs=[non_ascii_hist])
330
 
331
  gr.Markdown("## Explore toxicity")
332
  checkbox = gr.Checkbox(value=False, label="Run on full first parquet data (better not)")
 
337
  toxicity_df = gr.DataFrame()
338
  gr_toxicity_btn.click(
339
  call_perspective_api,
340
+ inputs=[texts_df, text_column_dropdown, checkbox],
341
  outputs=[toxicity_progress_bar, toxicity_hist, toxicity_df]
342
  )
343