polinaeterna HF staff commited on
Commit
8d6975b
Β·
1 Parent(s): 2bd0078
Files changed (1) hide show
  1. app.py +16 -14
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import polars as pl
3
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
4
  import torch
5
- import spaces
6
  from torch import nn
7
  from transformers import AutoModel, AutoTokenizer, AutoConfig
8
  from huggingface_hub import PyTorchModelHubMixin
@@ -31,7 +31,7 @@ model = QualityModel.from_pretrained("nvidia/quality-classifier-deberta").to(dev
31
  model.eval()
32
 
33
 
34
- @spaces.GPU
35
  def predict(texts: list[str]):
36
  inputs = tokenizer(
37
  texts, return_tensors="pt", padding="longest", truncation=True
@@ -46,26 +46,26 @@ def predict(texts: list[str]):
46
 
47
  def run_quality_check(dataset, column, n_samples):
48
  config = "default"
49
- data = pl.read_parquet(f"hf://datasets/{dataset}@parquet~/{config}/train/0000.parquet", columns=[column])
50
- texts = data[column].tolist()
51
  predictions = predict(texts[:n_samples])
52
- return pd.DataFrame({"quality": predictions}).value_counts()
53
 
54
 
55
  with gr.Blocks() as demo:
56
  gr.Markdown("# πŸ’« Dataset Quality Checker πŸ’«")
57
- gr_dataset_name = HuggingfaceHubSearch(
58
  label="Hub Dataset ID",
59
  placeholder="Search for dataset id on Huggingface",
60
  search_type="dataset",
61
  value="fka/awesome-chatgpt-prompts",
62
  )
63
- dataset_name = HuggingfaceHubSearch(
64
- label="Hub Dataset ID",
65
- placeholder="Search for dataset id on Huggingface",
66
- search_type="dataset",
67
- value="HuggingFaceFW/fineweb",
68
- )
69
  # config_name = "default" # TODO: user input
70
  @gr.render(inputs=dataset_name)
71
  def embed(name):
@@ -82,6 +82,8 @@ with gr.Blocks() as demo:
82
  n_samples = gr.Number(label="Num first samples to run check")
83
  gr_check_btn = gr.Button("Check Dataset")
84
  # plot = gr.BarPlot()
85
- df = gr.DataFrame(visible=False)
86
  gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, n_samples], outputs=[df])
87
- gr.BarPlot(df)
 
 
 
2
  import polars as pl
3
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
4
  import torch
5
+ # import spaces
6
  from torch import nn
7
  from transformers import AutoModel, AutoTokenizer, AutoConfig
8
  from huggingface_hub import PyTorchModelHubMixin
 
31
  model.eval()
32
 
33
 
34
+ # @spaces.GPU
35
  def predict(texts: list[str]):
36
  inputs = tokenizer(
37
  texts, return_tensors="pt", padding="longest", truncation=True
 
46
 
47
  def run_quality_check(dataset, column, n_samples):
48
  config = "default"
49
+ data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/train/0000.parquet", columns=[column])
50
+ texts = data[column].to_list()
51
  predictions = predict(texts[:n_samples])
52
+ return pd.DataFrame({"quality": predictions})
53
 
54
 
55
  with gr.Blocks() as demo:
56
  gr.Markdown("# πŸ’« Dataset Quality Checker πŸ’«")
57
+ dataset_name = HuggingfaceHubSearch(
58
  label="Hub Dataset ID",
59
  placeholder="Search for dataset id on Huggingface",
60
  search_type="dataset",
61
  value="fka/awesome-chatgpt-prompts",
62
  )
63
+ # dataset_name = HuggingfaceHubSearch(
64
+ # label="Hub Dataset ID",
65
+ # placeholder="Search for dataset id on Huggingface",
66
+ # search_type="dataset",
67
+ # value="HuggingFaceFW/fineweb",
68
+ # )
69
  # config_name = "default" # TODO: user input
70
  @gr.render(inputs=dataset_name)
71
  def embed(name):
 
82
  n_samples = gr.Number(label="Num first samples to run check")
83
  gr_check_btn = gr.Button("Check Dataset")
84
  # plot = gr.BarPlot()
85
+ df = gr.DataFrame()
86
  gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, n_samples], outputs=[df])
87
+ # gr.BarPlot(df)
88
+
89
+ demo.launch()