Spaces:
Running
on
Zero
Running
on
Zero
Commit
Β·
8d6975b
1
Parent(s):
2bd0078
update
Browse files
app.py
CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
|
|
2 |
import polars as pl
|
3 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
4 |
import torch
|
5 |
-
import spaces
|
6 |
from torch import nn
|
7 |
from transformers import AutoModel, AutoTokenizer, AutoConfig
|
8 |
from huggingface_hub import PyTorchModelHubMixin
|
@@ -31,7 +31,7 @@ model = QualityModel.from_pretrained("nvidia/quality-classifier-deberta").to(dev
|
|
31 |
model.eval()
|
32 |
|
33 |
|
34 |
-
@spaces.GPU
|
35 |
def predict(texts: list[str]):
|
36 |
inputs = tokenizer(
|
37 |
texts, return_tensors="pt", padding="longest", truncation=True
|
@@ -46,26 +46,26 @@ def predict(texts: list[str]):
|
|
46 |
|
47 |
def run_quality_check(dataset, column, n_samples):
|
48 |
config = "default"
|
49 |
-
data = pl.read_parquet(f"hf://datasets/{dataset}
|
50 |
-
texts = data[column].
|
51 |
predictions = predict(texts[:n_samples])
|
52 |
-
return pd.DataFrame({"quality": predictions})
|
53 |
|
54 |
|
55 |
with gr.Blocks() as demo:
|
56 |
gr.Markdown("# π« Dataset Quality Checker π«")
|
57 |
-
|
58 |
label="Hub Dataset ID",
|
59 |
placeholder="Search for dataset id on Huggingface",
|
60 |
search_type="dataset",
|
61 |
value="fka/awesome-chatgpt-prompts",
|
62 |
)
|
63 |
-
dataset_name = HuggingfaceHubSearch(
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
)
|
69 |
# config_name = "default" # TODO: user input
|
70 |
@gr.render(inputs=dataset_name)
|
71 |
def embed(name):
|
@@ -82,6 +82,8 @@ with gr.Blocks() as demo:
|
|
82 |
n_samples = gr.Number(label="Num first samples to run check")
|
83 |
gr_check_btn = gr.Button("Check Dataset")
|
84 |
# plot = gr.BarPlot()
|
85 |
-
df = gr.DataFrame(
|
86 |
gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, n_samples], outputs=[df])
|
87 |
-
gr.BarPlot(df)
|
|
|
|
|
|
2 |
import polars as pl
|
3 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
4 |
import torch
|
5 |
+
# import spaces
|
6 |
from torch import nn
|
7 |
from transformers import AutoModel, AutoTokenizer, AutoConfig
|
8 |
from huggingface_hub import PyTorchModelHubMixin
|
|
|
31 |
model.eval()
|
32 |
|
33 |
|
34 |
+
# @spaces.GPU
|
35 |
def predict(texts: list[str]):
|
36 |
inputs = tokenizer(
|
37 |
texts, return_tensors="pt", padding="longest", truncation=True
|
|
|
46 |
|
47 |
def run_quality_check(dataset, column, n_samples):
|
48 |
config = "default"
|
49 |
+
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/train/0000.parquet", columns=[column])
|
50 |
+
texts = data[column].to_list()
|
51 |
predictions = predict(texts[:n_samples])
|
52 |
+
return pd.DataFrame({"quality": predictions})
|
53 |
|
54 |
|
55 |
with gr.Blocks() as demo:
|
56 |
gr.Markdown("# π« Dataset Quality Checker π«")
|
57 |
+
dataset_name = HuggingfaceHubSearch(
|
58 |
label="Hub Dataset ID",
|
59 |
placeholder="Search for dataset id on Huggingface",
|
60 |
search_type="dataset",
|
61 |
value="fka/awesome-chatgpt-prompts",
|
62 |
)
|
63 |
+
# dataset_name = HuggingfaceHubSearch(
|
64 |
+
# label="Hub Dataset ID",
|
65 |
+
# placeholder="Search for dataset id on Huggingface",
|
66 |
+
# search_type="dataset",
|
67 |
+
# value="HuggingFaceFW/fineweb",
|
68 |
+
# )
|
69 |
# config_name = "default" # TODO: user input
|
70 |
@gr.render(inputs=dataset_name)
|
71 |
def embed(name):
|
|
|
82 |
n_samples = gr.Number(label="Num first samples to run check")
|
83 |
gr_check_btn = gr.Button("Check Dataset")
|
84 |
# plot = gr.BarPlot()
|
85 |
+
df = gr.DataFrame()
|
86 |
gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, n_samples], outputs=[df])
|
87 |
+
# gr.BarPlot(df)
|
88 |
+
|
89 |
+
demo.launch()
|