Spaces:
Running
on
Zero
Running
on
Zero
Commit
Β·
b858233
1
Parent(s):
4bc0ae7
fetch parquet filename via dataset-viewer api
Browse files
app.py
CHANGED
@@ -81,20 +81,28 @@ def plot_and_df(texts, preds):
|
|
81 |
)
|
82 |
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
@spaces.GPU
|
85 |
def run_quality_check(dataset, config, split, column, batch_size, num_examples):
|
86 |
logging.info(f"Fetching data for {dataset=} {config=} {split=} {column=}")
|
87 |
try:
|
88 |
-
|
89 |
-
except
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
logging.info("Data fetched.")
|
99 |
|
100 |
data_sample = data.sample(num_examples, seed=16) if data.shape[0] > num_examples else data
|
|
|
81 |
)
|
82 |
|
83 |
|
84 |
+
def get_first_parquet_filename(dataset, config, split):
|
85 |
+
parquet_resp = session.get(f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}", timeout=3).json()
|
86 |
+
if "error" in parquet_resp:
|
87 |
+
raise ValueError(parquet_resp["error"])
|
88 |
+
first_parquet_file_url = [file for file in parquet_resp["parquet_files"] if file["split"] == split][0]["url"]
|
89 |
+
return "/".join(first_parquet_file_url.split("/")[-3:])
|
90 |
+
|
91 |
+
|
92 |
@spaces.GPU
|
93 |
def run_quality_check(dataset, config, split, column, batch_size, num_examples):
|
94 |
logging.info(f"Fetching data for {dataset=} {config=} {split=} {column=}")
|
95 |
try:
|
96 |
+
filename = get_first_parquet_filename(dataset, config, split)
|
97 |
+
except Exception as error:
|
98 |
+
yield f"β {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(),
|
99 |
+
return
|
100 |
+
|
101 |
+
try:
|
102 |
+
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column])
|
103 |
+
except Exception as error:
|
104 |
+
yield f"β {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(),
|
105 |
+
return
|
106 |
logging.info("Data fetched.")
|
107 |
|
108 |
data_sample = data.sample(num_examples, seed=16) if data.shape[0] > num_examples else data
|