Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -19,7 +19,7 @@ from transformers import AutoModel, AutoTokenizer, AutoConfig
|
|
19 |
from tqdm import tqdm
|
20 |
|
21 |
|
22 |
-
logging.basicConfig(level=logging.
|
23 |
|
24 |
|
25 |
session = requests.Session()
|
@@ -90,6 +90,7 @@ def run_quality_check(dataset, column, batch_size, num_examples):
|
|
90 |
config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
|
91 |
split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(
|
92 |
iter(info_resp["dataset_info"][config]["splits"]))
|
|
|
93 |
try:
|
94 |
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/{split}/0000.parquet", columns=[column])
|
95 |
except pl.exceptions.ComputeError:
|
@@ -101,6 +102,7 @@ def run_quality_check(dataset, column, batch_size, num_examples):
|
|
101 |
except Exception as error:
|
102 |
yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(),
|
103 |
return
|
|
|
104 |
|
105 |
texts = [text[:10000] for text in data[column].to_list()]
|
106 |
# texts_sample = data.sample(100, shuffle=True, seed=16).to_pandas()
|
|
|
19 |
from tqdm import tqdm
|
20 |
|
21 |
|
22 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
23 |
|
24 |
|
25 |
session = requests.Session()
|
|
|
90 |
config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
|
91 |
split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(
|
92 |
iter(info_resp["dataset_info"][config]["splits"]))
|
93 |
+
logging.info(f"Fetching data for {dataset} {config} {split}")
|
94 |
try:
|
95 |
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/{split}/0000.parquet", columns=[column])
|
96 |
except pl.exceptions.ComputeError:
|
|
|
102 |
except Exception as error:
|
103 |
yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(),
|
104 |
return
|
105 |
+
logging.info("Data fetched.")
|
106 |
|
107 |
texts = [text[:10000] for text in data[column].to_list()]
|
108 |
# texts_sample = data.sample(100, shuffle=True, seed=16).to_pandas()
|