Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
•
bae8f11
1
Parent(s):
b5f94b5
refactor
Browse files- prep_viewer_data.py +3 -0
prep_viewer_data.py
CHANGED
@@ -7,6 +7,7 @@ from huggingface_hub import list_datasets
|
|
7 |
from tqdm import tqdm
|
8 |
from tqdm.asyncio import tqdm_asyncio
|
9 |
|
|
|
10 |
# Initialize the HTTP client
|
11 |
client = httpx.AsyncClient(timeout=60, http2=True)
|
12 |
|
@@ -115,6 +116,8 @@ async def prep_data(sample_size=200_000, min_likes=1):
|
|
115 |
df = pl.read_parquet(
|
116 |
"hf://datasets/davanstrien/dataset-viewer-descriptions-processed/data/train-00000-of-00001.parquet"
|
117 |
)
|
|
|
|
|
118 |
in_train_or_test = set(df["dataset_id"].unique().to_list())
|
119 |
|
120 |
# Get all datasets
|
|
|
7 |
from tqdm import tqdm
|
8 |
from tqdm.asyncio import tqdm_asyncio
|
9 |
|
10 |
+
|
11 |
# Initialize the HTTP client
|
12 |
client = httpx.AsyncClient(timeout=60, http2=True)
|
13 |
|
|
|
116 |
df = pl.read_parquet(
|
117 |
"hf://datasets/davanstrien/dataset-viewer-descriptions-processed/data/train-00000-of-00001.parquet"
|
118 |
)
|
119 |
+
# remove datasets that are already in the train or test set we can remove this later once the model works okay
|
120 |
+
|
121 |
in_train_or_test = set(df["dataset_id"].unique().to_list())
|
122 |
|
123 |
# Get all datasets
|