Richard Guo
commited on
Commit
·
19a3899
1
Parent(s):
81aaa4e
limit datum upload to 30k
Browse files- build_map.py +13 -3
build_map.py
CHANGED
@@ -115,7 +115,8 @@ def upload_dataset_to_atlas(dataset_dict,
|
|
115 |
indexed_field = None,
|
116 |
modality=None,
|
117 |
organization_name=None,
|
118 |
-
wait_for_map=True
|
|
|
119 |
|
120 |
if modality is None:
|
121 |
modality = "text"
|
@@ -124,7 +125,7 @@ def upload_dataset_to_atlas(dataset_dict,
|
|
124 |
unique_id_field_name = "atlas_datum_id"
|
125 |
|
126 |
if project_name is None:
|
127 |
-
project_name = dataset_dict["name"].replace("/", "--")
|
128 |
|
129 |
desc = f"Config: {dataset_dict['config']}"
|
130 |
|
@@ -169,13 +170,22 @@ def upload_dataset_to_atlas(dataset_dict,
|
|
169 |
batch_size = 1000
|
170 |
batched_texts = []
|
171 |
|
|
|
|
|
172 |
for split in dataset_dict["splits"]:
|
173 |
|
|
|
|
|
|
|
174 |
dataset = load_dataset(dataset_dict["name"], dataset_dict["config"], split = split, streaming=True)
|
175 |
|
176 |
for i, ex in tqdm(enumerate(dataset)):
|
177 |
if i % 10000 == 0:
|
178 |
time.sleep(2)
|
|
|
|
|
|
|
|
|
179 |
|
180 |
data_to_add = {"split": split, unique_id_field_name: f"{split}_{i}"}
|
181 |
|
@@ -245,4 +255,4 @@ if __name__ == "__main__":
|
|
245 |
project_name = "huggingface_auto_upload_test-dolly-15k"
|
246 |
|
247 |
dataset_dict = load_dataset_and_metadata(dataset_name)
|
248 |
-
print(upload_dataset_to_atlas(dataset_dict, project_name=project_name))
|
|
|
115 |
indexed_field = None,
|
116 |
modality=None,
|
117 |
organization_name=None,
|
118 |
+
wait_for_map=True,
|
119 |
+
datum_limit=30000):
|
120 |
|
121 |
if modality is None:
|
122 |
modality = "text"
|
|
|
125 |
unique_id_field_name = "atlas_datum_id"
|
126 |
|
127 |
if project_name is None:
|
128 |
+
project_name = dataset_dict["name"].replace("/", "--") + "--hf-atlas-map"
|
129 |
|
130 |
desc = f"Config: {dataset_dict['config']}"
|
131 |
|
|
|
170 |
batch_size = 1000
|
171 |
batched_texts = []
|
172 |
|
173 |
+
allow_upload = True
|
174 |
+
|
175 |
for split in dataset_dict["splits"]:
|
176 |
|
177 |
+
if not allow_upload:
|
178 |
+
break
|
179 |
+
|
180 |
dataset = load_dataset(dataset_dict["name"], dataset_dict["config"], split = split, streaming=True)
|
181 |
|
182 |
for i, ex in tqdm(enumerate(dataset)):
|
183 |
if i % 10000 == 0:
|
184 |
time.sleep(2)
|
185 |
+
if i == datum_limit:
|
186 |
+
print("Datum upload limited to 30,000 points. Stopping upload...")
|
187 |
+
allow_upload = False
|
188 |
+
break
|
189 |
|
190 |
data_to_add = {"split": split, unique_id_field_name: f"{split}_{i}"}
|
191 |
|
|
|
255 |
project_name = "huggingface_auto_upload_test-dolly-15k"
|
256 |
|
257 |
dataset_dict = load_dataset_and_metadata(dataset_name)
|
258 |
+
print(upload_dataset_to_atlas(dataset_dict, project_name=project_name))
|