Richard Guo commited on
Commit
19a3899
·
1 Parent(s): 81aaa4e

limit datum upload to 30k

Browse files
Files changed (1) hide show
  1. build_map.py +13 -3
build_map.py CHANGED
@@ -115,7 +115,8 @@ def upload_dataset_to_atlas(dataset_dict,
115
  indexed_field = None,
116
  modality=None,
117
  organization_name=None,
118
- wait_for_map=True):
 
119
 
120
  if modality is None:
121
  modality = "text"
@@ -124,7 +125,7 @@ def upload_dataset_to_atlas(dataset_dict,
124
  unique_id_field_name = "atlas_datum_id"
125
 
126
  if project_name is None:
127
- project_name = dataset_dict["name"].replace("/", "--")
128
 
129
  desc = f"Config: {dataset_dict['config']}"
130
 
@@ -169,13 +170,22 @@ def upload_dataset_to_atlas(dataset_dict,
169
  batch_size = 1000
170
  batched_texts = []
171
 
 
 
172
  for split in dataset_dict["splits"]:
173
 
 
 
 
174
  dataset = load_dataset(dataset_dict["name"], dataset_dict["config"], split = split, streaming=True)
175
 
176
  for i, ex in tqdm(enumerate(dataset)):
177
  if i % 10000 == 0:
178
  time.sleep(2)
 
 
 
 
179
 
180
  data_to_add = {"split": split, unique_id_field_name: f"{split}_{i}"}
181
 
@@ -245,4 +255,4 @@ if __name__ == "__main__":
245
  project_name = "huggingface_auto_upload_test-dolly-15k"
246
 
247
  dataset_dict = load_dataset_and_metadata(dataset_name)
248
- print(upload_dataset_to_atlas(dataset_dict, project_name=project_name))
 
115
  indexed_field = None,
116
  modality=None,
117
  organization_name=None,
118
+ wait_for_map=True,
119
+ datum_limit=30000):
120
 
121
  if modality is None:
122
  modality = "text"
 
125
  unique_id_field_name = "atlas_datum_id"
126
 
127
  if project_name is None:
128
+ project_name = dataset_dict["name"].replace("/", "--") + "--hf-atlas-map"
129
 
130
  desc = f"Config: {dataset_dict['config']}"
131
 
 
170
  batch_size = 1000
171
  batched_texts = []
172
 
173
+ allow_upload = True
174
+
175
  for split in dataset_dict["splits"]:
176
 
177
+ if not allow_upload:
178
+ break
179
+
180
  dataset = load_dataset(dataset_dict["name"], dataset_dict["config"], split = split, streaming=True)
181
 
182
  for i, ex in tqdm(enumerate(dataset)):
183
  if i % 10000 == 0:
184
  time.sleep(2)
185
+ if i == datum_limit:
186
+ print("Datum upload limited to 30,000 points. Stopping upload...")
187
+ allow_upload = False
188
+ break
189
 
190
  data_to_add = {"split": split, unique_id_field_name: f"{split}_{i}"}
191
 
 
255
  project_name = "huggingface_auto_upload_test-dolly-15k"
256
 
257
  dataset_dict = load_dataset_and_metadata(dataset_name)
258
+ print(upload_dataset_to_atlas(dataset_dict, project_name=project_name))