Spaces:

rguo123
/

atlas-map

Running

Richard Guo commited on Jul 11, 2023

Commit

a2483b1

1 Parent(s): de5bc26

use sample instead of head

Files changed (1) hide show

build_map.py CHANGED Viewed

@@ -18,6 +18,7 @@ def get_datum_fields(dataset_dict, n_samples = 100, unique_cutoff=20):
     sample = pd.DataFrame(dataset.shuffle(seed=42).take(n_samples))
     features = dataset.features
     numeric_fields = []
     string_fields = []
     bool_fields = []
@@ -71,6 +72,16 @@ def get_datum_fields(dataset_dict, n_samples = 100, unique_cutoff=20):
         else:
             uncategorized_fields.append(field)
     return features, \
            numeric_fields, \
@@ -80,7 +91,8 @@ def get_datum_fields(dataset_dict, n_samples = 100, unique_cutoff=20):
            label_fields, \
            categorical_fields, \
            datetime_fields, \
-           uncategorized_fields
 def load_dataset_and_metadata(dataset_name,
@@ -139,21 +151,11 @@ def upload_dataset_to_atlas(dataset_dict,
     label_fields, \
     categorical_fields, \
     datetime_fields, \
-    uncategorized_fields = get_datum_fields(dataset_dict)
-    # return longest string field from 5 samples
-    head = dataset_dict["head"]
     if indexed_field is None:
-        longest_length = 0
-        for field in string_fields:
-            length = 0
-            for i in range(len(head)):
-                if head[field][i]:
-                    length += len(str(head[field][i]).split())
-            if length > longest_length:
-                longest_length = length
-                indexed_field = field
     topic_label_field = None
     if modality == "embedding":

     sample = pd.DataFrame(dataset.shuffle(seed=42).take(n_samples))
     features = dataset.features
+    indexable_field = None
     numeric_fields = []
     string_fields = []
     bool_fields = []
         else:
             uncategorized_fields.append(field)
+    longest_length = 0
+    for field in string_fields:
+        length = 0
+        for i in range(len(sample)):
+            if sample[field][i]:
+                length += len(str(sample[field][i]).split())
+        if length > longest_length:
+            longest_length = length
+            indexable_field = field
     return features, \
            numeric_fields, \
            label_fields, \
            categorical_fields, \
            datetime_fields, \
+           uncategorized_fields, \
+           indexable_field
 def load_dataset_and_metadata(dataset_name,
     label_fields, \
     categorical_fields, \
     datetime_fields, \
+    uncategorized_fields, \
+    indexable_field = get_datum_fields(dataset_dict)
     if indexed_field is None:
+        indexed_field = indexable_field
     topic_label_field = None
     if modality == "embedding":