Richard Guo commited on
Commit
a2483b1
·
1 Parent(s): de5bc26

use sample instead of head

Browse files
Files changed (1) hide show
  1. build_map.py +16 -14
build_map.py CHANGED
@@ -18,6 +18,7 @@ def get_datum_fields(dataset_dict, n_samples = 100, unique_cutoff=20):
18
  sample = pd.DataFrame(dataset.shuffle(seed=42).take(n_samples))
19
  features = dataset.features
20
 
 
21
  numeric_fields = []
22
  string_fields = []
23
  bool_fields = []
@@ -71,6 +72,16 @@ def get_datum_fields(dataset_dict, n_samples = 100, unique_cutoff=20):
71
 
72
  else:
73
  uncategorized_fields.append(field)
 
 
 
 
 
 
 
 
 
 
74
 
75
  return features, \
76
  numeric_fields, \
@@ -80,7 +91,8 @@ def get_datum_fields(dataset_dict, n_samples = 100, unique_cutoff=20):
80
  label_fields, \
81
  categorical_fields, \
82
  datetime_fields, \
83
- uncategorized_fields
 
84
 
85
 
86
  def load_dataset_and_metadata(dataset_name,
@@ -139,21 +151,11 @@ def upload_dataset_to_atlas(dataset_dict,
139
  label_fields, \
140
  categorical_fields, \
141
  datetime_fields, \
142
- uncategorized_fields = get_datum_fields(dataset_dict)
143
-
144
 
145
- # return longest string field from 5 samples
146
- head = dataset_dict["head"]
147
  if indexed_field is None:
148
- longest_length = 0
149
- for field in string_fields:
150
- length = 0
151
- for i in range(len(head)):
152
- if head[field][i]:
153
- length += len(str(head[field][i]).split())
154
- if length > longest_length:
155
- longest_length = length
156
- indexed_field = field
157
 
158
  topic_label_field = None
159
  if modality == "embedding":
 
18
  sample = pd.DataFrame(dataset.shuffle(seed=42).take(n_samples))
19
  features = dataset.features
20
 
21
+ indexable_field = None
22
  numeric_fields = []
23
  string_fields = []
24
  bool_fields = []
 
72
 
73
  else:
74
  uncategorized_fields.append(field)
75
+
76
+ longest_length = 0
77
+ for field in string_fields:
78
+ length = 0
79
+ for i in range(len(sample)):
80
+ if sample[field][i]:
81
+ length += len(str(sample[field][i]).split())
82
+ if length > longest_length:
83
+ longest_length = length
84
+ indexable_field = field
85
 
86
  return features, \
87
  numeric_fields, \
 
91
  label_fields, \
92
  categorical_fields, \
93
  datetime_fields, \
94
+ uncategorized_fields, \
95
+ indexable_field
96
 
97
 
98
  def load_dataset_and_metadata(dataset_name,
 
151
  label_fields, \
152
  categorical_fields, \
153
  datetime_fields, \
154
+ uncategorized_fields, \
155
+ indexable_field = get_datum_fields(dataset_dict)
156
 
 
 
157
  if indexed_field is None:
158
+ indexed_field = indexable_field
 
 
 
 
 
 
 
 
159
 
160
  topic_label_field = None
161
  if modality == "embedding":