Richard Guo
commited on
Commit
·
a2483b1
1
Parent(s):
de5bc26
use sample instead of head
Browse files- build_map.py +16 -14
build_map.py
CHANGED
@@ -18,6 +18,7 @@ def get_datum_fields(dataset_dict, n_samples = 100, unique_cutoff=20):
|
|
18 |
sample = pd.DataFrame(dataset.shuffle(seed=42).take(n_samples))
|
19 |
features = dataset.features
|
20 |
|
|
|
21 |
numeric_fields = []
|
22 |
string_fields = []
|
23 |
bool_fields = []
|
@@ -71,6 +72,16 @@ def get_datum_fields(dataset_dict, n_samples = 100, unique_cutoff=20):
|
|
71 |
|
72 |
else:
|
73 |
uncategorized_fields.append(field)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
return features, \
|
76 |
numeric_fields, \
|
@@ -80,7 +91,8 @@ def get_datum_fields(dataset_dict, n_samples = 100, unique_cutoff=20):
|
|
80 |
label_fields, \
|
81 |
categorical_fields, \
|
82 |
datetime_fields, \
|
83 |
-
uncategorized_fields
|
|
|
84 |
|
85 |
|
86 |
def load_dataset_and_metadata(dataset_name,
|
@@ -139,21 +151,11 @@ def upload_dataset_to_atlas(dataset_dict,
|
|
139 |
label_fields, \
|
140 |
categorical_fields, \
|
141 |
datetime_fields, \
|
142 |
-
uncategorized_fields
|
143 |
-
|
144 |
|
145 |
-
# return longest string field from 5 samples
|
146 |
-
head = dataset_dict["head"]
|
147 |
if indexed_field is None:
|
148 |
-
|
149 |
-
for field in string_fields:
|
150 |
-
length = 0
|
151 |
-
for i in range(len(head)):
|
152 |
-
if head[field][i]:
|
153 |
-
length += len(str(head[field][i]).split())
|
154 |
-
if length > longest_length:
|
155 |
-
longest_length = length
|
156 |
-
indexed_field = field
|
157 |
|
158 |
topic_label_field = None
|
159 |
if modality == "embedding":
|
|
|
18 |
sample = pd.DataFrame(dataset.shuffle(seed=42).take(n_samples))
|
19 |
features = dataset.features
|
20 |
|
21 |
+
indexable_field = None
|
22 |
numeric_fields = []
|
23 |
string_fields = []
|
24 |
bool_fields = []
|
|
|
72 |
|
73 |
else:
|
74 |
uncategorized_fields.append(field)
|
75 |
+
|
76 |
+
longest_length = 0
|
77 |
+
for field in string_fields:
|
78 |
+
length = 0
|
79 |
+
for i in range(len(sample)):
|
80 |
+
if sample[field][i]:
|
81 |
+
length += len(str(sample[field][i]).split())
|
82 |
+
if length > longest_length:
|
83 |
+
longest_length = length
|
84 |
+
indexable_field = field
|
85 |
|
86 |
return features, \
|
87 |
numeric_fields, \
|
|
|
91 |
label_fields, \
|
92 |
categorical_fields, \
|
93 |
datetime_fields, \
|
94 |
+
uncategorized_fields, \
|
95 |
+
indexable_field
|
96 |
|
97 |
|
98 |
def load_dataset_and_metadata(dataset_name,
|
|
|
151 |
label_fields, \
|
152 |
categorical_fields, \
|
153 |
datetime_fields, \
|
154 |
+
uncategorized_fields, \
|
155 |
+
indexable_field = get_datum_fields(dataset_dict)
|
156 |
|
|
|
|
|
157 |
if indexed_field is None:
|
158 |
+
indexed_field = indexable_field
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
topic_label_field = None
|
161 |
if modality == "embedding":
|