lewtun HF staff commited on
Commit
5b19fc7
·
1 Parent(s): 7a3b085

Parse metadata

Browse files
Files changed (2) hide show
  1. app.py +65 -26
  2. utils.py +11 -4
app.py CHANGED
@@ -8,7 +8,8 @@ from datasets import get_dataset_config_names
8
  from dotenv import load_dotenv
9
  from huggingface_hub import list_datasets
10
 
11
- from utils import get_compatible_models, get_metadata, http_get, http_post
 
12
 
13
  if Path(".env").is_file():
14
  load_dotenv(".env")
@@ -29,6 +30,9 @@ TASK_TO_ID = {
29
  "summarization": 8,
30
  }
31
 
 
 
 
32
  ###########
33
  ### APP ###
34
  ###########
@@ -61,7 +65,11 @@ if metadata is None:
61
 
62
  with st.expander("Advanced configuration"):
63
  ## Select task
64
- selected_task = st.selectbox("Select a task", list(TASK_TO_ID.keys()))
 
 
 
 
65
  ### Select config
66
  configs = get_dataset_config_names(selected_dataset)
67
  selected_config = st.selectbox("Select a config", configs)
@@ -75,29 +83,25 @@ with st.expander("Advanced configuration"):
75
  if split["config"] == selected_config:
76
  split_names.append(split["split"])
77
 
78
- selected_split = st.selectbox("Select a split", split_names) # , index=split_names.index(eval_split))
 
 
 
 
79
 
80
- ## Show columns
81
  rows_resp = http_get(
82
  path="/rows",
83
  domain="https://datasets-preview.huggingface.tech",
84
  params={"dataset": selected_dataset, "config": selected_config, "split": selected_split},
85
  ).json()
86
  col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
87
- # splits = metadata[0]["splits"]
88
- # split_names = list(splits.values())
89
- # eval_split = splits.get("eval_split", split_names[0])
90
-
91
- # selected_split = st.selectbox("Select a split", split_names, index=split_names.index(eval_split))
92
-
93
- # TODO: add a function to handle the mapping task <--> column mapping
94
- # col_mapping = metadata[0]["col_mapping"]
95
- # col_names = list(col_mapping.keys())
96
 
97
  st.markdown("**Map your data columns**")
98
  col1, col2 = st.columns(2)
99
 
100
  # TODO: find a better way to layout these items
 
101
  col_mapping = {}
102
  if selected_task in ["binary_classification", "multi_class_classification"]:
103
  with col1:
@@ -108,9 +112,15 @@ with st.expander("Advanced configuration"):
108
  st.text("")
109
  st.markdown("`target` column")
110
  with col2:
111
- text_col = st.selectbox("This column should contain the text you want to classify", col_names)
 
 
 
 
112
  target_col = st.selectbox(
113
- "This column should contain the labels you want to assign to the text", col_names
 
 
114
  )
115
  col_mapping[text_col] = "text"
116
  col_mapping[target_col] = "target"
@@ -127,9 +137,12 @@ with st.expander("Advanced configuration"):
127
  tokens_col = st.selectbox(
128
  "This column should contain the parts of the text (as an array of tokens) you want to assign labels to",
129
  col_names,
 
130
  )
131
  tags_col = st.selectbox(
132
- "This column should contain the labels to associate to each part of the text", col_names
 
 
133
  )
134
  col_mapping[tokens_col] = "tokens"
135
  col_mapping[tags_col] = "tags"
@@ -143,9 +156,15 @@ with st.expander("Advanced configuration"):
143
  st.text("")
144
  st.markdown("`target` column")
145
  with col2:
146
- text_col = st.selectbox("This column should contain the text you want to translate", col_names)
 
 
 
 
147
  target_col = st.selectbox(
148
- "This column should contain an example translation of the source text", col_names
 
 
149
  )
150
  col_mapping[text_col] = "source"
151
  col_mapping[target_col] = "target"
@@ -159,8 +178,16 @@ with st.expander("Advanced configuration"):
159
  st.text("")
160
  st.markdown("`target` column")
161
  with col2:
162
- text_col = st.selectbox("This column should contain the text you want to summarize", col_names)
163
- target_col = st.selectbox("This column should contain an example summarization of the text", col_names)
 
 
 
 
 
 
 
 
164
  col_mapping[text_col] = "text"
165
  col_mapping[target_col] = "target"
166
 
@@ -183,16 +210,29 @@ with st.expander("Advanced configuration"):
183
  st.text("")
184
  st.markdown("`answers.answer_start` column")
185
  with col2:
186
- context_col = st.selectbox("This column should contain the question's context", col_names)
 
 
 
 
187
  question_col = st.selectbox(
188
- "This column should contain the question to be answered, given the context", col_names
 
 
189
  )
190
  answers_text_col = st.selectbox(
191
- "This column should contain example answers to the question, extracted from the context", col_names
 
 
 
 
192
  )
193
  answers_start_col = st.selectbox(
194
  "This column should contain the indices in the context of the first character of each answers.text",
195
  col_names,
 
 
 
196
  )
197
  col_mapping[context_col] = "context"
198
  col_mapping[question_col] = "question"
@@ -203,9 +243,8 @@ with st.form(key="form"):
203
 
204
  compatible_models = get_compatible_models(selected_task, selected_dataset)
205
 
206
- selected_models = st.multiselect(
207
- "Select the models you wish to evaluate", compatible_models
208
- )
209
  submit_button = st.form_submit_button("Make submission")
210
 
211
  if submit_button:
 
8
  from dotenv import load_dotenv
9
  from huggingface_hub import list_datasets
10
 
11
+ from utils import (get_compatible_models, get_key, get_metadata, http_get,
12
+ http_post)
13
 
14
  if Path(".env").is_file():
15
  load_dotenv(".env")
 
30
  "summarization": 8,
31
  }
32
 
33
+ supported_tasks = list(TASK_TO_ID.keys())
34
+
35
+
36
  ###########
37
  ### APP ###
38
  ###########
 
65
 
66
  with st.expander("Advanced configuration"):
67
  ## Select task
68
+ selected_task = st.selectbox(
69
+ "Select a task",
70
+ supported_tasks,
71
+ index=supported_tasks.index(metadata[0]["task_id"]) if metadata is not None else 0,
72
+ )
73
  ### Select config
74
  configs = get_dataset_config_names(selected_dataset)
75
  selected_config = st.selectbox("Select a config", configs)
 
83
  if split["config"] == selected_config:
84
  split_names.append(split["split"])
85
 
86
+ selected_split = st.selectbox(
87
+ "Select a split",
88
+ split_names,
89
+ index=split_names.index(metadata[0]["splits"]["eval_split"]) if metadata is not None else 0,
90
+ )
91
 
92
+ ## Select columns
93
  rows_resp = http_get(
94
  path="/rows",
95
  domain="https://datasets-preview.huggingface.tech",
96
  params={"dataset": selected_dataset, "config": selected_config, "split": selected_split},
97
  ).json()
98
  col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
 
 
 
 
 
 
 
 
 
99
 
100
  st.markdown("**Map your data columns**")
101
  col1, col2 = st.columns(2)
102
 
103
  # TODO: find a better way to layout these items
104
+ # TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata
105
  col_mapping = {}
106
  if selected_task in ["binary_classification", "multi_class_classification"]:
107
  with col1:
 
112
  st.text("")
113
  st.markdown("`target` column")
114
  with col2:
115
+ text_col = st.selectbox(
116
+ "This column should contain the text you want to classify",
117
+ col_names,
118
+ index=col_names.index(get_key(metadata[0]["col_mapping"], "text")) if metadata is not None else 0,
119
+ )
120
  target_col = st.selectbox(
121
+ "This column should contain the labels you want to assign to the text",
122
+ col_names,
123
+ index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0,
124
  )
125
  col_mapping[text_col] = "text"
126
  col_mapping[target_col] = "target"
 
137
  tokens_col = st.selectbox(
138
  "This column should contain the parts of the text (as an array of tokens) you want to assign labels to",
139
  col_names,
140
+ index=col_names.index(get_key(metadata[0]["col_mapping"], "tokens")) if metadata is not None else 0,
141
  )
142
  tags_col = st.selectbox(
143
+ "This column should contain the labels to associate to each part of the text",
144
+ col_names,
145
+ index=col_names.index(get_key(metadata[0]["col_mapping"], "tags")) if metadata is not None else 0,
146
  )
147
  col_mapping[tokens_col] = "tokens"
148
  col_mapping[tags_col] = "tags"
 
156
  st.text("")
157
  st.markdown("`target` column")
158
  with col2:
159
+ text_col = st.selectbox(
160
+ "This column should contain the text you want to translate",
161
+ col_names,
162
+ index=col_names.index(get_key(metadata[0]["col_mapping"], "source")) if metadata is not None else 0,
163
+ )
164
  target_col = st.selectbox(
165
+ "This column should contain an example translation of the source text",
166
+ col_names,
167
+ index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0,
168
  )
169
  col_mapping[text_col] = "source"
170
  col_mapping[target_col] = "target"
 
178
  st.text("")
179
  st.markdown("`target` column")
180
  with col2:
181
+ text_col = st.selectbox(
182
+ "This column should contain the text you want to summarize",
183
+ col_names,
184
+ index=col_names.index(get_key(metadata[0]["col_mapping"], "text")) if metadata is not None else 0,
185
+ )
186
+ target_col = st.selectbox(
187
+ "This column should contain an example summarization of the text",
188
+ col_names,
189
+ index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0,
190
+ )
191
  col_mapping[text_col] = "text"
192
  col_mapping[target_col] = "target"
193
 
 
210
  st.text("")
211
  st.markdown("`answers.answer_start` column")
212
  with col2:
213
+ context_col = st.selectbox(
214
+ "This column should contain the question's context",
215
+ col_names,
216
+ index=col_names.index(get_key(metadata[0]["col_mapping"], "context")) if metadata is not None else 0,
217
+ )
218
  question_col = st.selectbox(
219
+ "This column should contain the question to be answered, given the context",
220
+ col_names,
221
+ index=col_names.index(get_key(metadata[0]["col_mapping"], "question")) if metadata is not None else 0,
222
  )
223
  answers_text_col = st.selectbox(
224
+ "This column should contain example answers to the question, extracted from the context",
225
+ col_names,
226
+ index=col_names.index(get_key(metadata[0]["col_mapping"], "answers.text"))
227
+ if metadata is not None
228
+ else 0,
229
  )
230
  answers_start_col = st.selectbox(
231
  "This column should contain the indices in the context of the first character of each answers.text",
232
  col_names,
233
+ index=col_names.index(get_key(metadata[0]["col_mapping"], "answers.answer_start"))
234
+ if metadata is not None
235
+ else 0,
236
  )
237
  col_mapping[context_col] = "context"
238
  col_mapping[question_col] = "question"
 
243
 
244
  compatible_models = get_compatible_models(selected_task, selected_dataset)
245
 
246
+ selected_models = st.multiselect("Select the models you wish to evaluate", compatible_models)
247
+ print("Selected models:", selected_models)
 
248
  submit_button = st.form_submit_button("Make submission")
249
 
250
  if submit_button:
utils.py CHANGED
@@ -48,10 +48,9 @@ def http_get(path: str, domain: str, token: str = None, params: dict = None) ->
48
 
49
 
50
  def get_metadata(dataset_name: str) -> Union[Dict, None]:
51
- filt = DatasetFilter(dataset_name=dataset_name)
52
- data = api.list_datasets(filter=filt, full=True)
53
- if data[0].cardData is not None and "train-eval-index" in data[0].cardData.keys():
54
- return data[0].cardData["train-eval-index"]
55
  else:
56
  return None
57
 
@@ -63,3 +62,11 @@ def get_compatible_models(task, dataset_name):
63
  )
64
  compatible_models = api.list_models(filter=filt)
65
  return [model.modelId for model in compatible_models]
 
 
 
 
 
 
 
 
 
48
 
49
 
50
  def get_metadata(dataset_name: str) -> Union[Dict, None]:
51
+ data = requests.get(f"https://huggingface.co/api/datasets/{dataset_name}").json()
52
+ if data["cardData"] is not None and "train-eval-index" in data["cardData"].keys():
53
+ return data["cardData"]["train-eval-index"]
 
54
  else:
55
  return None
56
 
 
62
  )
63
  compatible_models = api.list_models(filter=filt)
64
  return [model.modelId for model in compatible_models]
65
+
66
+
67
+ def get_key(col_mapping, val):
68
+ for key, value in col_mapping.items():
69
+ if val == value:
70
+ return key
71
+
72
+ return "key doesn't exist"