yuanshengni commited on
Commit
6ed5ca9
·
1 Parent(s): af4a677
Files changed (5) hide show
  1. README.md +8 -1
  2. app.py +45 -336
  3. leaderboard/results.csv +25 -0
  4. requirements.txt +2 -0
  5. utils.py +49 -0
README.md CHANGED
@@ -10,4 +10,11 @@ pinned: false
10
  license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
10
  license: mit
11
  ---
12
 
13
+ # TheoremQA Leaderboard
14
+
15
+ ## Space Description
16
+
17
+ - **Repository:** [TheoremQA](https://github.com/wenhuchen/TheoremQA)
18
+ - **Paper:** [2311.17982]
19
+ (https://arxiv.org/abs/2305.12524)
20
+ <!-- - **Point of Contact:** -->
app.py CHANGED
@@ -1,380 +1,89 @@
1
- __all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
2
- import os
3
-
4
  import gradio as gr
5
  import pandas as pd
6
- import json
7
- import tempfile
8
-
9
- from constants import *
10
- from huggingface_hub import Repository
11
- HF_TOKEN = os.environ.get("HF_TOKEN")
12
-
13
- global data_component, filter_component
14
-
15
-
16
- def upload_file(files):
17
- file_paths = [file.name for file in files]
18
- return file_paths
19
-
20
- def add_new_eval(
21
- input_file,
22
- model_name_textbox: str,
23
- revision_name_textbox: str,
24
- model_link: str,
25
- ):
26
- if input_file is None:
27
- return "Error! Empty file!"
28
-
29
- upload_data=json.loads(input_file)
30
- submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
31
- submission_repo.git_pull()
32
- shutil.copyfile(CSV_DIR, os.path.join(SUBMISSION_NAME, f"{input_file}"))
33
-
34
- csv_data = pd.read_csv(CSV_DIR)
35
-
36
- if revision_name_textbox == '':
37
- col = csv_data.shape[0]
38
- model_name = model_name_textbox
39
- else:
40
- model_name = revision_name_textbox
41
- model_name_list = csv_data['Model Name (clickable)']
42
- name_list = [name.split(']')[0][1:] for name in model_name_list]
43
- if revision_name_textbox not in name_list:
44
- col = csv_data.shape[0]
45
- else:
46
- col = name_list.index(revision_name_textbox)
47
-
48
- if model_link == '':
49
- model_name = model_name # no url
50
- else:
51
- model_name = '[' + model_name + '](' + model_link + ')'
52
-
53
- # add new data
54
- new_data = [
55
- model_name
56
- ]
57
- for key in TASK_INFO:
58
- if key in upload_data:
59
- new_data.append(upload_data[key][0])
60
- else:
61
- new_data.append(0)
62
- csv_data.loc[col] = new_data
63
- csv_data = csv_data.to_csv(CSV_DIR, index=False)
64
- submission_repo.push_to_hub()
65
- return 0
66
-
67
- def get_normalized_df(df):
68
- # final_score = df.drop('name', axis=1).sum(axis=1)
69
- # df.insert(1, 'Overall Score', final_score)
70
- normalize_df = df.copy().fillna(0.0)
71
- for column in normalize_df.columns[1:]:
72
- min_val = NORMALIZE_DIC[column]['Min']
73
- max_val = NORMALIZE_DIC[column]['Max']
74
- normalize_df[column] = (normalize_df[column] - min_val) / (max_val - min_val)
75
- return normalize_df
76
-
77
- def calculate_selected_score(df, selected_columns):
78
- # selected_score = df[selected_columns].sum(axis=1)
79
- selected_QUALITY = [i for i in selected_columns if i in QUALITY_LIST]
80
- selected_SEMANTIC = [i for i in selected_columns if i in SEMANTIC_LIST]
81
- selected_quality_score = df[selected_QUALITY].sum(axis=1)/sum([DIM_WEIGHT[i] for i in selected_QUALITY])
82
- selected_semantic_score = df[selected_SEMANTIC].sum(axis=1)/sum([DIM_WEIGHT[i] for i in selected_SEMANTIC ])
83
- if selected_quality_score.isna().any().any() and selected_semantic_score.isna().any().any():
84
- selected_score = (selected_quality_score * QUALITY_WEIGHT + selected_semantic_score * SEMANTIC_WEIGHT) / (QUALITY_WEIGHT + SEMANTIC_WEIGHT)
85
- return selected_score.fillna(0.0)
86
- if selected_quality_score.isna().any().any():
87
- return selected_semantic_score
88
- if selected_semantic_score.isna().any().any():
89
- return selected_quality_score
90
- # print(selected_semantic_score,selected_quality_score )
91
- selected_score = (selected_quality_score * QUALITY_WEIGHT + selected_semantic_score * SEMANTIC_WEIGHT) / (QUALITY_WEIGHT + SEMANTIC_WEIGHT)
92
- return selected_score.fillna(0.0)
93
-
94
- def get_final_score(df, selected_columns):
95
- normalize_df = get_normalized_df(df)
96
- #final_score = normalize_df.drop('name', axis=1).sum(axis=1)
97
- for name in normalize_df.drop('Model Name (clickable)', axis=1):
98
- normalize_df[name] = normalize_df[name]*DIM_WEIGHT[name]
99
- quality_score = normalize_df[QUALITY_LIST].sum(axis=1)/sum([DIM_WEIGHT[i] for i in QUALITY_LIST])
100
- semantic_score = normalize_df[SEMANTIC_LIST].sum(axis=1)/sum([DIM_WEIGHT[i] for i in SEMANTIC_LIST ])
101
- final_score = (quality_score * QUALITY_WEIGHT + semantic_score * SEMANTIC_WEIGHT) / (QUALITY_WEIGHT + SEMANTIC_WEIGHT)
102
- if 'Total Score' in df:
103
- df['Total Score'] = final_score
104
- else:
105
- df.insert(1, 'Total Score', final_score)
106
- if 'Semantic Score' in df:
107
- df['Semantic Score'] = semantic_score
108
- else:
109
- df.insert(2, 'Semantic Score', semantic_score)
110
- if 'Quality Score' in df:
111
- df['Quality Score'] = quality_score
112
- else:
113
- df.insert(3, 'Quality Score', quality_score)
114
- selected_score = calculate_selected_score(normalize_df, selected_columns)
115
- if 'Selected Score' in df:
116
- df['Selected Score'] = selected_score
117
- else:
118
- df.insert(1, 'Selected Score', selected_score)
119
- return df
120
-
121
-
122
- def get_final_score_quality(df, selected_columns):
123
- normalize_df = get_normalized_df(df)
124
- for name in normalize_df.drop('Model Name (clickable)', axis=1):
125
- normalize_df[name] = normalize_df[name]*DIM_WEIGHT[name]
126
- quality_score = normalize_df[QUALITY_TAB].sum(axis=1) / sum([DIM_WEIGHT[i] for i in QUALITY_TAB])
127
-
128
- if 'Quality Score' in df:
129
- df['Quality Score'] = quality_score
130
- else:
131
- df.insert(1, 'Quality Score', quality_score)
132
- # selected_score = normalize_df[selected_columns].sum(axis=1) / len(selected_columns)
133
- selected_score = normalize_df[selected_columns].sum(axis=1)/sum([DIM_WEIGHT[i] for i in selected_columns])
134
- if 'Selected Score' in df:
135
- df['Selected Score'] = selected_score
136
- else:
137
- df.insert(1, 'Selected Score', selected_score)
138
- return df
139
-
140
- def get_baseline_df():
141
- submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
142
- submission_repo.git_pull()
143
- df = pd.read_csv(CSV_DIR)
144
- df = get_final_score(df, checkbox_group.value)
145
- df = df.sort_values(by="Selected Score", ascending=False)
146
- present_columns = MODEL_INFO + checkbox_group.value
147
- df = df[present_columns]
148
- df = convert_scores_to_percentage(df)
149
- return df
150
-
151
- def get_baseline_df_quality():
152
- submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
153
- submission_repo.git_pull()
154
- df = pd.read_csv(QUALITY_DIR)
155
- df = get_final_score_quality(df, checkbox_group_quality.value)
156
- df = df.sort_values(by="Selected Score", ascending=False)
157
- present_columns = MODEL_INFO_TAB_QUALITY + checkbox_group_quality.value
158
- df = df[present_columns]
159
- df = convert_scores_to_percentage(df)
160
- return df
161
-
162
- def get_all_df(selected_columns, dir=CSV_DIR):
163
- submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
164
- submission_repo.git_pull()
165
- df = pd.read_csv(dir)
166
- df = get_final_score(df, selected_columns)
167
- df = df.sort_values(by="Selected Score", ascending=False)
168
- return df
169
-
170
- def get_all_df_quality(selected_columns, dir=QUALITY_DIR):
171
- submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
172
- submission_repo.git_pull()
173
- df = pd.read_csv(dir)
174
- df = get_final_score_quality(df, selected_columns)
175
- df = df.sort_values(by="Selected Score", ascending=False)
176
- return df
177
-
178
-
179
- def convert_scores_to_percentage(df):
180
- # 对DataFrame中的每一列(除了'name'列)进行操作
181
- for column in df.columns[1:]: # 假设第一列是'name'
182
- df[column] = round(df[column] * 100,2) # 将分数转换为百分数
183
- df[column] = df[column].astype(str) + '%'
184
- return df
185
-
186
- def choose_all_quailty():
187
- return gr.update(value=QUALITY_LIST)
188
-
189
- def choose_all_semantic():
190
- return gr.update(value=SEMANTIC_LIST)
191
-
192
- def disable_all():
193
- return gr.update(value=[])
194
-
195
- def enable_all():
196
- return gr.update(value=TASK_INFO)
197
-
198
- def on_filter_model_size_method_change(selected_columns):
199
- updated_data = get_all_df(selected_columns, CSV_DIR)
200
- #print(updated_data)
201
- # columns:
202
- selected_columns = [item for item in TASK_INFO if item in selected_columns]
203
- present_columns = MODEL_INFO + selected_columns
204
- updated_data = updated_data[present_columns]
205
- updated_data = updated_data.sort_values(by="Selected Score", ascending=False)
206
- updated_data = convert_scores_to_percentage(updated_data)
207
- updated_headers = present_columns
208
- update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
209
- # print(updated_data,present_columns,update_datatype)
210
- filter_component = gr.components.Dataframe(
211
- value=updated_data,
212
- headers=updated_headers,
213
- type="pandas",
214
- datatype=update_datatype,
215
- interactive=False,
216
- visible=True,
217
- )
218
- return filter_component#.value
219
-
220
- def on_filter_model_size_method_change_quality(selected_columns):
221
- updated_data = get_all_df_quality(selected_columns, QUALITY_DIR)
222
- #print(updated_data)
223
- # columns:
224
- selected_columns = [item for item in QUALITY_TAB if item in selected_columns]
225
- present_columns = MODEL_INFO_TAB_QUALITY + selected_columns
226
- updated_data = updated_data[present_columns]
227
- updated_data = updated_data.sort_values(by="Selected Score", ascending=False)
228
- updated_data = convert_scores_to_percentage(updated_data)
229
- updated_headers = present_columns
230
- update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
231
- # print(updated_data,present_columns,update_datatype)
232
- filter_component = gr.components.Dataframe(
233
- value=updated_data,
234
- headers=updated_headers,
235
- type="pandas",
236
- datatype=update_datatype,
237
- interactive=False,
238
- visible=True,
239
- )
240
- return filter_component#.value
241
 
 
242
 
243
  block = gr.Blocks()
244
 
245
-
246
  with block:
247
  gr.Markdown(
248
  LEADERBORAD_INTRODUCTION
249
  )
250
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
251
  # Table 0
252
- with gr.TabItem("📊 VBench", elem_id="vbench-tab-table", id=1):
253
  with gr.Row():
254
  with gr.Accordion("Citation", open=False):
255
  citation_button = gr.Textbox(
256
  value=CITATION_BUTTON_TEXT,
257
  label=CITATION_BUTTON_LABEL,
258
  elem_id="citation-button",
259
- lines=10,
260
  )
261
-
262
  gr.Markdown(
263
  TABLE_INTRODUCTION
264
  )
265
- with gr.Row():
266
- with gr.Column(scale=0.2):
267
- choosen_q = gr.Button("Select Quality Dimensions")
268
- choosen_s = gr.Button("Select Semantic Dimensions")
269
- # enable_b = gr.Button("Select All")
270
- disable_b = gr.Button("Deselect All")
271
 
272
- with gr.Column(scale=0.8):
273
- # selection for column part:
274
- checkbox_group = gr.CheckboxGroup(
275
- choices=TASK_INFO,
276
- value=DEFAULT_INFO,
277
- label="Evaluation Dimension",
278
- interactive=True,
279
- )
280
-
281
- data_component = gr.components.Dataframe(
282
- value=get_baseline_df,
283
  headers=COLUMN_NAMES,
284
  type="pandas",
285
  datatype=DATA_TITILE_TYPE,
286
  interactive=False,
287
  visible=True,
288
  )
289
-
290
- choosen_q.click(choose_all_quailty, inputs=None, outputs=[checkbox_group]).then(fn=on_filter_model_size_method_change, inputs=[ checkbox_group], outputs=data_component)
291
- choosen_s.click(choose_all_semantic, inputs=None, outputs=[checkbox_group]).then(fn=on_filter_model_size_method_change, inputs=[ checkbox_group], outputs=data_component)
292
- # enable_b.click(enable_all, inputs=None, outputs=[checkbox_group]).then(fn=on_filter_model_size_method_change, inputs=[ checkbox_group], outputs=data_component)
293
- disable_b.click(disable_all, inputs=None, outputs=[checkbox_group]).then(fn=on_filter_model_size_method_change, inputs=[ checkbox_group], outputs=data_component)
294
- checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[ checkbox_group], outputs=data_component)
295
 
296
- with gr.TabItem("Video Quaity", elem_id="vbench-tab-table", id=2):
297
- with gr.Accordion("INSTRUCTION", open=False):
298
- citation_button = gr.Textbox(
299
- value=QUALITY_CLAIM_TEXT,
300
- label="",
301
- elem_id="quality-button",
302
- lines=2,
303
- )
304
- with gr.Row():
305
- with gr.Column(scale=1.0):
306
- # selection for column part:
307
- checkbox_group_quality = gr.CheckboxGroup(
308
- choices=QUALITY_TAB,
309
- value=QUALITY_TAB,
310
- label="Evaluation Quality Dimension",
311
- interactive=True,
312
- )
313
-
314
- data_component_quality = gr.components.Dataframe(
315
- value=get_baseline_df_quality,
316
- headers=COLUMN_NAMES_QUALITY,
317
- type="pandas",
318
- datatype=DATA_TITILE_TYPE,
319
- interactive=False,
320
- visible=True,
321
- )
322
-
323
- checkbox_group_quality.change(fn=on_filter_model_size_method_change_quality, inputs=[checkbox_group_quality], outputs=data_component_quality)
324
-
325
- # table 2
326
- with gr.TabItem("📝 About", elem_id="mvbench-tab-table", id=3):
327
  gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
328
 
329
  # table 3
330
- with gr.TabItem("🚀 Submit here! ", elem_id="mvbench-tab-table", id=4):
331
- gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
332
 
333
- with gr.Row():
334
- gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
335
 
336
- with gr.Row():
337
- gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
338
 
339
- with gr.Row():
340
- with gr.Column():
341
- model_name_textbox = gr.Textbox(
342
- label="Model name", placeholder="LaVie"
343
- )
344
- revision_name_textbox = gr.Textbox(
345
- label="Revision Model Name", placeholder="LaVie"
346
- )
347
 
348
- with gr.Column():
349
- model_link = gr.Textbox(
350
- label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
351
- )
352
 
353
 
354
- with gr.Column():
355
 
356
- input_file = gr.components.File(label = "Click to Upload a json File", file_count="single", type='binary')
357
- submit_button = gr.Button("Submit Eval")
358
 
359
- submission_result = gr.Markdown()
360
- submit_button.click(
361
- add_new_eval,
362
- inputs = [
363
- input_file,
364
- model_name_textbox,
365
- revision_name_textbox,
366
- model_link,
367
- ],
368
- )
369
-
370
-
371
- def refresh_data():
372
- value1 = get_baseline_df()
373
- return value1
374
-
375
- with gr.Row():
376
- data_run = gr.Button("Refresh")
377
- data_run.click(on_filter_model_size_method_change, inputs=[checkbox_group], outputs=data_component)
378
 
379
 
380
  block.launch()
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ from utils import *
5
 
6
  block = gr.Blocks()
7
 
 
8
  with block:
9
  gr.Markdown(
10
  LEADERBORAD_INTRODUCTION
11
  )
12
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
13
  # Table 0
14
+ with gr.TabItem("📊 TheoremQA", elem_id="theoremqa-tab-table1", id=1):
15
  with gr.Row():
16
  with gr.Accordion("Citation", open=False):
17
  citation_button = gr.Textbox(
18
  value=CITATION_BUTTON_TEXT,
19
  label=CITATION_BUTTON_LABEL,
20
  elem_id="citation-button",
 
21
  )
 
22
  gr.Markdown(
23
  TABLE_INTRODUCTION
24
  )
 
 
 
 
 
 
25
 
26
+ gr.components.Dataframe(
27
+ value=pd.read_csv(CSV_DIR),
 
 
 
 
 
 
 
 
 
28
  headers=COLUMN_NAMES,
29
  type="pandas",
30
  datatype=DATA_TITILE_TYPE,
31
  interactive=False,
32
  visible=True,
33
  )
 
 
 
 
 
 
34
 
35
+ with gr.TabItem("📝 About", elem_id="theoremqa-tab-table2", id=2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
37
 
38
  # table 3
39
+ # with gr.TabItem("🚀 Submit here! ", elem_id="mtheoremqa-tab-table", id=3):
40
+ # gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
41
 
42
+ # with gr.Row():
43
+ # gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
44
 
45
+ # with gr.Row():
46
+ # gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
47
 
48
+ # with gr.Row():
49
+ # with gr.Column():
50
+ # model_name_textbox = gr.Textbox(
51
+ # label="Model name", placeholder="LaVie"
52
+ # )
53
+ # revision_name_textbox = gr.Textbox(
54
+ # label="Revision Model Name", placeholder="LaVie"
55
+ # )
56
 
57
+ # with gr.Column():
58
+ # model_link = gr.Textbox(
59
+ # label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
60
+ # )
61
 
62
 
63
+ # with gr.Column():
64
 
65
+ # input_file = gr.components.File(label = "Click to Upload a json File", file_count="single", type='binary')
66
+ # submit_button = gr.Button("Submit Eval")
67
 
68
+ # submission_result = gr.Markdown()
69
+ # submit_button.click(
70
+ # add_new_eval,
71
+ # inputs = [
72
+ # input_file,
73
+ # model_name_textbox,
74
+ # revision_name_textbox,
75
+ # model_link,
76
+ # ],
77
+ # )
78
+
79
+
80
+ # def refresh_data():
81
+ # value1 = get_baseline_df()
82
+ # return value1
83
+
84
+ # with gr.Row():
85
+ # data_run = gr.Button("Refresh")
86
+ # data_run.click(on_filter_model_size_method_change, inputs=[checkbox_group], outputs=data_component)
87
 
88
 
89
  block.launch()
leaderboard/results.csv ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model (CoT),TheoremQA,MATH,GSM
2
+ Mistral-v0.2-base,19.2,10.2,36.2
3
+ Mixtral-7x8B-base,23.2,22.1,58.4
4
+ Qwen-1.5-7B,14.2,13.3,54.1
5
+ Qwen-1.5-14B,14,25.2,61.6
6
+ Qwen-1.5-72B,29.3,35.1,77.6
7
+ Yi-6B,12,5.8,32.6
8
+ Yi-34B,23.2,15.9,67.9
9
+ ChatGLM3-6B,11.3,25.7,72.3
10
+ Gemma-7B,21.5,24.3,46.4
11
+ LLaMA-2-13B,10.9,5,29.6
12
+ LLeMMA-7B,17.2,18,36.4
13
+ LLeMMA-34B,21.1,25,71.9
14
+ InternLM2-7B,7.8,20.2,70.8
15
+ InternLM2-20B,19.5,25.5,76.1
16
+ Deepseek-7B,15.7,6.4,17.4
17
+ Deepseek-67B,25.3,15.9,66.5
18
+ GPT-4-0409,0,69.2,94.5
19
+ InternLM-Math-20B,17.1,37.7,82.9
20
+ Deepseek-Math-7B,27.1,36.2,64.2
21
+ Deepseek-Math-7B-Instruct,23.7,46.8,82.9
22
+ WizardMath-7B-1.1,11.7,33,83.2
23
+ MetaMath-Mistral-7B,16.5,28.2,77.7
24
+ Abel-7B-002,19.3,29.5,83.2
25
+ OpenMath-Mistral-7B,13.1,44.5,80.2
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio==3.23.0
2
+ pandas==2.0.0
utils.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL_INFO = [
2
+ "Model Name (clickable)",
3
+ "TheoremQA",
4
+ "MATH",
5
+ "GSM",
6
+ ]
7
+
8
+ MODEL_INFO_TAB_QUALITY = [
9
+ "Model Name (clickable)",
10
+ "Quality Score",
11
+ "Selected Score"
12
+ ]
13
+
14
+
15
+ DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number']
16
+
17
+ CSV_DIR = "./leaderboard/results.csv"
18
+
19
+ COLUMN_NAMES = MODEL_INFO
20
+
21
+ LEADERBORAD_INTRODUCTION = """# TheoremQA Leaderboard
22
+
23
+ *"Which Model is better on STEM QA?"*
24
+ 🏆 Welcome to the leaderboard of the **TheoremQA**! 🎦 *A Theorem-driven Question Answering dataset* (**EMNLP 2023**)
25
+ <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
26
+ <a href='https://arxiv.org/abs/2305.12524'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
27
+ <a href='https://github.com/TIGER-AI-Lab/TheoremQA'><img src='https://img.shields.io/badge/TheoremQA-Website-green?logo=googlechrome&logoColor=green'></a>
28
+ <a href=“https://hits.seeyoufarm.com”><img src=“https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fhuggingface.co%2Fspaces%2FTIGER-Lab%2FTheoremQA-Leaderboard&count_bg=%23C7C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false”/></a>
29
+ </div>
30
+
31
+ We propose the first question-answering dataset driven by STEM theorems. We annotated 800 QA pairs covering 350+ theorems spanning across Math, EE&CS, Physics and Finance. The dataset is collected by human experts with very high quality. We provide the dataset as a new benchmark to test the limit of large language models to apply theorems to solve challenging university-level questions.
32
+
33
+ Please follow the instructions in [TheoremQA](https://github.com/TIGER-AI-Lab/TheoremQA) to use.
34
+ """
35
+
36
+ TABLE_INTRODUCTION = """
37
+ """
38
+
39
+ LEADERBORAD_INFO = """
40
+ TheoremQA, a comprehensive benchmark suite for video generative models. We design a comprehensive and hierarchical Evaluation Dimension Suite to decompose "video generation quality" into multiple well-defined dimensions to facilitate fine-grained and objective evaluation. For each dimension and each content category, we carefully design a Prompt Suite as test cases, and sample Generated Videos from a set of video generation models. For each evaluation dimension, we specifically design an Evaluation Method Suite, which uses carefully crafted method or designated pipeline for automatic objective evaluation. We also conduct Human Preference Annotation for the generated videos for each dimension, and show that TheoremQA evaluation results are well aligned with human perceptions. TheoremQA can provide valuable insights from multiple perspectives.
41
+ """
42
+
43
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
44
+ CITATION_BUTTON_TEXT = r"""@inproceedings{chen2023theoremqa,
45
+ title={Theoremqa: A theorem-driven question answering dataset},
46
+ author={Chen, Wenhu and Yin, Ming and Ku, Max and Lu, Pan and Wan, Yixin and Ma, Xueguang and Xu, Jianyu and Wang, Xinyi and Xia, Tony},
47
+ booktitle={The 2023 Conference on Empirical Methods in Natural Language Processing},
48
+ year={2023}
49
+ }"""