Spaces:

wenhu
/

Science-Leaderboard

Running

App Files Files Community

yuanshengni commited on Apr 11, 2024

Commit

6ed5ca9

1 Parent(s): af4a677

init

Browse files

Files changed (5) hide show

README.md +8 -1
app.py +45 -336
leaderboard/results.csv +25 -0
requirements.txt +2 -0
utils.py +49 -0

README.md CHANGED Viewed

@@ -10,4 +10,11 @@ pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 license: mit
 ---
+# TheoremQA Leaderboard
+## Space Description
+- **Repository:** [TheoremQA](https://github.com/wenhuchen/TheoremQA)
+- **Paper:** [2311.17982]
+(https://arxiv.org/abs/2305.12524)
+<!-- - **Point of Contact:**  -->

app.py CHANGED Viewed

@@ -1,380 +1,89 @@
-__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
-import os
 import gradio as gr
 import pandas as pd
-import json
-import tempfile
-from constants import *
-from huggingface_hub import Repository
-HF_TOKEN = os.environ.get("HF_TOKEN")
-global data_component, filter_component
-def upload_file(files):
-    file_paths = [file.name for file in files]
-    return file_paths
-def add_new_eval(
-    input_file,
-    model_name_textbox: str,
-    revision_name_textbox: str,
-    model_link: str,
-):
-    if input_file is None:
-        return "Error! Empty file!"
-    upload_data=json.loads(input_file)
-    submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
-    submission_repo.git_pull()
-    shutil.copyfile(CSV_DIR, os.path.join(SUBMISSION_NAME, f"{input_file}"))
-    csv_data = pd.read_csv(CSV_DIR)
-    if revision_name_textbox == '':
-        col = csv_data.shape[0]
-        model_name = model_name_textbox
-    else:
-        model_name = revision_name_textbox
-        model_name_list = csv_data['Model Name (clickable)']
-        name_list = [name.split(']')[0][1:] for name in model_name_list]
-        if revision_name_textbox not in name_list:
-            col = csv_data.shape[0]
-        else:
-            col = name_list.index(revision_name_textbox)
-    if model_link == '':
-        model_name = model_name  # no url
-    else:
-        model_name = '[' + model_name + '](' + model_link + ')'
-    # add new data
-    new_data = [
-        model_name
-        ]
-    for key in TASK_INFO:
-        if key in upload_data:
-            new_data.append(upload_data[key][0])
-        else:
-            new_data.append(0)
-    csv_data.loc[col] = new_data
-    csv_data = csv_data.to_csv(CSV_DIR, index=False)
-    submission_repo.push_to_hub()
-    return 0
-def get_normalized_df(df):
-    # final_score = df.drop('name', axis=1).sum(axis=1)
-    # df.insert(1, 'Overall Score', final_score)
-    normalize_df = df.copy().fillna(0.0)
-    for column in normalize_df.columns[1:]:
-        min_val = NORMALIZE_DIC[column]['Min']
-        max_val = NORMALIZE_DIC[column]['Max']
-        normalize_df[column] = (normalize_df[column] - min_val) / (max_val - min_val)
-    return normalize_df
-def calculate_selected_score(df, selected_columns):
-    # selected_score = df[selected_columns].sum(axis=1)
-    selected_QUALITY = [i for i in selected_columns if i in QUALITY_LIST]
-    selected_SEMANTIC = [i for i in selected_columns if i in SEMANTIC_LIST]
-    selected_quality_score = df[selected_QUALITY].sum(axis=1)/sum([DIM_WEIGHT[i] for i in selected_QUALITY])
-    selected_semantic_score = df[selected_SEMANTIC].sum(axis=1)/sum([DIM_WEIGHT[i] for i in selected_SEMANTIC ])
-    if selected_quality_score.isna().any().any() and selected_semantic_score.isna().any().any():
-        selected_score =  (selected_quality_score * QUALITY_WEIGHT + selected_semantic_score * SEMANTIC_WEIGHT) / (QUALITY_WEIGHT + SEMANTIC_WEIGHT)
-        return selected_score.fillna(0.0)
-    if selected_quality_score.isna().any().any():
-        return selected_semantic_score
-    if selected_semantic_score.isna().any().any():
-        return selected_quality_score
-    # print(selected_semantic_score,selected_quality_score )
-    selected_score =  (selected_quality_score * QUALITY_WEIGHT + selected_semantic_score * SEMANTIC_WEIGHT) / (QUALITY_WEIGHT + SEMANTIC_WEIGHT)
-    return selected_score.fillna(0.0)
-def get_final_score(df, selected_columns):
-    normalize_df = get_normalized_df(df)
-    #final_score = normalize_df.drop('name', axis=1).sum(axis=1)
-    for name in normalize_df.drop('Model Name (clickable)', axis=1):
-        normalize_df[name] = normalize_df[name]*DIM_WEIGHT[name]
-    quality_score = normalize_df[QUALITY_LIST].sum(axis=1)/sum([DIM_WEIGHT[i] for i in QUALITY_LIST])
-    semantic_score = normalize_df[SEMANTIC_LIST].sum(axis=1)/sum([DIM_WEIGHT[i] for i in SEMANTIC_LIST ])
-    final_score =  (quality_score * QUALITY_WEIGHT + semantic_score * SEMANTIC_WEIGHT) / (QUALITY_WEIGHT + SEMANTIC_WEIGHT)
-    if 'Total Score' in df:
-        df['Total Score'] = final_score
-    else:
-        df.insert(1, 'Total Score', final_score)
-    if 'Semantic Score' in df:
-        df['Semantic Score'] = semantic_score
-    else:
-        df.insert(2, 'Semantic Score', semantic_score)
-    if 'Quality Score' in df:
-        df['Quality Score'] = quality_score
-    else:
-        df.insert(3, 'Quality Score', quality_score)
-    selected_score = calculate_selected_score(normalize_df, selected_columns)
-    if 'Selected Score' in df:
-        df['Selected Score'] = selected_score
-    else:
-        df.insert(1, 'Selected Score', selected_score)
-    return df
-def get_final_score_quality(df, selected_columns):
-    normalize_df = get_normalized_df(df)
-    for name in normalize_df.drop('Model Name (clickable)', axis=1):
-        normalize_df[name] = normalize_df[name]*DIM_WEIGHT[name]
-    quality_score = normalize_df[QUALITY_TAB].sum(axis=1) / sum([DIM_WEIGHT[i] for i in QUALITY_TAB])
-    if 'Quality Score' in df:
-        df['Quality Score'] = quality_score
-    else:
-        df.insert(1, 'Quality Score', quality_score)
-    # selected_score = normalize_df[selected_columns].sum(axis=1) / len(selected_columns)
-    selected_score = normalize_df[selected_columns].sum(axis=1)/sum([DIM_WEIGHT[i] for i in selected_columns])
-    if 'Selected Score' in df:
-        df['Selected Score'] = selected_score
-    else:
-        df.insert(1, 'Selected Score', selected_score)
-    return df
-def get_baseline_df():
-    submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
-    submission_repo.git_pull()
-    df = pd.read_csv(CSV_DIR)
-    df = get_final_score(df, checkbox_group.value)
-    df = df.sort_values(by="Selected Score", ascending=False)
-    present_columns = MODEL_INFO + checkbox_group.value
-    df = df[present_columns]
-    df = convert_scores_to_percentage(df)
-    return df
-def get_baseline_df_quality():
-    submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
-    submission_repo.git_pull()
-    df = pd.read_csv(QUALITY_DIR)
-    df = get_final_score_quality(df, checkbox_group_quality.value)
-    df = df.sort_values(by="Selected Score", ascending=False)
-    present_columns = MODEL_INFO_TAB_QUALITY + checkbox_group_quality.value
-    df = df[present_columns]
-    df = convert_scores_to_percentage(df)
-    return df
-def get_all_df(selected_columns, dir=CSV_DIR):
-    submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
-    submission_repo.git_pull()
-    df = pd.read_csv(dir)
-    df = get_final_score(df, selected_columns)
-    df = df.sort_values(by="Selected Score", ascending=False)
-    return df
-def get_all_df_quality(selected_columns, dir=QUALITY_DIR):
-    submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
-    submission_repo.git_pull()
-    df = pd.read_csv(dir)
-    df = get_final_score_quality(df, selected_columns)
-    df = df.sort_values(by="Selected Score", ascending=False)
-    return df
-def convert_scores_to_percentage(df):
-    # 对DataFrame中的每一列（除了'name'列）进行操作
-    for column in df.columns[1:]:  # 假设第一列是'name'
-        df[column] = round(df[column] * 100,2)  # 将分数转换为百分数
-        df[column] = df[column].astype(str) + '%'
-    return df
-def choose_all_quailty():
-    return gr.update(value=QUALITY_LIST)
-def choose_all_semantic():
-    return gr.update(value=SEMANTIC_LIST)
-def disable_all():
-    return gr.update(value=[])
-def enable_all():
-    return gr.update(value=TASK_INFO)
-def on_filter_model_size_method_change(selected_columns):
-    updated_data = get_all_df(selected_columns, CSV_DIR)
-    #print(updated_data)
-    # columns:
-    selected_columns = [item for item in TASK_INFO if item in selected_columns]
-    present_columns = MODEL_INFO + selected_columns
-    updated_data = updated_data[present_columns]
-    updated_data = updated_data.sort_values(by="Selected Score", ascending=False)
-    updated_data = convert_scores_to_percentage(updated_data)
-    updated_headers = present_columns
-    update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
-    # print(updated_data,present_columns,update_datatype)
-    filter_component = gr.components.Dataframe(
-        value=updated_data,
-        headers=updated_headers,
-        type="pandas",
-        datatype=update_datatype,
-        interactive=False,
-        visible=True,
-        )
-    return filter_component#.value
-def on_filter_model_size_method_change_quality(selected_columns):
-    updated_data = get_all_df_quality(selected_columns, QUALITY_DIR)
-    #print(updated_data)
-    # columns:
-    selected_columns = [item for item in QUALITY_TAB if item in selected_columns]
-    present_columns = MODEL_INFO_TAB_QUALITY + selected_columns
-    updated_data = updated_data[present_columns]
-    updated_data = updated_data.sort_values(by="Selected Score", ascending=False)
-    updated_data = convert_scores_to_percentage(updated_data)
-    updated_headers = present_columns
-    update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
-    # print(updated_data,present_columns,update_datatype)
-    filter_component = gr.components.Dataframe(
-        value=updated_data,
-        headers=updated_headers,
-        type="pandas",
-        datatype=update_datatype,
-        interactive=False,
-        visible=True,
-        )
-    return filter_component#.value
 block = gr.Blocks()
 with block:
     gr.Markdown(
         LEADERBORAD_INTRODUCTION
     )
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         # Table 0
-        with gr.TabItem("📊 VBench", elem_id="vbench-tab-table", id=1):
             with gr.Row():
                 with gr.Accordion("Citation", open=False):
                     citation_button = gr.Textbox(
                         value=CITATION_BUTTON_TEXT,
                         label=CITATION_BUTTON_LABEL,
                         elem_id="citation-button",
-                        lines=10,
                     )
             gr.Markdown(
                 TABLE_INTRODUCTION
             )
-            with gr.Row():
-                with gr.Column(scale=0.2):
-                    choosen_q = gr.Button("Select Quality Dimensions")
-                    choosen_s = gr.Button("Select Semantic Dimensions")
-                    # enable_b = gr.Button("Select All")
-                    disable_b = gr.Button("Deselect All")
-                with gr.Column(scale=0.8):
-                    # selection for column part:
-                    checkbox_group = gr.CheckboxGroup(
-                        choices=TASK_INFO,
-                        value=DEFAULT_INFO,
-                        label="Evaluation Dimension",
-                        interactive=True,
-                    )
-            data_component = gr.components.Dataframe(
-                value=get_baseline_df,
                 headers=COLUMN_NAMES,
                 type="pandas",
                 datatype=DATA_TITILE_TYPE,
                 interactive=False,
                 visible=True,
                 )
-            choosen_q.click(choose_all_quailty, inputs=None, outputs=[checkbox_group]).then(fn=on_filter_model_size_method_change, inputs=[ checkbox_group], outputs=data_component)
-            choosen_s.click(choose_all_semantic, inputs=None, outputs=[checkbox_group]).then(fn=on_filter_model_size_method_change, inputs=[ checkbox_group], outputs=data_component)
-            # enable_b.click(enable_all, inputs=None, outputs=[checkbox_group]).then(fn=on_filter_model_size_method_change, inputs=[ checkbox_group], outputs=data_component)
-            disable_b.click(disable_all, inputs=None, outputs=[checkbox_group]).then(fn=on_filter_model_size_method_change, inputs=[ checkbox_group], outputs=data_component)
-            checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[ checkbox_group], outputs=data_component)
-        with gr.TabItem("Video Quaity", elem_id="vbench-tab-table", id=2):
-            with gr.Accordion("INSTRUCTION", open=False):
-                    citation_button = gr.Textbox(
-                        value=QUALITY_CLAIM_TEXT,
-                        label="",
-                        elem_id="quality-button",
-                        lines=2,
-                    )
-            with gr.Row():
-                with gr.Column(scale=1.0):
-                    # selection for column part:
-                    checkbox_group_quality = gr.CheckboxGroup(
-                        choices=QUALITY_TAB,
-                        value=QUALITY_TAB,
-                        label="Evaluation Quality Dimension",
-                        interactive=True,
-                    )
-            data_component_quality = gr.components.Dataframe(
-                value=get_baseline_df_quality,
-                headers=COLUMN_NAMES_QUALITY,
-                type="pandas",
-                datatype=DATA_TITILE_TYPE,
-                interactive=False,
-                visible=True,
-                )
-            checkbox_group_quality.change(fn=on_filter_model_size_method_change_quality, inputs=[checkbox_group_quality], outputs=data_component_quality)
-        # table 2
-        with gr.TabItem("📝 About", elem_id="mvbench-tab-table", id=3):
             gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
         # table 3
-        with gr.TabItem("🚀 Submit here! ", elem_id="mvbench-tab-table", id=4):
-            gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
-            with gr.Row():
-                gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(
-                        label="Model name", placeholder="LaVie"
-                        )
-                    revision_name_textbox = gr.Textbox(
-                        label="Revision Model Name", placeholder="LaVie"
-                    )
-                with gr.Column():
-                    model_link = gr.Textbox(
-                        label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
-                    )
-            with gr.Column():
-                input_file = gr.components.File(label = "Click to Upload a json File", file_count="single", type='binary')
-                submit_button = gr.Button("Submit Eval")
-                submission_result = gr.Markdown()
-                submit_button.click(
-                    add_new_eval,
-                    inputs = [
-                        input_file,
-                        model_name_textbox,
-                        revision_name_textbox,
-                        model_link,
-                    ],
-                )
-    def refresh_data():
-        value1 = get_baseline_df()
-        return value1
-    with gr.Row():
-        data_run = gr.Button("Refresh")
-        data_run.click(on_filter_model_size_method_change, inputs=[checkbox_group], outputs=data_component)
 block.launch()

 import gradio as gr
 import pandas as pd
+from utils import *
 block = gr.Blocks()
 with block:
     gr.Markdown(
         LEADERBORAD_INTRODUCTION
     )
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         # Table 0
+        with gr.TabItem("📊 TheoremQA", elem_id="theoremqa-tab-table1", id=1):
             with gr.Row():
                 with gr.Accordion("Citation", open=False):
                     citation_button = gr.Textbox(
                         value=CITATION_BUTTON_TEXT,
                         label=CITATION_BUTTON_LABEL,
                         elem_id="citation-button",
                     )
             gr.Markdown(
                 TABLE_INTRODUCTION
             )
+            gr.components.Dataframe(
+                value=pd.read_csv(CSV_DIR),
                 headers=COLUMN_NAMES,
                 type="pandas",
                 datatype=DATA_TITILE_TYPE,
                 interactive=False,
                 visible=True,
                 )
+        with gr.TabItem("📝 About", elem_id="theoremqa-tab-table2", id=2):
             gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
         # table 3
+        # with gr.TabItem("🚀 Submit here! ", elem_id="mtheoremqa-tab-table", id=3):
+        #     gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
+        #     with gr.Row():
+        #         gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
+        #     with gr.Row():
+        #         gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
+        #     with gr.Row():
+        #         with gr.Column():
+        #             model_name_textbox = gr.Textbox(
+        #                 label="Model name", placeholder="LaVie"
+        #                 )
+        #             revision_name_textbox = gr.Textbox(
+        #                 label="Revision Model Name", placeholder="LaVie"
+        #             )
+        #         with gr.Column():
+        #             model_link = gr.Textbox(
+        #                 label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
+        #             )
+            # with gr.Column():
+            #     input_file = gr.components.File(label = "Click to Upload a json File", file_count="single", type='binary')
+            #     submit_button = gr.Button("Submit Eval")
+            #     submission_result = gr.Markdown()
+            #     submit_button.click(
+            #         add_new_eval,
+            #         inputs = [
+            #             input_file,
+            #             model_name_textbox,
+            #             revision_name_textbox,
+            #             model_link,
+            #         ],
+            #     )
+    # def refresh_data():
+    #     value1 = get_baseline_df()
+    #     return value1
+    # with gr.Row():
+    #     data_run = gr.Button("Refresh")
+    #     data_run.click(on_filter_model_size_method_change, inputs=[checkbox_group], outputs=data_component)
 block.launch()

leaderboard/results.csv ADDED Viewed

	@@ -0,0 +1,25 @@

+Model (CoT),TheoremQA,MATH,GSM
+Mistral-v0.2-base,19.2,10.2,36.2
+Mixtral-7x8B-base,23.2,22.1,58.4
+Qwen-1.5-7B,14.2,13.3,54.1
+Qwen-1.5-14B,14,25.2,61.6
+Qwen-1.5-72B,29.3,35.1,77.6
+Yi-6B,12,5.8,32.6
+Yi-34B,23.2,15.9,67.9
+ChatGLM3-6B,11.3,25.7,72.3
+Gemma-7B,21.5,24.3,46.4
+LLaMA-2-13B,10.9,5,29.6
+LLeMMA-7B,17.2,18,36.4
+LLeMMA-34B,21.1,25,71.9
+InternLM2-7B,7.8,20.2,70.8
+InternLM2-20B,19.5,25.5,76.1
+Deepseek-7B,15.7,6.4,17.4
+Deepseek-67B,25.3,15.9,66.5
+GPT-4-0409,0,69.2,94.5
+InternLM-Math-20B,17.1,37.7,82.9
+Deepseek-Math-7B,27.1,36.2,64.2
+Deepseek-Math-7B-Instruct,23.7,46.8,82.9
+WizardMath-7B-1.1,11.7,33,83.2
+MetaMath-Mistral-7B,16.5,28.2,77.7
+Abel-7B-002,19.3,29.5,83.2
+OpenMath-Mistral-7B,13.1,44.5,80.2

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio==3.23.0
2	+ pandas==2.0.0

utils.py ADDED Viewed

	@@ -0,0 +1,49 @@

+MODEL_INFO = [
+    "Model Name (clickable)",
+    "TheoremQA",
+    "MATH",
+    "GSM",
+    ]
+MODEL_INFO_TAB_QUALITY = [
+    "Model Name (clickable)",
+    "Quality Score",
+    "Selected Score"
+]
+DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number']
+CSV_DIR = "./leaderboard/results.csv"
+COLUMN_NAMES = MODEL_INFO
+LEADERBORAD_INTRODUCTION = """# TheoremQA Leaderboard
+    *"Which Model is better on STEM QA?"*
+    🏆 Welcome to the leaderboard of the **TheoremQA**! 🎦 *A Theorem-driven Question Answering dataset* (**EMNLP 2023**)
+    <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
+    <a href='https://arxiv.org/abs/2305.12524'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
+    <a href='https://github.com/TIGER-AI-Lab/TheoremQA'><img src='https://img.shields.io/badge/TheoremQA-Website-green?logo=googlechrome&logoColor=green'></a>
+    <a href=“https://hits.seeyoufarm.com”><img src=“https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fhuggingface.co%2Fspaces%2FTIGER-Lab%2FTheoremQA-Leaderboard&count_bg=%23C7C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false”/></a>
+    </div>
+    We propose the first question-answering dataset driven by STEM theorems.  We annotated 800 QA pairs covering 350+ theorems spanning across Math, EE&CS, Physics and Finance.  The dataset is collected by human experts with very high quality.  We provide the dataset as a new benchmark to test the limit of large language models to apply theorems to solve challenging university-level questions.
+    Please follow the instructions in [TheoremQA](https://github.com/TIGER-AI-Lab/TheoremQA) to use.
+    """
+TABLE_INTRODUCTION = """
+    """
+LEADERBORAD_INFO = """
+       TheoremQA, a comprehensive benchmark suite for video generative models. We design a comprehensive and hierarchical Evaluation Dimension Suite to decompose "video generation quality" into multiple well-defined dimensions to facilitate fine-grained and objective evaluation. For each dimension and each content category, we carefully design a Prompt Suite as test cases, and sample Generated Videos from a set of video generation models. For each evaluation dimension, we specifically design an Evaluation Method Suite, which uses carefully crafted method or designated pipeline for automatic objective evaluation. We also conduct Human Preference Annotation for the generated videos for each dimension, and show that TheoremQA evaluation results are well aligned with human perceptions. TheoremQA can provide valuable insights from multiple perspectives.
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""@inproceedings{chen2023theoremqa,
+  title={Theoremqa: A theorem-driven question answering dataset},
+  author={Chen, Wenhu and Yin, Ming and Ku, Max and Lu, Pan and Wan, Yixin and Ma, Xueguang and Xu, Jianyu and Wang, Xinyi and Xia, Tony},
+  booktitle={The 2023 Conference on Empirical Methods in Natural Language Processing},
+  year={2023}
+}"""