Spaces:

Bias-Leaderboard
/

leaderboard

Sleeping

App Files Files Community

meg-huggingface commited on Jan 23, 2024

Commit

595e24c

1 Parent(s): b266265

Read evals code

Browse files

Files changed (4) hide show

app.py +95 -93
src/about.py +6 -5
src/envs.py +3 -3
src/leaderboard/read_evals.py +2 -2

app.py CHANGED Viewed

@@ -140,8 +140,8 @@ def filter_models(
     return filtered_df
-ui = gr.Blocks(css=custom_css)
-with ui:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
@@ -150,36 +150,46 @@ with ui:
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
-                        search_bar = gr.Textbox(
-                            placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
-                            show_label=False,
-                            elem_id="search-bar",
-                        )
                     with gr.Row():
-                        with gr.Column(scale=1):
-                            with gr.Row():
-                                with gr.Column():
-                                    shown_columns = gr.CheckboxGroup(
-                                        choices=[
-                                            c.name
-                                            for c in fields(AutoEvalColumn)
-                                            if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.advanced and not c.dummy
-                                        ],
-                                        value=[
-                                            c.name
-                                            for c in fields(AutoEvalColumn)
-                                            if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.advanced
-                                        ],
-                                        label="Select metrics to show",
-                                        elem_id="column-select",
-                                        interactive=True,
-                                    )
-                        with gr.Column(scale=3):
                             for c in fields(AutoEvalColumn):
                                 if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.advanced and not c.dummy:
-                                    gr.Markdown("**" + c.name + "**. " + c.cite, elem_classes="markdown-text")
-                    with gr.Row():
-                        with gr.Accordion("Advanced options [WIP]", open=False):
                             shown_columns_advanced = gr.CheckboxGroup(
                                 choices=[
                                     c.name
@@ -198,30 +208,20 @@ with ui:
                             deleted_models_visibility = gr.Checkbox(
                                 value=False, label="Show gated/private/deleted models", interactive=True, visible=True,
                             )
-                with gr.Column(min_width=320):
-                    #with gr.Box(elem_id="box-filter"):
-                    filter_columns_type = gr.CheckboxGroup(
-                        label="Select model types to include",
-                        choices=[t.to_str() for t in ModelType],
-                        value=[t.to_str() for t in ModelType],
-                        interactive=True,
-                        elem_id="filter-columns-type",
-                    )
-                    filter_columns_precision = gr.CheckboxGroup(
-                        label="Select precision levels to include",
-                        choices=[i.value.name for i in Precision],
-                        value=[i.value.name for i in Precision],
-                        interactive=True,
-                        elem_id="filter-columns-precision",
-                    )
-                    filter_columns_size = gr.CheckboxGroup(
-                        label="Select model sizes (in billions of parameters) to include",
-                        choices=list(NUMERIC_INTERVALS.keys()),
-                        value=list(NUMERIC_INTERVALS.keys()),
-                        interactive=True,
-                        elem_id="filter-columns-size",
-                    )
             leaderboard_table = gr.components.Dataframe(
                 value=leaderboard_df[
                     [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
@@ -277,45 +277,6 @@ with ui:
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
             with gr.Row():
                 gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
@@ -363,6 +324,47 @@ with ui:
                 submission_result,
             )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
@@ -375,6 +377,6 @@ with ui:
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.add_job(launch_backend, "interval", seconds=100) # will only allow one job to be run at the same time
 scheduler.start()
-ui.queue(default_concurrency_limit=40).launch()

     return filtered_df
+demo = gr.Blocks(css=custom_css)
+with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
+                        with gr.Column():
+                                shown_columns = gr.CheckboxGroup(
+                                    choices=[
+                                        c.name
+                                        for c in fields(AutoEvalColumn)
+                                        if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.advanced and not c.dummy
+                                    ],
+                                    value=[
+                                        c.name
+                                        for c in fields(AutoEvalColumn)
+                                        if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.advanced
+                                    ],
+                                    label="Select metrics to show",
+                                    elem_id="column-select",
+                                    interactive=True,
+                                )
                     with gr.Row():
+                        with gr.Column():
                             for c in fields(AutoEvalColumn):
                                 if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.advanced and not c.dummy:
+                                    with gr.Row():
+                                        gr.Markdown("**" + c.name + "**. " + c.cite)
+                with gr.Column(min_width=320):
+                    #with gr.Box(elem_id="box-filter"):
+                    filter_columns_precision = gr.CheckboxGroup(
+                        label="Select precision levels to include",
+                        choices=[i.value.name for i in Precision],
+                        value=[i.value.name for i in Precision],
+                        interactive=True,
+                        elem_id="filter-columns-precision",
+                    )
+                    filter_columns_size = gr.CheckboxGroup(
+                        label="Select model sizes (in billions of parameters) to include",
+                        choices=list(NUMERIC_INTERVALS.keys()),
+                        value=list(NUMERIC_INTERVALS.keys()),
+                        interactive=True,
+                        elem_id="filter-columns-size",
+                    )
+                    #with gr.Row():
+                    with gr.Accordion("Advanced options [WIP]", open=False):
                             shown_columns_advanced = gr.CheckboxGroup(
                                 choices=[
                                     c.name
                             deleted_models_visibility = gr.Checkbox(
                                 value=False, label="Show gated/private/deleted models", interactive=True, visible=True,
                             )
+                            filter_columns_type = gr.CheckboxGroup(
+                                label="Select model types to include",
+                                choices=[t.to_str() for t in ModelType],
+                                value=[t.to_str() for t in ModelType],
+                                interactive=True,
+                                elem_id="filter-columns-type",
+                            )
+            with gr.Row():
+                search_bar = gr.Textbox(
+                    placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
+                    show_label=False,
+                    elem_id="search-bar",
+                )
             leaderboard_table = gr.components.Dataframe(
                 value=leaderboard_df[
                     [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Row():
                 gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
                 submission_result,
             )
+            with gr.Column():
+                with gr.Row():
+                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("Submission Status", elem_id="llm-benchmark-tab-table", id=4):
+                with gr.Column():
+                    with gr.Accordion(
+                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            finished_eval_table = gr.components.Dataframe(
+                                value=finished_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            running_eval_table = gr.components.Dataframe(
+                                value=running_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            pending_eval_table = gr.components.Dataframe(
+                                value=pending_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.add_job(launch_backend, "interval", seconds=100)
 scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

src/about.py CHANGED Viewed

@@ -15,8 +15,9 @@ class Task:
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     task0 = Task("toxigen", "acc", "Toxicity (lower is better)", cite="_ToxiGen: A Large-Scale Machine-Generated Dataset for Adversarial and Implicit Hate Speech Detection._ Hartvigsen et al., ACL 2022.")
-    task1 = Task("truthfulqa_gen", "acc", "Truthful QA", cite="TODO")
-    task2 = Task("crows_pairs_english", "acc", "CrowS-Pairs English", cite="TODO")
     #task2 = Task("anli_r1", "acc", "ANLI", cite="_Adversarial NLI: A New Benchmark for Natural Language Understanding._ Nie et al., ACL 2020.")
     #task3 = Task("logiqa", "acc_norm", "LogiQA", cite="_LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning_. Liu et al.,  IJCAI 2020.")
@@ -36,9 +37,9 @@ LLM_BENCHMARKS_TEXT = f"""
 ## How it works
 ## Reproducibility
-To reproduce the toxicity results, here is the command you can run:
-```python main.py --model=hf-causal-experimental --model_args="pretrained=<your_model>,use_accelerate=True" --tasks=toxigen --batch_size=1 --output_path=<output_path>```
 """
@@ -69,7 +70,7 @@ When we add extra information about models to the leaderboard, it will be automa
 ## In case of model failure
 If your model is displayed in the `FAILED` category, its execution stopped.
 Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     task0 = Task("toxigen", "acc", "Toxicity (lower is better)", cite="_ToxiGen: A Large-Scale Machine-Generated Dataset for Adversarial and Implicit Hate Speech Detection._ Hartvigsen et al., ACL 2022.")
+    task1 = Task("truthfulqa_gen", "bleurt_acc", "Truthful QA", cite="_TruthfulQA: Measuring How Models Mimic Human Falsehoods._  Lin et al., ACL 2022.")
+    # https://aclanthology.org/2020.emnlp-main.154/
+    task2 = Task("crows_pairs_english", "pct_stereotype", "CrowS-Pairs English", cite="_CrowS-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models._ Nangia et al., EMNLP 2020.")
     #task2 = Task("anli_r1", "acc", "ANLI", cite="_Adversarial NLI: A New Benchmark for Natural Language Understanding._ Nie et al., ACL 2020.")
     #task3 = Task("logiqa", "acc_norm", "LogiQA", cite="_LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning_. Liu et al.,  IJCAI 2020.")
 ## How it works
 ## Reproducibility
+To reproduce the toxicity results, here is the command you can run using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the EleutherAI LM Evaluation Harness:
+```python main.py --model=hf-causal-experimental --model_args="pretrained=<your_model>,use_accelerate=True" --tasks=<task> --batch_size=1 --output_path=<output_path>```
 """
 ## In case of model failure
 If your model is displayed in the `FAILED` category, its execution stopped.
 Make sure you have followed the above steps first.
+If everything is done, check you can launch the EleutherAIHarness on your model locally. See About tab for exact command.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

src/envs.py CHANGED Viewed

@@ -20,11 +20,11 @@ LIMIT = None # 20
 # Define some input/output variables.
 # Don't forget to create a results and requests Dataset for your org
 # Leaderboard Space
-REPO_ID = f"{OWNER}/leaderboard"
 # Leaderboard input Dataset
-QUEUE_REPO = f"{OWNER}/requests"
 # Leaderboard output Dataset
-RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache, set HF_HOME.
 CACHE_PATH=os.getenv("HF_HOME", ".")

 # Define some input/output variables.
 # Don't forget to create a results and requests Dataset for your org
 # Leaderboard Space
+REPO_ID = f"{OWNER}/leaderboard-backend"
 # Leaderboard input Dataset
+QUEUE_REPO = f"{OWNER}/requests-tmp"
 # Leaderboard output Dataset
+RESULTS_REPO = f"{OWNER}/results-tmp"
 # If you setup a cache, set HF_HOME.
 CACHE_PATH=os.getenv("HF_HOME", ".")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -71,7 +71,7 @@ class EvalResult:
         results = {}
         for task in Tasks:
             print("Looking at task:")
-            print(task)
             try:
                 task = task.value
             except Exception as e:
@@ -169,7 +169,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     model_result_filepaths = []
     print("Getting raw eval results from:")
-    print(os.walk(results_path))
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
         if len(files) == 0 or any([not f.endswith(".json") for f in files]):

         results = {}
         for task in Tasks:
             print("Looking at task:")
+            print(task.value)
             try:
                 task = task.value
             except Exception as e:
     model_result_filepaths = []
     print("Getting raw eval results from:")
+    print(results_path)
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
         if len(files) == 0 or any([not f.endswith(".json") for f in files]):