File size: 9,892 Bytes
aff26b6
 
 
 
 
 
9b14f93
aff26b6
 
9b14f93
aff26b6
 
f7bb52c
7ac902b
aff26b6
8cc933f
aff26b6
 
 
 
 
 
a53938b
aff26b6
 
 
 
 
 
 
 
8a31745
599688f
8a31745
599688f
aff26b6
 
 
7ac902b
 
 
a17f702
 
 
 
1cfc013
a17f702
 
 
 
f7bb52c
f7ee73a
a17f702
f7ee73a
 
 
 
 
 
 
 
 
 
a17f702
 
 
f7bb52c
a17f702
 
 
f7bb52c
a17f702
 
 
 
 
 
f7bb52c
a17f702
 
 
f7bb52c
a17f702
f7bb52c
a17f702
f7bb52c
a17f702
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b61cab9
599688f
 
8a31745
43975e6
 
 
 
 
 
cf8d412
599688f
 
8ea545e
8cc933f
 
b59264d
8cc933f
 
b59264d
8cc933f
 
8a31745
 
 
3a567fa
 
 
 
 
aff26b6
 
b59264d
c9b339f
8cc933f
b6e9e9b
8cc933f
ab41a5c
c9b339f
6e537e5
c9b339f
6e537e5
43975e6
 
ab41a5c
 
 
 
 
 
 
 
df128e8
 
 
aff26b6
 
 
 
 
 
c9b339f
 
8a31745
 
 
 
 
8ea545e
df128e8
 
8a31745
0521afd
 
df128e8
aff26b6
fcb88b9
 
 
 
5c8755a
fcb88b9
c9b339f
8cc933f
 
 
aff26b6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import json
from pathlib import Path

import gradio as gr
import pandas as pd

TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for open-r1 Models</h1>"""

DESCRIPTION = f"""
Evaluation of open-r1 models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
"""

BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2"]


def get_leaderboard_df():
    filepaths = list(Path("eval_results").rglob("*.json"))

    # Parse filepaths to get unique models
    models = set()
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        model_revision = "_".join(path_parts[1:4])
        models.add(model_revision)

    # Initialize DataFrame
    df = pd.DataFrame(index=list(models))

    # Extract data from each file and populate the DataFrame
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        date = filepath.stem.split("_")[-1][:-3]
        model_revision = "_".join(path_parts[1:4]) + "_" + date
        task = path_parts[4]
        df.loc[model_revision, "Date"] = date

        with open(filepath, "r") as file:
            data = json.load(file)
            # Skip benchmarks that we don't want to include in the leaderboard
            if task.lower() in BENCHMARKS_TO_SKIP:
                continue
            # MixEval doen't have a results key, so we need to get the overall score
            if task.lower() in ["mixeval", "mixeval_hard"]:
                value = data["overall score (final score)"]
                df.loc[model_revision, f"{task}"] = value
            else:
                first_result_key = next(iter(data["results"]))  # gets the first key in 'results'
                # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
                if task.lower() == "truthfulqa":
                    value = data["results"][first_result_key]["truthfulqa_mc2"]
                    df.loc[model_revision, task] = float(value)
                # IFEval has several metrics but we report the average like Llama3 paper
                elif task.lower() == "ifeval":
                    values = 0.0
                    for metric in [
                        "prompt_level_loose",
                        "prompt_level_strict",
                        "inst_level_strict",
                        "inst_level_loose",
                    ]:
                        values += data["results"][first_result_key][f"{metric}_acc"]
                    value = values / 4
                    df.loc[model_revision, f"{task}"] = float(value)
                # MMLU has several metrics but we report just the average one
                elif task.lower() == "mmlu":
                    value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
                    df.loc[model_revision, task] = float(value)
                # HellaSwag and ARC reports acc_norm
                elif task.lower() in ["hellaswag", "arc"]:
                    value = data["results"][first_result_key]["acc_norm"]
                    df.loc[model_revision, task] = float(value)
                # BBH has several metrics but we report just the average one
                elif task.lower() == "bbh":
                    if "all" in data["results"]:
                        value = data["results"]["all"]["acc"]
                    else:
                        value = -100
                    df.loc[model_revision, task] = float(value)
                # AGIEval reports acc_norm
                elif task.lower() == "agieval":
                    value = data["results"]["all"]["acc_norm"]
                    df.loc[model_revision, task] = float(value)
                # MATH reports qem
                elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
                    value = data["results"]["all"]["qem"]
                    df.loc[model_revision, task] = float(value)
                # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
                elif task.lower() in ["mini_math_v2"]:
                    for k, v in data["results"].items():
                        if k != "all":
                            level = k.split("|")[1].split(":")[-1]
                            value = v["qem"]
                            df.loc[model_revision, f"{task}_{level}"] = value
                # For PoT we report N metrics, one for each prompt and store each one as a separate row in the dataframe
                elif task.lower() in ["aimo_kaggle_medium_pot", "aimo_kaggle_hard_pot"]:
                    for k, v in data["results"].items():
                        if k != "all" and "_average" not in k:
                            version = k.split("|")[1].split(":")[-1]
                            value = v["qem"] if "qem" in v else v["score"]
                            df.loc[model_revision, f"{task}_{version}"] = value
                # For kaggle_tora we report accuracy as a percentage, so need  to divide by 100
                elif task.lower() in [
                    "aimo_tora_eval_kaggle_medium",
                    "aimo_tora_eval_kaggle_hard",
                    "aimo_kaggle_fast_eval_hard",
                    "aimo_kaggle_tora_medium",
                    "aimo_kaggle_tora_hard",
                    "aimo_kaggle_tora_medium_extended",
                    "aimo_kaggle_tora_hard_extended",
                    "aimo_math_integer_lvl4",
                    "aimo_math_integer_lvl5",
                ]:
                    for k, v in data["results"].items():
                        value = float(v["qem"]) / 100.0
                        df.loc[model_revision, f"{task}"] = value
                # For AlpacaEval we report base winrate and lenght corrected one
                elif task.lower() == "alpaca_eval":
                    value = data["results"][first_result_key]["win_rate"]
                    df.loc[model_revision, "Alpaca_eval"] = value / 100.0
                    value = data["results"][first_result_key]["length_controlled_winrate"]
                    df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
                else:
                    first_metric_key = next(
                        iter(data["results"][first_result_key])
                    )  # gets the first key in the first result
                    value = data["results"][first_result_key][first_metric_key]  # gets the value of the first metric
                    df.loc[model_revision, task] = float(value)

    # Drop rows where every entry is NaN
    df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])

    # Trim minimath column names
    df.columns = [c.replace("_level_", "_l") for c in df.columns]

    # Trim AIMO column names
    df.columns = [c.replace("aimo_", "") for c in df.columns]

    df = df.reset_index().rename(columns={"index": "Model"}).round(4)
    # Strip off date from model name
    df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])

    return df


leaderboard_df = get_leaderboard_df()


def agg_df(df, agg: str = "max"):
    df = df.copy()
    # Drop date and aggregate results by model name
    df = df.drop("Date", axis=1).groupby("Model").agg(agg).reset_index()

    df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))

    # Convert all values to percentage
    df[df.select_dtypes(include=["number"]).columns] *= 100.0
    df = df.sort_values(by=["Average"], ascending=False)
    return df


# Function to update the table based on search query
def filter_and_search(cols: list[str], search_query: str, agg: str):
    df = leaderboard_df
    df = agg_df(df, agg)
    if len(search_query) > 0:
        search_terms = search_query.split(";")
        search_terms = [term.strip().lower() for term in search_terms]
        pattern = "|".join(search_terms)
        df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
        # Drop any columns which are all NaN
        df = df.dropna(how="all", axis=1)
    if len(cols) > 0:
        index_cols = list(leaderboard_df.columns[:1])
        new_cols = index_cols + cols
        df = df.copy()[new_cols]
        # Drop rows with NaN values
        df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
        # Recompute average
        df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
    return df


demo = gr.Blocks()

with demo:
    gr.HTML(TITLE)
    with gr.Column():
        gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
        with gr.Row():
            search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
            agg = gr.Radio(
                ["min", "max", "mean"],
                value="max",
                label="Aggregation",
                info="How to aggregate results for each model",
            )
        with gr.Row():
            cols_bar = gr.CheckboxGroup(
                choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
                show_label=False,
                info="Select columns to display",
            )
        with gr.Group():
            leaderboard_table = gr.Dataframe(
                value=leaderboard_df,
                wrap=True,
                height=1000,
                column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
            )

    cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
    agg.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
    search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])

demo.launch()