lewtun HF staff commited on
Commit
a53938b
·
1 Parent(s): d4faf91
Files changed (2) hide show
  1. a.ipynb +168 -0
  2. app.py +16 -7
a.ipynb ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import json\n",
10
+ "from pathlib import Path\n",
11
+ "\n",
12
+ "import gradio as gr\n",
13
+ "import pandas as pd"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 31,
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "def get_leaderboard_df():\n",
23
+ " filepaths = list(Path(\"eval_results\").rglob(\"*.json\"))\n",
24
+ "\n",
25
+ " # Parse filepaths to get unique models\n",
26
+ " models = set()\n",
27
+ " for filepath in filepaths:\n",
28
+ " path_parts = Path(filepath).parts\n",
29
+ " model_revision = \"_\".join(path_parts[1:4])\n",
30
+ " models.add(model_revision)\n",
31
+ "\n",
32
+ " # Initialize DataFrame\n",
33
+ " df = pd.DataFrame(index=list(models))\n",
34
+ "\n",
35
+ " # Extract data from each file and populate the DataFrame\n",
36
+ " for filepath in filepaths:\n",
37
+ " path_parts = Path(filepath).parts\n",
38
+ " model_revision = \"_\".join(path_parts[1:4])\n",
39
+ " task = path_parts[4].capitalize()\n",
40
+ " # Extract timestamp from filepath\n",
41
+ " timestamp = filepath.stem.split(\"_\")[-1][:-3]\n",
42
+ " df.loc[model_revision, \"Timestamp\"] = timestamp\n",
43
+ "\n",
44
+ " with open(filepath, \"r\") as file:\n",
45
+ " data = json.load(file)\n",
46
+ " first_result_key = next(iter(data[\"results\"])) # gets the first key in 'results'\n",
47
+ " # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n",
48
+ " if task == \"truthfulqa\":\n",
49
+ " value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n",
50
+ " else:\n",
51
+ " first_metric_key = next(iter(data[\"results\"][first_result_key])) # gets the first key in the first result\n",
52
+ " value = data[\"results\"][first_result_key][first_metric_key] # gets the value of the first metric\n",
53
+ " df.loc[model_revision, task] = value\n",
54
+ " \n",
55
+ " df.insert(loc=0, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n",
56
+ " df = df.sort_values(by=[\"Average\"], ascending=False)\n",
57
+ " df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(3)\n",
58
+ " return df"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 32,
64
+ "metadata": {},
65
+ "outputs": [],
66
+ "source": [
67
+ "df = get_leaderboard_df()"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": null,
73
+ "metadata": {},
74
+ "outputs": [
75
+ {
76
+ "data": {
77
+ "text/html": [
78
+ "<div>\n",
79
+ "<style scoped>\n",
80
+ " .dataframe tbody tr th:only-of-type {\n",
81
+ " vertical-align: middle;\n",
82
+ " }\n",
83
+ "\n",
84
+ " .dataframe tbody tr th {\n",
85
+ " vertical-align: top;\n",
86
+ " }\n",
87
+ "\n",
88
+ " .dataframe thead th {\n",
89
+ " text-align: right;\n",
90
+ " }\n",
91
+ "</style>\n",
92
+ "<table border=\"1\" class=\"dataframe\">\n",
93
+ " <thead>\n",
94
+ " <tr style=\"text-align: right;\">\n",
95
+ " <th></th>\n",
96
+ " <th>Model</th>\n",
97
+ " <th>Timestamp</th>\n",
98
+ " <th>Average</th>\n",
99
+ " <th>Truthfulqa</th>\n",
100
+ " <th>Winogrande</th>\n",
101
+ " <th>Gsm8k</th>\n",
102
+ " <th>Hellaswag</th>\n",
103
+ " <th>Arc</th>\n",
104
+ " </tr>\n",
105
+ " </thead>\n",
106
+ " <tbody>\n",
107
+ " <tr>\n",
108
+ " <th>0</th>\n",
109
+ " <td>Qwen_Qwen1.5-0.5B-Chat_main</td>\n",
110
+ " <td>2024-02-28T07-35-58.803</td>\n",
111
+ " <td>0.296</td>\n",
112
+ " <td>0.271</td>\n",
113
+ " <td>0.519</td>\n",
114
+ " <td>0.039</td>\n",
115
+ " <td>0.363</td>\n",
116
+ " <td>0.287</td>\n",
117
+ " </tr>\n",
118
+ " </tbody>\n",
119
+ "</table>\n",
120
+ "</div>"
121
+ ],
122
+ "text/plain": [
123
+ " Model Timestamp Average Truthfulqa \\\n",
124
+ "0 Qwen_Qwen1.5-0.5B-Chat_main 2024-02-28T07-35-58.803 0.296 0.271 \n",
125
+ "\n",
126
+ " Winogrande Gsm8k Hellaswag Arc \n",
127
+ "0 0.519 0.039 0.363 0.287 "
128
+ ]
129
+ },
130
+ "execution_count": 28,
131
+ "metadata": {},
132
+ "output_type": "execute_result"
133
+ }
134
+ ],
135
+ "source": [
136
+ "df"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": null,
142
+ "metadata": {},
143
+ "outputs": [],
144
+ "source": []
145
+ }
146
+ ],
147
+ "metadata": {
148
+ "kernelspec": {
149
+ "display_name": "hf",
150
+ "language": "python",
151
+ "name": "python3"
152
+ },
153
+ "language_info": {
154
+ "codemirror_mode": {
155
+ "name": "ipython",
156
+ "version": 3
157
+ },
158
+ "file_extension": ".py",
159
+ "mimetype": "text/x-python",
160
+ "name": "python",
161
+ "nbconvert_exporter": "python",
162
+ "pygments_lexer": "ipython3",
163
+ "version": "3.10.6"
164
+ }
165
+ },
166
+ "nbformat": 4,
167
+ "nbformat_minor": 2
168
+ }
app.py CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
7
  TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for H4 Models</h1>"""
8
 
9
  DESCRIPTION = f"""
10
- Evaluation of H4 models across a diverse range of benchmarks from Eleuther's [LLM evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).
11
  """
12
 
13
 
@@ -18,7 +18,7 @@ def get_leaderboard_df():
18
  models = set()
19
  for filepath in filepaths:
20
  path_parts = Path(filepath).parts
21
- model_revision = "_".join(path_parts[1:4]) # Adjust indices based on your path structure
22
  models.add(model_revision)
23
 
24
  # Initialize DataFrame
@@ -27,17 +27,26 @@ def get_leaderboard_df():
27
  # Extract data from each file and populate the DataFrame
28
  for filepath in filepaths:
29
  path_parts = Path(filepath).parts
30
- model_revision = "_".join(path_parts[1:4]) # Adjust indices based on your path structure
31
- task = Path(filepath).stem.split("_")[-1].capitalize() # gets 'mmlu' from the filename
 
 
 
32
 
33
  with open(filepath, "r") as file:
34
  data = json.load(file)
35
  first_result_key = next(iter(data["results"])) # gets the first key in 'results'
36
- first_metric_key = next(iter(data["results"][first_result_key])) # gets the first key in the first result
37
- value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
 
 
 
 
 
 
38
  df.loc[model_revision, task] = value
39
 
40
- df.insert(loc=0, column="Average", value=df.mean(axis=1))
41
  df = df.sort_values(by=["Average"], ascending=False)
42
  df = df.reset_index().rename(columns={"index": "Model"}).round(3)
43
  return df
 
7
  TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for H4 Models</h1>"""
8
 
9
  DESCRIPTION = f"""
10
+ Evaluation of H4 models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval)
11
  """
12
 
13
 
 
18
  models = set()
19
  for filepath in filepaths:
20
  path_parts = Path(filepath).parts
21
+ model_revision = "_".join(path_parts[1:4])
22
  models.add(model_revision)
23
 
24
  # Initialize DataFrame
 
27
  # Extract data from each file and populate the DataFrame
28
  for filepath in filepaths:
29
  path_parts = Path(filepath).parts
30
+ model_revision = "_".join(path_parts[1:4])
31
+ task = path_parts[4].capitalize()
32
+ # Extract timestamp from filepath
33
+ timestamp = filepath.stem.split("_")[-1][:-3]
34
+ df.loc[model_revision, "Timestamp"] = timestamp
35
 
36
  with open(filepath, "r") as file:
37
  data = json.load(file)
38
  first_result_key = next(iter(data["results"])) # gets the first key in 'results'
39
+ # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
40
+ if task == "truthfulqa":
41
+ value = data["results"][first_result_key]["truthfulqa_mc2"]
42
+ else:
43
+ first_metric_key = next(
44
+ iter(data["results"][first_result_key])
45
+ ) # gets the first key in the first result
46
+ value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
47
  df.loc[model_revision, task] = value
48
 
49
+ df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
50
  df = df.sort_values(by=["Average"], ascending=False)
51
  df = df.reset_index().rename(columns={"index": "Model"}).round(3)
52
  return df