Update space
Browse files- app.py +43 -12
- src/display/utils.py +4 -0
- src/leaderboard/read_evals.py +4 -0
- src/populate.py +2 -2
app.py
CHANGED
@@ -104,7 +104,8 @@ def init_leaderboard(dataframe):
|
|
104 |
# model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
|
105 |
# model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
|
106 |
# model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
|
107 |
-
model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
|
|
|
108 |
# model_leaderboard_df = get_model_leaderboard_df(model_result_path)
|
109 |
|
110 |
|
@@ -131,17 +132,33 @@ with demo:
|
|
131 |
gr.HTML(TITLE)
|
132 |
gr.HTML(SUB_TITLE)
|
133 |
gr.HTML(EXTERNAL_LINKS)
|
134 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
137 |
|
138 |
with gr.TabItem("π
Overview", elem_id="llm-benchmark-tab-table", id=0):
|
139 |
|
140 |
DESCRIPTION_TEXT = """
|
141 |
-
Total #models: 53 (Last updated: 2024-10-
|
142 |
|
143 |
-
This page
|
144 |
-
(
|
145 |
"""
|
146 |
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
147 |
|
@@ -158,6 +175,7 @@ with demo:
|
|
158 |
AutoEvalColumn.rank_reason_logical.name,
|
159 |
AutoEvalColumn.rank_reason_social.name,
|
160 |
AutoEvalColumn.rank_chemistry.name,
|
|
|
161 |
],
|
162 |
rank_col=[],
|
163 |
)
|
@@ -374,19 +392,31 @@ with demo:
|
|
374 |
"""
|
375 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
376 |
|
377 |
-
with gr.TabItem("
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
382 |
|
383 |
-
with gr.TabItem("
|
384 |
CURRENT_TEXT = """
|
385 |
# Coming soon!
|
386 |
"""
|
387 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
388 |
|
389 |
-
with gr.TabItem("
|
390 |
CURRENT_TEXT = """
|
391 |
# Coming soon!
|
392 |
"""
|
@@ -395,6 +425,7 @@ with demo:
|
|
395 |
|
396 |
|
397 |
|
|
|
398 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=6):
|
399 |
ABOUT_TEXT = """
|
400 |
# About Us
|
|
|
104 |
# model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
|
105 |
# model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
|
106 |
# model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
|
107 |
+
# model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
|
108 |
+
model_result_path = "./src/results/models_2024-10-10-06:18:54.263527.json"
|
109 |
# model_leaderboard_df = get_model_leaderboard_df(model_result_path)
|
110 |
|
111 |
|
|
|
132 |
gr.HTML(TITLE)
|
133 |
gr.HTML(SUB_TITLE)
|
134 |
gr.HTML(EXTERNAL_LINKS)
|
135 |
+
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
136 |
+
# gr.HTML('<p style="font-size:15px;">This is a larger text using HTML in Markdown.</p>')
|
137 |
+
INTRODUCTION_TEXT_FONT_SIZE = 16
|
138 |
+
INTRODUCTION_TEXT = (
|
139 |
+
f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
|
140 |
+
'<strong>Decentralized Arena</strong> automates, scales, and accelerates "<a href="https://lmarena.ai/">Chatbot Arena</a>" '
|
141 |
+
'for large language model (LLM) evaluation across diverse, fine-grained dimensions, '
|
142 |
+
'such as mathematics (algebra, geometry, probability), logical reasoning, social reasoning, biology, chemistry, and more'
|
143 |
+
'The evaluation is decentralized and democratic, with all participating LLMs assessing each other to ensure unbiased and fair results '
|
144 |
+
'With a 95\% correlation to Chatbot Arena\'s overall rankings, the system is fully transparent and reproducible.'
|
145 |
+
'</p>'
|
146 |
+
f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
|
147 |
+
'We actively invite <b>model developers</b> to participate and expedite their benchmarking efforts '
|
148 |
+
'and encourage <b>data stakeholders</b> to freely define and evaluate dimensions of interest for their own objectives.'
|
149 |
+
'</p>'
|
150 |
+
)
|
151 |
+
gr.HTML(INTRODUCTION_TEXT)
|
152 |
|
153 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
154 |
|
155 |
with gr.TabItem("π
Overview", elem_id="llm-benchmark-tab-table", id=0):
|
156 |
|
157 |
DESCRIPTION_TEXT = """
|
158 |
+
Total #models: 53 (Last updated: 2024-10-09)
|
159 |
|
160 |
+
This page prvovides a comprehensive overview of model ranks across various dimensions, based on their averaged ranks.
|
161 |
+
(Missing values are due to the slow or problemtic model responses, which will be fixed soom.)
|
162 |
"""
|
163 |
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
164 |
|
|
|
175 |
AutoEvalColumn.rank_reason_logical.name,
|
176 |
AutoEvalColumn.rank_reason_social.name,
|
177 |
AutoEvalColumn.rank_chemistry.name,
|
178 |
+
AutoEvalColumn.rank_cpp.name,
|
179 |
],
|
180 |
rank_col=[],
|
181 |
)
|
|
|
392 |
"""
|
393 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
394 |
|
395 |
+
with gr.TabItem("β C++", elem_id="cpp_subtab", id=0, elem_classes="subtab"):
|
396 |
+
|
397 |
+
leaderboard = overall_leaderboard(
|
398 |
+
get_model_leaderboard_df(
|
399 |
+
model_result_path,
|
400 |
+
benchmark_cols=[
|
401 |
+
AutoEvalColumn.rank_cpp.name,
|
402 |
+
AutoEvalColumn.model.name,
|
403 |
+
AutoEvalColumn.score_cpp.name,
|
404 |
+
# AutoEvalColumn.sd_cpp.name,
|
405 |
+
AutoEvalColumn.license.name,
|
406 |
+
AutoEvalColumn.organization.name,
|
407 |
+
AutoEvalColumn.knowledge_cutoff.name,
|
408 |
+
],
|
409 |
+
rank_col=[AutoEvalColumn.rank_cpp.name],
|
410 |
+
)
|
411 |
+
)
|
412 |
|
413 |
+
with gr.TabItem("π Python", elem_id="python_subtab", id=1, elem_classes="subtab"):
|
414 |
CURRENT_TEXT = """
|
415 |
# Coming soon!
|
416 |
"""
|
417 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
418 |
|
419 |
+
with gr.TabItem("β Java", elem_id="java_subtab", id=2, elem_classes="subtab"):
|
420 |
CURRENT_TEXT = """
|
421 |
# Coming soon!
|
422 |
"""
|
|
|
425 |
|
426 |
|
427 |
|
428 |
+
|
429 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=6):
|
430 |
ABOUT_TEXT = """
|
431 |
# About Us
|
src/display/utils.py
CHANGED
@@ -89,6 +89,10 @@ auto_eval_column_dict.append(["score_chemistry", ColumnContent, field(default_fa
|
|
89 |
auto_eval_column_dict.append(["sd_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Chemistry)", "number", True))])
|
90 |
auto_eval_column_dict.append(["rank_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Chemistry)", "number", True))])
|
91 |
|
|
|
|
|
|
|
|
|
92 |
for task in Tasks:
|
93 |
auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
|
94 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])
|
|
|
89 |
auto_eval_column_dict.append(["sd_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Chemistry)", "number", True))])
|
90 |
auto_eval_column_dict.append(["rank_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Chemistry)", "number", True))])
|
91 |
|
92 |
+
auto_eval_column_dict.append(["score_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Score (C++)", "number", True))])
|
93 |
+
auto_eval_column_dict.append(["sd_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (C++)", "number", True))])
|
94 |
+
auto_eval_column_dict.append(["rank_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (C++)", "number", True))])
|
95 |
+
|
96 |
for task in Tasks:
|
97 |
auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
|
98 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])
|
src/leaderboard/read_evals.py
CHANGED
@@ -189,6 +189,10 @@ class ModelResult:
|
|
189 |
AutoEvalColumn.sd_chemistry.name: self.results.get("Chemistry").get("Standard Deviation", None) if self.results.get("Chemistry") else None,
|
190 |
AutoEvalColumn.rank_chemistry.name: self.results.get("Chemistry").get("Rank", None) if self.results.get("Chemistry") else None,
|
191 |
|
|
|
|
|
|
|
|
|
192 |
AutoEvalColumn.license.name: self.license,
|
193 |
AutoEvalColumn.organization.name: self.org,
|
194 |
AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
|
|
|
189 |
AutoEvalColumn.sd_chemistry.name: self.results.get("Chemistry").get("Standard Deviation", None) if self.results.get("Chemistry") else None,
|
190 |
AutoEvalColumn.rank_chemistry.name: self.results.get("Chemistry").get("Rank", None) if self.results.get("Chemistry") else None,
|
191 |
|
192 |
+
AutoEvalColumn.score_cpp.name: self.results.get("CPP").get("Average Score", None) if self.results.get("CPP") else None,
|
193 |
+
AutoEvalColumn.sd_cpp.name: self.results.get("CPP").get("Standard Deviation", None) if self.results.get("CPP") else None,
|
194 |
+
AutoEvalColumn.rank_cpp.name: self.results.get("CPP").get("Rank", None) if self.results.get("CPP") else None,
|
195 |
+
|
196 |
AutoEvalColumn.license.name: self.license,
|
197 |
AutoEvalColumn.organization.name: self.org,
|
198 |
AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
|
src/populate.py
CHANGED
@@ -24,7 +24,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
24 |
if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
|
25 |
df = df.dropna(subset=benchmark_cols)
|
26 |
df = df.sort_values(by=[rank_col[0]], ascending=True)
|
27 |
-
# print(rank_col)
|
28 |
else:
|
29 |
# when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
|
30 |
avg_rank = df.iloc[:, 1:].mean(axis=1)
|
@@ -43,7 +43,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
43 |
# print(col)
|
44 |
# if 'Std dev' in col or 'Score' in col:
|
45 |
if 'Std dev' in col or 'Score' in col:
|
46 |
-
if "Chemistry" in col:
|
47 |
df[col] = (df[col]).map('{:.2f}'.format)
|
48 |
else:
|
49 |
df[col] = (df[col]*100).map('{:.2f}'.format)
|
|
|
24 |
if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
|
25 |
df = df.dropna(subset=benchmark_cols)
|
26 |
df = df.sort_values(by=[rank_col[0]], ascending=True)
|
27 |
+
# print(rank_col, benchmark_cols)
|
28 |
else:
|
29 |
# when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
|
30 |
avg_rank = df.iloc[:, 1:].mean(axis=1)
|
|
|
43 |
# print(col)
|
44 |
# if 'Std dev' in col or 'Score' in col:
|
45 |
if 'Std dev' in col or 'Score' in col:
|
46 |
+
if "Chemistry" in col or "C++" in col:
|
47 |
df[col] = (df[col]).map('{:.2f}'.format)
|
48 |
else:
|
49 |
df[col] = (df[col]*100).map('{:.2f}'.format)
|