Lisa Dunlap
commited on
Commit
·
df2a130
1
Parent(s):
a2fadac
moved buttons back to tab
Browse files
app.py
CHANGED
@@ -31,42 +31,44 @@ We've collected over **500,000** human preference votes to rank LLMs with the El
|
|
31 |
return leaderboard_md
|
32 |
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
# def make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df):
|
|
|
35 |
# total_votes = sum(arena_df["num_battles"]) // 2
|
36 |
-
#
|
37 |
-
# total_code_votes = sum(arena_chinese_df["num_battles"]) // 2
|
38 |
-
# total_code_models = len(arena_chinese_df)
|
39 |
# total_long_votes = sum(arena_long_df["num_battles"]) // 2
|
40 |
-
# total_long_models = len(arena_long_df)
|
41 |
# total_english_votes = sum(arena_english_df["num_battles"]) // 2
|
42 |
-
# total_english_models = len(arena_english_df)
|
43 |
|
|
|
44 |
# leaderboard_md = f"""
|
45 |
-
#
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
# Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)!
|
48 |
# """
|
49 |
-
# return leaderboard_md
|
50 |
-
|
51 |
-
def make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df):
|
52 |
-
# Calculate totals for each arena
|
53 |
-
total_votes = sum(arena_df["num_battles"]) // 2
|
54 |
-
total_chinese_votes = sum(arena_chinese_df["num_battles"]) // 2
|
55 |
-
total_long_votes = sum(arena_long_df["num_battles"]) // 2
|
56 |
-
total_english_votes = sum(arena_english_df["num_battles"]) // 2
|
57 |
-
|
58 |
-
# Constructing the markdown table
|
59 |
-
leaderboard_md = f"""
|
60 |
-
Last updated: March 29, 2024.
|
61 |
-
| | **Total** | English | Chinese | Long Context |
|
62 |
-
| :-------------- | :-----------------------: | :-----------------------: | :-----------------------: | :-----------------------: |
|
63 |
-
| # Votes | **{"{:,}".format(total_votes)}** | {"{:,}".format(total_english_votes)} | {"{:,}".format(total_chinese_votes)} | {"{:,}".format(total_long_votes)} |
|
64 |
-
| # Models | **{len(arena_df)}** | {len(arena_english_df)}| {len(arena_chinese_df)} | {len(arena_long_df)} |
|
65 |
|
66 |
-
|
67 |
-
"""
|
68 |
-
|
69 |
-
return leaderboard_md
|
70 |
|
71 |
|
72 |
|
@@ -231,9 +233,9 @@ def get_full_table(arena_df, model_table_df):
|
|
231 |
|
232 |
def create_ranking_str(ranking, ranking_difference):
|
233 |
if ranking_difference > 0:
|
234 |
-
return f"{int(ranking)} (\u2191
|
235 |
elif ranking_difference < 0:
|
236 |
-
return f"{int(ranking)} (\u2193
|
237 |
else:
|
238 |
return f"{int(ranking)}"
|
239 |
|
@@ -293,12 +295,17 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
|
293 |
print(f"{model_key} - {e}")
|
294 |
return values
|
295 |
|
296 |
-
def
|
|
|
297 |
p1 = elo_subset_results["win_fraction_heatmap"]
|
298 |
p2 = elo_subset_results["battle_count_heatmap"]
|
299 |
p3 = elo_subset_results["bootstrap_elo_rating"]
|
300 |
p4 = elo_subset_results["average_win_rate_bar"]
|
301 |
-
|
|
|
|
|
|
|
|
|
302 |
|
303 |
|
304 |
def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
|
@@ -325,6 +332,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
325 |
default_md = make_default_md(arena_df, elo_results)
|
326 |
|
327 |
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
|
|
328 |
if leaderboard_table_file:
|
329 |
data = load_leaderboard_table_csv(leaderboard_table_file)
|
330 |
model_table_df = pd.DataFrame(data)
|
@@ -333,17 +341,21 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
333 |
# arena table
|
334 |
arena_table_vals = get_arena_table(arena_df, model_table_df)
|
335 |
with gr.Tab("Arena Elo", id=0):
|
336 |
-
md = make_arena_leaderboard_md(arena_df
|
337 |
-
gr.Markdown(md, elem_id="leaderboard_markdown")
|
338 |
with gr.Row():
|
339 |
overall_rating = gr.Button("Overall")
|
340 |
-
update_overall_rating_df = lambda _: get_arena_table(arena_df, model_table_df)
|
|
|
341 |
english_rating = gr.Button("English")
|
342 |
-
update_english_rating_df = lambda
|
|
|
343 |
chinese_rating = gr.Button("Chinese")
|
344 |
-
update_chinese_rating_df = lambda
|
|
|
345 |
long_context_rating = gr.Button("Long Context")
|
346 |
-
update_long_context_rating_df = lambda
|
|
|
347 |
elo_display_df = gr.Dataframe(
|
348 |
headers=[
|
349 |
"Rank",
|
@@ -368,14 +380,14 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
368 |
value=arena_table_vals,
|
369 |
elem_id="arena_leaderboard_dataframe",
|
370 |
height=700,
|
371 |
-
column_widths=[70, 190,
|
372 |
wrap=True,
|
373 |
)
|
374 |
# Setup the button click action
|
375 |
-
overall_rating.click(fn=update_overall_rating_df, inputs=overall_rating, outputs=elo_display_df)
|
376 |
-
english_rating.click(fn=update_english_rating_df, inputs=english_rating, outputs=elo_display_df)
|
377 |
-
chinese_rating.click(fn=update_chinese_rating_df, inputs=chinese_rating ,outputs=elo_display_df)
|
378 |
-
long_context_rating.click(fn=update_long_context_rating_df, inputs=long_context_rating, outputs=elo_display_df)
|
379 |
|
380 |
with gr.Tab("Full Leaderboard", id=1):
|
381 |
md = make_full_leaderboard_md(elo_results)
|
@@ -418,22 +430,12 @@ See Figure 3 below for visualization of the confidence intervals.
|
|
418 |
leader_component_values[:] = [default_md, p1, p2, p3, p4]
|
419 |
|
420 |
if show_plot:
|
421 |
-
gr.Markdown(
|
422 |
f"""## More Statistics for Chatbot Arena\n
|
423 |
-
|
424 |
-
You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
|
425 |
""",
|
426 |
elem_id="leaderboard_markdown"
|
427 |
)
|
428 |
-
with gr.Row():
|
429 |
-
overall_plots = gr.Button("Overall")
|
430 |
-
update_overall_plots = lambda _: get_plots(elo_results)
|
431 |
-
english_plots = gr.Button("English")
|
432 |
-
update_english_plot = lambda _: get_plots(elo_english_results)
|
433 |
-
chinese_plots = gr.Button("Chinese")
|
434 |
-
update_chinese_plot = lambda _: get_plots(elo_chinese_results)
|
435 |
-
long_context_plots = gr.Button("Long Context")
|
436 |
-
update_long_context_plot = lambda _: get_plots(elo_long_results)
|
437 |
with gr.Row():
|
438 |
with gr.Column():
|
439 |
gr.Markdown(
|
@@ -457,10 +459,10 @@ You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12
|
|
457 |
)
|
458 |
plot_4 = gr.Plot(p4, show_label=False)
|
459 |
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
|
465 |
gr.Markdown(acknowledgment_md)
|
466 |
|
@@ -494,7 +496,7 @@ block_css = """
|
|
494 |
|
495 |
#arena_leaderboard_dataframe td {
|
496 |
line-height: 0.15em;
|
497 |
-
font-size:
|
498 |
}
|
499 |
#arena_leaderboard_dataframe th {
|
500 |
font-size: 20px;
|
@@ -503,7 +505,7 @@ block_css = """
|
|
503 |
|
504 |
#full_leaderboard_dataframe td {
|
505 |
line-height: 0.15em;
|
506 |
-
font-size:
|
507 |
}
|
508 |
#full_leaderboard_dataframe th {
|
509 |
font-size: 20px;
|
@@ -538,12 +540,10 @@ We thank [Kaggle](https://www.kaggle.com/), [MBZUAI](https://mbzuai.ac.ae/), [a1
|
|
538 |
|
539 |
def build_demo(elo_results_file, leaderboard_table_file):
|
540 |
text_size = gr.themes.sizes.text_lg
|
541 |
-
print("text_size", text_size)
|
542 |
|
543 |
with gr.Blocks(
|
544 |
title="Chatbot Arena Leaderboard",
|
545 |
-
# theme=gr.themes.
|
546 |
-
# theme='reilnuud/polite',
|
547 |
theme = gr.themes.Base.load("theme.json"),
|
548 |
css=block_css,
|
549 |
) as demo:
|
|
|
31 |
return leaderboard_md
|
32 |
|
33 |
|
34 |
+
def make_arena_leaderboard_md(arena_df, arena_subset_df=None, name="Overall"):
|
35 |
+
total_votes = sum(arena_df["num_battles"]) // 2
|
36 |
+
total_models = len(arena_df)
|
37 |
+
space = " "
|
38 |
+
if arena_subset_df is not None:
|
39 |
+
total_subset_votes = sum(arena_subset_df["num_battles"]) // 2
|
40 |
+
total_subset_models = len(arena_subset_df)
|
41 |
+
vote_str = f"{space} {name} #models: **{total_subset_models}**.{space} {name} #votes: **{'{:,}'.format(total_subset_votes)}**."
|
42 |
+
else:
|
43 |
+
vote_str = ""
|
44 |
+
leaderboard_md = f"""
|
45 |
+
Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{vote_str}{space} Last updated: March 29, 2024.
|
46 |
+
|
47 |
+
Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)! You can find code to recreate these tables and plots in this [notebook]({notebook_url}).
|
48 |
+
|
49 |
+
**NEW!** Click the buttons below to view the ELO leaderboard and stats for different input categories. You are currently viewing **{name}** inputs.
|
50 |
+
"""
|
51 |
+
return leaderboard_md
|
52 |
+
|
53 |
# def make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df):
|
54 |
+
# # Calculate totals for each arena
|
55 |
# total_votes = sum(arena_df["num_battles"]) // 2
|
56 |
+
# total_chinese_votes = sum(arena_chinese_df["num_battles"]) // 2
|
|
|
|
|
57 |
# total_long_votes = sum(arena_long_df["num_battles"]) // 2
|
|
|
58 |
# total_english_votes = sum(arena_english_df["num_battles"]) // 2
|
|
|
59 |
|
60 |
+
# # Constructing the markdown table
|
61 |
# leaderboard_md = f"""
|
62 |
+
# Last updated: March 29, 2024.
|
63 |
+
# | | **Total** | English | Chinese | Long Context |
|
64 |
+
# | :-------------- | :-----------------------: | :-----------------------: | :-----------------------: | :-----------------------: |
|
65 |
+
# | # Votes | **{"{:,}".format(total_votes)}** | {"{:,}".format(total_english_votes)} | {"{:,}".format(total_chinese_votes)} | {"{:,}".format(total_long_votes)} |
|
66 |
+
# | # Models | **{len(arena_df)}** | {len(arena_english_df)}| {len(arena_chinese_df)} | {len(arena_long_df)} |
|
67 |
|
68 |
+
# Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)! You can find code to recreate these tables and plots in this [notebook]({notebook_url}).
|
69 |
# """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
+
# return leaderboard_md
|
|
|
|
|
|
|
72 |
|
73 |
|
74 |
|
|
|
233 |
|
234 |
def create_ranking_str(ranking, ranking_difference):
|
235 |
if ranking_difference > 0:
|
236 |
+
return f"{int(ranking)} (\u2191{int(ranking_difference)})"
|
237 |
elif ranking_difference < 0:
|
238 |
+
return f"{int(ranking)} (\u2193{int(-ranking_difference)})"
|
239 |
else:
|
240 |
return f"{int(ranking)}"
|
241 |
|
|
|
295 |
print(f"{model_key} - {e}")
|
296 |
return values
|
297 |
|
298 |
+
def update_leaderboard_and_plots(button, arena_df, model_table_df, arena_subset_df, elo_subset_results):
|
299 |
+
arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df)
|
300 |
p1 = elo_subset_results["win_fraction_heatmap"]
|
301 |
p2 = elo_subset_results["battle_count_heatmap"]
|
302 |
p3 = elo_subset_results["bootstrap_elo_rating"]
|
303 |
p4 = elo_subset_results["average_win_rate_bar"]
|
304 |
+
more_stats_md = f"""## More Statistics for Chatbot Arena ({button})\n
|
305 |
+
You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
|
306 |
+
"""
|
307 |
+
leaderboard_md = make_arena_leaderboard_md(arena_df, arena_subset_df, name=button)
|
308 |
+
return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
|
309 |
|
310 |
|
311 |
def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
|
|
|
332 |
default_md = make_default_md(arena_df, elo_results)
|
333 |
|
334 |
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
335 |
+
# md = make_arena_leaderboard_md(arena_df, arena_chinese_df, arena_long_df, arena_english_df)
|
336 |
if leaderboard_table_file:
|
337 |
data = load_leaderboard_table_csv(leaderboard_table_file)
|
338 |
model_table_df = pd.DataFrame(data)
|
|
|
341 |
# arena table
|
342 |
arena_table_vals = get_arena_table(arena_df, model_table_df)
|
343 |
with gr.Tab("Arena Elo", id=0):
|
344 |
+
md = make_arena_leaderboard_md(arena_df)
|
345 |
+
leaderboard_markdown = gr.Markdown(md, elem_id="leaderboard_markdown")
|
346 |
with gr.Row():
|
347 |
overall_rating = gr.Button("Overall")
|
348 |
+
# update_overall_rating_df = lambda _: get_arena_table(arena_df, model_table_df)
|
349 |
+
update_overall_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, None, elo_results)
|
350 |
english_rating = gr.Button("English")
|
351 |
+
update_english_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, arena_english_df, elo_english_results)
|
352 |
+
# update_english_rating_df = lambda _: get_arena_table(arena_df, model_table_df, arena_english_df)
|
353 |
chinese_rating = gr.Button("Chinese")
|
354 |
+
update_chinese_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, arena_chinese_df, elo_chinese_results)
|
355 |
+
# update_chinese_rating_df = lambda _: get_arena_table(arena_df, model_table_df, arena_chinese_df)
|
356 |
long_context_rating = gr.Button("Long Context")
|
357 |
+
update_long_context_rating_df = lambda x: update_leaderboard_and_plots(x, arena_df, model_table_df, arena_long_df, elo_long_results)
|
358 |
+
# update_long_context_rating_df = lambda _: get_arena_table(arena_df, model_table_df, arena_long_df)
|
359 |
elo_display_df = gr.Dataframe(
|
360 |
headers=[
|
361 |
"Rank",
|
|
|
380 |
value=arena_table_vals,
|
381 |
elem_id="arena_leaderboard_dataframe",
|
382 |
height=700,
|
383 |
+
column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
|
384 |
wrap=True,
|
385 |
)
|
386 |
# Setup the button click action
|
387 |
+
# overall_rating.click(fn=update_overall_rating_df, inputs=overall_rating, outputs=elo_display_df)
|
388 |
+
# english_rating.click(fn=update_english_rating_df, inputs=english_rating, outputs=elo_display_df)
|
389 |
+
# chinese_rating.click(fn=update_chinese_rating_df, inputs=chinese_rating ,outputs=elo_display_df)
|
390 |
+
# long_context_rating.click(fn=update_long_context_rating_df, inputs=long_context_rating, outputs=elo_display_df)
|
391 |
|
392 |
with gr.Tab("Full Leaderboard", id=1):
|
393 |
md = make_full_leaderboard_md(elo_results)
|
|
|
430 |
leader_component_values[:] = [default_md, p1, p2, p3, p4]
|
431 |
|
432 |
if show_plot:
|
433 |
+
more_stats_md = gr.Markdown(
|
434 |
f"""## More Statistics for Chatbot Arena\n
|
435 |
+
You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
|
|
|
436 |
""",
|
437 |
elem_id="leaderboard_markdown"
|
438 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
439 |
with gr.Row():
|
440 |
with gr.Column():
|
441 |
gr.Markdown(
|
|
|
459 |
)
|
460 |
plot_4 = gr.Plot(p4, show_label=False)
|
461 |
|
462 |
+
overall_rating.click(fn=update_overall_rating_df, inputs=overall_rating, outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
|
463 |
+
english_rating.click(fn=update_english_rating_df, inputs=english_rating, outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
|
464 |
+
chinese_rating.click(fn=update_chinese_rating_df, inputs=chinese_rating ,outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
|
465 |
+
long_context_rating.click(fn=update_long_context_rating_df, inputs=long_context_rating, outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, leaderboard_markdown])
|
466 |
|
467 |
gr.Markdown(acknowledgment_md)
|
468 |
|
|
|
496 |
|
497 |
#arena_leaderboard_dataframe td {
|
498 |
line-height: 0.15em;
|
499 |
+
font-size: 18px;
|
500 |
}
|
501 |
#arena_leaderboard_dataframe th {
|
502 |
font-size: 20px;
|
|
|
505 |
|
506 |
#full_leaderboard_dataframe td {
|
507 |
line-height: 0.15em;
|
508 |
+
font-size: 18px;
|
509 |
}
|
510 |
#full_leaderboard_dataframe th {
|
511 |
font-size: 20px;
|
|
|
540 |
|
541 |
def build_demo(elo_results_file, leaderboard_table_file):
|
542 |
text_size = gr.themes.sizes.text_lg
|
|
|
543 |
|
544 |
with gr.Blocks(
|
545 |
title="Chatbot Arena Leaderboard",
|
546 |
+
# theme=gr.themes.Base(text_size=text_size),
|
|
|
547 |
theme = gr.themes.Base.load("theme.json"),
|
548 |
css=block_css,
|
549 |
) as demo:
|