weichiang commited on
Commit
99bab3b
Β·
1 Parent(s): 13ecd9b
Files changed (1) hide show
  1. app.py +50 -27
app.py CHANGED
@@ -39,7 +39,7 @@ def make_arena_leaderboard_md(arena_df):
39
  leaderboard_md = f"""
40
  Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{space} Last updated: March 29, 2024.
41
 
42
- **NEW!** View ELO leaderboard and stats for different input categories.
43
  """
44
  return leaderboard_md
45
 
@@ -49,8 +49,8 @@ def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="Overall"
49
  space = "   "
50
  total_subset_votes = sum(arena_subset_df["num_battles"]) // 2
51
  total_subset_models = len(arena_subset_df)
52
- leaderboard_md = f"""### {name} Question Coverage
53
- #models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)**.{space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**.{space}
54
  """
55
  return leaderboard_md
56
 
@@ -259,10 +259,14 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
259
  arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1)
260
  arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1)
261
  # join arena_df and arena_subset_df on index
262
- arena_df = arena_subset_df.join(arena_df["final_ranking_no_tie"], rsuffix="_global", how="inner")
263
- # arena_df = arena_subset_df.join(arena_df["rating"], rsuffix="_global", how="inner")
264
- arena_df["ranking_difference"] = arena_df["final_ranking_no_tie_global"] - arena_df["final_ranking_no_tie"]
265
- # arena_df['ranking_difference'] = arena_df['rating_global'] - arena_df['rating']
 
 
 
 
266
  arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
267
  arena_df["final_ranking"] = arena_df.apply(lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]), axis=1)
268
 
@@ -310,7 +314,25 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
310
  print(f"{model_key} - {e}")
311
  return values
312
 
313
- key_to_category_name = {"full": "Total", "coding": "Coding", "long": "Long Conversation", "english": "English", "chinese": "Chinese"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
  def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
316
  arena_dfs = {}
@@ -328,12 +350,12 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
328
  arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
329
  category_elo_results[key_to_category_name[k]] = elo_results[k]
330
 
331
- p1 = category_elo_results["Total"]["win_fraction_heatmap"]
332
- p2 = category_elo_results["Total"]["battle_count_heatmap"]
333
- p3 = category_elo_results["Total"]["bootstrap_elo_rating"]
334
- p4 = category_elo_results["Total"]["average_win_rate_bar"]
335
- arena_df = arena_dfs["Total"]
336
- default_md = make_default_md(arena_df, category_elo_results["Total"])
337
 
338
  md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
339
  if leaderboard_table_file:
@@ -347,9 +369,10 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
347
  md = make_arena_leaderboard_md(arena_df)
348
  leaderboard_markdown = gr.Markdown(md, elem_id="leaderboard_markdown")
349
  with gr.Row():
350
- category_dropdown = gr.Dropdown(choices=list(arena_dfs.keys()), label="Category", value="Total")
351
- default_category_details = make_category_arena_leaderboard_md(arena_df, arena_df, name="Toal")
352
- with gr.Column(variant="panel"):
 
353
  category_deets = gr.Markdown(default_category_details, elem_id="category_deets")
354
 
355
  elo_display_df = gr.Dataframe(
@@ -364,7 +387,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
364
  "Knowledge Cutoff",
365
  ],
366
  datatype=[
367
- "str",
368
  "markdown",
369
  "number",
370
  "str",
@@ -449,7 +472,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
449
  pass
450
 
451
  def update_leaderboard_df(arena_table_vals):
452
- elo_datarame = pd.DataFrame(arena_table_vals, columns=[ "Rank", "Ξ”", "πŸ€– Model", "⭐ Arena Elo", "πŸ“Š 95% CI", "πŸ—³οΈ Votes", "Organization", "License", "Knowledge Cutoff"])
453
 
454
  # goal: color the rows based on the rank with styler
455
  def highlight_max(s):
@@ -459,20 +482,20 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
459
  def highlight_rank_max(s):
460
  return ["color: green; font-weight: bold" if v > 0 else "color: red; font-weight: bold" if v < 0 else "" for v in s]
461
 
462
- return elo_datarame.style.apply(highlight_max, subset=["Rank"]).apply(highlight_rank_max, subset=["Ξ”"])
463
 
464
  def update_leaderboard_and_plots(category):
465
  arena_subset_df = arena_dfs[category]
466
  arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
467
  elo_subset_results = category_elo_results[category]
468
- arena_df = arena_dfs["Total"]
469
- arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df = arena_subset_df if category != "Total" else None)
470
- if category != "Total":
471
  arena_values = update_leaderboard_df(arena_values)
472
  arena_values = gr.Dataframe(
473
  headers=[
474
  "Rank",
475
- "Ξ”",
476
  "πŸ€– Model",
477
  "⭐ Arena Elo",
478
  "πŸ“Š 95% CI",
@@ -482,7 +505,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
482
  "Knowledge Cutoff",
483
  ],
484
  datatype=[
485
- "str",
486
  "number",
487
  "markdown",
488
  "number",
@@ -495,7 +518,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
495
  value=arena_values,
496
  elem_id="arena_leaderboard_dataframe",
497
  height=700,
498
- column_widths=[50, 50, 190, 110, 100, 90, 160, 150, 140],
499
  wrap=True,
500
  )
501
  else:
@@ -511,7 +534,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
511
  "Knowledge Cutoff",
512
  ],
513
  datatype=[
514
- "str",
515
  "markdown",
516
  "number",
517
  "str",
 
39
  leaderboard_md = f"""
40
  Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{space} Last updated: March 29, 2024.
41
 
42
+ πŸ“£ **NEW!** View leaderboard for different categories (e.g., coding, long user query)!
43
  """
44
  return leaderboard_md
45
 
 
49
  space = "&nbsp;&nbsp;&nbsp;"
50
  total_subset_votes = sum(arena_subset_df["num_battles"]) // 2
51
  total_subset_models = len(arena_subset_df)
52
+ leaderboard_md = f"""### {cat_name_to_explanation[name]}
53
+ #### [Coverage] {space} #models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space}
54
  """
55
  return leaderboard_md
56
 
 
259
  arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1)
260
  arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1)
261
  # join arena_df and arena_subset_df on index
262
+ arena_df = arena_subset_df.join(arena_df["final_ranking"], rsuffix="_global", how="inner")
263
+ arena_df["ranking_difference"] = arena_df["final_ranking_global"] - arena_df["final_ranking"]
264
+
265
+ # no tie version
266
+ # arena_df = arena_subset_df.join(arena_df["final_ranking_no_tie"], rsuffix="_global", how="inner")
267
+ # arena_df["ranking_difference"] = arena_df["final_ranking_no_tie_global"] - arena_df["final_ranking_no_tie"]
268
+
269
+ arena_df = arena_df.sort_values(by=["rating"], ascending=False)
270
  arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
271
  arena_df["final_ranking"] = arena_df.apply(lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]), axis=1)
272
 
 
314
  print(f"{model_key} - {e}")
315
  return values
316
 
317
+ key_to_category_name = {
318
+ "full": "Overall",
319
+ "coding": "Coding",
320
+ "long_user": "Longer query",
321
+ "english": "English",
322
+ "chinese": "Chinese",
323
+ "no_tie": "Exclude Ties",
324
+ "no_short": "Exclude Short",
325
+ }
326
+ cat_name_to_explanation = {
327
+ "Overall": "Overall Questions",
328
+ "Coding": "Coding: whether conversation contains code snippets",
329
+ "Longer query": "Longer user query (>= 500 tokens)",
330
+ "English": "English Prompts",
331
+ "Chinese": "Chinese Prompts",
332
+ "Exclude Ties": "Exclude Ties and Bothbad",
333
+ "Exclude Short": "User Query >= 5 tokens",
334
+ }
335
+
336
 
337
  def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
338
  arena_dfs = {}
 
350
  arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
351
  category_elo_results[key_to_category_name[k]] = elo_results[k]
352
 
353
+ p1 = category_elo_results["Overall"]["win_fraction_heatmap"]
354
+ p2 = category_elo_results["Overall"]["battle_count_heatmap"]
355
+ p3 = category_elo_results["Overall"]["bootstrap_elo_rating"]
356
+ p4 = category_elo_results["Overall"]["average_win_rate_bar"]
357
+ arena_df = arena_dfs["Overall"]
358
+ default_md = make_default_md(arena_df, category_elo_results["Overall"])
359
 
360
  md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
361
  if leaderboard_table_file:
 
369
  md = make_arena_leaderboard_md(arena_df)
370
  leaderboard_markdown = gr.Markdown(md, elem_id="leaderboard_markdown")
371
  with gr.Row():
372
+ with gr.Column(scale=2):
373
+ category_dropdown = gr.Dropdown(choices=list(arena_dfs.keys()), label="Category", value="Overall")
374
+ default_category_details = make_category_arena_leaderboard_md(arena_df, arena_df, name="Overall")
375
+ with gr.Column(scale=4, variant="panel"):
376
  category_deets = gr.Markdown(default_category_details, elem_id="category_deets")
377
 
378
  elo_display_df = gr.Dataframe(
 
387
  "Knowledge Cutoff",
388
  ],
389
  datatype=[
390
+ "number",
391
  "markdown",
392
  "number",
393
  "str",
 
472
  pass
473
 
474
  def update_leaderboard_df(arena_table_vals):
475
+ elo_datarame = pd.DataFrame(arena_table_vals, columns=[ "Rank", "Delta", "πŸ€– Model", "⭐ Arena Elo", "πŸ“Š 95% CI", "πŸ—³οΈ Votes", "Organization", "License", "Knowledge Cutoff"])
476
 
477
  # goal: color the rows based on the rank with styler
478
  def highlight_max(s):
 
482
  def highlight_rank_max(s):
483
  return ["color: green; font-weight: bold" if v > 0 else "color: red; font-weight: bold" if v < 0 else "" for v in s]
484
 
485
+ return elo_datarame.style.apply(highlight_max, subset=["Rank"]).apply(highlight_rank_max, subset=["Delta"])
486
 
487
  def update_leaderboard_and_plots(category):
488
  arena_subset_df = arena_dfs[category]
489
  arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
490
  elo_subset_results = category_elo_results[category]
491
+ arena_df = arena_dfs["Overall"]
492
+ arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df = arena_subset_df if category != "Overall" else None)
493
+ if category != "Overall":
494
  arena_values = update_leaderboard_df(arena_values)
495
  arena_values = gr.Dataframe(
496
  headers=[
497
  "Rank",
498
+ "Delta",
499
  "πŸ€– Model",
500
  "⭐ Arena Elo",
501
  "πŸ“Š 95% CI",
 
505
  "Knowledge Cutoff",
506
  ],
507
  datatype=[
508
+ "number",
509
  "number",
510
  "markdown",
511
  "number",
 
518
  value=arena_values,
519
  elem_id="arena_leaderboard_dataframe",
520
  height=700,
521
+ column_widths=[60, 70, 190, 110, 100, 90, 160, 150, 140],
522
  wrap=True,
523
  )
524
  else:
 
534
  "Knowledge Cutoff",
535
  ],
536
  datatype=[
537
+ "number",
538
  "markdown",
539
  "number",
540
  "str",