yzabc007 commited on
Commit
3d466ff
Β·
1 Parent(s): 29f697b

Update space

Browse files
app.py CHANGED
@@ -104,7 +104,8 @@ def init_leaderboard(dataframe):
104
  # model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
105
  # model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
106
  # model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
107
- model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
 
108
  # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
109
 
110
 
@@ -131,17 +132,33 @@ with demo:
131
  gr.HTML(TITLE)
132
  gr.HTML(SUB_TITLE)
133
  gr.HTML(EXTERNAL_LINKS)
134
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
137
 
138
  with gr.TabItem("πŸ… Overview", elem_id="llm-benchmark-tab-table", id=0):
139
 
140
  DESCRIPTION_TEXT = """
141
- Total #models: 53 (Last updated: 2024-10-08)
142
 
143
- This page provids a comprehensive overview of model ranks across various dimensions. Models are sorted based on their averaged rank across all dimensions.
144
- (Some missing values are due to the slow or problemtic model responses, and we will update the leaderboard once we have the complete results.)
145
  """
146
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
147
 
@@ -158,6 +175,7 @@ with demo:
158
  AutoEvalColumn.rank_reason_logical.name,
159
  AutoEvalColumn.rank_reason_social.name,
160
  AutoEvalColumn.rank_chemistry.name,
 
161
  ],
162
  rank_col=[],
163
  )
@@ -374,19 +392,31 @@ with demo:
374
  """
375
  gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
376
 
377
- with gr.TabItem("🐍 Python", elem_id="python_subtab", id=0, elem_classes="subtab"):
378
- CURRENT_TEXT = """
379
- # Coming soon!
380
- """
381
- gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
382
 
383
- with gr.TabItem("β˜• Java", elem_id="java_subtab", id=1, elem_classes="subtab"):
384
  CURRENT_TEXT = """
385
  # Coming soon!
386
  """
387
  gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
388
 
389
- with gr.TabItem("βž• C++", elem_id="cpp_subtab", id=2, elem_classes="subtab"):
390
  CURRENT_TEXT = """
391
  # Coming soon!
392
  """
@@ -395,6 +425,7 @@ with demo:
395
 
396
 
397
 
 
398
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=6):
399
  ABOUT_TEXT = """
400
  # About Us
 
104
  # model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
105
  # model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
106
  # model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
107
+ # model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
108
+ model_result_path = "./src/results/models_2024-10-10-06:18:54.263527.json"
109
  # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
110
 
111
 
 
132
  gr.HTML(TITLE)
133
  gr.HTML(SUB_TITLE)
134
  gr.HTML(EXTERNAL_LINKS)
135
+ # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
136
+ # gr.HTML('<p style="font-size:15px;">This is a larger text using HTML in Markdown.</p>')
137
+ INTRODUCTION_TEXT_FONT_SIZE = 16
138
+ INTRODUCTION_TEXT = (
139
+ f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
140
+ '<strong>Decentralized Arena</strong> automates, scales, and accelerates "<a href="https://lmarena.ai/">Chatbot Arena</a>" '
141
+ 'for large language model (LLM) evaluation across diverse, fine-grained dimensions, '
142
+ 'such as mathematics (algebra, geometry, probability), logical reasoning, social reasoning, biology, chemistry, and more'
143
+ 'The evaluation is decentralized and democratic, with all participating LLMs assessing each other to ensure unbiased and fair results '
144
+ 'With a 95\% correlation to Chatbot Arena\'s overall rankings, the system is fully transparent and reproducible.'
145
+ '</p>'
146
+ f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
147
+ 'We actively invite <b>model developers</b> to participate and expedite their benchmarking efforts '
148
+ 'and encourage <b>data stakeholders</b> to freely define and evaluate dimensions of interest for their own objectives.'
149
+ '</p>'
150
+ )
151
+ gr.HTML(INTRODUCTION_TEXT)
152
 
153
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
154
 
155
  with gr.TabItem("πŸ… Overview", elem_id="llm-benchmark-tab-table", id=0):
156
 
157
  DESCRIPTION_TEXT = """
158
+ Total #models: 53 (Last updated: 2024-10-09)
159
 
160
+ This page prvovides a comprehensive overview of model ranks across various dimensions, based on their averaged ranks.
161
+ (Missing values are due to the slow or problemtic model responses, which will be fixed soom.)
162
  """
163
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
164
 
 
175
  AutoEvalColumn.rank_reason_logical.name,
176
  AutoEvalColumn.rank_reason_social.name,
177
  AutoEvalColumn.rank_chemistry.name,
178
+ AutoEvalColumn.rank_cpp.name,
179
  ],
180
  rank_col=[],
181
  )
 
392
  """
393
  gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
394
 
395
+ with gr.TabItem("βž• C++", elem_id="cpp_subtab", id=0, elem_classes="subtab"):
396
+
397
+ leaderboard = overall_leaderboard(
398
+ get_model_leaderboard_df(
399
+ model_result_path,
400
+ benchmark_cols=[
401
+ AutoEvalColumn.rank_cpp.name,
402
+ AutoEvalColumn.model.name,
403
+ AutoEvalColumn.score_cpp.name,
404
+ # AutoEvalColumn.sd_cpp.name,
405
+ AutoEvalColumn.license.name,
406
+ AutoEvalColumn.organization.name,
407
+ AutoEvalColumn.knowledge_cutoff.name,
408
+ ],
409
+ rank_col=[AutoEvalColumn.rank_cpp.name],
410
+ )
411
+ )
412
 
413
+ with gr.TabItem("🐍 Python", elem_id="python_subtab", id=1, elem_classes="subtab"):
414
  CURRENT_TEXT = """
415
  # Coming soon!
416
  """
417
  gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
418
 
419
+ with gr.TabItem("β˜• Java", elem_id="java_subtab", id=2, elem_classes="subtab"):
420
  CURRENT_TEXT = """
421
  # Coming soon!
422
  """
 
425
 
426
 
427
 
428
+
429
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=6):
430
  ABOUT_TEXT = """
431
  # About Us
src/display/utils.py CHANGED
@@ -89,6 +89,10 @@ auto_eval_column_dict.append(["score_chemistry", ColumnContent, field(default_fa
89
  auto_eval_column_dict.append(["sd_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Chemistry)", "number", True))])
90
  auto_eval_column_dict.append(["rank_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Chemistry)", "number", True))])
91
 
 
 
 
 
92
  for task in Tasks:
93
  auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
94
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])
 
89
  auto_eval_column_dict.append(["sd_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Chemistry)", "number", True))])
90
  auto_eval_column_dict.append(["rank_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Chemistry)", "number", True))])
91
 
92
+ auto_eval_column_dict.append(["score_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Score (C++)", "number", True))])
93
+ auto_eval_column_dict.append(["sd_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (C++)", "number", True))])
94
+ auto_eval_column_dict.append(["rank_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (C++)", "number", True))])
95
+
96
  for task in Tasks:
97
  auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
98
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])
src/leaderboard/read_evals.py CHANGED
@@ -189,6 +189,10 @@ class ModelResult:
189
  AutoEvalColumn.sd_chemistry.name: self.results.get("Chemistry").get("Standard Deviation", None) if self.results.get("Chemistry") else None,
190
  AutoEvalColumn.rank_chemistry.name: self.results.get("Chemistry").get("Rank", None) if self.results.get("Chemistry") else None,
191
 
 
 
 
 
192
  AutoEvalColumn.license.name: self.license,
193
  AutoEvalColumn.organization.name: self.org,
194
  AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
 
189
  AutoEvalColumn.sd_chemistry.name: self.results.get("Chemistry").get("Standard Deviation", None) if self.results.get("Chemistry") else None,
190
  AutoEvalColumn.rank_chemistry.name: self.results.get("Chemistry").get("Rank", None) if self.results.get("Chemistry") else None,
191
 
192
+ AutoEvalColumn.score_cpp.name: self.results.get("CPP").get("Average Score", None) if self.results.get("CPP") else None,
193
+ AutoEvalColumn.sd_cpp.name: self.results.get("CPP").get("Standard Deviation", None) if self.results.get("CPP") else None,
194
+ AutoEvalColumn.rank_cpp.name: self.results.get("CPP").get("Rank", None) if self.results.get("CPP") else None,
195
+
196
  AutoEvalColumn.license.name: self.license,
197
  AutoEvalColumn.organization.name: self.org,
198
  AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
src/populate.py CHANGED
@@ -24,7 +24,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
24
  if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
25
  df = df.dropna(subset=benchmark_cols)
26
  df = df.sort_values(by=[rank_col[0]], ascending=True)
27
- # print(rank_col)
28
  else:
29
  # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
30
  avg_rank = df.iloc[:, 1:].mean(axis=1)
@@ -43,7 +43,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
43
  # print(col)
44
  # if 'Std dev' in col or 'Score' in col:
45
  if 'Std dev' in col or 'Score' in col:
46
- if "Chemistry" in col:
47
  df[col] = (df[col]).map('{:.2f}'.format)
48
  else:
49
  df[col] = (df[col]*100).map('{:.2f}'.format)
 
24
  if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
25
  df = df.dropna(subset=benchmark_cols)
26
  df = df.sort_values(by=[rank_col[0]], ascending=True)
27
+ # print(rank_col, benchmark_cols)
28
  else:
29
  # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
30
  avg_rank = df.iloc[:, 1:].mean(axis=1)
 
43
  # print(col)
44
  # if 'Std dev' in col or 'Score' in col:
45
  if 'Std dev' in col or 'Score' in col:
46
+ if "Chemistry" in col or "C++" in col:
47
  df[col] = (df[col]).map('{:.2f}'.format)
48
  else:
49
  df[col] = (df[col]*100).map('{:.2f}'.format)