Tong Chen commited on
Commit
675f454
β€’
1 Parent(s): 7617cab

upload results

Browse files
Files changed (2) hide show
  1. app.py +23 -122
  2. results.csv +27 -0
app.py CHANGED
@@ -57,137 +57,38 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
- )
90
 
 
 
 
 
 
91
 
92
  demo = gr.Blocks(css=custom_css)
93
  with demo:
94
  gr.HTML(TITLE)
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
  with gr.Row():
145
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
  )
190
-
191
  with gr.Row():
192
  with gr.Accordion("πŸ“™ Citation", open=False):
193
  citation_button = gr.Textbox(
 
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ def load_data():
62
+ import pandas as pd
63
+ # load results.csv
64
+ results_df = pd.read_csv("results.csv")
65
+ return results_df
66
 
67
  demo = gr.Blocks(css=custom_css)
68
  with demo:
69
  gr.HTML(TITLE)
70
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
71
 
72
+ with gr.Tabs():
73
+ with gr.TabItem("Leaderboard"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  with gr.Row():
75
+ version_dropdown = gr.Dropdown(
76
+ choices=["All"],
77
+ label="πŸ”„ Select Models",
78
+ value="All",
79
+ )
80
+ model_dropdown = gr.Dropdown(
81
+ choices=["All"],
82
+ label="πŸ”„ Select Inference Method",
83
+ value="All",
84
+ )
85
+
86
+ leaderboard_table = gr.components.Dataframe(
87
+ value=load_data(),
88
+ interactive=True,
89
+ visible=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  )
91
+
92
  with gr.Row():
93
  with gr.Accordion("πŸ“™ Citation", open=False):
94
  citation_button = gr.Textbox(
results.csv ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ copyright,model,decoding,count_verbatim,count_nonverbatim,count_qa,score_rouge_l,score_rouge_l>=0.8,events_recall,events_recall>=5,char_recall,char_recall>=3,score_f1,verbatim_quality_score,nonverbatim_quality_score
2
+ yes,gpt-35-turbo,greedy,2274,1770,589,0.1840055085,2.022867194,0.5316384181,1.355932203,0.1836158192,1.412429379,36.09021864,3.49648197,4.340112994
3
+ yes,gpt-4-turbo,greedy,2274,1730,589,0.1579340721,0.4397537379,0.8560693642,2.947976879,0.4398843931,4.50867052,41.9088674,3.940193492,4.667052023
4
+ yes,llama2-13b-chat-hf,greedy,2274,1770,589,0.1259847233,0,0.2271186441,0.2259887006,0.106779661,0.5649717514,17.18724202,3.922163588,4.201694915
5
+ yes,llama2-13b-hf,greedy,2274,1770,589,0.1487607591,0.08795074758,0.3169491525,0.3389830508,0.2542372881,2.033898305,20.94628725,2.514951627,3.020903955
6
+ yes,llama2-13b-hf,memfree,2274,1770,589,0.1470236149,0,0.3146892655,0.3389830508,0.2446327684,2.033898305,20.94628725,2.551890941,3.049717514
7
+ yes,llama2-13b-hf,system,2274,1770,589,0.1489617194,0.04397537379,0.3367231638,0.4519774011,0.2423728814,2.033898305,19.79607546,2.576077397,3.123163842
8
+ yes,llama2-13b-vicuna,greedy,2274,1770,589,0.1419072826,0.08795074758,0.4824858757,0.4519774011,0.186440678,1.412429379,16.18073258,3.649956025,4.176271186
9
+ yes,llama2-70b,greedy,2274,1770,589,0.198620695,2.418645558,0.8903954802,4.011299435,0.7485875706,10.33898305,30.1233411,2.803869833,3.346892655
10
+ yes,llama2-70b,memfree,2274,1770,589,0.1751059825,0.3078276165,0.8903954802,3.84180791,0.7519774011,10.9039548,30.1233411,2.76121372,3.341242938
11
+ yes,llama2-70b,system,2274,1770,589,0.2003116714,2.594547054,0.9146892655,4.745762712,0.7734463277,11.52542373,29.9026073,2.750659631,3.355932203
12
+ yes,llama2-70b-chat,greedy,2274,1770,589,0.1468163176,0.1319261214,0.4016949153,0.7344632768,0.1644067797,1.129943503,21.19026512,3.627968338,4.156497175
13
+ yes,llama2-7b,greedy,2274,1770,589,0.1435181828,0.1319261214,0.297740113,0.2259887006,0.213559322,1.694915254,15.34213034,2.380386983,2.85819209
14
+ yes,llama3-70b,greedy,2274,1770,589,0.3042579322,10.51011434,1.176836158,6.892655367,1.076271186,15.59322034,39.98167839,2.708003518,3.206779661
15
+ yes,llama3-70b,memfree,2274,1770,589,0.2223630689,0.6156552331,1.215254237,7.231638418,1.074576271,15.53672316,39.98167839,2.666666667,3.196045198
16
+ yes,llama3-70b,system,2274,1770,589,0.308713149,11.03781882,1.15480226,5.93220339,1.038418079,15.02824859,39.92419107,2.736147757,3.276271186
17
+ yes,llama3-70b-instruct,greedy,2274,1770,589,0.1338709257,0.219876869,0.4344632768,1.242937853,0.4276836158,4.237288136,30.20756187,3.237906772,4.404519774
18
+ yes,llama3-8b,greedy,2274,1770,589,0.1498831988,0.1759014952,0.6531073446,2.316384181,0.3949152542,4.463276836,18.60886463,2.576956904,2.737288136
19
+ yes,mistral-7b,greedy,2274,1770,589,0.1477934219,0.1319261214,0.3576271186,0.395480226,0.2299435028,1.920903955,18.71270589,2.279683377,2.796045198
20
+ yes,mixtral-8x7b,greedy,2274,1770,589,0.1561011593,0.9674582234,0.5344632768,1.299435028,0.5536723164,6.949152542,23.32206589,2.981530343,3.537853107
21
+ yes,mixtral-8x7b-instruct,greedy,2274,1770,589,0.1316051003,0.08795074758,0.7254237288,1.97740113,0.3474576271,2.937853107,21.27583658,3.420844327,4.255932203
22
+ yes,tulu2-13b,greedy,2274,1770,589,0.1306359884,0,0.506779661,0.6214689266,0.2474576271,1.581920904,17.89834888,2.931838171,4.005649718
23
+ yes,tulu2-13b-dpo,greedy,2274,1770,589,0.1383527819,0.08795074758,0.618079096,1.525423729,0.288700565,1.751412429,17.32643669,3.448988566,4.196610169
24
+ yes,tulu2-70b,greedy,2274,1770,589,0.1678018825,1.011433597,0.7706214689,2.824858757,0.4395480226,4.632768362,28.32230484,2.921723835,4.019774011
25
+ yes,tulu2-70b,memfree,2274,1770,589,0.1578867035,0.08795074758,0.7734463277,2.881355932,0.4293785311,4.406779661,28.32230484,2.91292876,4.044632768
26
+ yes,tulu2-70b,system,2274,1770,589,0.1675028708,0.7475813544,0.7559322034,2.033898305,0.3875706215,3.276836158,28.30660473,3.044415128,4.116949153
27
+ yes,tulu2-70b-dpo,greedy,2274,1770,589,0.1738150265,0.3518029903,0.7689265537,2.146892655,0.3672316384,3.389830508,28.836381,3.480650836,4.363276836