vtrv.vls commited on
Commit
1639c46
β€’
1 Parent(s): f9d1508
Files changed (3) hide show
  1. app.py +63 -4
  2. constants.py +315 -0
  3. test.md +1 -0
app.py CHANGED
@@ -1,10 +1,69 @@
1
- import gradio as gr
2
- from utils import generate
3
  import os
4
 
 
 
 
 
 
5
  def gen(content):
6
  res = generate(content,'auth_token.json')
7
  return res
8
 
9
- demo = gr.Interface(fn=gen, inputs="text", outputs="text")
10
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio
2
+ import argparse
3
  import os
4
 
5
+ from utils import generate
6
+ from constants import css, js_code, js_light
7
+
8
+ MERA_table = None
9
+
10
  def gen(content):
11
  res = generate(content,'auth_token.json')
12
  return res
13
 
14
+ def tab_arena():
15
+ arena = gradio.Interface(fn=gen, inputs="text", outputs="text")
16
+ arena.launch()
17
+
18
+ with open("_test.md", "r") as f:
19
+ TEST_MD = f.read()
20
+
21
+
22
+ def build_demo():
23
+ # global original_dfs, available_models, gpt4t_dfs, haiku_dfs, llama_dfs
24
+
25
+ with gradio.Blocks(theme=gradio.themes.Soft(), css=css, js=js_light) as demo:
26
+ # gradio.HTML(BANNER, elem_id="banner")
27
+ # gradio.Markdown(HEADER_MD.replace("{model_num}", str(len(original_dfs["-1"]))), elem_classes="markdown-text")
28
+
29
+ with gradio.Tabs(elem_classes="tab-buttons") as tabs:
30
+ with gradio.TabItem("🐼 MERA leaderboard", elem_id="od-benchmark-tab-table", id=0):
31
+ gradio.Markdown(TEST_MD, elem_classes="markdown-text-details")
32
+ # _tab_leaderboard()
33
+
34
+ with gradio.TabItem("πŸ†š SBS by categories and criteria", elem_id="od-benchmark-tab-table", id=1):
35
+ gradio.Markdown(TEST_MD, elem_classes="markdown-text-details")
36
+
37
+ with gradio.TabItem("πŸ₯Š Model arena", elem_id="od-benchmark-tab-table", id=2):
38
+ tab_arena()
39
+ # _tab_explore()
40
+
41
+ with gradio.TabItem("πŸ’ͺ About MERA", elem_id="od-benchmark-tab-table", id=3):
42
+ gradio.Markdown(TEST_MD, elem_classes="markdown-text")
43
+ # gr.Markdown(f"Last updated on **{LAST_UPDATED}** | [Link to V1-legacy](https://huggingface.co/spaces/allenai/WildBench-V1-legacy)", elem_classes="markdown-text-small")
44
+
45
+ # with gr.Row():
46
+ # with gr.Accordion("πŸ“™ Citation", open=False, elem_classes="accordion-label"):
47
+ # gr.Textbox(
48
+ # value=CITATION_TEXT,
49
+ # lines=7,
50
+ # label="Copy the BibTeX snippet to cite this source",
51
+ # elem_id="citation-button",
52
+ # show_copy_button=True)
53
+ # ).style(show_copy_button=True)
54
+
55
+ return demo
56
+
57
+ if __name__ == "__main__":
58
+ parser = argparse.ArgumentParser()
59
+ # parser.add_argument("--share", action="store_true")
60
+ # parser.add_argument("--bench_table", help="Path to MERA table", default="data_dir/MERA_jun2024.jsonl")
61
+ args = parser.parse_args()
62
+ # data_load(args.result_file)
63
+ # TYPES = ["number", "markdown", "number"]
64
+ demo = build_demo()
65
+ demo.launch(share=args.share, height=3000, width="110%")
66
+
67
+ # demo = gradio.Interface(fn=gen, inputs="text", outputs="text")
68
+ # demo.launch()
69
+
constants.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from collections import OrderedDict
3
+
4
+ # DEFAULT_K = "∞"
5
+ DEFAULT_K = "1500"
6
+
7
+ banner_url = "https://allenai.github.io/WildBench/gray_banner.png" # the same repo here.
8
+ BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 800px;"> </div>'
9
+
10
+ TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
11
+
12
+ WINRATE_HEATMAP = "<div><img src='https://github.com/WildEval/WildBench-Leaderboard/blob/main/gradio/pairwise_win_fractions.png?raw=true' style='width:100%;'></div>"
13
+
14
+ CITATION_TEXT = """@misc{wildbench2024,
15
+ title = {WildBench: Benchmarking Language Models with Challenging Tasks from Real Users in the Wild},
16
+ author = {Bill Yuchen Lin and Yuntian Deng and Khyathi Chandu and Faeze BrArena-Hardman and Abhilasha Ravichander and Valentina Pyatkin and Ronan Le Bras and Yejin Choi},
17
+ year = 2024,
18
+ url = {https://huggingface.co/spaces/allenai/WildBench},
19
+ }
20
+ """
21
+
22
+ # make column_names as an ordered dict
23
+
24
+
25
+ REWARD_MIX_COLUMN = "πŸ†š Reward-Mix (Avg)"
26
+ MACRO_COLUMN = "πŸ†š Reward (Macro)"
27
+
28
+ column_names = OrderedDict({
29
+ "model_name": "Model",
30
+ "WB_score": "πŸ’― WB Score",
31
+ "WB_score.task_macro": "πŸ’― Score Macro",
32
+ # "Arena Elo (hard) - 2024-05-20": "LMSYS Elo",
33
+ "Arena Elo (hard-en) - 2024-06-06": "LMSYS Elo",
34
+ "Arena-Hard v0.1": "Arena-Hard",
35
+ "AE2.0 LC": "AE2-LCWR",
36
+ "AE2.0": "AE2-WR",
37
+ "#chars": "Length",
38
+ "Length": "Len",
39
+ "task_macro_reward": "πŸ†š Task-Macro",
40
+ # # "elo overall": "Overall Elo",
41
+ # 'Others': 'Misc',
42
+ # # "average": "Task-Avg Elo",
43
+ # f"mixture_of_rewards.K={K}": "πŸ†š 🎯 Reward-Mix",
44
+ # f"gpt4t_reward.K={K}": "πŸ†š GPT4T",
45
+ # f"haiku_reward.K={K}": "πŸ†š Haiku",
46
+ # f"llama_reward.K={K}": "πŸ†š Llama2",
47
+ })
48
+
49
+
50
+
51
+ LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**.
52
+ """
53
+
54
+ LEADERBOARD_REMARKS_MAIN = """
55
+ **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**.
56
+ The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three.
57
+ **WB Score** individually scores each model based on checklists.
58
+ Evaluator is GPT-4-Turbo.
59
+ """
60
+
61
+ LENGTH_MARGIN_DESC_MD = """To mitigate the length bias, we consider it a **Tie** when A is only **slightly** better than B but A is longer than B by more than K chars.
62
+
63
+ πŸ”’ for closed LLMs; 🚨 for newly added models;
64
+ """
65
+
66
+ RANKING_COLUMN = REWARD_MIX_COLUMN
67
+
68
+ ORDERED_COLUMN_NAMES = [
69
+ "Model",
70
+ MACRO_COLUMN,
71
+ "πŸ’― Score Macro",
72
+ REWARD_MIX_COLUMN,
73
+ # "πŸ’― WB Score",
74
+ "πŸ†š 🎯 GPT4T",
75
+ "πŸ†š 🎯 Haiku",
76
+ "πŸ†š 🎯 Llama",
77
+ # "LMSYS Elo",
78
+ "LMSYS Elo",
79
+ "Arena-Hard",
80
+ "AE2-LCWR",
81
+ # "AE2-WR",
82
+ "Len",
83
+ ]
84
+
85
+
86
+ all_task_types_raw = [
87
+ 'Information seeking',
88
+ 'Coding & Debugging',
89
+ 'Math',
90
+ 'Data Analysis',
91
+ 'Planning',
92
+ 'Reasoning',
93
+ 'Creative Writing',
94
+ 'Editing',
95
+ 'Role playing',
96
+ 'Advice seeking',
97
+ 'Brainstorming',
98
+ # 'Others'
99
+ ]
100
+
101
+ all_task_types = ['Creative Tasks', 'Planning & Reasoning', 'Math & Data Analysis', 'Information/Advice seeking', 'Coding & Debugging']
102
+
103
+
104
+ TASK_NAME_MAPPING_RAW = {
105
+ 'Information seeking': 'InfoSek',
106
+ 'Creative Writing': 'CrtWrt',
107
+ 'Coding & Debugging': 'Code',
108
+ 'Reasoning': 'Reason',
109
+ 'Editing': 'Edit',
110
+ 'Math': 'Math',
111
+ 'Planning': 'Plan',
112
+ 'Brainstorming': 'Brnstrm',
113
+ 'Role playing': 'RolPly',
114
+ 'Advice seeking': 'AdvSek',
115
+ 'Data Analysis': 'DataAna',
116
+ }
117
+
118
+ TASK_NAME_MAPPING = {
119
+ 'Planning & Reasoning': 'πŸ’­ Reason & Plan',
120
+ 'Math & Data Analysis': 'πŸ“Š Math & Data',
121
+ 'Coding & Debugging': 'πŸ’» Code & Debug',
122
+ 'Creative Tasks': 'πŸ“ Creative',
123
+ 'Information/Advice seeking': 'ℹ️ Info Seek',
124
+ }
125
+
126
+ js_light = """
127
+ function refresh() {
128
+ const url = new URL(window.location);
129
+
130
+ if (url.searchParams.get('__theme') !== 'light') {
131
+ url.searchParams.set('__theme', 'light');
132
+ window.location.href = url.href;
133
+ }
134
+ }
135
+ """
136
+
137
+ js_code = """
138
+ function scroll_top() {
139
+ console.log("Hello from Gradio!");
140
+ const bubbles = document.querySelectorAll('.bubble-wrap');
141
+ bubbles.forEach((bubble, index) => {
142
+ setTimeout(() => {
143
+ bubble.scrollTop = 0;
144
+ }, index * 100); // Delay of 100ms between each iteration
145
+ });
146
+ }
147
+ """
148
+
149
+
150
+ TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtWrt**), Coding&Debugging (**Code**), Reasoning (**Reason**), Editing (**Edit**), **Math**, Planning (**Plan**), Brainstorming (**Brnstrm**), Role playing (**RolPly**), Advice seeking (**AdvSek**), Data Analysis (**DataAna**)"
151
+
152
+ css = """
153
+
154
+
155
+
156
+ code {
157
+ font-size: large;
158
+ }
159
+ footer {visibility: hidden}
160
+ .top-left-LP{
161
+ margin-top: 6px;
162
+ margin-left: 5px;
163
+ }
164
+ .no_margin{
165
+ margin-top: 0px;
166
+ margin-left: 0px;
167
+ margin-right: 0px;
168
+ margin-bottom: 0px;
169
+ padding-top: 0px;
170
+ padding-left: 0px;
171
+ padding-right: 0px;
172
+ padding-bottom: 0px;
173
+ }
174
+ .markdown-text{font-size: 14pt}
175
+ .markdown-text-tiny{font-size: 10pt}
176
+ .markdown-text-small{font-size: 13pt}
177
+ .markdown-text-tiny{font-size: 12pt}
178
+ .markdown-text-tiny-red{
179
+ font-size: 12pt;
180
+ color: red;
181
+ background-color: yellow;
182
+ font-color: red;
183
+ font-weight: bold;
184
+ }
185
+ th {
186
+ text-align: center;
187
+ font-size: 17px; /* Adjust the font size as needed */
188
+ }
189
+ td {
190
+ font-size: 15px; /* Adjust the font size as needed */
191
+ text-align: center;
192
+ }
193
+
194
+ .sample_button{
195
+ border: 1px solid #000000;
196
+ border-radius: 5px;
197
+ padding: 5px;
198
+ font-size: 15pt;
199
+ font-weight: bold;
200
+ margin: 5px;
201
+ }
202
+
203
+ .chat-common{
204
+ height: auto;
205
+ max-height: 400px;
206
+ min-height: 100px;
207
+ }
208
+ .chat-specific{
209
+ height: auto;
210
+ max-height: 600px;
211
+ min-height: 200px;
212
+ }
213
+ #od-benchmark-tab-table-button{
214
+ font-size: 15pt;
215
+ font-weight: bold;
216
+ }
217
+
218
+ .btn_boderline{
219
+ border: 1px solid #000000;
220
+ border-radius: 5px;
221
+ padding: 5px;
222
+ margin: 5px;
223
+ font-size: 15pt;
224
+ font-weight: bold;
225
+ }
226
+
227
+ .btn_boderline_next{
228
+ border: 0.1px solid #000000;
229
+ border-radius: 5px;
230
+ padding: 5px;
231
+ margin: 5px;
232
+ font-size: 15pt;
233
+ font-weight: bold;
234
+ }
235
+
236
+ .btn_boderline_gray{
237
+ border: 0.5px solid gray;
238
+ border-radius: 5px;
239
+ padding: 5px;
240
+ margin: 5px;
241
+ font-size: 15pt;
242
+ font-weight: italic;
243
+ }
244
+ .btn_boderline_selected{
245
+ border: 2px solid purple;
246
+ background-color: #f2f2f2;
247
+ border-radius: 5px;
248
+ padding: 5px;
249
+ margin: 5px;
250
+ font-size: 15pt;
251
+ font-weight: bold;
252
+ }
253
+ .accordion-label button span{
254
+ font-size: 14pt;
255
+ font-weight: bold;
256
+ }
257
+
258
+ #show-task-categorized span{
259
+ font-size: 13pt;
260
+ font-weight: bold;
261
+ }
262
+
263
+ #show-open-source-models span{
264
+ font-size: 13pt;
265
+ font-weight: bold;
266
+ }
267
+
268
+ #select-models span{
269
+ font-size: 10pt;
270
+ }
271
+
272
+ #select-tasks span{
273
+ font-size: 10pt;
274
+ }
275
+
276
+
277
+ .markdown-text-details{
278
+ margin: 10px;
279
+ padding: 10px;
280
+ }
281
+
282
+
283
+ button.selected[role="tab"][aria-selected="true"] {
284
+ font-size: 18px; /* or any other size you prefer */
285
+ font-weight: bold;
286
+ }
287
+
288
+ #od-benchmark-tab-table-ablation-button {
289
+ font-size: larger; /* Adjust the font size as needed */
290
+ }
291
+
292
+
293
+ .plotly-plot{
294
+ height: auto;
295
+ max-height: 600px;
296
+ min-height: 600px;
297
+ }
298
+
299
+ #length-margin-radio{
300
+ font-size: 10pt;
301
+ padding: 0px;
302
+ margin: 0px;
303
+ }
304
+
305
+ #show-task-categorized{
306
+ font-size: 12pt;
307
+ font-decoration: bold;
308
+ }
309
+
310
+ #show-open-source-models{
311
+ font-size: 12pt;
312
+ font-decoration: bold;
313
+ }
314
+ """
315
+
test.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ## TEST