Spaces:
AIR-Bench
/
Restarting on CPU Upgrade

nan hanhainebula commited on
Commit
0785fe4
·
verified ·
1 Parent(s): 30f9433

feat-add-v2405 (#26)

Browse files

- update part code for v24.05 (257f64d0bddc9d6bac7df327345dac25bebc92c8)
- feat: add v2024.05 (2bce3f3b3285bf8e6b4ab0920c2c52eb43c2cd2d)


Co-authored-by: Jianlyu Chen <[email protected]>

Files changed (5) hide show
  1. app.py +266 -266
  2. requirements.txt +1 -0
  3. src/about.py +3 -3
  4. src/benchmarks.py +2 -62
  5. src/envs.py +1 -1
app.py CHANGED
@@ -131,303 +131,303 @@ with demo:
131
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
132
 
133
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
134
- with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
135
  with gr.Row():
136
- with gr.Column(min_width=320):
137
- # select domain
138
- with gr.Row():
139
- selected_domains = get_domain_dropdown(DOMAIN_COLS_QA, DOMAIN_COLS_QA)
140
- # select language
141
- with gr.Row():
142
- selected_langs = get_language_dropdown(LANG_COLS_QA, LANG_COLS_QA)
143
-
144
- with gr.Column():
145
- with gr.Row():
146
- selected_version = get_version_dropdown()
147
- # select the metric
148
- selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_QA)
149
- with gr.Row():
150
- show_anonymous = get_anonymous_checkbox()
151
- with gr.Row():
152
- show_revision_and_timestamp = get_revision_and_ts_checkbox()
153
- with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
154
- with gr.TabItem("Retrieval + Reranking", id=10):
155
- with gr.Row():
156
- # search retrieval models
157
- with gr.Column():
158
- search_bar = get_search_bar()
159
- # select reranking models
160
- with gr.Column():
161
- selected_rerankings = get_reranking_dropdown(reranking_models)
162
- leaderboard_table = get_leaderboard_table(leaderboard_df_qa, types_qa)
163
- # Dummy leaderboard for handling the case when the user uses backspace key
164
- hidden_leaderboard_table_for_search = get_leaderboard_table(original_df_qa, types_qa, visible=False)
165
-
166
- set_listeners(
167
- "qa",
168
- leaderboard_table,
169
- hidden_leaderboard_table_for_search,
170
- search_bar,
171
- selected_domains,
172
- selected_langs,
173
- selected_rerankings,
174
- show_anonymous,
175
- show_revision_and_timestamp,
176
- )
177
-
178
- # set metric listener
179
- selected_metric.change(
180
- update_metric_qa,
181
- [
182
- selected_metric,
183
  selected_domains,
184
  selected_langs,
185
  selected_rerankings,
186
- search_bar,
187
  show_anonymous,
188
  show_revision_and_timestamp,
189
- ],
190
- leaderboard_table,
191
- queue=True
192
- )
193
- with gr.TabItem("Retrieval Only", id=11):
194
- with gr.Row():
195
- with gr.Column(scale=1):
196
- search_bar_retriever = get_search_bar()
197
- with gr.Column(scale=1):
198
- selected_noreranker = get_noreranking_dropdown()
199
- lb_df_retriever = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
200
- lb_df_retriever = reset_rank(lb_df_retriever)
201
- lb_table_retriever = get_leaderboard_table(lb_df_retriever, types_qa)
202
- # Dummy leaderboard for handling the case when the user uses backspace key
203
- hidden_lb_df_retriever = original_df_qa[original_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
204
- hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
205
- hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, types_qa, visible=False)
206
-
207
- set_listeners(
208
- "qa",
209
- lb_table_retriever,
210
- hidden_lb_table_retriever,
211
- search_bar_retriever,
212
- selected_domains,
213
- selected_langs,
214
- selected_noreranker,
215
- show_anonymous,
216
- show_revision_and_timestamp,
217
- )
218
-
219
- # set metric listener
220
- selected_metric.change(
221
- update_metric_qa,
222
- [
223
- selected_metric,
 
224
  selected_domains,
225
  selected_langs,
226
  selected_noreranker,
227
- search_bar_retriever,
228
  show_anonymous,
229
  show_revision_and_timestamp,
230
- ],
231
- lb_table_retriever,
232
- queue=True
233
- )
234
- with gr.TabItem("Reranking Only", id=12):
235
- lb_df_reranker = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
236
- lb_df_reranker = reset_rank(lb_df_reranker)
237
- reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
238
- with gr.Row():
239
- with gr.Column(scale=1):
240
- selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
241
- with gr.Column(scale=1):
242
- search_bar_reranker = gr.Textbox(show_label=False, visible=False)
243
- lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
244
- hidden_lb_df_reranker = original_df_qa[original_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
245
- hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
246
- hidden_lb_table_reranker = get_leaderboard_table(
247
- hidden_lb_df_reranker, types_qa, visible=False
248
- )
249
-
250
- set_listeners(
251
- "qa",
252
- lb_table_reranker,
253
- hidden_lb_table_reranker,
254
- search_bar_reranker,
255
- selected_domains,
256
- selected_langs,
257
- selected_rerankings_reranker,
258
- show_anonymous,
259
- show_revision_and_timestamp,
260
- )
261
- # set metric listener
262
- selected_metric.change(
263
- update_metric_qa,
264
- [
265
- selected_metric,
 
 
266
  selected_domains,
267
  selected_langs,
268
  selected_rerankings_reranker,
269
- search_bar_reranker,
270
  show_anonymous,
271
  show_revision_and_timestamp,
272
- ],
273
- lb_table_reranker,
274
- queue=True
275
- )
276
- with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
277
- with gr.Row():
278
- with gr.Column(min_width=320):
279
- # select domain
280
- with gr.Row():
281
- selected_domains = get_domain_dropdown(DOMAIN_COLS_LONG_DOC, DOMAIN_COLS_LONG_DOC)
282
- # select language
283
- with gr.Row():
284
- selected_langs = get_language_dropdown(
285
- LANG_COLS_LONG_DOC, LANG_COLS_LONG_DOC
286
  )
287
- with gr.Column():
288
- with gr.Row():
289
- selected_version = get_version_dropdown()
290
- # select the metric
291
- with gr.Row():
292
- selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_LONG_DOC)
293
- with gr.Row():
294
- show_anonymous = get_anonymous_checkbox()
295
- with gr.Row():
296
- show_revision_and_timestamp = get_revision_and_ts_checkbox()
297
- with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
298
- with gr.TabItem("Retrieval + Reranking", id=20):
299
- with gr.Row():
300
- with gr.Column():
301
- search_bar = get_search_bar()
302
- # select reranking model
303
- with gr.Column():
304
- selected_rerankings = get_reranking_dropdown(reranking_models)
305
-
306
- lb_table = get_leaderboard_table(
307
- leaderboard_df_long_doc, types_long_doc
308
- )
309
-
310
- # Dummy leaderboard for handling the case when the user uses backspace key
311
- hidden_lb_table_for_search = get_leaderboard_table(
312
- original_df_long_doc, types_long_doc, visible=False
313
- )
314
-
315
- set_listeners(
316
- "long-doc",
317
- lb_table,
318
- hidden_lb_table_for_search,
319
- search_bar,
320
- selected_domains,
321
- selected_langs,
322
- selected_rerankings,
323
- show_anonymous,
324
- show_revision_and_timestamp,
325
- )
326
-
327
- # set metric listener
328
- selected_metric.change(
329
- update_metric_long_doc,
330
- [
331
- selected_metric,
 
 
 
 
 
 
 
 
 
 
 
 
332
  selected_domains,
333
  selected_langs,
334
  selected_rerankings,
335
- search_bar,
336
  show_anonymous,
337
- show_revision_and_timestamp
338
- ],
339
- lb_table,
340
- queue=True
341
- )
342
- with gr.TabItem("Retrieval Only", id=21):
343
- with gr.Row():
344
- with gr.Column(scale=1):
345
- search_bar_retriever = get_search_bar()
346
- with gr.Column(scale=1):
347
- selected_noreranker = get_noreranking_dropdown()
348
- lb_df_retriever_long_doc = leaderboard_df_long_doc[
349
- leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
350
- ]
351
- lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc)
352
- hidden_lb_db_retriever_long_doc = original_df_long_doc[
353
- original_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
354
- ]
355
- hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
356
- lb_table_retriever_long_doc = get_leaderboard_table(
357
- lb_df_retriever_long_doc, types_long_doc)
358
- hidden_lb_table_retriever_long_doc = get_leaderboard_table(
359
- hidden_lb_db_retriever_long_doc, types_long_doc, visible=False
360
- )
361
-
362
- set_listeners(
363
- "long-doc",
364
- lb_table_retriever_long_doc,
365
- hidden_lb_table_retriever_long_doc,
366
- search_bar_retriever,
367
- selected_domains,
368
- selected_langs,
369
- selected_noreranker,
370
- show_anonymous,
371
- show_revision_and_timestamp,
372
- )
373
-
374
- selected_metric.change(
375
- update_metric_long_doc,
376
- [
377
- selected_metric,
 
 
378
  selected_domains,
379
  selected_langs,
380
  selected_noreranker,
381
- search_bar_retriever,
382
  show_anonymous,
383
  show_revision_and_timestamp,
384
- ],
385
- lb_table_retriever_long_doc,
386
- queue=True
387
- )
388
- with gr.TabItem("Reranking Only", id=22):
389
- lb_df_reranker_ldoc = leaderboard_df_long_doc[
390
- leaderboard_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
391
- ]
392
- lb_df_reranker_ldoc = reset_rank(lb_df_reranker_ldoc)
393
- reranking_models_reranker_ldoc = lb_df_reranker_ldoc[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
394
- with gr.Row():
395
- with gr.Column(scale=1):
396
- selected_rerankings_reranker_ldoc = get_reranking_dropdown(reranking_models_reranker_ldoc)
397
- with gr.Column(scale=1):
398
- search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
399
- lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
400
- hidden_lb_df_reranker_ldoc = original_df_long_doc[original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
401
- hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
402
- hidden_lb_table_reranker_ldoc = get_leaderboard_table(
403
- hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
404
- )
405
-
406
- set_listeners(
407
- "long-doc",
408
- lb_table_reranker_ldoc,
409
- hidden_lb_table_reranker_ldoc,
410
- search_bar_reranker_ldoc,
411
- selected_domains,
412
- selected_langs,
413
- selected_rerankings_reranker_ldoc,
414
- show_anonymous,
415
- show_revision_and_timestamp,
416
- )
417
- selected_metric.change(
418
- update_metric_long_doc,
419
- [
420
- selected_metric,
 
 
421
  selected_domains,
422
  selected_langs,
423
  selected_rerankings_reranker_ldoc,
424
- search_bar_reranker_ldoc,
425
  show_anonymous,
426
  show_revision_and_timestamp,
427
- ],
428
- lb_table_reranker_ldoc,
429
- queue=True
430
- )
 
 
 
 
 
 
 
 
 
 
 
431
 
432
  with gr.TabItem("🚀Submit here!", elem_id="submit-tab-table", id=2):
433
  with gr.Column():
 
131
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
132
 
133
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
134
+ with gr.TabItem("Results", elem_id="results-tab-table"):
135
  with gr.Row():
136
+ selected_version = get_version_dropdown()
137
+
138
+ with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
139
+ with gr.Row():
140
+ with gr.Column(min_width=320):
141
+ # select domain
142
+ with gr.Row():
143
+ selected_domains = get_domain_dropdown(DOMAIN_COLS_QA, DOMAIN_COLS_QA)
144
+ # select language
145
+ with gr.Row():
146
+ selected_langs = get_language_dropdown(LANG_COLS_QA, LANG_COLS_QA)
147
+
148
+ with gr.Column():
149
+ # select the metric
150
+ selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_QA)
151
+ with gr.Row():
152
+ show_anonymous = get_anonymous_checkbox()
153
+ with gr.Row():
154
+ show_revision_and_timestamp = get_revision_and_ts_checkbox()
155
+ with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
156
+ with gr.TabItem("Retrieval + Reranking", id=10):
157
+ with gr.Row():
158
+ # search retrieval models
159
+ with gr.Column():
160
+ search_bar = get_search_bar()
161
+ # select reranking models
162
+ with gr.Column():
163
+ selected_rerankings = get_reranking_dropdown(reranking_models)
164
+ leaderboard_table = get_leaderboard_table(leaderboard_df_qa, types_qa)
165
+ # Dummy leaderboard for handling the case when the user uses backspace key
166
+ hidden_leaderboard_table_for_search = get_leaderboard_table(original_df_qa, types_qa, visible=False)
167
+
168
+ set_listeners(
169
+ "qa",
170
+ leaderboard_table,
171
+ hidden_leaderboard_table_for_search,
172
+ search_bar,
 
 
 
 
 
 
 
 
 
 
173
  selected_domains,
174
  selected_langs,
175
  selected_rerankings,
 
176
  show_anonymous,
177
  show_revision_and_timestamp,
178
+ )
179
+
180
+ # set metric listener
181
+ selected_metric.change(
182
+ update_metric_qa,
183
+ [
184
+ selected_metric,
185
+ selected_domains,
186
+ selected_langs,
187
+ selected_rerankings,
188
+ search_bar,
189
+ show_anonymous,
190
+ show_revision_and_timestamp,
191
+ ],
192
+ leaderboard_table,
193
+ queue=True
194
+ )
195
+ with gr.TabItem("Retrieval Only", id=11):
196
+ with gr.Row():
197
+ with gr.Column(scale=1):
198
+ search_bar_retriever = get_search_bar()
199
+ with gr.Column(scale=1):
200
+ selected_noreranker = get_noreranking_dropdown()
201
+ lb_df_retriever = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
202
+ lb_df_retriever = reset_rank(lb_df_retriever)
203
+ lb_table_retriever = get_leaderboard_table(lb_df_retriever, types_qa)
204
+ # Dummy leaderboard for handling the case when the user uses backspace key
205
+ hidden_lb_df_retriever = original_df_qa[original_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
206
+ hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
207
+ hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, types_qa, visible=False)
208
+
209
+ set_listeners(
210
+ "qa",
211
+ lb_table_retriever,
212
+ hidden_lb_table_retriever,
213
+ search_bar_retriever,
214
  selected_domains,
215
  selected_langs,
216
  selected_noreranker,
 
217
  show_anonymous,
218
  show_revision_and_timestamp,
219
+ )
220
+
221
+ # set metric listener
222
+ selected_metric.change(
223
+ update_metric_qa,
224
+ [
225
+ selected_metric,
226
+ selected_domains,
227
+ selected_langs,
228
+ selected_noreranker,
229
+ search_bar_retriever,
230
+ show_anonymous,
231
+ show_revision_and_timestamp,
232
+ ],
233
+ lb_table_retriever,
234
+ queue=True
235
+ )
236
+ with gr.TabItem("Reranking Only", id=12):
237
+ lb_df_reranker = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
238
+ lb_df_reranker = reset_rank(lb_df_reranker)
239
+ reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
240
+ with gr.Row():
241
+ with gr.Column(scale=1):
242
+ selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
243
+ with gr.Column(scale=1):
244
+ search_bar_reranker = gr.Textbox(show_label=False, visible=False)
245
+ lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
246
+ hidden_lb_df_reranker = original_df_qa[original_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
247
+ hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
248
+ hidden_lb_table_reranker = get_leaderboard_table(
249
+ hidden_lb_df_reranker, types_qa, visible=False
250
+ )
251
+
252
+ set_listeners(
253
+ "qa",
254
+ lb_table_reranker,
255
+ hidden_lb_table_reranker,
256
+ search_bar_reranker,
257
  selected_domains,
258
  selected_langs,
259
  selected_rerankings_reranker,
 
260
  show_anonymous,
261
  show_revision_and_timestamp,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  )
263
+ # set metric listener
264
+ selected_metric.change(
265
+ update_metric_qa,
266
+ [
267
+ selected_metric,
268
+ selected_domains,
269
+ selected_langs,
270
+ selected_rerankings_reranker,
271
+ search_bar_reranker,
272
+ show_anonymous,
273
+ show_revision_and_timestamp,
274
+ ],
275
+ lb_table_reranker,
276
+ queue=True
277
+ )
278
+ with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
279
+ with gr.Row():
280
+ with gr.Column(min_width=320):
281
+ # select domain
282
+ with gr.Row():
283
+ selected_domains = get_domain_dropdown(DOMAIN_COLS_LONG_DOC, DOMAIN_COLS_LONG_DOC)
284
+ # select language
285
+ with gr.Row():
286
+ selected_langs = get_language_dropdown(
287
+ LANG_COLS_LONG_DOC, LANG_COLS_LONG_DOC
288
+ )
289
+ with gr.Column():
290
+ # select the metric
291
+ with gr.Row():
292
+ selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_LONG_DOC)
293
+ with gr.Row():
294
+ show_anonymous = get_anonymous_checkbox()
295
+ with gr.Row():
296
+ show_revision_and_timestamp = get_revision_and_ts_checkbox()
297
+ with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
298
+ with gr.TabItem("Retrieval + Reranking", id=20):
299
+ with gr.Row():
300
+ with gr.Column():
301
+ search_bar = get_search_bar()
302
+ # select reranking model
303
+ with gr.Column():
304
+ selected_rerankings = get_reranking_dropdown(reranking_models)
305
+
306
+ lb_table = get_leaderboard_table(
307
+ leaderboard_df_long_doc, types_long_doc
308
+ )
309
+
310
+ # Dummy leaderboard for handling the case when the user uses backspace key
311
+ hidden_lb_table_for_search = get_leaderboard_table(
312
+ original_df_long_doc, types_long_doc, visible=False
313
+ )
314
+
315
+ set_listeners(
316
+ "long-doc",
317
+ lb_table,
318
+ hidden_lb_table_for_search,
319
+ search_bar,
320
  selected_domains,
321
  selected_langs,
322
  selected_rerankings,
 
323
  show_anonymous,
324
+ show_revision_and_timestamp,
325
+ )
326
+
327
+ # set metric listener
328
+ selected_metric.change(
329
+ update_metric_long_doc,
330
+ [
331
+ selected_metric,
332
+ selected_domains,
333
+ selected_langs,
334
+ selected_rerankings,
335
+ search_bar,
336
+ show_anonymous,
337
+ show_revision_and_timestamp
338
+ ],
339
+ lb_table,
340
+ queue=True
341
+ )
342
+ with gr.TabItem("Retrieval Only", id=21):
343
+ with gr.Row():
344
+ with gr.Column(scale=1):
345
+ search_bar_retriever = get_search_bar()
346
+ with gr.Column(scale=1):
347
+ selected_noreranker = get_noreranking_dropdown()
348
+ lb_df_retriever_long_doc = leaderboard_df_long_doc[
349
+ leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
350
+ ]
351
+ lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc)
352
+ hidden_lb_db_retriever_long_doc = original_df_long_doc[
353
+ original_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
354
+ ]
355
+ hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
356
+ lb_table_retriever_long_doc = get_leaderboard_table(
357
+ lb_df_retriever_long_doc, types_long_doc)
358
+ hidden_lb_table_retriever_long_doc = get_leaderboard_table(
359
+ hidden_lb_db_retriever_long_doc, types_long_doc, visible=False
360
+ )
361
+
362
+ set_listeners(
363
+ "long-doc",
364
+ lb_table_retriever_long_doc,
365
+ hidden_lb_table_retriever_long_doc,
366
+ search_bar_retriever,
367
  selected_domains,
368
  selected_langs,
369
  selected_noreranker,
 
370
  show_anonymous,
371
  show_revision_and_timestamp,
372
+ )
373
+
374
+ selected_metric.change(
375
+ update_metric_long_doc,
376
+ [
377
+ selected_metric,
378
+ selected_domains,
379
+ selected_langs,
380
+ selected_noreranker,
381
+ search_bar_retriever,
382
+ show_anonymous,
383
+ show_revision_and_timestamp,
384
+ ],
385
+ lb_table_retriever_long_doc,
386
+ queue=True
387
+ )
388
+ with gr.TabItem("Reranking Only", id=22):
389
+ lb_df_reranker_ldoc = leaderboard_df_long_doc[
390
+ leaderboard_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
391
+ ]
392
+ lb_df_reranker_ldoc = reset_rank(lb_df_reranker_ldoc)
393
+ reranking_models_reranker_ldoc = lb_df_reranker_ldoc[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
394
+ with gr.Row():
395
+ with gr.Column(scale=1):
396
+ selected_rerankings_reranker_ldoc = get_reranking_dropdown(reranking_models_reranker_ldoc)
397
+ with gr.Column(scale=1):
398
+ search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
399
+ lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
400
+ hidden_lb_df_reranker_ldoc = original_df_long_doc[original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
401
+ hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
402
+ hidden_lb_table_reranker_ldoc = get_leaderboard_table(
403
+ hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
404
+ )
405
+
406
+ set_listeners(
407
+ "long-doc",
408
+ lb_table_reranker_ldoc,
409
+ hidden_lb_table_reranker_ldoc,
410
+ search_bar_reranker_ldoc,
411
  selected_domains,
412
  selected_langs,
413
  selected_rerankings_reranker_ldoc,
 
414
  show_anonymous,
415
  show_revision_and_timestamp,
416
+ )
417
+ selected_metric.change(
418
+ update_metric_long_doc,
419
+ [
420
+ selected_metric,
421
+ selected_domains,
422
+ selected_langs,
423
+ selected_rerankings_reranker_ldoc,
424
+ search_bar_reranker_ldoc,
425
+ show_anonymous,
426
+ show_revision_and_timestamp,
427
+ ],
428
+ lb_table_reranker_ldoc,
429
+ queue=True
430
+ )
431
 
432
  with gr.TabItem("🚀Submit here!", elem_id="submit-tab-table", id=2):
433
  with gr.Column():
requirements.txt CHANGED
@@ -12,3 +12,4 @@ requests>=2.31.0
12
  tqdm>=4.65.0
13
  accelerate>=0.24.1
14
  socksio>=1.0.0
 
 
12
  tqdm>=4.65.0
13
  accelerate>=0.24.1
14
  socksio>=1.0.0
15
+ air-benchmark>=0.0.4
src/about.py CHANGED
@@ -1,6 +1,6 @@
1
  # Your leaderboard name
2
  TITLE = """<h1 align="center" id="space-title">AIR-Bench: Automated Heterogeneous Information Retrieval Benchmark
3
- (v0.0.3) </h1>"""
4
 
5
  # What does your leaderboard evaluate?
6
  INTRODUCTION_TEXT = """
@@ -17,14 +17,14 @@ BENCHMARKS_TEXT = f"""
17
  - A: Yes, we plan to release new datasets on regular basis. However, the update frequency is to be decided.
18
 
19
  - Q: As you are using models to do the quality control when generating the data, is it biased to the models that are used?
20
- - A: Yes, the results is biased to the chosen models. However, we believe the datasets labeled by human are also biased to the human's preference. The key point to verify is whether the model's bias is consistent with the human's. We use our approach to generate test data using the well established MSMARCO datasets. We benchmark different models' performances using the generated dataset and the human-label DEV dataset. Comparing the ranking of different models on these two datasets, we observe the spearman correlation between them is 0.8211 (p-value=5e-5). This indicates that the models' perference is well aligned with the human. Please refer to [here](https://github.com/AIR-Bench/AIR-Bench/blob/main/docs/available_evaluation_results.md#consistency-with-ms-marco) for details
21
 
22
  """
23
 
24
  EVALUATION_QUEUE_TEXT = """
25
  ## Check out the submission steps at [our GitHub repo](https://github.com/AIR-Bench/AIR-Bench/blob/main/docs/submit_to_leaderboard.md)
26
 
27
- ## You can find the **STATUS of Your Submission** at the [Backend Space](https://huggingface.co/spaces/AIR-Bench/leaderboard_backend).
28
 
29
  - If the status is **✔️ Success**, then you can find your results at the [Leaderboard Space](https://huggingface.co/spaces/AIR-Bench/leaderboard) in no more than one hour.
30
  - If the status is **❌ Failed**, please check your submission steps and try again. If you have any questions, please feel free to open an issue [here](https://github.com/AIR-Bench/AIR-Bench/issues/new).
 
1
  # Your leaderboard name
2
  TITLE = """<h1 align="center" id="space-title">AIR-Bench: Automated Heterogeneous Information Retrieval Benchmark
3
+ (v0.1.0.dev) </h1>"""
4
 
5
  # What does your leaderboard evaluate?
6
  INTRODUCTION_TEXT = """
 
17
  - A: Yes, we plan to release new datasets on regular basis. However, the update frequency is to be decided.
18
 
19
  - Q: As you are using models to do the quality control when generating the data, is it biased to the models that are used?
20
+ - A: Yes, the results is biased to the chosen models. However, we believe the datasets labeled by human are also biased to the human's preference. The key point to verify is whether the model's bias is consistent with the human's. We use our approach to generate test data using the well established MSMARCO datasets. We benchmark different models' performances using the generated dataset and the human-label DEV dataset. Comparing the ranking of different models on these two datasets, we observe the spearman correlation between them is 0.8211 (p-value=5e-5). This indicates that the models' perference is well aligned with the human. Please refer to [here](https://github.com/AIR-Bench/AIR-Bench/blob/main/docs/available_analysis_results.md#consistency-with-human-labeled-data) for details.
21
 
22
  """
23
 
24
  EVALUATION_QUEUE_TEXT = """
25
  ## Check out the submission steps at [our GitHub repo](https://github.com/AIR-Bench/AIR-Bench/blob/main/docs/submit_to_leaderboard.md)
26
 
27
+ ## You can find the **STATUS of Your Submission** at the [Backend Space](https://huggingface.co/spaces/AIR-Bench/leaderboard_backend)
28
 
29
  - If the status is **✔️ Success**, then you can find your results at the [Leaderboard Space](https://huggingface.co/spaces/AIR-Bench/leaderboard) in no more than one hour.
30
  - If the status is **❌ Failed**, please check your submission steps and try again. If you have any questions, please feel free to open an issue [here](https://github.com/AIR-Bench/AIR-Bench/issues/new).
src/benchmarks.py CHANGED
@@ -1,5 +1,6 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
 
3
 
4
 
5
  def get_safe_name(name: str):
@@ -11,67 +12,6 @@ def get_safe_name(name: str):
11
  if (character.isalnum() or character == '_'))
12
 
13
 
14
- dataset_dict = {
15
- "qa": {
16
- "wiki": {
17
- "en": ["wikipedia_20240101", ],
18
- "zh": ["wikipedia_20240101", ]
19
- },
20
- "web": {
21
- "en": ["mC4", ],
22
- "zh": ["mC4", ]
23
- },
24
- "news": {
25
- "en": ["CC-News", ],
26
- "zh": ["CC-News", ]
27
- },
28
- "healthcare": {
29
- "en": ["PubMedQA", ],
30
- "zh": ["Huatuo-26M", ]
31
- },
32
- "law": {
33
- "en": ["pile-of-law", ],
34
- # "zh": ["flk_npc_gov_cn", ]
35
- },
36
- "finance": {
37
- "en": ["Reuters-Financial", ],
38
- "zh": ["FinCorpus", ]
39
- },
40
- "arxiv": {
41
- "en": ["Arxiv", ]},
42
- "msmarco": {
43
- "en": ["MS MARCO", ]},
44
- },
45
- "long-doc": {
46
- "arxiv": {
47
- "en": ["gpt3", "llama2", "llm-survey", "gemini"],
48
- },
49
- "book": {
50
- "en": [
51
- "origin-of-species_darwin",
52
- "a-brief-history-of-time_stephen-hawking"
53
- ]
54
- },
55
- "healthcare": {
56
- "en": [
57
- "pubmed_100k-200k_1",
58
- "pubmed_100k-200k_2",
59
- "pubmed_100k-200k_3",
60
- "pubmed_40k-50k_5-merged",
61
- "pubmed_30k-40k_10-merged"
62
- ]
63
- },
64
- "law": {
65
- "en": [
66
- "lex_files_300k-400k",
67
- "lex_files_400k-500k",
68
- "lex_files_500k-600k",
69
- "lex_files_600k-700k"
70
- ]
71
- }
72
- }
73
- }
74
-
75
  METRIC_LIST = [
76
  "ndcg_at_1",
77
  "ndcg_at_3",
@@ -118,7 +58,7 @@ class Benchmark:
118
 
119
  qa_benchmark_dict = {}
120
  long_doc_benchmark_dict = {}
121
- for task, domain_dict in dataset_dict.items():
122
  for domain, lang_dict in domain_dict.items():
123
  for lang, dataset_list in lang_dict.items():
124
  if task == "qa":
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
+ from air_benchmark.tasks.tasks import BenchmarkTable
4
 
5
 
6
  def get_safe_name(name: str):
 
12
  if (character.isalnum() or character == '_'))
13
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  METRIC_LIST = [
16
  "ndcg_at_1",
17
  "ndcg_at_3",
 
58
 
59
  qa_benchmark_dict = {}
60
  long_doc_benchmark_dict = {}
61
+ for task, domain_dict in BenchmarkTable['AIR-Bench_24.04'].items():
62
  for domain, lang_dict in domain_dict.items():
63
  for lang, dataset_list in lang_dict.items():
64
  if task == "qa":
src/envs.py CHANGED
@@ -27,7 +27,7 @@ BM25_LINK = model_hyperlink("https://github.com/castorini/pyserini", "BM25")
27
 
28
  BENCHMARK_VERSION_LIST = [
29
  "AIR-Bench_24.04",
30
- # "AIR-Bench_24.05",
31
  ]
32
 
33
  LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[-1]
 
27
 
28
  BENCHMARK_VERSION_LIST = [
29
  "AIR-Bench_24.04",
30
+ "AIR-Bench_24.05",
31
  ]
32
 
33
  LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[-1]