meg-huggingface commited on
Commit
595e24c
Β·
1 Parent(s): b266265

Read evals code

Browse files
Files changed (4) hide show
  1. app.py +95 -93
  2. src/about.py +6 -5
  3. src/envs.py +3 -3
  4. src/leaderboard/read_evals.py +2 -2
app.py CHANGED
@@ -140,8 +140,8 @@ def filter_models(
140
  return filtered_df
141
 
142
 
143
- ui = gr.Blocks(css=custom_css)
144
- with ui:
145
  gr.HTML(TITLE)
146
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
147
 
@@ -150,36 +150,46 @@ with ui:
150
  with gr.Row():
151
  with gr.Column():
152
  with gr.Row():
153
- search_bar = gr.Textbox(
154
- placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
155
- show_label=False,
156
- elem_id="search-bar",
157
- )
 
 
 
 
 
 
 
 
 
 
 
158
  with gr.Row():
159
- with gr.Column(scale=1):
160
- with gr.Row():
161
- with gr.Column():
162
- shown_columns = gr.CheckboxGroup(
163
- choices=[
164
- c.name
165
- for c in fields(AutoEvalColumn)
166
- if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.advanced and not c.dummy
167
- ],
168
- value=[
169
- c.name
170
- for c in fields(AutoEvalColumn)
171
- if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.advanced
172
- ],
173
- label="Select metrics to show",
174
- elem_id="column-select",
175
- interactive=True,
176
- )
177
- with gr.Column(scale=3):
178
  for c in fields(AutoEvalColumn):
179
  if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.advanced and not c.dummy:
180
- gr.Markdown("**" + c.name + "**. " + c.cite, elem_classes="markdown-text")
181
- with gr.Row():
182
- with gr.Accordion("Advanced options [WIP]", open=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  shown_columns_advanced = gr.CheckboxGroup(
184
  choices=[
185
  c.name
@@ -198,30 +208,20 @@ with ui:
198
  deleted_models_visibility = gr.Checkbox(
199
  value=False, label="Show gated/private/deleted models", interactive=True, visible=True,
200
  )
201
- with gr.Column(min_width=320):
202
- #with gr.Box(elem_id="box-filter"):
203
- filter_columns_type = gr.CheckboxGroup(
204
- label="Select model types to include",
205
- choices=[t.to_str() for t in ModelType],
206
- value=[t.to_str() for t in ModelType],
207
- interactive=True,
208
- elem_id="filter-columns-type",
209
- )
210
- filter_columns_precision = gr.CheckboxGroup(
211
- label="Select precision levels to include",
212
- choices=[i.value.name for i in Precision],
213
- value=[i.value.name for i in Precision],
214
- interactive=True,
215
- elem_id="filter-columns-precision",
216
- )
217
- filter_columns_size = gr.CheckboxGroup(
218
- label="Select model sizes (in billions of parameters) to include",
219
- choices=list(NUMERIC_INTERVALS.keys()),
220
- value=list(NUMERIC_INTERVALS.keys()),
221
- interactive=True,
222
- elem_id="filter-columns-size",
223
- )
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  leaderboard_table = gr.components.Dataframe(
226
  value=leaderboard_df[
227
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
@@ -277,45 +277,6 @@ with ui:
277
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
278
 
279
  with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
280
- with gr.Column():
281
- with gr.Row():
282
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
283
-
284
- with gr.Column():
285
- with gr.Accordion(
286
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
287
- open=False,
288
- ):
289
- with gr.Row():
290
- finished_eval_table = gr.components.Dataframe(
291
- value=finished_eval_queue_df,
292
- headers=EVAL_COLS,
293
- datatype=EVAL_TYPES,
294
- row_count=5,
295
- )
296
- with gr.Accordion(
297
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
298
- open=False,
299
- ):
300
- with gr.Row():
301
- running_eval_table = gr.components.Dataframe(
302
- value=running_eval_queue_df,
303
- headers=EVAL_COLS,
304
- datatype=EVAL_TYPES,
305
- row_count=5,
306
- )
307
-
308
- with gr.Accordion(
309
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
310
- open=False,
311
- ):
312
- with gr.Row():
313
- pending_eval_table = gr.components.Dataframe(
314
- value=pending_eval_queue_df,
315
- headers=EVAL_COLS,
316
- datatype=EVAL_TYPES,
317
- row_count=5,
318
- )
319
  with gr.Row():
320
  gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
321
 
@@ -363,6 +324,47 @@ with ui:
363
  submission_result,
364
  )
365
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  with gr.Row():
367
  with gr.Accordion("πŸ“™ Citation", open=False):
368
  citation_button = gr.Textbox(
@@ -375,6 +377,6 @@ with ui:
375
 
376
  scheduler = BackgroundScheduler()
377
  scheduler.add_job(restart_space, "interval", seconds=1800)
378
- scheduler.add_job(launch_backend, "interval", seconds=100) # will only allow one job to be run at the same time
379
  scheduler.start()
380
- ui.queue(default_concurrency_limit=40).launch()
 
140
  return filtered_df
141
 
142
 
143
+ demo = gr.Blocks(css=custom_css)
144
+ with demo:
145
  gr.HTML(TITLE)
146
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
147
 
 
150
  with gr.Row():
151
  with gr.Column():
152
  with gr.Row():
153
+ with gr.Column():
154
+ shown_columns = gr.CheckboxGroup(
155
+ choices=[
156
+ c.name
157
+ for c in fields(AutoEvalColumn)
158
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.advanced and not c.dummy
159
+ ],
160
+ value=[
161
+ c.name
162
+ for c in fields(AutoEvalColumn)
163
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.advanced
164
+ ],
165
+ label="Select metrics to show",
166
+ elem_id="column-select",
167
+ interactive=True,
168
+ )
169
  with gr.Row():
170
+ with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  for c in fields(AutoEvalColumn):
172
  if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.advanced and not c.dummy:
173
+ with gr.Row():
174
+ gr.Markdown("**" + c.name + "**. " + c.cite)
175
+ with gr.Column(min_width=320):
176
+ #with gr.Box(elem_id="box-filter"):
177
+ filter_columns_precision = gr.CheckboxGroup(
178
+ label="Select precision levels to include",
179
+ choices=[i.value.name for i in Precision],
180
+ value=[i.value.name for i in Precision],
181
+ interactive=True,
182
+ elem_id="filter-columns-precision",
183
+ )
184
+ filter_columns_size = gr.CheckboxGroup(
185
+ label="Select model sizes (in billions of parameters) to include",
186
+ choices=list(NUMERIC_INTERVALS.keys()),
187
+ value=list(NUMERIC_INTERVALS.keys()),
188
+ interactive=True,
189
+ elem_id="filter-columns-size",
190
+ )
191
+ #with gr.Row():
192
+ with gr.Accordion("Advanced options [WIP]", open=False):
193
  shown_columns_advanced = gr.CheckboxGroup(
194
  choices=[
195
  c.name
 
208
  deleted_models_visibility = gr.Checkbox(
209
  value=False, label="Show gated/private/deleted models", interactive=True, visible=True,
210
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
+ filter_columns_type = gr.CheckboxGroup(
213
+ label="Select model types to include",
214
+ choices=[t.to_str() for t in ModelType],
215
+ value=[t.to_str() for t in ModelType],
216
+ interactive=True,
217
+ elem_id="filter-columns-type",
218
+ )
219
+ with gr.Row():
220
+ search_bar = gr.Textbox(
221
+ placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
222
+ show_label=False,
223
+ elem_id="search-bar",
224
+ )
225
  leaderboard_table = gr.components.Dataframe(
226
  value=leaderboard_df[
227
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
 
277
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
278
 
279
  with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  with gr.Row():
281
  gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
282
 
 
324
  submission_result,
325
  )
326
 
327
+ with gr.Column():
328
+ with gr.Row():
329
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
330
+
331
+ with gr.TabItem("Submission Status", elem_id="llm-benchmark-tab-table", id=4):
332
+ with gr.Column():
333
+ with gr.Accordion(
334
+ f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
335
+ open=False,
336
+ ):
337
+ with gr.Row():
338
+ finished_eval_table = gr.components.Dataframe(
339
+ value=finished_eval_queue_df,
340
+ headers=EVAL_COLS,
341
+ datatype=EVAL_TYPES,
342
+ row_count=5,
343
+ )
344
+ with gr.Accordion(
345
+ f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
346
+ open=False,
347
+ ):
348
+ with gr.Row():
349
+ running_eval_table = gr.components.Dataframe(
350
+ value=running_eval_queue_df,
351
+ headers=EVAL_COLS,
352
+ datatype=EVAL_TYPES,
353
+ row_count=5,
354
+ )
355
+
356
+ with gr.Accordion(
357
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
358
+ open=False,
359
+ ):
360
+ with gr.Row():
361
+ pending_eval_table = gr.components.Dataframe(
362
+ value=pending_eval_queue_df,
363
+ headers=EVAL_COLS,
364
+ datatype=EVAL_TYPES,
365
+ row_count=5,
366
+ )
367
+
368
  with gr.Row():
369
  with gr.Accordion("πŸ“™ Citation", open=False):
370
  citation_button = gr.Textbox(
 
377
 
378
  scheduler = BackgroundScheduler()
379
  scheduler.add_job(restart_space, "interval", seconds=1800)
380
+ scheduler.add_job(launch_backend, "interval", seconds=100)
381
  scheduler.start()
382
+ demo.queue(default_concurrency_limit=40).launch()
src/about.py CHANGED
@@ -15,8 +15,9 @@ class Task:
15
  class Tasks(Enum):
16
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
17
  task0 = Task("toxigen", "acc", "Toxicity (lower is better)", cite="_ToxiGen: A Large-Scale Machine-Generated Dataset for Adversarial and Implicit Hate Speech Detection._ Hartvigsen et al., ACL 2022.")
18
- task1 = Task("truthfulqa_gen", "acc", "Truthful QA", cite="TODO")
19
- task2 = Task("crows_pairs_english", "acc", "CrowS-Pairs English", cite="TODO")
 
20
  #task2 = Task("anli_r1", "acc", "ANLI", cite="_Adversarial NLI: A New Benchmark for Natural Language Understanding._ Nie et al., ACL 2020.")
21
  #task3 = Task("logiqa", "acc_norm", "LogiQA", cite="_LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning_. Liu et al., IJCAI 2020.")
22
 
@@ -36,9 +37,9 @@ LLM_BENCHMARKS_TEXT = f"""
36
  ## How it works
37
 
38
  ## Reproducibility
39
- To reproduce the toxicity results, here is the command you can run:
40
 
41
- ```python main.py --model=hf-causal-experimental --model_args="pretrained=<your_model>,use_accelerate=True" --tasks=toxigen --batch_size=1 --output_path=<output_path>```
42
 
43
  """
44
 
@@ -69,7 +70,7 @@ When we add extra information about models to the leaderboard, it will be automa
69
  ## In case of model failure
70
  If your model is displayed in the `FAILED` category, its execution stopped.
71
  Make sure you have followed the above steps first.
72
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
73
  """
74
 
75
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
15
  class Tasks(Enum):
16
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
17
  task0 = Task("toxigen", "acc", "Toxicity (lower is better)", cite="_ToxiGen: A Large-Scale Machine-Generated Dataset for Adversarial and Implicit Hate Speech Detection._ Hartvigsen et al., ACL 2022.")
18
+ task1 = Task("truthfulqa_gen", "bleurt_acc", "Truthful QA", cite="_TruthfulQA: Measuring How Models Mimic Human Falsehoods._ Lin et al., ACL 2022.")
19
+ # https://aclanthology.org/2020.emnlp-main.154/
20
+ task2 = Task("crows_pairs_english", "pct_stereotype", "CrowS-Pairs English", cite="_CrowS-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models._ Nangia et al., EMNLP 2020.")
21
  #task2 = Task("anli_r1", "acc", "ANLI", cite="_Adversarial NLI: A New Benchmark for Natural Language Understanding._ Nie et al., ACL 2020.")
22
  #task3 = Task("logiqa", "acc_norm", "LogiQA", cite="_LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning_. Liu et al., IJCAI 2020.")
23
 
 
37
  ## How it works
38
 
39
  ## Reproducibility
40
+ To reproduce the toxicity results, here is the command you can run using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the EleutherAI LM Evaluation Harness:
41
 
42
+ ```python main.py --model=hf-causal-experimental --model_args="pretrained=<your_model>,use_accelerate=True" --tasks=<task> --batch_size=1 --output_path=<output_path>```
43
 
44
  """
45
 
 
70
  ## In case of model failure
71
  If your model is displayed in the `FAILED` category, its execution stopped.
72
  Make sure you have followed the above steps first.
73
+ If everything is done, check you can launch the EleutherAIHarness on your model locally. See About tab for exact command.
74
  """
75
 
76
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
src/envs.py CHANGED
@@ -20,11 +20,11 @@ LIMIT = None # 20
20
  # Define some input/output variables.
21
  # Don't forget to create a results and requests Dataset for your org
22
  # Leaderboard Space
23
- REPO_ID = f"{OWNER}/leaderboard"
24
  # Leaderboard input Dataset
25
- QUEUE_REPO = f"{OWNER}/requests"
26
  # Leaderboard output Dataset
27
- RESULTS_REPO = f"{OWNER}/results"
28
 
29
  # If you setup a cache, set HF_HOME.
30
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
20
  # Define some input/output variables.
21
  # Don't forget to create a results and requests Dataset for your org
22
  # Leaderboard Space
23
+ REPO_ID = f"{OWNER}/leaderboard-backend"
24
  # Leaderboard input Dataset
25
+ QUEUE_REPO = f"{OWNER}/requests-tmp"
26
  # Leaderboard output Dataset
27
+ RESULTS_REPO = f"{OWNER}/results-tmp"
28
 
29
  # If you setup a cache, set HF_HOME.
30
  CACHE_PATH=os.getenv("HF_HOME", ".")
src/leaderboard/read_evals.py CHANGED
@@ -71,7 +71,7 @@ class EvalResult:
71
  results = {}
72
  for task in Tasks:
73
  print("Looking at task:")
74
- print(task)
75
  try:
76
  task = task.value
77
  except Exception as e:
@@ -169,7 +169,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
169
  model_result_filepaths = []
170
 
171
  print("Getting raw eval results from:")
172
- print(os.walk(results_path))
173
  for root, _, files in os.walk(results_path):
174
  # We should only have json files in model results
175
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
 
71
  results = {}
72
  for task in Tasks:
73
  print("Looking at task:")
74
+ print(task.value)
75
  try:
76
  task = task.value
77
  except Exception as e:
 
169
  model_result_filepaths = []
170
 
171
  print("Getting raw eval results from:")
172
+ print(results_path)
173
  for root, _, files in os.walk(results_path):
174
  # We should only have json files in model results
175
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):