ssmits commited on
Commit
f68b294
1 Parent(s): 75010bd

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +131 -101
README.md CHANGED
@@ -196,7 +196,37 @@ outputs = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7,
196
  print(outputs[0]["generated_text"])
197
  ```
198
 
199
- # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_ssmits__Qwen2.5-95B-Instruct)
201
 
202
  | Metric |Value|
@@ -212,103 +242,103 @@ Detailed results can be found [here](https://huggingface.co/datasets/open-llm-le
212
 
213
  | Key | 72b Result | 95b Result | Difference | Which is Higher | Multiplier |
214
  |:--------------------------------------------------------------------------|-------------:|-------------:|-------------:|:------------------|:-------------|
215
- | leaderboard_musr.acc_norm,none | 0.419 | 0.427 | 0.008 | 95b | 1.02x |
216
- | leaderboard_bbh_sports_understanding.acc_norm,none | 0.892 | 0.876 | 0.016 | 72b | 0.98x |
217
- | leaderboard_bbh_logical_deduction_three_objects.acc_norm,none | 0.94 | 0.928 | 0.012 | 72b | 0.99x |
218
- | leaderboard_math_geometry_hard.exact_match,none | 0 | 0.008 | 0.008 | 95b | 0.00x |
219
- | leaderboard_gpqa.acc_norm,none | 0.375 | 0.364 | 0.011 | 72b | 0.97x |
220
- | leaderboard_math_hard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.00x |
221
- | leaderboard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.00x |
222
- | leaderboard.prompt_level_loose_acc,none | 0.861 | 0.839 | 0.022 | 72b | 0.97x |
223
- | leaderboard.prompt_level_strict_acc,none | 0.839 | 0.813 | 0.026 | 72b | 0.97x |
224
- | leaderboard.inst_level_loose_acc,none | 0.904 | 0.891 | 0.013 | 72b | 0.99x |
225
- | leaderboard.acc_norm,none | 0.641 | 0.622 | 0.02 | 72b | 0.97x |
226
- | leaderboard.inst_level_strict_acc,none | 0.888 | 0.873 | 0.016 | 72b | 0.98x |
227
- | leaderboard.acc,none | 0.563 | 0.522 | 0.041 | 72b | 0.93x |
228
- | leaderboard_bbh_causal_judgement.acc_norm,none | 0.668 | 0.663 | 0.005 | 72b | 0.99x |
229
- | leaderboard_bbh_salient_translation_error_detection.acc_norm,none | 0.668 | 0.588 | 0.08 | 72b | 0.88x |
230
- | leaderboard_gpqa_extended.acc_norm,none | 0.372 | 0.364 | 0.007 | 72b | 0.98x |
231
- | leaderboard_math_prealgebra_hard.exact_match,none | 0.047 | 0.155 | 0.109 | 95b | 3.33x |
232
- | leaderboard_math_algebra_hard.exact_match,none | 0.02 | 0.114 | 0.094 | 95b | 5.83x |
233
- | leaderboard_bbh_boolean_expressions.acc_norm,none | 0.936 | 0.92 | 0.016 | 72b | 0.98x |
234
- | leaderboard_math_num_theory_hard.exact_match,none | 0 | 0.058 | 0.058 | 95b | 0.00x |
235
- | leaderboard_bbh_movie_recommendation.acc_norm,none | 0.768 | 0.78 | 0.012 | 95b | 1.02x |
236
- | leaderboard_math_counting_and_prob_hard.exact_match,none | 0 | 0.024 | 0.024 | 95b | 0.00x |
237
- | leaderboard_math_intermediate_algebra_hard.exact_match,none | 0 | 0.004 | 0.004 | 95b | 0.00x |
238
- | leaderboard_ifeval.prompt_level_strict_acc,none | 0.839 | 0.813 | 0.026 | 72b | 0.97x |
239
- | leaderboard_ifeval.inst_level_strict_acc,none | 0.888 | 0.873 | 0.016 | 72b | 0.98x |
240
- | leaderboard_ifeval.inst_level_loose_acc,none | 0.904 | 0.891 | 0.013 | 72b | 0.99x |
241
- | leaderboard_ifeval.prompt_level_loose_acc,none | 0.861 | 0.839 | 0.022 | 72b | 0.97x |
242
- | leaderboard_bbh_snarks.acc_norm,none | 0.927 | 0.904 | 0.022 | 72b | 0.98x |
243
- | leaderboard_bbh_web_of_lies.acc_norm,none | 0.676 | 0.616 | 0.06 | 72b | 0.91x |
244
- | leaderboard_bbh_penguins_in_a_table.acc_norm,none | 0.719 | 0.767 | 0.048 | 95b | 1.07x |
245
- | leaderboard_bbh_hyperbaton.acc_norm,none | 0.892 | 0.9 | 0.008 | 95b | 1.01x |
246
- | leaderboard_bbh_object_counting.acc_norm,none | 0.612 | 0.544 | 0.068 | 72b | 0.89x |
247
- | leaderboard_musr_object_placements.acc_norm,none | 0.258 | 0.285 | 0.027 | 95b | 1.11x |
248
- | leaderboard_bbh_logical_deduction_five_objects.acc_norm,none | 0.704 | 0.592 | 0.112 | 72b | 0.84x |
249
- | leaderboard_musr_team_allocation.acc_norm,none | 0.456 | 0.396 | 0.06 | 72b | 0.87x |
250
- | leaderboard_bbh_navigate.acc_norm,none | 0.832 | 0.788 | 0.044 | 72b | 0.95x |
251
- | leaderboard_bbh_tracking_shuffled_objects_seven_objects.acc_norm,none | 0.34 | 0.304 | 0.036 | 72b | 0.89x |
252
- | leaderboard_bbh_formal_fallacies.acc_norm,none | 0.776 | 0.756 | 0.02 | 72b | 0.97x |
253
- | all.leaderboard_musr.acc_norm,none | 0.419 | 0.427 | 0.008 | 95b | 1.02x |
254
- | all.leaderboard_bbh_sports_understanding.acc_norm,none | 0.892 | 0.876 | 0.016 | 72b | 0.98x |
255
- | all.leaderboard_bbh_logical_deduction_three_objects.acc_norm,none | 0.94 | 0.928 | 0.012 | 72b | 0.99x |
256
- | all.leaderboard_math_geometry_hard.exact_match,none | 0 | 0.008 | 0.008 | 95b | 0.00x |
257
- | all.leaderboard_gpqa.acc_norm,none | 0.375 | 0.364 | 0.011 | 72b | 0.97x |
258
- | all.leaderboard_math_hard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.00x |
259
- | all.leaderboard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.00x |
260
- | all.leaderboard.prompt_level_loose_acc,none | 0.861 | 0.839 | 0.022 | 72b | 0.97x |
261
- | all.leaderboard.prompt_level_strict_acc,none | 0.839 | 0.813 | 0.026 | 72b | 0.97x |
262
- | all.leaderboard.inst_level_loose_acc,none | 0.904 | 0.891 | 0.013 | 72b | 0.99x |
263
- | all.leaderboard.acc_norm,none | 0.641 | 0.622 | 0.02 | 72b | 0.97x |
264
- | all.leaderboard.inst_level_strict_acc,none | 0.888 | 0.873 | 0.016 | 72b | 0.98x |
265
- | all.leaderboard.acc,none | 0.563 | 0.522 | 0.041 | 72b | 0.93x |
266
- | all.leaderboard_bbh_causal_judgement.acc_norm,none | 0.668 | 0.663 | 0.005 | 72b | 0.99x |
267
- | all.leaderboard_bbh_salient_translation_error_detection.acc_norm,none | 0.668 | 0.588 | 0.08 | 72b | 0.88x |
268
- | all.leaderboard_gpqa_extended.acc_norm,none | 0.372 | 0.364 | 0.007 | 72b | 0.98x |
269
- | all.leaderboard_math_prealgebra_hard.exact_match,none | 0.047 | 0.155 | 0.109 | 95b | 3.33x |
270
- | all.leaderboard_math_algebra_hard.exact_match,none | 0.02 | 0.114 | 0.094 | 95b | 5.83x |
271
- | all.leaderboard_bbh_boolean_expressions.acc_norm,none | 0.936 | 0.92 | 0.016 | 72b | 0.98x |
272
- | all.leaderboard_math_num_theory_hard.exact_match,none | 0 | 0.058 | 0.058 | 95b | 0.00x |
273
- | all.leaderboard_bbh_movie_recommendation.acc_norm,none | 0.768 | 0.78 | 0.012 | 95b | 1.02x |
274
- | all.leaderboard_math_counting_and_prob_hard.exact_match,none | 0 | 0.024 | 0.024 | 95b | 0.00x |
275
- | all.leaderboard_math_intermediate_algebra_hard.exact_match,none | 0 | 0.004 | 0.004 | 95b | 0.00x |
276
- | all.leaderboard_ifeval.prompt_level_strict_acc,none | 0.839 | 0.813 | 0.026 | 72b | 0.97x |
277
- | all.leaderboard_ifeval.inst_level_strict_acc,none | 0.888 | 0.873 | 0.016 | 72b | 0.98x |
278
- | all.leaderboard_ifeval.inst_level_loose_acc,none | 0.904 | 0.891 | 0.013 | 72b | 0.99x |
279
- | all.leaderboard_ifeval.prompt_level_loose_acc,none | 0.861 | 0.839 | 0.022 | 72b | 0.97x |
280
- | all.leaderboard_bbh_snarks.acc_norm,none | 0.927 | 0.904 | 0.022 | 72b | 0.98x |
281
- | all.leaderboard_bbh_web_of_lies.acc_norm,none | 0.676 | 0.616 | 0.06 | 72b | 0.91x |
282
- | all.leaderboard_bbh_penguins_in_a_table.acc_norm,none | 0.719 | 0.767 | 0.048 | 95b | 1.07x |
283
- | all.leaderboard_bbh_hyperbaton.acc_norm,none | 0.892 | 0.9 | 0.008 | 95b | 1.01x |
284
- | all.leaderboard_bbh_object_counting.acc_norm,none | 0.612 | 0.544 | 0.068 | 72b | 0.89x |
285
- | all.leaderboard_musr_object_placements.acc_norm,none | 0.258 | 0.285 | 0.027 | 95b | 1.11x |
286
- | all.leaderboard_bbh_logical_deduction_five_objects.acc_norm,none | 0.704 | 0.592 | 0.112 | 72b | 0.84x |
287
- | all.leaderboard_musr_team_allocation.acc_norm,none | 0.456 | 0.396 | 0.06 | 72b | 0.87x |
288
- | all.leaderboard_bbh_navigate.acc_norm,none | 0.832 | 0.788 | 0.044 | 72b | 0.95x |
289
- | all.leaderboard_bbh_tracking_shuffled_objects_seven_objects.acc_norm,none | 0.34 | 0.304 | 0.036 | 72b | 0.89x |
290
- | all.leaderboard_bbh_formal_fallacies.acc_norm,none | 0.776 | 0.756 | 0.02 | 72b | 0.97x |
291
- | all.leaderboard_gpqa_main.acc_norm,none | 0.375 | 0.355 | 0.02 | 72b | 0.95x |
292
- | all.leaderboard_bbh_disambiguation_qa.acc_norm,none | 0.744 | 0.772 | 0.028 | 95b | 1.04x |
293
- | all.leaderboard_bbh_tracking_shuffled_objects_five_objects.acc_norm,none | 0.32 | 0.284 | 0.036 | 72b | 0.89x |
294
- | all.leaderboard_bbh_date_understanding.acc_norm,none | 0.784 | 0.764 | 0.02 | 72b | 0.97x |
295
- | all.leaderboard_bbh_geometric_shapes.acc_norm,none | 0.464 | 0.412 | 0.052 | 72b | 0.89x |
296
- | all.leaderboard_bbh_reasoning_about_colored_objects.acc_norm,none | 0.864 | 0.84 | 0.024 | 72b | 0.97x |
297
- | all.leaderboard_musr_murder_mysteries.acc_norm,none | 0.548 | 0.604 | 0.056 | 95b | 1.10x |
298
- | all.leaderboard_bbh_ruin_names.acc_norm,none | 0.888 | 0.86 | 0.028 | 72b | 0.97x |
299
- | all.leaderboard_bbh_logical_deduction_seven_objects.acc_norm,none | 0.644 | 0.664 | 0.02 | 95b | 1.03x |
300
- | all.leaderboard_bbh.acc_norm,none | 0.726 | 0.701 | 0.025 | 72b | 0.97x |
301
- | all.leaderboard_bbh_temporal_sequences.acc_norm,none | 0.996 | 0.968 | 0.028 | 72b | 0.97x |
302
- | all.leaderboard_mmlu_pro.acc,none | 0.563 | 0.522 | 0.041 | 72b | 0.93x |
303
- | leaderboard_gpqa_main.acc_norm,none | 0.375 | 0.355 | 0.02 | 72b | 0.95x |
304
- | leaderboard_bbh_disambiguation_qa.acc_norm,none | 0.744 | 0.772 | 0.028 | 95b | 1.04x |
305
- | leaderboard_bbh_tracking_shuffled_objects_five_objects.acc_norm,none | 0.32 | 0.284 | 0.036 | 72b | 0.89x |
306
- | leaderboard_bbh_date_understanding.acc_norm,none | 0.784 | 0.764 | 0.02 | 72b | 0.97x |
307
- | leaderboard_bbh_geometric_shapes.acc_norm,none | 0.464 | 0.412 | 0.052 | 72b | 0.89x |
308
- | leaderboard_bbh_reasoning_about_colored_objects.acc_norm,none | 0.864 | 0.84 | 0.024 | 72b | 0.97x |
309
- | leaderboard_musr_murder_mysteries.acc_norm,none | 0.548 | 0.604 | 0.056 | 95b | 1.10x |
310
- | leaderboard_bbh_ruin_names.acc_norm,none | 0.888 | 0.86 | 0.028 | 72b | 0.97x |
311
- | leaderboard_bbh_logical_deduction_seven_objects.acc_norm,none | 0.644 | 0.664 | 0.02 | 95b | 1.03x |
312
- | leaderboard_bbh.acc_norm,none | 0.726 | 0.701 | 0.025 | 72b | 0.97x |
313
- | leaderboard_bbh_temporal_sequences.acc_norm,none | 0.996 | 0.968 | 0.028 | 72b | 0.97x |
314
- | leaderboard_mmlu_pro.acc,none | 0.563 | 0.522 | 0.041 | 72b | 0.93x |
 
196
  print(outputs[0]["generated_text"])
197
  ```
198
 
199
+ ## 🏆 Evaluation
200
+
201
+ Initial benchmarks show interesting performance characteristics compared to the 72B model:
202
+
203
+ ### Strengths
204
+ The 95B model shows notable improvements in:
205
+
206
+ 1. **Mathematical Reasoning**
207
+ - Up to 5.83x improvement in algebra tasks
208
+ - 3.33x improvement in pre-algebra
209
+ - Consistent gains across geometry, number theory, and probability tasks
210
+ - Overall stronger performance in complex mathematical reasoning
211
+
212
+ 2. **Spatial & Object Understanding**
213
+ - 11% improvement in object placement tasks
214
+ - 7% better at tabular data interpretation
215
+ - Enhanced performance in logical deduction with multiple objects
216
+
217
+ 3. **Complex Language Tasks**
218
+ - 4% improvement in disambiguation tasks
219
+ - 2% better at movie recommendations
220
+ - Slight improvements in hyperbaton (complex word order) tasks
221
+
222
+ 4. **Creative & Analytical Reasoning**
223
+ - 10% improvement in murder mystery solving
224
+ - Better performance in tasks requiring creative problem-solving
225
+
226
+ ### Areas for Consideration
227
+ While the model shows improvements in specific areas, users should note that the 72B model still performs better in many general language and reasoning tasks. The 95B version appears to excel particularly in mathematical and spatial reasoning while maintaining comparable performance in other areas.
228
+
229
+ ### [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
230
  Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_ssmits__Qwen2.5-95B-Instruct)
231
 
232
  | Metric |Value|
 
242
 
243
  | Key | 72b Result | 95b Result | Difference | Which is Higher | Multiplier |
244
  |:--------------------------------------------------------------------------|-------------:|-------------:|-------------:|:------------------|:-------------|
245
+ | leaderboard_musr.acc_norm,none | 0.419 | 0.427 | 0.008 | 95b | 1.02 |
246
+ | leaderboard_bbh_sports_understanding.acc_norm,none | 0.892 | 0.876 | -0.016 | 72b | 0.98 |
247
+ | leaderboard_bbh_logical_deduction_three_objects.acc_norm,none | 0.94 | 0.928 | -0.012 | 72b | 0.99 |
248
+ | leaderboard_math_geometry_hard.exact_match,none | 0 | 0.008 | 0.008 | 95b | 0.00 |
249
+ | leaderboard_gpqa.acc_norm,none | 0.375 | 0.364 | -0.011 | 72b | 0.97 |
250
+ | leaderboard_math_hard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.00 |
251
+ | leaderboard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.00 |
252
+ | leaderboard.prompt_level_loose_acc,none | 0.861 | 0.839 | -0.022 | 72b | 0.97 |
253
+ | leaderboard.prompt_level_strict_acc,none | 0.839 | 0.813 | -0.026 | 72b | 0.97 |
254
+ | leaderboard.inst_level_loose_acc,none | 0.904 | 0.891 | -0.013 | 72b | 0.99 |
255
+ | leaderboard.acc_norm,none | 0.641 | 0.622 | -0.020 | 72b | 0.97 |
256
+ | leaderboard.inst_level_strict_acc,none | 0.888 | 0.873 | -0.016 | 72b | 0.98 |
257
+ | leaderboard.acc,none | 0.563 | 0.522 | -0.041 | 72b | 0.93 |
258
+ | leaderboard_bbh_causal_judgement.acc_norm,none | 0.668 | 0.663 | -0.005 | 72b | 0.99 |
259
+ | leaderboard_bbh_salient_translation_error_detection.acc_norm,none | 0.668 | 0.588 | -0.080 | 72b | 0.88 |
260
+ | leaderboard_gpqa_extended.acc_norm,none | 0.372 | 0.364 | -0.007 | 72b | 0.98 |
261
+ | leaderboard_math_prealgebra_hard.exact_match,none | 0.047 | 0.155 | 0.109 | 95b | 3.33 |
262
+ | leaderboard_math_algebra_hard.exact_match,none | 0.02 | 0.114 | 0.094 | 95b | 5.83 |
263
+ | leaderboard_bbh_boolean_expressions.acc_norm,none | 0.936 | 0.92 | -0.016 | 72b | 0.98 |
264
+ | leaderboard_math_num_theory_hard.exact_match,none | 0 | 0.058 | 0.058 | 95b | 0.00 |
265
+ | leaderboard_bbh_movie_recommendation.acc_norm,none | 0.768 | 0.78 | 0.012 | 95b | 1.02 |
266
+ | leaderboard_math_counting_and_prob_hard.exact_match,none | 0 | 0.024 | 0.024 | 95b | 0.00 |
267
+ | leaderboard_math_intermediate_algebra_hard.exact_match,none | 0 | 0.004 | 0.004 | 95b | 0.00 |
268
+ | leaderboard_ifeval.prompt_level_strict_acc,none | 0.839 | 0.813 | -0.026 | 72b | 0.97 |
269
+ | leaderboard_ifeval.inst_level_strict_acc,none | 0.888 | 0.873 | -0.016 | 72b | 0.98 |
270
+ | leaderboard_ifeval.inst_level_loose_acc,none | 0.904 | 0.891 | -0.013 | 72b | 0.99 |
271
+ | leaderboard_ifeval.prompt_level_loose_acc,none | 0.861 | 0.839 | -0.022 | 72b | 0.97 |
272
+ | leaderboard_bbh_snarks.acc_norm,none | 0.927 | 0.904 | -0.022 | 72b | 0.98 |
273
+ | leaderboard_bbh_web_of_lies.acc_norm,none | 0.676 | 0.616 | -0.060 | 72b | 0.91 |
274
+ | leaderboard_bbh_penguins_in_a_table.acc_norm,none | 0.719 | 0.767 | 0.048 | 95b | 1.07 |
275
+ | leaderboard_bbh_hyperbaton.acc_norm,none | 0.892 | 0.9 | 0.008 | 95b | 1.01 |
276
+ | leaderboard_bbh_object_counting.acc_norm,none | 0.612 | 0.544 | -0.068 | 72b | 0.89 |
277
+ | leaderboard_musr_object_placements.acc_norm,none | 0.258 | 0.285 | 0.027 | 95b | 1.11 |
278
+ | leaderboard_bbh_logical_deduction_five_objects.acc_norm,none | 0.704 | 0.592 | -0.112 | 72b | 0.84 |
279
+ | leaderboard_musr_team_allocation.acc_norm,none | 0.456 | 0.396 | -0.060 | 72b | 0.87 |
280
+ | leaderboard_bbh_navigate.acc_norm,none | 0.832 | 0.788 | -0.044 | 72b | 0.95 |
281
+ | leaderboard_bbh_tracking_shuffled_objects_seven_objects.acc_norm,none | 0.34 | 0.304 | -0.036 | 72b | 0.89 |
282
+ | leaderboard_bbh_formal_fallacies.acc_norm,none | 0.776 | 0.756 | -0.020 | 72b | 0.97 |
283
+ | all.leaderboard_musr.acc_norm,none | 0.419 | 0.427 | 0.008 | 95b | 1.02 |
284
+ | all.leaderboard_bbh_sports_understanding.acc_norm,none | 0.892 | 0.876 | -0.016 | 72b | 0.98 |
285
+ | all.leaderboard_bbh_logical_deduction_three_objects.acc_norm,none | 0.94 | 0.928 | -0.012 | 72b | 0.99 |
286
+ | all.leaderboard_math_geometry_hard.exact_match,none | 0 | 0.008 | 0.008 | 95b | 0.00 |
287
+ | all.leaderboard_gpqa.acc_norm,none | 0.375 | 0.364 | -0.011 | 72b | 0.97 |
288
+ | all.leaderboard_math_hard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.00 |
289
+ | all.leaderboard.exact_match,none | 0.012 | 0.06 | 0.048 | 95b | 5.00 |
290
+ | all.leaderboard.prompt_level_loose_acc,none | 0.861 | 0.839 | -0.022 | 72b | 0.97 |
291
+ | all.leaderboard.prompt_level_strict_acc,none | 0.839 | 0.813 | -0.026 | 72b | 0.97 |
292
+ | all.leaderboard.inst_level_loose_acc,none | 0.904 | 0.891 | -0.013 | 72b | 0.99 |
293
+ | all.leaderboard.acc_norm,none | 0.641 | 0.622 | -0.020 | 72b | 0.97 |
294
+ | all.leaderboard.inst_level_strict_acc,none | 0.888 | 0.873 | -0.016 | 72b | 0.98 |
295
+ | all.leaderboard.acc,none | 0.563 | 0.522 | -0.041 | 72b | 0.93 |
296
+ | all.leaderboard_bbh_causal_judgement.acc_norm,none | 0.668 | 0.663 | -0.005 | 72b | 0.99 |
297
+ | all.leaderboard_bbh_salient_translation_error_detection.acc_norm,none | 0.668 | 0.588 | -0.080 | 72b | 0.88 |
298
+ | all.leaderboard_gpqa_extended.acc_norm,none | 0.372 | 0.364 | -0.007 | 72b | 0.98 |
299
+ | all.leaderboard_math_prealgebra_hard.exact_match,none | 0.047 | 0.155 | 0.109 | 95b | 3.33 |
300
+ | all.leaderboard_math_algebra_hard.exact_match,none | 0.02 | 0.114 | 0.094 | 95b | 5.83 |
301
+ | all.leaderboard_bbh_boolean_expressions.acc_norm,none | 0.936 | 0.92 | -0.016 | 72b | 0.98 |
302
+ | all.leaderboard_math_num_theory_hard.exact_match,none | 0 | 0.058 | 0.058 | 95b | 0.00 |
303
+ | all.leaderboard_bbh_movie_recommendation.acc_norm,none | 0.768 | 0.78 | 0.012 | 95b | 1.02 |
304
+ | all.leaderboard_math_counting_and_prob_hard.exact_match,none | 0 | 0.024 | 0.024 | 95b | 0.00 |
305
+ | all.leaderboard_math_intermediate_algebra_hard.exact_match,none | 0 | 0.004 | 0.004 | 95b | 0.00 |
306
+ | all.leaderboard_ifeval.prompt_level_strict_acc,none | 0.839 | 0.813 | -0.026 | 72b | 0.97 |
307
+ | all.leaderboard_ifeval.inst_level_strict_acc,none | 0.888 | 0.873 | -0.016 | 72b | 0.98 |
308
+ | all.leaderboard_ifeval.inst_level_loose_acc,none | 0.904 | 0.891 | -0.013 | 72b | 0.99 |
309
+ | all.leaderboard_ifeval.prompt_level_loose_acc,none | 0.861 | 0.839 | -0.022 | 72b | 0.97 |
310
+ | all.leaderboard_bbh_snarks.acc_norm,none | 0.927 | 0.904 | -0.022 | 72b | 0.98 |
311
+ | all.leaderboard_bbh_web_of_lies.acc_norm,none | 0.676 | 0.616 | -0.060 | 72b | 0.91 |
312
+ | all.leaderboard_bbh_penguins_in_a_table.acc_norm,none | 0.719 | 0.767 | 0.048 | 95b | 1.07 |
313
+ | all.leaderboard_bbh_hyperbaton.acc_norm,none | 0.892 | 0.9 | 0.008 | 95b | 1.01 |
314
+ | all.leaderboard_bbh_object_counting.acc_norm,none | 0.612 | 0.544 | -0.068 | 72b | 0.89 |
315
+ | all.leaderboard_musr_object_placements.acc_norm,none | 0.258 | 0.285 | 0.027 | 95b | 1.11 |
316
+ | all.leaderboard_bbh_logical_deduction_five_objects.acc_norm,none | 0.704 | 0.592 | -0.112 | 72b | 0.84 |
317
+ | all.leaderboard_musr_team_allocation.acc_norm,none | 0.456 | 0.396 | -0.060 | 72b | 0.87 |
318
+ | all.leaderboard_bbh_navigate.acc_norm,none | 0.832 | 0.788 | -0.044 | 72b | 0.95 |
319
+ | all.leaderboard_bbh_tracking_shuffled_objects_seven_objects.acc_norm,none | 0.34 | 0.304 | -0.036 | 72b | 0.89 |
320
+ | all.leaderboard_bbh_formal_fallacies.acc_norm,none | 0.776 | 0.756 | -0.020 | 72b | 0.97 |
321
+ | all.leaderboard_gpqa_main.acc_norm,none | 0.375 | 0.355 | -0.020 | 72b | 0.95 |
322
+ | all.leaderboard_bbh_disambiguation_qa.acc_norm,none | 0.744 | 0.772 | 0.028 | 95b | 1.04 |
323
+ | all.leaderboard_bbh_tracking_shuffled_objects_five_objects.acc_norm,none | 0.32 | 0.284 | -0.036 | 72b | 0.89 |
324
+ | all.leaderboard_bbh_date_understanding.acc_norm,none | 0.784 | 0.764 | -0.020 | 72b | 0.97 |
325
+ | all.leaderboard_bbh_geometric_shapes.acc_norm,none | 0.464 | 0.412 | -0.052 | 72b | 0.89 |
326
+ | all.leaderboard_bbh_reasoning_about_colored_objects.acc_norm,none | 0.864 | 0.84 | -0.024 | 72b | 0.97 |
327
+ | all.leaderboard_musr_murder_mysteries.acc_norm,none | 0.548 | 0.604 | 0.056 | 95b | 1.10 |
328
+ | all.leaderboard_bbh_ruin_names.acc_norm,none | 0.888 | 0.86 | -0.028 | 72b | 0.97 |
329
+ | all.leaderboard_bbh_logical_deduction_seven_objects.acc_norm,none | 0.644 | 0.664 | 0.020 | 95b | 1.03 |
330
+ | all.leaderboard_bbh.acc_norm,none | 0.726 | 0.701 | -0.025 | 72b | 0.97 |
331
+ | all.leaderboard_bbh_temporal_sequences.acc_norm,none | 0.996 | 0.968 | -0.028 | 72b | 0.97 |
332
+ | all.leaderboard_mmlu_pro.acc,none | 0.563 | 0.522 | -0.041 | 72b | 0.93 |
333
+ | leaderboard_gpqa_main.acc_norm,none | 0.375 | 0.355 | -0.020 | 72b | 0.95 |
334
+ | leaderboard_bbh_disambiguation_qa.acc_norm,none | 0.744 | 0.772 | 0.028 | 95b | 1.04 |
335
+ | leaderboard_bbh_tracking_shuffled_objects_five_objects.acc_norm,none | 0.32 | 0.284 | -0.036 | 72b | 0.89 |
336
+ | leaderboard_bbh_date_understanding.acc_norm,none | 0.784 | 0.764 | -0.020 | 72b | 0.97 |
337
+ | leaderboard_bbh_geometric_shapes.acc_norm,none | 0.464 | 0.412 | -0.052 | 72b | 0.89 |
338
+ | leaderboard_bbh_reasoning_about_colored_objects.acc_norm,none | 0.864 | 0.84 | -0.024 | 72b | 0.97 |
339
+ | leaderboard_musr_murder_mysteries.acc_norm,none | 0.548 | 0.604 | 0.056 | 95b | 1.10 |
340
+ | leaderboard_bbh_ruin_names.acc_norm,none | 0.888 | 0.86 | -0.028 | 72b | 0.97 |
341
+ | leaderboard_bbh_logical_deduction_seven_objects.acc_norm,none | 0.644 | 0.664 | 0.020 | 95b | 1.03 |
342
+ | leaderboard_bbh.acc_norm,none | 0.726 | 0.701 | -0.025 | 72b | 0.97 |
343
+ | leaderboard_bbh_temporal_sequences.acc_norm,none | 0.996 | 0.968 | -0.028 | 72b | 0.97 |
344
+ | leaderboard_mmlu_pro.acc,none | 0.563 | 0.522 | -0.041 | 72b | 0.93 |