Spestly commited on
Commit
56743aa
·
verified ·
1 Parent(s): 558b1c4

Upload results.html

Browse files
Files changed (1) hide show
  1. results.html +732 -0
results.html ADDED
@@ -0,0 +1,732 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <style type="text/css">
2
+ #T_a7af5 td {
3
+ overflow-wrap: break-word;
4
+ max-width: 1px;
5
+ }
6
+ #T_a7af5 .col_heading {
7
+ width: 25.0%;
8
+ }
9
+ #T_a7af5_row15_col0, #T_a7af5_row118_col1 {
10
+ background-color: #f7cbe4;
11
+ color: #000000;
12
+ }
13
+ #T_a7af5_row15_col1, #T_a7af5_row31_col3 {
14
+ background-color: #f9eff4;
15
+ color: #000000;
16
+ }
17
+ #T_a7af5_row15_col2, #T_a7af5_row58_col3, #T_a7af5_row109_col0 {
18
+ background-color: #f4bfdf;
19
+ color: #000000;
20
+ }
21
+ #T_a7af5_row15_col3, #T_a7af5_row28_col2, #T_a7af5_row46_col2, #T_a7af5_row85_col0, #T_a7af5_row107_col3 {
22
+ background-color: #f6c7e3;
23
+ color: #000000;
24
+ }
25
+ #T_a7af5_row19_col0 {
26
+ background-color: #a9d874;
27
+ color: #000000;
28
+ }
29
+ #T_a7af5_row19_col1 {
30
+ background-color: #549825;
31
+ color: #f1f1f1;
32
+ }
33
+ #T_a7af5_row19_col2, #T_a7af5_row70_col0, #T_a7af5_row115_col0 {
34
+ background-color: #f7f7f6;
35
+ color: #000000;
36
+ }
37
+ #T_a7af5_row19_col3 {
38
+ background-color: #f1f6ea;
39
+ color: #000000;
40
+ }
41
+ #T_a7af5_row22_col0 {
42
+ background-color: #f5f7f3;
43
+ color: #000000;
44
+ }
45
+ #T_a7af5_row22_col1 {
46
+ background-color: #edf6df;
47
+ color: #000000;
48
+ }
49
+ #T_a7af5_row22_col2, #T_a7af5_row22_col3, #T_a7af5_row37_col2, #T_a7af5_row70_col1 {
50
+ background-color: #f4f7f0;
51
+ color: #000000;
52
+ }
53
+ #T_a7af5_row25_col0, #T_a7af5_row100_col2, #T_a7af5_row145_col1 {
54
+ background-color: #eeabd2;
55
+ color: #000000;
56
+ }
57
+ #T_a7af5_row25_col1 {
58
+ background-color: #eff6e4;
59
+ color: #000000;
60
+ }
61
+ #T_a7af5_row25_col2, #T_a7af5_row40_col2 {
62
+ background-color: #df7cb1;
63
+ color: #f1f1f1;
64
+ }
65
+ #T_a7af5_row25_col3 {
66
+ background-color: #fbd9ec;
67
+ color: #000000;
68
+ }
69
+ #T_a7af5_row28_col0, #T_a7af5_row130_col0 {
70
+ background-color: #fbe7f2;
71
+ color: #000000;
72
+ }
73
+ #T_a7af5_row28_col1, #T_a7af5_row73_col1 {
74
+ background-color: #ecf6de;
75
+ color: #000000;
76
+ }
77
+ #T_a7af5_row28_col3 {
78
+ background-color: #f2badc;
79
+ color: #000000;
80
+ }
81
+ #T_a7af5_row31_col0, #T_a7af5_row105_col3 {
82
+ background-color: #f9f1f5;
83
+ color: #000000;
84
+ }
85
+ #T_a7af5_row31_col1, #T_a7af5_row37_col1 {
86
+ background-color: #e8f5d5;
87
+ color: #000000;
88
+ }
89
+ #T_a7af5_row31_col2, #T_a7af5_row40_col1 {
90
+ background-color: #f9eef4;
91
+ color: #000000;
92
+ }
93
+ #T_a7af5_row34_col0, #T_a7af5_row43_col0, #T_a7af5_row124_col1 {
94
+ background-color: #d24c97;
95
+ color: #f1f1f1;
96
+ }
97
+ #T_a7af5_row34_col1 {
98
+ background-color: #faeaf2;
99
+ color: #000000;
100
+ }
101
+ #T_a7af5_row34_col2, #T_a7af5_row67_col3 {
102
+ background-color: #e283b7;
103
+ color: #f1f1f1;
104
+ }
105
+ #T_a7af5_row34_col3 {
106
+ background-color: #fbe6f1;
107
+ color: #000000;
108
+ }
109
+ #T_a7af5_row37_col0, #T_a7af5_row49_col0 {
110
+ background-color: #f7f6f7;
111
+ color: #000000;
112
+ }
113
+ #T_a7af5_row37_col3, #T_a7af5_row46_col3, #T_a7af5_row52_col3 {
114
+ background-color: #f8f3f6;
115
+ color: #000000;
116
+ }
117
+ #T_a7af5_row40_col0 {
118
+ background-color: #e388ba;
119
+ color: #f1f1f1;
120
+ }
121
+ #T_a7af5_row40_col3 {
122
+ background-color: #eeadd4;
123
+ color: #000000;
124
+ }
125
+ #T_a7af5_row43_col1 {
126
+ background-color: #f6c9e3;
127
+ color: #000000;
128
+ }
129
+ #T_a7af5_row43_col2, #T_a7af5_row67_col2, #T_a7af5_row79_col1 {
130
+ background-color: #cf4191;
131
+ color: #f1f1f1;
132
+ }
133
+ #T_a7af5_row43_col3 {
134
+ background-color: #d34f99;
135
+ color: #f1f1f1;
136
+ }
137
+ #T_a7af5_row46_col0 {
138
+ background-color: #fde2f0;
139
+ color: #000000;
140
+ }
141
+ #T_a7af5_row46_col1 {
142
+ background-color: #bbe28a;
143
+ color: #000000;
144
+ }
145
+ #T_a7af5_row49_col1 {
146
+ background-color: #e9f5d6;
147
+ color: #000000;
148
+ }
149
+ #T_a7af5_row49_col2, #T_a7af5_row67_col1, #T_a7af5_row148_col0 {
150
+ background-color: #ea9fca;
151
+ color: #000000;
152
+ }
153
+ #T_a7af5_row49_col3 {
154
+ background-color: #f0b2d7;
155
+ color: #000000;
156
+ }
157
+ #T_a7af5_row52_col0, #T_a7af5_row142_col0 {
158
+ background-color: #f6f7f5;
159
+ color: #000000;
160
+ }
161
+ #T_a7af5_row52_col1 {
162
+ background-color: #c0e593;
163
+ color: #000000;
164
+ }
165
+ #T_a7af5_row52_col2, #T_a7af5_row58_col1, #T_a7af5_row64_col1, #T_a7af5_row109_col2 {
166
+ background-color: #fce5f1;
167
+ color: #000000;
168
+ }
169
+ #T_a7af5_row55_col0, #T_a7af5_row90_col1 {
170
+ background-color: #f5c6e2;
171
+ color: #000000;
172
+ }
173
+ #T_a7af5_row55_col1 {
174
+ background-color: #fad6ea;
175
+ color: #000000;
176
+ }
177
+ #T_a7af5_row55_col2 {
178
+ background-color: #f8cee6;
179
+ color: #000000;
180
+ }
181
+ #T_a7af5_row55_col3 {
182
+ background-color: #c82884;
183
+ color: #f1f1f1;
184
+ }
185
+ #T_a7af5_row58_col0, #T_a7af5_row121_col1 {
186
+ background-color: #e897c4;
187
+ color: #000000;
188
+ }
189
+ #T_a7af5_row58_col2, #T_a7af5_row79_col0, #T_a7af5_row107_col0 {
190
+ background-color: #e07eb3;
191
+ color: #f1f1f1;
192
+ }
193
+ #T_a7af5_row61_col0 {
194
+ background-color: #eba3cd;
195
+ color: #000000;
196
+ }
197
+ #T_a7af5_row61_col1, #T_a7af5_row88_col0, #T_a7af5_row88_col1, #T_a7af5_row88_col2, #T_a7af5_row88_col3, #T_a7af5_row109_col3 {
198
+ background-color: #f8f4f6;
199
+ color: #000000;
200
+ }
201
+ #T_a7af5_row61_col2, #T_a7af5_row136_col0 {
202
+ background-color: #dc70aa;
203
+ color: #f1f1f1;
204
+ }
205
+ #T_a7af5_row61_col3 {
206
+ background-color: #d861a2;
207
+ color: #f1f1f1;
208
+ }
209
+ #T_a7af5_row64_col0, #T_a7af5_row145_col3 {
210
+ background-color: #e181b5;
211
+ color: #f1f1f1;
212
+ }
213
+ #T_a7af5_row64_col2, #T_a7af5_row94_col3, #T_a7af5_row136_col3 {
214
+ background-color: #e48bbc;
215
+ color: #f1f1f1;
216
+ }
217
+ #T_a7af5_row64_col3, #T_a7af5_row138_col3, #T_a7af5_row148_col2 {
218
+ background-color: #f9d1e8;
219
+ color: #000000;
220
+ }
221
+ #T_a7af5_row67_col0, #T_a7af5_row76_col1 {
222
+ background-color: #cc368b;
223
+ color: #f1f1f1;
224
+ }
225
+ #T_a7af5_row70_col2, #T_a7af5_row73_col0 {
226
+ background-color: #f0f6e7;
227
+ color: #000000;
228
+ }
229
+ #T_a7af5_row70_col3, #T_a7af5_row73_col3 {
230
+ background-color: #f8f2f5;
231
+ color: #000000;
232
+ }
233
+ #T_a7af5_row73_col2 {
234
+ background-color: #f3f6ed;
235
+ color: #000000;
236
+ }
237
+ #T_a7af5_row76_col0, #T_a7af5_row133_col0 {
238
+ background-color: #b51370;
239
+ color: #f1f1f1;
240
+ }
241
+ #T_a7af5_row76_col2, #T_a7af5_row97_col2, #T_a7af5_row145_col0 {
242
+ background-color: #e590bf;
243
+ color: #f1f1f1;
244
+ }
245
+ #T_a7af5_row76_col3 {
246
+ background-color: #e692c1;
247
+ color: #000000;
248
+ }
249
+ #T_a7af5_row79_col2 {
250
+ background-color: #e286b8;
251
+ color: #f1f1f1;
252
+ }
253
+ #T_a7af5_row79_col3 {
254
+ background-color: #c72482;
255
+ color: #f1f1f1;
256
+ }
257
+ #T_a7af5_row82_col0 {
258
+ background-color: #d65a9f;
259
+ color: #f1f1f1;
260
+ }
261
+ #T_a7af5_row82_col1, #T_a7af5_row82_col3, #T_a7af5_row136_col2 {
262
+ background-color: #c92b86;
263
+ color: #f1f1f1;
264
+ }
265
+ #T_a7af5_row82_col2 {
266
+ background-color: #d14895;
267
+ color: #f1f1f1;
268
+ }
269
+ #T_a7af5_row85_col1, #T_a7af5_row94_col1 {
270
+ background-color: #f3bdde;
271
+ color: #000000;
272
+ }
273
+ #T_a7af5_row85_col2, #T_a7af5_row100_col1 {
274
+ background-color: #f5c2e0;
275
+ color: #000000;
276
+ }
277
+ #T_a7af5_row85_col3, #T_a7af5_row100_col3 {
278
+ background-color: #f1b7da;
279
+ color: #000000;
280
+ }
281
+ #T_a7af5_row90_col0, #T_a7af5_row105_col0, #T_a7af5_row136_col1 {
282
+ background-color: #f1b5d9;
283
+ color: #000000;
284
+ }
285
+ #T_a7af5_row90_col2 {
286
+ background-color: #e89ac6;
287
+ color: #000000;
288
+ }
289
+ #T_a7af5_row90_col3, #T_a7af5_row107_col2 {
290
+ background-color: #eda8d1;
291
+ color: #000000;
292
+ }
293
+ #T_a7af5_row94_col0, #T_a7af5_row127_col0 {
294
+ background-color: #eba1cb;
295
+ color: #000000;
296
+ }
297
+ #T_a7af5_row94_col2, #T_a7af5_row145_col2 {
298
+ background-color: #e58dbe;
299
+ color: #f1f1f1;
300
+ }
301
+ #T_a7af5_row97_col0, #T_a7af5_row107_col1 {
302
+ background-color: #f3bcdd;
303
+ color: #000000;
304
+ }
305
+ #T_a7af5_row97_col1 {
306
+ background-color: #f7cce5;
307
+ color: #000000;
308
+ }
309
+ #T_a7af5_row97_col3, #T_a7af5_row103_col2 {
310
+ background-color: #eca6cf;
311
+ color: #000000;
312
+ }
313
+ #T_a7af5_row100_col0 {
314
+ background-color: #efb0d6;
315
+ color: #000000;
316
+ }
317
+ #T_a7af5_row103_col0 {
318
+ background-color: #db6ca8;
319
+ color: #f1f1f1;
320
+ }
321
+ #T_a7af5_row103_col1 {
322
+ background-color: #e795c3;
323
+ color: #000000;
324
+ }
325
+ #T_a7af5_row103_col3, #T_a7af5_row138_col0 {
326
+ background-color: #f5c4e1;
327
+ color: #000000;
328
+ }
329
+ #T_a7af5_row105_col1 {
330
+ background-color: #fbd8eb;
331
+ color: #000000;
332
+ }
333
+ #T_a7af5_row105_col2, #T_a7af5_row127_col1 {
334
+ background-color: #fce4f0;
335
+ color: #000000;
336
+ }
337
+ #T_a7af5_row109_col1 {
338
+ background-color: #fbe9f2;
339
+ color: #000000;
340
+ }
341
+ #T_a7af5_row111_col0 {
342
+ background-color: #e99cc8;
343
+ color: #000000;
344
+ }
345
+ #T_a7af5_row111_col1, #T_a7af5_row138_col1 {
346
+ background-color: #fcdbed;
347
+ color: #000000;
348
+ }
349
+ #T_a7af5_row111_col2, #T_a7af5_row111_col3, #T_a7af5_row115_col2, #T_a7af5_row115_col3, #T_a7af5_row118_col2, #T_a7af5_row118_col3, #T_a7af5_row121_col2, #T_a7af5_row121_col3, #T_a7af5_row124_col2, #T_a7af5_row124_col3, #T_a7af5_row127_col2, #T_a7af5_row127_col3, #T_a7af5_row130_col2, #T_a7af5_row130_col3, #T_a7af5_row133_col2, #T_a7af5_row133_col3 {
350
+ background-color: #8e0152;
351
+ color: #f1f1f1;
352
+ }
353
+ #T_a7af5_row115_col1 {
354
+ background-color: #ddf1c1;
355
+ color: #000000;
356
+ }
357
+ #T_a7af5_row118_col0 {
358
+ background-color: #d4539b;
359
+ color: #f1f1f1;
360
+ }
361
+ #T_a7af5_row121_col0 {
362
+ background-color: #ca2f88;
363
+ color: #f1f1f1;
364
+ }
365
+ #T_a7af5_row124_col0 {
366
+ background-color: #b1116d;
367
+ color: #f1f1f1;
368
+ }
369
+ #T_a7af5_row130_col1 {
370
+ background-color: #d0ecad;
371
+ color: #000000;
372
+ }
373
+ #T_a7af5_row133_col1 {
374
+ background-color: #cb3289;
375
+ color: #f1f1f1;
376
+ }
377
+ #T_a7af5_row138_col2 {
378
+ background-color: #f8d0e7;
379
+ color: #000000;
380
+ }
381
+ #T_a7af5_row142_col1 {
382
+ background-color: #f3f7ef;
383
+ color: #000000;
384
+ }
385
+ #T_a7af5_row142_col2, #T_a7af5_row142_col3 {
386
+ background-color: #f8f5f6;
387
+ color: #000000;
388
+ }
389
+ #T_a7af5_row148_col1 {
390
+ background-color: #fad4e9;
391
+ color: #000000;
392
+ }
393
+ #T_a7af5_row148_col3 {
394
+ background-color: #fcdded;
395
+ color: #000000;
396
+ }
397
+ </style>
398
+ <table id="T_a7af5">
399
+ <thead>
400
+ <tr>
401
+ <th class="blank level0" >&nbsp;</th>
402
+ <th id="T_a7af5_level0_col0" class="col_heading level0 col0" >Spestly/Atlas-Pro-1.5B-Preview</th>
403
+ <th id="T_a7af5_level0_col1" class="col_heading level0 col1" >Spestly/Atlas-Pro-7B-Preview</th>
404
+ <th id="T_a7af5_level0_col2" class="col_heading level0 col2" >deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B</th>
405
+ <th id="T_a7af5_level0_col3" class="col_heading level0 col3" >deepseek-ai/DeepSeek-R1-Distill-Qwen-7B</th>
406
+ </tr>
407
+ </thead>
408
+ <tbody>
409
+ <tr>
410
+ <th id="T_a7af5_level0_row15" class="row_heading level0 row15" >bbh.acc_norm</th>
411
+ <td id="T_a7af5_row15_col0" class="data row15 col0" >0.348030</td>
412
+ <td id="T_a7af5_row15_col1" class="data row15 col1" >0.465891</td>
413
+ <td id="T_a7af5_row15_col2" class="data row15 col2" >0.321298</td>
414
+ <td id="T_a7af5_row15_col3" class="data row15 col3" >0.341087</td>
415
+ </tr>
416
+ <tr>
417
+ <th id="T_a7af5_level0_row19" class="row_heading level0 row19" >bbh_boolean_expressions.acc_norm</th>
418
+ <td id="T_a7af5_row19_col0" class="data row19 col0" >0.724000</td>
419
+ <td id="T_a7af5_row19_col1" class="data row19 col1" >0.884000</td>
420
+ <td id="T_a7af5_row19_col2" class="data row19 col2" >0.500000</td>
421
+ <td id="T_a7af5_row19_col3" class="data row19 col3" >0.532000</td>
422
+ </tr>
423
+ <tr>
424
+ <th id="T_a7af5_level0_row22" class="row_heading level0 row22" >bbh_causal_judgement.acc_norm</th>
425
+ <td id="T_a7af5_row22_col0" class="data row22 col0" >0.508021</td>
426
+ <td id="T_a7af5_row22_col1" class="data row22 col1" >0.561497</td>
427
+ <td id="T_a7af5_row22_col2" class="data row22 col2" >0.518717</td>
428
+ <td id="T_a7af5_row22_col3" class="data row22 col3" >0.518717</td>
429
+ </tr>
430
+ <tr>
431
+ <th id="T_a7af5_level0_row25" class="row_heading level0 row25" >bbh_date_understanding.acc_norm</th>
432
+ <td id="T_a7af5_row25_col0" class="data row25 col0" >0.284000</td>
433
+ <td id="T_a7af5_row25_col1" class="data row25 col1" >0.548000</td>
434
+ <td id="T_a7af5_row25_col2" class="data row25 col2" >0.208000</td>
435
+ <td id="T_a7af5_row25_col3" class="data row25 col3" >0.384000</td>
436
+ </tr>
437
+ <tr>
438
+ <th id="T_a7af5_level0_row28" class="row_heading level0 row28" >bbh_disambiguation_qa.acc_norm</th>
439
+ <td id="T_a7af5_row28_col0" class="data row28 col0" >0.432000</td>
440
+ <td id="T_a7af5_row28_col1" class="data row28 col1" >0.564000</td>
441
+ <td id="T_a7af5_row28_col2" class="data row28 col2" >0.340000</td>
442
+ <td id="T_a7af5_row28_col3" class="data row28 col3" >0.312000</td>
443
+ </tr>
444
+ <tr>
445
+ <th id="T_a7af5_level0_row31" class="row_heading level0 row31" >bbh_formal_fallacies.acc_norm</th>
446
+ <td id="T_a7af5_row31_col0" class="data row31 col0" >0.476000</td>
447
+ <td id="T_a7af5_row31_col1" class="data row31 col1" >0.588000</td>
448
+ <td id="T_a7af5_row31_col2" class="data row31 col2" >0.464000</td>
449
+ <td id="T_a7af5_row31_col3" class="data row31 col3" >0.468000</td>
450
+ </tr>
451
+ <tr>
452
+ <th id="T_a7af5_level0_row34" class="row_heading level0 row34" >bbh_geometric_shapes.acc_norm</th>
453
+ <td id="T_a7af5_row34_col0" class="data row34 col0" >0.156000</td>
454
+ <td id="T_a7af5_row34_col1" class="data row34 col1" >0.444000</td>
455
+ <td id="T_a7af5_row34_col2" class="data row34 col2" >0.220000</td>
456
+ <td id="T_a7af5_row34_col3" class="data row34 col3" >0.428000</td>
457
+ </tr>
458
+ <tr>
459
+ <th id="T_a7af5_level0_row37" class="row_heading level0 row37" >bbh_hyperbaton.acc_norm</th>
460
+ <td id="T_a7af5_row37_col0" class="data row37 col0" >0.496000</td>
461
+ <td id="T_a7af5_row37_col1" class="data row37 col1" >0.588000</td>
462
+ <td id="T_a7af5_row37_col2" class="data row37 col2" >0.516000</td>
463
+ <td id="T_a7af5_row37_col3" class="data row37 col3" >0.484000</td>
464
+ </tr>
465
+ <tr>
466
+ <th id="T_a7af5_level0_row40" class="row_heading level0 row40" >bbh_logical_deduction_five_objects.acc_norm</th>
467
+ <td id="T_a7af5_row40_col0" class="data row40 col0" >0.228000</td>
468
+ <td id="T_a7af5_row40_col1" class="data row40 col1" >0.464000</td>
469
+ <td id="T_a7af5_row40_col2" class="data row40 col2" >0.208000</td>
470
+ <td id="T_a7af5_row40_col3" class="data row40 col3" >0.288000</td>
471
+ </tr>
472
+ <tr>
473
+ <th id="T_a7af5_level0_row43" class="row_heading level0 row43" >bbh_logical_deduction_seven_objects.acc_norm</th>
474
+ <td id="T_a7af5_row43_col0" class="data row43 col0" >0.156000</td>
475
+ <td id="T_a7af5_row43_col1" class="data row43 col1" >0.344000</td>
476
+ <td id="T_a7af5_row43_col2" class="data row43 col2" >0.144000</td>
477
+ <td id="T_a7af5_row43_col3" class="data row43 col3" >0.160000</td>
478
+ </tr>
479
+ <tr>
480
+ <th id="T_a7af5_level0_row46" class="row_heading level0 row46" >bbh_logical_deduction_three_objects.acc_norm</th>
481
+ <td id="T_a7af5_row46_col0" class="data row46 col0" >0.408000</td>
482
+ <td id="T_a7af5_row46_col1" class="data row46 col1" >0.692000</td>
483
+ <td id="T_a7af5_row46_col2" class="data row46 col2" >0.340000</td>
484
+ <td id="T_a7af5_row46_col3" class="data row46 col3" >0.484000</td>
485
+ </tr>
486
+ <tr>
487
+ <th id="T_a7af5_level0_row49" class="row_heading level0 row49" >bbh_movie_recommendation.acc_norm</th>
488
+ <td id="T_a7af5_row49_col0" class="data row49 col0" >0.496000</td>
489
+ <td id="T_a7af5_row49_col1" class="data row49 col1" >0.584000</td>
490
+ <td id="T_a7af5_row49_col2" class="data row49 col2" >0.264000</td>
491
+ <td id="T_a7af5_row49_col3" class="data row49 col3" >0.296000</td>
492
+ </tr>
493
+ <tr>
494
+ <th id="T_a7af5_level0_row52" class="row_heading level0 row52" >bbh_navigate.acc_norm</th>
495
+ <td id="T_a7af5_row52_col0" class="data row52 col0" >0.504000</td>
496
+ <td id="T_a7af5_row52_col1" class="data row52 col1" >0.680000</td>
497
+ <td id="T_a7af5_row52_col2" class="data row52 col2" >0.420000</td>
498
+ <td id="T_a7af5_row52_col3" class="data row52 col3" >0.484000</td>
499
+ </tr>
500
+ <tr>
501
+ <th id="T_a7af5_level0_row55" class="row_heading level0 row55" >bbh_object_counting.acc_norm</th>
502
+ <td id="T_a7af5_row55_col0" class="data row55 col0" >0.336000</td>
503
+ <td id="T_a7af5_row55_col1" class="data row55 col1" >0.376000</td>
504
+ <td id="T_a7af5_row55_col2" class="data row55 col2" >0.356000</td>
505
+ <td id="T_a7af5_row55_col3" class="data row55 col3" >0.116000</td>
506
+ </tr>
507
+ <tr>
508
+ <th id="T_a7af5_level0_row58" class="row_heading level0 row58" >bbh_penguins_in_a_table.acc_norm</th>
509
+ <td id="T_a7af5_row58_col0" class="data row58 col0" >0.253425</td>
510
+ <td id="T_a7af5_row58_col1" class="data row58 col1" >0.424658</td>
511
+ <td id="T_a7af5_row58_col2" class="data row58 col2" >0.212329</td>
512
+ <td id="T_a7af5_row58_col3" class="data row58 col3" >0.321918</td>
513
+ </tr>
514
+ <tr>
515
+ <th id="T_a7af5_level0_row61" class="row_heading level0 row61" >bbh_reasoning_about_colored_objects.acc_norm</th>
516
+ <td id="T_a7af5_row61_col0" class="data row61 col0" >0.272000</td>
517
+ <td id="T_a7af5_row61_col1" class="data row61 col1" >0.488000</td>
518
+ <td id="T_a7af5_row61_col2" class="data row61 col2" >0.192000</td>
519
+ <td id="T_a7af5_row61_col3" class="data row61 col3" >0.176000</td>
520
+ </tr>
521
+ <tr>
522
+ <th id="T_a7af5_level0_row64" class="row_heading level0 row64" >bbh_ruin_names.acc_norm</th>
523
+ <td id="T_a7af5_row64_col0" class="data row64 col0" >0.216000</td>
524
+ <td id="T_a7af5_row64_col1" class="data row64 col1" >0.424000</td>
525
+ <td id="T_a7af5_row64_col2" class="data row64 col2" >0.232000</td>
526
+ <td id="T_a7af5_row64_col3" class="data row64 col3" >0.364000</td>
527
+ </tr>
528
+ <tr>
529
+ <th id="T_a7af5_level0_row67" class="row_heading level0 row67" >bbh_salient_translation_error_detection.acc_norm</th>
530
+ <td id="T_a7af5_row67_col0" class="data row67 col0" >0.132000</td>
531
+ <td id="T_a7af5_row67_col1" class="data row67 col1" >0.264000</td>
532
+ <td id="T_a7af5_row67_col2" class="data row67 col2" >0.144000</td>
533
+ <td id="T_a7af5_row67_col3" class="data row67 col3" >0.220000</td>
534
+ </tr>
535
+ <tr>
536
+ <th id="T_a7af5_level0_row70" class="row_heading level0 row70" >bbh_snarks.acc_norm</th>
537
+ <td id="T_a7af5_row70_col0" class="data row70 col0" >0.500000</td>
538
+ <td id="T_a7af5_row70_col1" class="data row70 col1" >0.516854</td>
539
+ <td id="T_a7af5_row70_col2" class="data row70 col2" >0.539326</td>
540
+ <td id="T_a7af5_row70_col3" class="data row70 col3" >0.477528</td>
541
+ </tr>
542
+ <tr>
543
+ <th id="T_a7af5_level0_row73" class="row_heading level0 row73" >bbh_sports_understanding.acc_norm</th>
544
+ <td id="T_a7af5_row73_col0" class="data row73 col0" >0.540000</td>
545
+ <td id="T_a7af5_row73_col1" class="data row73 col1" >0.564000</td>
546
+ <td id="T_a7af5_row73_col2" class="data row73 col2" >0.524000</td>
547
+ <td id="T_a7af5_row73_col3" class="data row73 col3" >0.480000</td>
548
+ </tr>
549
+ <tr>
550
+ <th id="T_a7af5_level0_row76" class="row_heading level0 row76" >bbh_temporal_sequences.acc_norm</th>
551
+ <td id="T_a7af5_row76_col0" class="data row76 col0" >0.072000</td>
552
+ <td id="T_a7af5_row76_col1" class="data row76 col1" >0.132000</td>
553
+ <td id="T_a7af5_row76_col2" class="data row76 col2" >0.240000</td>
554
+ <td id="T_a7af5_row76_col3" class="data row76 col3" >0.244000</td>
555
+ </tr>
556
+ <tr>
557
+ <th id="T_a7af5_level0_row79" class="row_heading level0 row79" >bbh_tracking_shuffled_objects_five_objects.acc_norm</th>
558
+ <td id="T_a7af5_row79_col0" class="data row79 col0" >0.212000</td>
559
+ <td id="T_a7af5_row79_col1" class="data row79 col1" >0.144000</td>
560
+ <td id="T_a7af5_row79_col2" class="data row79 col2" >0.224000</td>
561
+ <td id="T_a7af5_row79_col3" class="data row79 col3" >0.112000</td>
562
+ </tr>
563
+ <tr>
564
+ <th id="T_a7af5_level0_row82" class="row_heading level0 row82" >bbh_tracking_shuffled_objects_seven_objects.acc_norm</th>
565
+ <td id="T_a7af5_row82_col0" class="data row82 col0" >0.168000</td>
566
+ <td id="T_a7af5_row82_col1" class="data row82 col1" >0.120000</td>
567
+ <td id="T_a7af5_row82_col2" class="data row82 col2" >0.152000</td>
568
+ <td id="T_a7af5_row82_col3" class="data row82 col3" >0.120000</td>
569
+ </tr>
570
+ <tr>
571
+ <th id="T_a7af5_level0_row85" class="row_heading level0 row85" >bbh_tracking_shuffled_objects_three_objects.acc_norm</th>
572
+ <td id="T_a7af5_row85_col0" class="data row85 col0" >0.340000</td>
573
+ <td id="T_a7af5_row85_col1" class="data row85 col1" >0.320000</td>
574
+ <td id="T_a7af5_row85_col2" class="data row85 col2" >0.332000</td>
575
+ <td id="T_a7af5_row85_col3" class="data row85 col3" >0.304000</td>
576
+ </tr>
577
+ <tr>
578
+ <th id="T_a7af5_level0_row88" class="row_heading level0 row88" >bbh_web_of_lies.acc_norm</th>
579
+ <td id="T_a7af5_row88_col0" class="data row88 col0" >0.488000</td>
580
+ <td id="T_a7af5_row88_col1" class="data row88 col1" >0.488000</td>
581
+ <td id="T_a7af5_row88_col2" class="data row88 col2" >0.488000</td>
582
+ <td id="T_a7af5_row88_col3" class="data row88 col3" >0.488000</td>
583
+ </tr>
584
+ <tr>
585
+ <th id="T_a7af5_level0_row90" class="row_heading level0 row90" >gpqa.acc_norm</th>
586
+ <td id="T_a7af5_row90_col0" class="data row90 col0" >0.296980</td>
587
+ <td id="T_a7af5_row90_col1" class="data row90 col1" >0.337248</td>
588
+ <td id="T_a7af5_row90_col2" class="data row90 col2" >0.255872</td>
589
+ <td id="T_a7af5_row90_col3" class="data row90 col3" >0.279362</td>
590
+ </tr>
591
+ <tr>
592
+ <th id="T_a7af5_level0_row94" class="row_heading level0 row94" >gpqa_diamond.acc_norm</th>
593
+ <td id="T_a7af5_row94_col0" class="data row94 col0" >0.267677</td>
594
+ <td id="T_a7af5_row94_col1" class="data row94 col1" >0.318182</td>
595
+ <td id="T_a7af5_row94_col2" class="data row94 col2" >0.237374</td>
596
+ <td id="T_a7af5_row94_col3" class="data row94 col3" >0.232323</td>
597
+ </tr>
598
+ <tr>
599
+ <th id="T_a7af5_level0_row97" class="row_heading level0 row97" >gpqa_extended.acc_norm</th>
600
+ <td id="T_a7af5_row97_col0" class="data row97 col0" >0.313187</td>
601
+ <td id="T_a7af5_row97_col1" class="data row97 col1" >0.351648</td>
602
+ <td id="T_a7af5_row97_col2" class="data row97 col2" >0.239927</td>
603
+ <td id="T_a7af5_row97_col3" class="data row97 col3" >0.276557</td>
604
+ </tr>
605
+ <tr>
606
+ <th id="T_a7af5_level0_row100" class="row_heading level0 row100" >gpqa_main.acc_norm</th>
607
+ <td id="T_a7af5_row100_col0" class="data row100 col0" >0.290179</td>
608
+ <td id="T_a7af5_row100_col1" class="data row100 col1" >0.328125</td>
609
+ <td id="T_a7af5_row100_col2" class="data row100 col2" >0.283482</td>
610
+ <td id="T_a7af5_row100_col3" class="data row100 col3" >0.303571</td>
611
+ </tr>
612
+ <tr>
613
+ <th id="T_a7af5_level0_row103" class="row_heading level0 row103" >ifeval.prompt_level_strict_acc</th>
614
+ <td id="T_a7af5_row103_col0" class="data row103 col0" >0.188540</td>
615
+ <td id="T_a7af5_row103_col1" class="data row103 col1" >0.249538</td>
616
+ <td id="T_a7af5_row103_col2" class="data row103 col2" >0.275416</td>
617
+ <td id="T_a7af5_row103_col3" class="data row103 col3" >0.332717</td>
618
+ </tr>
619
+ <tr>
620
+ <th id="T_a7af5_level0_row105" class="row_heading level0 row105" >ifeval.inst_level_strict_acc</th>
621
+ <td id="T_a7af5_row105_col0" class="data row105 col0" >0.297362</td>
622
+ <td id="T_a7af5_row105_col1" class="data row105 col1" >0.381295</td>
623
+ <td id="T_a7af5_row105_col2" class="data row105 col2" >0.417266</td>
624
+ <td id="T_a7af5_row105_col3" class="data row105 col3" >0.474820</td>
625
+ </tr>
626
+ <tr>
627
+ <th id="T_a7af5_level0_row107" class="row_heading level0 row107" >ifeval.prompt_level_loose_acc</th>
628
+ <td id="T_a7af5_row107_col0" class="data row107 col0" >0.214418</td>
629
+ <td id="T_a7af5_row107_col1" class="data row107 col1" >0.314233</td>
630
+ <td id="T_a7af5_row107_col2" class="data row107 col2" >0.280961</td>
631
+ <td id="T_a7af5_row107_col3" class="data row107 col3" >0.340111</td>
632
+ </tr>
633
+ <tr>
634
+ <th id="T_a7af5_level0_row109" class="row_heading level0 row109" >ifeval.inst_level_loose_acc</th>
635
+ <td id="T_a7af5_row109_col0" class="data row109 col0" >0.323741</td>
636
+ <td id="T_a7af5_row109_col1" class="data row109 col1" >0.437650</td>
637
+ <td id="T_a7af5_row109_col2" class="data row109 col2" >0.423261</td>
638
+ <td id="T_a7af5_row109_col3" class="data row109 col3" >0.484412</td>
639
+ </tr>
640
+ <tr>
641
+ <th id="T_a7af5_level0_row111" class="row_heading level0 row111" >math_hard.exact_match</th>
642
+ <td id="T_a7af5_row111_col0" class="data row111 col0" >0.258308</td>
643
+ <td id="T_a7af5_row111_col1" class="data row111 col1" >0.388973</td>
644
+ <td id="T_a7af5_row111_col2" class="data row111 col2" >0.000000</td>
645
+ <td id="T_a7af5_row111_col3" class="data row111 col3" >0.000000</td>
646
+ </tr>
647
+ <tr>
648
+ <th id="T_a7af5_level0_row115" class="row_heading level0 row115" >math_algebra_hard.exact_match</th>
649
+ <td id="T_a7af5_row115_col0" class="data row115 col0" >0.501629</td>
650
+ <td id="T_a7af5_row115_col1" class="data row115 col1" >0.618893</td>
651
+ <td id="T_a7af5_row115_col2" class="data row115 col2" >0.000000</td>
652
+ <td id="T_a7af5_row115_col3" class="data row115 col3" >0.000000</td>
653
+ </tr>
654
+ <tr>
655
+ <th id="T_a7af5_level0_row118" class="row_heading level0 row118" >math_counting_and_prob_hard.exact_match</th>
656
+ <td id="T_a7af5_row118_col0" class="data row118 col0" >0.162602</td>
657
+ <td id="T_a7af5_row118_col1" class="data row118 col1" >0.349593</td>
658
+ <td id="T_a7af5_row118_col2" class="data row118 col2" >0.000000</td>
659
+ <td id="T_a7af5_row118_col3" class="data row118 col3" >0.000000</td>
660
+ </tr>
661
+ <tr>
662
+ <th id="T_a7af5_level0_row121" class="row_heading level0 row121" >math_geometry_hard.exact_match</th>
663
+ <td id="T_a7af5_row121_col0" class="data row121 col0" >0.121212</td>
664
+ <td id="T_a7af5_row121_col1" class="data row121 col1" >0.250000</td>
665
+ <td id="T_a7af5_row121_col2" class="data row121 col2" >0.000000</td>
666
+ <td id="T_a7af5_row121_col3" class="data row121 col3" >0.000000</td>
667
+ </tr>
668
+ <tr>
669
+ <th id="T_a7af5_level0_row124" class="row_heading level0 row124" >math_intermediate_algebra_hard.exact_match</th>
670
+ <td id="T_a7af5_row124_col0" class="data row124 col0" >0.064286</td>
671
+ <td id="T_a7af5_row124_col1" class="data row124 col1" >0.153571</td>
672
+ <td id="T_a7af5_row124_col2" class="data row124 col2" >0.000000</td>
673
+ <td id="T_a7af5_row124_col3" class="data row124 col3" >0.000000</td>
674
+ </tr>
675
+ <tr>
676
+ <th id="T_a7af5_level0_row127" class="row_heading level0 row127" >math_num_theory_hard.exact_match</th>
677
+ <td id="T_a7af5_row127_col0" class="data row127 col0" >0.266234</td>
678
+ <td id="T_a7af5_row127_col1" class="data row127 col1" >0.415584</td>
679
+ <td id="T_a7af5_row127_col2" class="data row127 col2" >0.000000</td>
680
+ <td id="T_a7af5_row127_col3" class="data row127 col3" >0.000000</td>
681
+ </tr>
682
+ <tr>
683
+ <th id="T_a7af5_level0_row130" class="row_heading level0 row130" >math_prealgebra_hard.exact_match</th>
684
+ <td id="T_a7af5_row130_col0" class="data row130 col0" >0.430052</td>
685
+ <td id="T_a7af5_row130_col1" class="data row130 col1" >0.647668</td>
686
+ <td id="T_a7af5_row130_col2" class="data row130 col2" >0.000000</td>
687
+ <td id="T_a7af5_row130_col3" class="data row130 col3" >0.000000</td>
688
+ </tr>
689
+ <tr>
690
+ <th id="T_a7af5_level0_row133" class="row_heading level0 row133" >math_precalculus_hard.exact_match</th>
691
+ <td id="T_a7af5_row133_col0" class="data row133 col0" >0.074074</td>
692
+ <td id="T_a7af5_row133_col1" class="data row133 col1" >0.125926</td>
693
+ <td id="T_a7af5_row133_col2" class="data row133 col2" >0.000000</td>
694
+ <td id="T_a7af5_row133_col3" class="data row133 col3" >0.000000</td>
695
+ </tr>
696
+ <tr>
697
+ <th id="T_a7af5_level0_row136" class="row_heading level0 row136" >mmlu_pro.acc</th>
698
+ <td id="T_a7af5_row136_col0" class="data row136 col0" >0.192487</td>
699
+ <td id="T_a7af5_row136_col1" class="data row136 col1" >0.297041</td>
700
+ <td id="T_a7af5_row136_col2" class="data row136 col2" >0.118684</td>
701
+ <td id="T_a7af5_row136_col3" class="data row136 col3" >0.232131</td>
702
+ </tr>
703
+ <tr>
704
+ <th id="T_a7af5_level0_row138" class="row_heading level0 row138" >musr.acc_norm</th>
705
+ <td id="T_a7af5_row138_col0" class="data row138 col0" >0.334656</td>
706
+ <td id="T_a7af5_row138_col1" class="data row138 col1" >0.390212</td>
707
+ <td id="T_a7af5_row138_col2" class="data row138 col2" >0.362434</td>
708
+ <td id="T_a7af5_row138_col3" class="data row138 col3" >0.365079</td>
709
+ </tr>
710
+ <tr>
711
+ <th id="T_a7af5_level0_row142" class="row_heading level0 row142" >musr_murder_mysteries.acc_norm</th>
712
+ <td id="T_a7af5_row142_col0" class="data row142 col0" >0.504000</td>
713
+ <td id="T_a7af5_row142_col1" class="data row142 col1" >0.520000</td>
714
+ <td id="T_a7af5_row142_col2" class="data row142 col2" >0.492000</td>
715
+ <td id="T_a7af5_row142_col3" class="data row142 col3" >0.492000</td>
716
+ </tr>
717
+ <tr>
718
+ <th id="T_a7af5_level0_row145" class="row_heading level0 row145" >musr_object_placements.acc_norm</th>
719
+ <td id="T_a7af5_row145_col0" class="data row145 col0" >0.238281</td>
720
+ <td id="T_a7af5_row145_col1" class="data row145 col1" >0.281250</td>
721
+ <td id="T_a7af5_row145_col2" class="data row145 col2" >0.234375</td>
722
+ <td id="T_a7af5_row145_col3" class="data row145 col3" >0.214844</td>
723
+ </tr>
724
+ <tr>
725
+ <th id="T_a7af5_level0_row148" class="row_heading level0 row148" >musr_team_allocation.acc_norm</th>
726
+ <td id="T_a7af5_row148_col0" class="data row148 col0" >0.264000</td>
727
+ <td id="T_a7af5_row148_col1" class="data row148 col1" >0.372000</td>
728
+ <td id="T_a7af5_row148_col2" class="data row148 col2" >0.364000</td>
729
+ <td id="T_a7af5_row148_col3" class="data row148 col3" >0.392000</td>
730
+ </tr>
731
+ </tbody>
732
+ </table>