Spaces:
Running
Running
update results
Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json
CHANGED
@@ -285,6 +285,17 @@
|
|
285 |
"Total Puzzles": 1000,
|
286 |
"Reason Lens": "1216.40"
|
287 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
{
|
289 |
"Model": "Meta-Llama-3-8B-Instruct",
|
290 |
"Mode": "sampling",
|
@@ -318,6 +329,17 @@
|
|
318 |
"Total Puzzles": 1000,
|
319 |
"Reason Lens": "1074.80"
|
320 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
321 |
{
|
322 |
"Model": "Qwen2-7B-Instruct",
|
323 |
"Mode": "greedy",
|
@@ -328,5 +350,16 @@
|
|
328 |
"Hard Puzzle Acc": "0.28",
|
329 |
"Total Puzzles": 1000,
|
330 |
"Reason Lens": "1473.23"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
}
|
332 |
]
|
|
|
285 |
"Total Puzzles": 1000,
|
286 |
"Reason Lens": "1216.40"
|
287 |
},
|
288 |
+
{
|
289 |
+
"Model": "Yi-1.5-34B-Chat",
|
290 |
+
"Mode": "greedy",
|
291 |
+
"Puzzle Acc": "11.50",
|
292 |
+
"Cell Acc": "32.73",
|
293 |
+
"No answer": "4.40",
|
294 |
+
"Easy Puzzle Acc": "37.50",
|
295 |
+
"Hard Puzzle Acc": "1.39",
|
296 |
+
"Total Puzzles": 1000,
|
297 |
+
"Reason Lens": "869.65"
|
298 |
+
},
|
299 |
{
|
300 |
"Model": "Meta-Llama-3-8B-Instruct",
|
301 |
"Mode": "sampling",
|
|
|
329 |
"Total Puzzles": 1000,
|
330 |
"Reason Lens": "1074.80"
|
331 |
},
|
332 |
+
{
|
333 |
+
"Model": "mathstral-7B-v0.1",
|
334 |
+
"Mode": "greedy",
|
335 |
+
"Puzzle Acc": "9.00",
|
336 |
+
"Cell Acc": "20.42",
|
337 |
+
"No answer": "36.00",
|
338 |
+
"Easy Puzzle Acc": "30.00",
|
339 |
+
"Hard Puzzle Acc": "0.83",
|
340 |
+
"Total Puzzles": 1000,
|
341 |
+
"Reason Lens": "1148.16"
|
342 |
+
},
|
343 |
{
|
344 |
"Model": "Qwen2-7B-Instruct",
|
345 |
"Mode": "greedy",
|
|
|
350 |
"Hard Puzzle Acc": "0.28",
|
351 |
"Total Puzzles": 1000,
|
352 |
"Reason Lens": "1473.23"
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"Model": "Yi-1.5-9B-Chat",
|
356 |
+
"Mode": "greedy",
|
357 |
+
"Puzzle Acc": "2.30",
|
358 |
+
"Cell Acc": "7.53",
|
359 |
+
"No answer": "11.30",
|
360 |
+
"Easy Puzzle Acc": "8.21",
|
361 |
+
"Hard Puzzle Acc": "0.00",
|
362 |
+
"Total Puzzles": 1000,
|
363 |
+
"Reason Lens": "1592.60"
|
364 |
}
|
365 |
]
|