Spaces:
Running
Running
update results
Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json
CHANGED
@@ -120,6 +120,17 @@
|
|
120 |
"Total Puzzles": 1000,
|
121 |
"Reason Lens": "1324.55"
|
122 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
{
|
124 |
"Model": "gemini-1.5-pro",
|
125 |
"Mode": "sampling",
|
@@ -361,5 +372,16 @@
|
|
361 |
"Hard Puzzle Acc": "0.00",
|
362 |
"Total Puzzles": 1000,
|
363 |
"Reason Lens": "1592.60"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
}
|
365 |
]
|
|
|
120 |
"Total Puzzles": 1000,
|
121 |
"Reason Lens": "1324.55"
|
122 |
},
|
123 |
+
{
|
124 |
+
"Model": "gpt-4o-mini-2024-07-18",
|
125 |
+
"Mode": "greedy",
|
126 |
+
"Puzzle Acc": "20.10",
|
127 |
+
"Cell Acc": "41.26",
|
128 |
+
"No answer": "0.10",
|
129 |
+
"Easy Puzzle Acc": "62.50",
|
130 |
+
"Hard Puzzle Acc": "3.61",
|
131 |
+
"Total Puzzles": 1000,
|
132 |
+
"Reason Lens": "943.52"
|
133 |
+
},
|
134 |
{
|
135 |
"Model": "gemini-1.5-pro",
|
136 |
"Mode": "sampling",
|
|
|
372 |
"Hard Puzzle Acc": "0.00",
|
373 |
"Total Puzzles": 1000,
|
374 |
"Reason Lens": "1592.60"
|
375 |
+
},
|
376 |
+
{
|
377 |
+
"Model": "gemma-2-27b-it@vllm",
|
378 |
+
"Mode": "greedy",
|
379 |
+
"Puzzle Acc": "0.47",
|
380 |
+
"Cell Acc": "0.31",
|
381 |
+
"No answer": "96.23",
|
382 |
+
"Easy Puzzle Acc": "2.08",
|
383 |
+
"Hard Puzzle Acc": "0.00",
|
384 |
+
"Total Puzzles": 212,
|
385 |
+
"Reason Lens": "1280.62"
|
386 |
}
|
387 |
]
|
model_info.json
CHANGED
@@ -32,6 +32,7 @@
|
|
32 |
"gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"},
|
33 |
"gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"},
|
34 |
"gpt-4o-2024-05-13": {"pretty_name": "gpt-4o-2024-05-13", "hf_model_id": "https://platform.openai.com/"},
|
|
|
35 |
"gpt-4-turbo-2024-04-09": {"pretty_name": "gpt-4-turbo-2024-04-09", "hf_model_id": "https://platform.openai.com/"},
|
36 |
"gpt-4-0314": {"pretty_name": "gpt-4-0314", "hf_model_id": "https://platform.openai.com/"},
|
37 |
"tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "allenai/tulu-2-dpo-70b"},
|
|
|
32 |
"gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"},
|
33 |
"gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"},
|
34 |
"gpt-4o-2024-05-13": {"pretty_name": "gpt-4o-2024-05-13", "hf_model_id": "https://platform.openai.com/"},
|
35 |
+
"gpt-4o-mini-2024-07-18": {"pretty_name": "gpt-4o-mini-2024-07-18", "hf_model_id": "https://platform.openai.com/"},
|
36 |
"gpt-4-turbo-2024-04-09": {"pretty_name": "gpt-4-turbo-2024-04-09", "hf_model_id": "https://platform.openai.com/"},
|
37 |
"gpt-4-0314": {"pretty_name": "gpt-4-0314", "hf_model_id": "https://platform.openai.com/"},
|
38 |
"tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "allenai/tulu-2-dpo-70b"},
|