yuchenlin commited on
Commit
7302659
1 Parent(s): 9abf560

update results

Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json CHANGED
@@ -120,6 +120,17 @@
120
  "Total Puzzles": 1000,
121
  "Reason Lens": "1324.55"
122
  },
 
 
 
 
 
 
 
 
 
 
 
123
  {
124
  "Model": "gemini-1.5-pro",
125
  "Mode": "sampling",
@@ -361,5 +372,16 @@
361
  "Hard Puzzle Acc": "0.00",
362
  "Total Puzzles": 1000,
363
  "Reason Lens": "1592.60"
 
 
 
 
 
 
 
 
 
 
 
364
  }
365
  ]
 
120
  "Total Puzzles": 1000,
121
  "Reason Lens": "1324.55"
122
  },
123
+ {
124
+ "Model": "gpt-4o-mini-2024-07-18",
125
+ "Mode": "greedy",
126
+ "Puzzle Acc": "20.10",
127
+ "Cell Acc": "41.26",
128
+ "No answer": "0.10",
129
+ "Easy Puzzle Acc": "62.50",
130
+ "Hard Puzzle Acc": "3.61",
131
+ "Total Puzzles": 1000,
132
+ "Reason Lens": "943.52"
133
+ },
134
  {
135
  "Model": "gemini-1.5-pro",
136
  "Mode": "sampling",
 
372
  "Hard Puzzle Acc": "0.00",
373
  "Total Puzzles": 1000,
374
  "Reason Lens": "1592.60"
375
+ },
376
+ {
377
+ "Model": "gemma-2-27b-it@vllm",
378
+ "Mode": "greedy",
379
+ "Puzzle Acc": "0.47",
380
+ "Cell Acc": "0.31",
381
+ "No answer": "96.23",
382
+ "Easy Puzzle Acc": "2.08",
383
+ "Hard Puzzle Acc": "0.00",
384
+ "Total Puzzles": 212,
385
+ "Reason Lens": "1280.62"
386
  }
387
  ]
model_info.json CHANGED
@@ -32,6 +32,7 @@
32
  "gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"},
33
  "gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"},
34
  "gpt-4o-2024-05-13": {"pretty_name": "gpt-4o-2024-05-13", "hf_model_id": "https://platform.openai.com/"},
 
35
  "gpt-4-turbo-2024-04-09": {"pretty_name": "gpt-4-turbo-2024-04-09", "hf_model_id": "https://platform.openai.com/"},
36
  "gpt-4-0314": {"pretty_name": "gpt-4-0314", "hf_model_id": "https://platform.openai.com/"},
37
  "tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "allenai/tulu-2-dpo-70b"},
 
32
  "gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"},
33
  "gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"},
34
  "gpt-4o-2024-05-13": {"pretty_name": "gpt-4o-2024-05-13", "hf_model_id": "https://platform.openai.com/"},
35
+ "gpt-4o-mini-2024-07-18": {"pretty_name": "gpt-4o-mini-2024-07-18", "hf_model_id": "https://platform.openai.com/"},
36
  "gpt-4-turbo-2024-04-09": {"pretty_name": "gpt-4-turbo-2024-04-09", "hf_model_id": "https://platform.openai.com/"},
37
  "gpt-4-0314": {"pretty_name": "gpt-4-0314", "hf_model_id": "https://platform.openai.com/"},
38
  "tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "allenai/tulu-2-dpo-70b"},