Spaces:
Running
Running
{ | |
"time": "2025-01-24 15:10:27", | |
"results": { | |
"IO": { | |
"gpt-3.5-turbo": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 37.83, | |
"Pass rate": 0.9992, | |
"Cost($)": 0.3328, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 586553, | |
"Total input tokens": 546990, | |
"Average input tokens": 415, | |
"Total output tokens": 39563, | |
"Average output tokens": 30 | |
}, | |
"AQuA": { | |
"Score": 38.98, | |
"Pass rate": 1.0, | |
"Cost($)": 0.038, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 42471, | |
"Total input tokens": 25701, | |
"Average input tokens": 101, | |
"Total output tokens": 16770, | |
"Average output tokens": 66 | |
} | |
}, | |
"Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 72.02, | |
"Pass rate": 0.9992, | |
"Cost($)": 0.0354, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 740483, | |
"Total input tokens": 617377, | |
"Average input tokens": 468, | |
"Total output tokens": 123106, | |
"Average output tokens": 93 | |
}, | |
"AQuA": { | |
"Score": 79.13, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0058, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 87742, | |
"Total input tokens": 33058, | |
"Average input tokens": 130, | |
"Total output tokens": 54684, | |
"Average output tokens": 215 | |
} | |
}, | |
"gpt-4o": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "gpt-4o", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 88.4, | |
"Pass rate": 1.0, | |
"Cost($)": 3.3463, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 741446, | |
"Total input tokens": 542416, | |
"Average input tokens": 411, | |
"Total output tokens": 199030, | |
"Average output tokens": 151 | |
}, | |
"AQuA": { | |
"Score": 75.59, | |
"Pass rate": 0.9724, | |
"Cost($)": 1.1453, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 133752, | |
"Total input tokens": 25631, | |
"Average input tokens": 101, | |
"Total output tokens": 108121, | |
"Average output tokens": 426 | |
} | |
}, | |
"Qwen2.5-72B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Qwen2.5-72B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 86.58, | |
"Pass rate": 1.0, | |
"Cost($)": 0.4899, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 869060, | |
"Total input tokens": 555340, | |
"Average input tokens": 421, | |
"Total output tokens": 313720, | |
"Average output tokens": 238 | |
}, | |
"AQuA": { | |
"Score": 84.25, | |
"Pass rate": 0.9961, | |
"Cost($)": 0.0742, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 131604, | |
"Total input tokens": 25397, | |
"Average input tokens": 100, | |
"Total output tokens": 106207, | |
"Average output tokens": 418 | |
} | |
}, | |
"Llama-3.3-70B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Llama-3.3-70B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 92.27, | |
"Pass rate": 1.0, | |
"Cost($)": 0.4709, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 835275, | |
"Total input tokens": 583916, | |
"Average input tokens": 443, | |
"Total output tokens": 251359, | |
"Average output tokens": 191 | |
}, | |
"AQuA": { | |
"Score": 82.68, | |
"Pass rate": 0.9921, | |
"Cost($)": 0.0798, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 141567, | |
"Total input tokens": 32809, | |
"Average input tokens": 129, | |
"Total output tokens": 108758, | |
"Average output tokens": 428 | |
} | |
}, | |
"Qwen2.5-7B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Qwen2.5-7B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 57.24, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 887913, | |
"Total input tokens": 596229, | |
"Average input tokens": 452, | |
"Total output tokens": 291684, | |
"Average output tokens": 221 | |
}, | |
"AQuA": { | |
"Score": 78.74, | |
"Pass rate": 0.9843, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 137771, | |
"Total input tokens": 33271, | |
"Average input tokens": 131, | |
"Total output tokens": 104500, | |
"Average output tokens": 411 | |
} | |
}, | |
"Llama-3.1-8B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Llama-3.1-8B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 57.16, | |
"Pass rate": 0.9955, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1745429, | |
"Total input tokens": 550941, | |
"Average input tokens": 418, | |
"Total output tokens": 1194488, | |
"Average output tokens": 906 | |
}, | |
"AQuA": { | |
"Score": 51.18, | |
"Pass rate": 0.9882, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 133106, | |
"Total input tokens": 26459, | |
"Average input tokens": 104, | |
"Total output tokens": 106647, | |
"Average output tokens": 420 | |
} | |
}, | |
"Internllm2_5-7B": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Internllm2_5-7B", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 11.6, | |
"Pass rate": 0.9795, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1113728, | |
"Total input tokens": 679302, | |
"Average input tokens": 515, | |
"Total output tokens": 434426, | |
"Average output tokens": 329 | |
}, | |
"AQuA": { | |
"Score": 47.64, | |
"Pass rate": 0.9094, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 185041, | |
"Total input tokens": 50232, | |
"Average input tokens": 198, | |
"Total output tokens": 134809, | |
"Average output tokens": 531 | |
} | |
}, | |
"Qwen2-1.5B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Qwen2-1.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 16.68, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 736996, | |
"Total input tokens": 568530, | |
"Average input tokens": 431, | |
"Total output tokens": 168466, | |
"Average output tokens": 128 | |
}, | |
"AQuA": { | |
"Score": 29.13, | |
"Pass rate": 0.9764, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 71047, | |
"Total input tokens": 27937, | |
"Average input tokens": 110, | |
"Total output tokens": 43110, | |
"Average output tokens": 170 | |
} | |
}, | |
"Qwen2-0.5B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Qwen2-0.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 14.71, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 834897, | |
"Total input tokens": 568116, | |
"Average input tokens": 431, | |
"Total output tokens": 266781, | |
"Average output tokens": 202 | |
}, | |
"AQuA": { | |
"Score": 27.17, | |
"Pass rate": 0.9882, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 110415, | |
"Total input tokens": 27937, | |
"Average input tokens": 110, | |
"Total output tokens": 82478, | |
"Average output tokens": 325 | |
} | |
} | |
}, | |
"ReAct-Pro*": { | |
"gpt-3.5-turbo": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 74.91, | |
"Pass rate": 0.9939, | |
"Cost($)": 3.4633, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 6646286, | |
"Total input tokens": 6506164, | |
"Average input tokens": 4933, | |
"Total output tokens": 140122, | |
"Average output tokens": 106 | |
}, | |
"AQuA": { | |
"Score": 64.57, | |
"Pass rate": 0.9803, | |
"Cost($)": 0.4928, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 903587, | |
"Total input tokens": 862614, | |
"Average input tokens": 3396, | |
"Total output tokens": 40973, | |
"Average output tokens": 161 | |
} | |
}, | |
"Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 85.6, | |
"Pass rate": 0.9962, | |
"Cost($)": 0.2512, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 5998639, | |
"Total input tokens": 5862016, | |
"Average input tokens": 4444, | |
"Total output tokens": 136623, | |
"Average output tokens": 104 | |
}, | |
"AQuA": { | |
"Score": 77.56, | |
"Pass rate": 0.9606, | |
"Cost($)": 0.0445, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 1032841, | |
"Total input tokens": 977890, | |
"Average input tokens": 3850, | |
"Total output tokens": 54951, | |
"Average output tokens": 216 | |
} | |
}, | |
"gpt-4o": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "gpt-4o", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 63.31, | |
"Pass rate": 0.9955, | |
"Cost($)": 39.0751, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 14715887, | |
"Total input tokens": 14411173, | |
"Average input tokens": 10926, | |
"Total output tokens": 304714, | |
"Average output tokens": 231 | |
}, | |
"AQuA": { | |
"Score": 57.48, | |
"Pass rate": 0.9724, | |
"Cost($)": 2.304, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 692096, | |
"Total input tokens": 615589, | |
"Average input tokens": 2424, | |
"Total output tokens": 76507, | |
"Average output tokens": 301 | |
} | |
}, | |
"Qwen2.5-72B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Qwen2.5-72B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 87.26, | |
"Pass rate": 1.0, | |
"Cost($)": 10.5479, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 18710437, | |
"Total input tokens": 18160983, | |
"Average input tokens": 13769, | |
"Total output tokens": 549454, | |
"Average output tokens": 417 | |
}, | |
"AQuA": { | |
"Score": 73.23, | |
"Pass rate": 1.0, | |
"Cost($)": 0.3177, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 563603, | |
"Total input tokens": 441765, | |
"Average input tokens": 1739, | |
"Total output tokens": 121838, | |
"Average output tokens": 480 | |
} | |
}, | |
"Llama-3.3-70B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Llama-3.3-70B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 87.64, | |
"Pass rate": 0.9992, | |
"Cost($)": 10.1124, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 17937864, | |
"Total input tokens": 17038928, | |
"Average input tokens": 12918, | |
"Total output tokens": 898936, | |
"Average output tokens": 682 | |
}, | |
"AQuA": { | |
"Score": 79.13, | |
"Pass rate": 0.9961, | |
"Cost($)": 0.768, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 1362379, | |
"Total input tokens": 1119143, | |
"Average input tokens": 4406, | |
"Total output tokens": 243236, | |
"Average output tokens": 958 | |
} | |
}, | |
"Qwen2.5-7B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Qwen2.5-7B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 82.87, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 14850914, | |
"Total input tokens": 14355752, | |
"Average input tokens": 10884, | |
"Total output tokens": 495162, | |
"Average output tokens": 375 | |
}, | |
"AQuA": { | |
"Score": 74.41, | |
"Pass rate": 0.9921, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 695844, | |
"Total input tokens": 564165, | |
"Average input tokens": 2221, | |
"Total output tokens": 131679, | |
"Average output tokens": 518 | |
} | |
}, | |
"Llama-3.1-8B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Llama-3.1-8B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 67.78, | |
"Pass rate": 0.9856, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 22835767, | |
"Total input tokens": 21044978, | |
"Average input tokens": 15955, | |
"Total output tokens": 1790789, | |
"Average output tokens": 1358 | |
}, | |
"AQuA": { | |
"Score": 55.51, | |
"Pass rate": 0.9685, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 4340821, | |
"Total input tokens": 3764723, | |
"Average input tokens": 14822, | |
"Total output tokens": 576098, | |
"Average output tokens": 2268 | |
} | |
}, | |
"Internllm2_5-7B": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Internllm2_5-7B", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 33.51, | |
"Pass rate": 0.9795, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 35669989, | |
"Total input tokens": 30120070, | |
"Average input tokens": 22836, | |
"Total output tokens": 5549919, | |
"Average output tokens": 4208 | |
}, | |
"AQuA": { | |
"Score": 40.94, | |
"Pass rate": 0.9685, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 4428801, | |
"Total input tokens": 3592039, | |
"Average input tokens": 14142, | |
"Total output tokens": 836762, | |
"Average output tokens": 3294 | |
} | |
}, | |
"Qwen2-1.5B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Qwen2-1.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 24.87, | |
"Pass rate": 0.8021, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 9828001, | |
"Total input tokens": 9133603, | |
"Average input tokens": 6925, | |
"Total output tokens": 694398, | |
"Average output tokens": 526 | |
}, | |
"AQuA": { | |
"Score": 25.59, | |
"Pass rate": 0.9606, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 5072004, | |
"Total input tokens": 4555858, | |
"Average input tokens": 17936, | |
"Total output tokens": 516146, | |
"Average output tokens": 2032 | |
} | |
}, | |
"Qwen2-0.5B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Qwen2-0.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 7.66, | |
"Pass rate": 0.9522, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 55392611, | |
"Total input tokens": 52431343, | |
"Average input tokens": 39751, | |
"Total output tokens": 2961268, | |
"Average output tokens": 2245 | |
}, | |
"AQuA": { | |
"Score": 24.02, | |
"Pass rate": 0.9685, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 7170087, | |
"Total input tokens": 6344167, | |
"Average input tokens": 24977, | |
"Total output tokens": 825920, | |
"Average output tokens": 3252 | |
} | |
} | |
}, | |
"PoT": { | |
"gpt-3.5-turbo": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 76.88, | |
"Pass rate": 0.9924, | |
"Cost($)": 0.6902, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1187080, | |
"Total input tokens": 1090418, | |
"Average input tokens": 827, | |
"Total output tokens": 96662, | |
"Average output tokens": 73 | |
}, | |
"AQuA": { | |
"Score": 59.45, | |
"Pass rate": 1.0, | |
"Cost($)": 0.1748, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 266654, | |
"Total input tokens": 225162, | |
"Average input tokens": 886, | |
"Total output tokens": 41492, | |
"Average output tokens": 163 | |
} | |
}, | |
"Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 79.61, | |
"Pass rate": 0.9257, | |
"Cost($)": 0.0576, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1288055, | |
"Total input tokens": 1170038, | |
"Average input tokens": 887, | |
"Total output tokens": 118017, | |
"Average output tokens": 89 | |
}, | |
"AQuA": { | |
"Score": 71.65, | |
"Pass rate": 0.9685, | |
"Cost($)": 0.0147, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 309436, | |
"Total input tokens": 259863, | |
"Average input tokens": 1023, | |
"Total output tokens": 49573, | |
"Average output tokens": 195 | |
} | |
}, | |
"gpt-4o": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "gpt-4o", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 93.1, | |
"Pass rate": 0.9977, | |
"Cost($)": 4.2166, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1247912, | |
"Total input tokens": 1101672, | |
"Average input tokens": 835, | |
"Total output tokens": 146240, | |
"Average output tokens": 111 | |
}, | |
"AQuA": { | |
"Score": 75.2, | |
"Pass rate": 1.0, | |
"Cost($)": 1.6087, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 327908, | |
"Total input tokens": 222717, | |
"Average input tokens": 877, | |
"Total output tokens": 105191, | |
"Average output tokens": 414 | |
} | |
}, | |
"Qwen2.5-72B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Qwen2.5-72B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 92.34, | |
"Pass rate": 0.9939, | |
"Cost($)": 0.7054, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1251210, | |
"Total input tokens": 1106682, | |
"Average input tokens": 839, | |
"Total output tokens": 144528, | |
"Average output tokens": 110 | |
}, | |
"AQuA": { | |
"Score": 75.2, | |
"Pass rate": 1.0, | |
"Cost($)": 0.1645, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 291764, | |
"Total input tokens": 249215, | |
"Average input tokens": 981, | |
"Total output tokens": 42549, | |
"Average output tokens": 168 | |
} | |
}, | |
"Llama-3.3-70B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Llama-3.3-70B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 73.09, | |
"Pass rate": 0.7961, | |
"Cost($)": 0.9736, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1727044, | |
"Total input tokens": 1126025, | |
"Average input tokens": 854, | |
"Total output tokens": 601019, | |
"Average output tokens": 456 | |
}, | |
"AQuA": { | |
"Score": 79.53, | |
"Pass rate": 0.9921, | |
"Cost($)": 0.1746, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 309799, | |
"Total input tokens": 240735, | |
"Average input tokens": 948, | |
"Total output tokens": 69064, | |
"Average output tokens": 272 | |
} | |
}, | |
"Qwen2.5-7B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Qwen2.5-7B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 58.83, | |
"Pass rate": 0.7051, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1362822, | |
"Total input tokens": 1145390, | |
"Average input tokens": 868, | |
"Total output tokens": 217432, | |
"Average output tokens": 165 | |
}, | |
"AQuA": { | |
"Score": 68.11, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 313728, | |
"Total input tokens": 264517, | |
"Average input tokens": 1041, | |
"Total output tokens": 49211, | |
"Average output tokens": 194 | |
} | |
}, | |
"Llama-3.1-8B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Llama-3.1-8B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 38.67, | |
"Pass rate": 0.5542, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1391111, | |
"Total input tokens": 1147538, | |
"Average input tokens": 870, | |
"Total output tokens": 243573, | |
"Average output tokens": 185 | |
}, | |
"AQuA": { | |
"Score": 36.61, | |
"Pass rate": 0.9685, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 290914, | |
"Total input tokens": 240613, | |
"Average input tokens": 947, | |
"Total output tokens": 50301, | |
"Average output tokens": 198 | |
} | |
}, | |
"Internllm2_5-7B": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Internllm2_5-7B", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 38.21, | |
"Pass rate": 0.489, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1324949, | |
"Total input tokens": 1136843, | |
"Average input tokens": 862, | |
"Total output tokens": 188106, | |
"Average output tokens": 143 | |
}, | |
"AQuA": { | |
"Score": 36.61, | |
"Pass rate": 0.9882, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 301962, | |
"Total input tokens": 233505, | |
"Average input tokens": 919, | |
"Total output tokens": 68457, | |
"Average output tokens": 270 | |
} | |
}, | |
"Qwen2-1.5B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Qwen2-1.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 18.5, | |
"Pass rate": 0.3101, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1327522, | |
"Total input tokens": 1151528, | |
"Average input tokens": 873, | |
"Total output tokens": 175994, | |
"Average output tokens": 133 | |
}, | |
"AQuA": { | |
"Score": 30.71, | |
"Pass rate": 0.9646, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 298475, | |
"Total input tokens": 246560, | |
"Average input tokens": 971, | |
"Total output tokens": 51915, | |
"Average output tokens": 204 | |
} | |
}, | |
"Qwen2-0.5B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Qwen2-0.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 9.62, | |
"Pass rate": 0.1691, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1389135, | |
"Total input tokens": 1151528, | |
"Average input tokens": 873, | |
"Total output tokens": 237607, | |
"Average output tokens": 180 | |
}, | |
"AQuA": { | |
"Score": 17.32, | |
"Pass rate": 0.9213, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 322281, | |
"Total input tokens": 258867, | |
"Average input tokens": 1019, | |
"Total output tokens": 63414, | |
"Average output tokens": 250 | |
} | |
} | |
}, | |
"CoT": { | |
"gpt-3.5-turbo": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 78.7, | |
"Pass rate": 1.0, | |
"Cost($)": 0.6788, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1088041, | |
"Total input tokens": 953242, | |
"Average input tokens": 723, | |
"Total output tokens": 134799, | |
"Average output tokens": 102 | |
}, | |
"AQuA": { | |
"Score": 61.02, | |
"Pass rate": 0.937, | |
"Cost($)": 0.0957, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 80793, | |
"Total input tokens": 25447, | |
"Average input tokens": 100, | |
"Total output tokens": 55346, | |
"Average output tokens": 218 | |
} | |
}, | |
"Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 89.31, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0558, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1201820, | |
"Total input tokens": 1042095, | |
"Average input tokens": 790, | |
"Total output tokens": 159725, | |
"Average output tokens": 121 | |
}, | |
"AQuA": { | |
"Score": 82.68, | |
"Pass rate": 0.9724, | |
"Cost($)": 0.0066, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 94577, | |
"Total input tokens": 27978, | |
"Average input tokens": 110, | |
"Total output tokens": 66599, | |
"Average output tokens": 262 | |
} | |
}, | |
"gpt-4o": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "gpt-4o", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 94.09, | |
"Pass rate": 1.0, | |
"Cost($)": 4.5367, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1165166, | |
"Total input tokens": 948668, | |
"Average input tokens": 719, | |
"Total output tokens": 216498, | |
"Average output tokens": 164 | |
}, | |
"AQuA": { | |
"Score": 82.68, | |
"Pass rate": 0.9803, | |
"Cost($)": 1.0417, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 123017, | |
"Total input tokens": 25123, | |
"Average input tokens": 99, | |
"Total output tokens": 97894, | |
"Average output tokens": 385 | |
} | |
}, | |
"Qwen2.5-72B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Qwen2.5-72B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 92.87, | |
"Pass rate": 1.0, | |
"Cost($)": 0.7195, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1276252, | |
"Total input tokens": 1005119, | |
"Average input tokens": 762, | |
"Total output tokens": 271133, | |
"Average output tokens": 206 | |
}, | |
"AQuA": { | |
"Score": 86.22, | |
"Pass rate": 0.9921, | |
"Cost($)": 0.0808, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 143289, | |
"Total input tokens": 25143, | |
"Average input tokens": 99, | |
"Total output tokens": 118146, | |
"Average output tokens": 465 | |
} | |
}, | |
"Llama-3.3-70B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Llama-3.3-70B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 93.93, | |
"Pass rate": 1.0, | |
"Cost($)": 0.687, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1218665, | |
"Total input tokens": 990168, | |
"Average input tokens": 751, | |
"Total output tokens": 228497, | |
"Average output tokens": 173 | |
}, | |
"AQuA": { | |
"Score": 83.46, | |
"Pass rate": 0.9843, | |
"Cost($)": 0.0927, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 164389, | |
"Total input tokens": 32555, | |
"Average input tokens": 128, | |
"Total output tokens": 131834, | |
"Average output tokens": 519 | |
} | |
}, | |
"Qwen2.5-7B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Qwen2.5-7B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 85.67, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1290805, | |
"Total input tokens": 1046008, | |
"Average input tokens": 793, | |
"Total output tokens": 244797, | |
"Average output tokens": 186 | |
}, | |
"AQuA": { | |
"Score": 80.71, | |
"Pass rate": 0.9961, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 149736, | |
"Total input tokens": 33017, | |
"Average input tokens": 130, | |
"Total output tokens": 116719, | |
"Average output tokens": 460 | |
} | |
}, | |
"Llama-3.1-8B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Llama-3.1-8B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 75.44, | |
"Pass rate": 0.9992, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1248329, | |
"Total input tokens": 990168, | |
"Average input tokens": 751, | |
"Total output tokens": 258161, | |
"Average output tokens": 196 | |
}, | |
"AQuA": { | |
"Score": 60.63, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 144435, | |
"Total input tokens": 32555, | |
"Average input tokens": 128, | |
"Total output tokens": 111880, | |
"Average output tokens": 440 | |
} | |
}, | |
"Internllm2_5-7B": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Internllm2_5-7B", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 77.71, | |
"Pass rate": 0.997, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1202163, | |
"Total input tokens": 968163, | |
"Average input tokens": 734, | |
"Total output tokens": 234000, | |
"Average output tokens": 177 | |
}, | |
"AQuA": { | |
"Score": 52.76, | |
"Pass rate": 0.8937, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 127520, | |
"Total input tokens": 26610, | |
"Average input tokens": 105, | |
"Total output tokens": 100910, | |
"Average output tokens": 397 | |
} | |
}, | |
"Qwen2-1.5B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Qwen2-1.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 55.5, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1218525, | |
"Total input tokens": 1032818, | |
"Average input tokens": 783, | |
"Total output tokens": 185707, | |
"Average output tokens": 141 | |
}, | |
"AQuA": { | |
"Score": 40.55, | |
"Pass rate": 0.9882, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 110040, | |
"Total input tokens": 30477, | |
"Average input tokens": 120, | |
"Total output tokens": 79563, | |
"Average output tokens": 313 | |
} | |
}, | |
"Qwen2-0.5B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Qwen2-0.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 35.94, | |
"Pass rate": 0.9992, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 1223459, | |
"Total input tokens": 1032818, | |
"Average input tokens": 783, | |
"Total output tokens": 190641, | |
"Average output tokens": 145 | |
}, | |
"AQuA": { | |
"Score": 33.07, | |
"Pass rate": 0.9882, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 117339, | |
"Total input tokens": 30477, | |
"Average input tokens": 120, | |
"Total output tokens": 86862, | |
"Average output tokens": 342 | |
} | |
} | |
}, | |
"SC-CoT": { | |
"gpt-3.5-turbo": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 79.91, | |
"Pass rate": 0.9992, | |
"Cost($)": 3.3938, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 4089612, | |
"Total input tokens": 2740652, | |
"Average input tokens": 2078, | |
"Total output tokens": 1348960, | |
"Average output tokens": 1023 | |
}, | |
"AQuA": { | |
"Score": 66.14, | |
"Pass rate": 0.9921, | |
"Cost($)": 0.7888, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 847335, | |
"Total input tokens": 482192, | |
"Average input tokens": 1898, | |
"Total output tokens": 365143, | |
"Average output tokens": 1438 | |
} | |
}, | |
"Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 87.26, | |
"Pass rate": 0.9992, | |
"Cost($)": 0.2083, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 3888813, | |
"Total input tokens": 2691714, | |
"Average input tokens": 2041, | |
"Total output tokens": 1197099, | |
"Average output tokens": 908 | |
}, | |
"AQuA": { | |
"Score": 81.1, | |
"Pass rate": 0.9724, | |
"Cost($)": 0.0519, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 885986, | |
"Total input tokens": 503751, | |
"Average input tokens": 1983, | |
"Total output tokens": 382235, | |
"Average output tokens": 1505 | |
} | |
}, | |
"gpt-4o": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "gpt-4o", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 90.3, | |
"Pass rate": 0.9992, | |
"Cost($)": 31.0542, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 5798173, | |
"Total input tokens": 3590336, | |
"Average input tokens": 2722, | |
"Total output tokens": 2207837, | |
"Average output tokens": 1674 | |
}, | |
"AQuA": { | |
"Score": 86.61, | |
"Pass rate": 0.9882, | |
"Cost($)": 8.1485, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 1373206, | |
"Total input tokens": 744478, | |
"Average input tokens": 2931, | |
"Total output tokens": 628728, | |
"Average output tokens": 2475 | |
} | |
}, | |
"Qwen2.5-72B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Qwen2.5-72B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 93.86, | |
"Pass rate": 1.0, | |
"Cost($)": 5.9858, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 10618008, | |
"Total input tokens": 8136223, | |
"Average input tokens": 6168, | |
"Total output tokens": 2481785, | |
"Average output tokens": 1882 | |
}, | |
"AQuA": { | |
"Score": 85.04, | |
"Pass rate": 0.9921, | |
"Cost($)": 1.0348, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 1835669, | |
"Total input tokens": 1051218, | |
"Average input tokens": 4139, | |
"Total output tokens": 784451, | |
"Average output tokens": 3088 | |
} | |
}, | |
"Llama-3.3-70B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Llama-3.3-70B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 95.07, | |
"Pass rate": 1.0, | |
"Cost($)": 6.2005, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 10998794, | |
"Total input tokens": 8413717, | |
"Average input tokens": 6379, | |
"Total output tokens": 2585077, | |
"Average output tokens": 1960 | |
}, | |
"AQuA": { | |
"Score": 82.28, | |
"Pass rate": 0.9921, | |
"Cost($)": 1.0756, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 1907924, | |
"Total input tokens": 1135251, | |
"Average input tokens": 4469, | |
"Total output tokens": 772673, | |
"Average output tokens": 3042 | |
} | |
}, | |
"Qwen2.5-7B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Qwen2.5-7B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 91.13, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 11140985, | |
"Total input tokens": 8586888, | |
"Average input tokens": 6510, | |
"Total output tokens": 2554097, | |
"Average output tokens": 1936 | |
}, | |
"AQuA": { | |
"Score": 79.92, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 1845332, | |
"Total input tokens": 1098280, | |
"Average input tokens": 4324, | |
"Total output tokens": 747052, | |
"Average output tokens": 2941 | |
} | |
}, | |
"Llama-3.1-8B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Llama-3.1-8B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 73.46, | |
"Pass rate": 0.9955, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 11778716, | |
"Total input tokens": 8630514, | |
"Average input tokens": 6543, | |
"Total output tokens": 3148202, | |
"Average output tokens": 2387 | |
}, | |
"AQuA": { | |
"Score": 59.45, | |
"Pass rate": 0.9724, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 1651333, | |
"Total input tokens": 971003, | |
"Average input tokens": 3823, | |
"Total output tokens": 680330, | |
"Average output tokens": 2678 | |
} | |
}, | |
"Internllm2_5-7B": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Internllm2_5-7B", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 48.22, | |
"Pass rate": 0.9841, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 14526431, | |
"Total input tokens": 10678792, | |
"Average input tokens": 8096, | |
"Total output tokens": 3847639, | |
"Average output tokens": 2917 | |
}, | |
"AQuA": { | |
"Score": 39.37, | |
"Pass rate": 0.9803, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 2296222, | |
"Total input tokens": 1420494, | |
"Average input tokens": 5592, | |
"Total output tokens": 875728, | |
"Average output tokens": 3448 | |
} | |
}, | |
"Qwen2-1.5B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Qwen2-1.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 11.75, | |
"Pass rate": 0.9189, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 12411942, | |
"Total input tokens": 9066115, | |
"Average input tokens": 6873, | |
"Total output tokens": 3345827, | |
"Average output tokens": 2537 | |
}, | |
"AQuA": { | |
"Score": 23.62, | |
"Pass rate": 0.9646, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 1775335, | |
"Total input tokens": 1034362, | |
"Average input tokens": 4072, | |
"Total output tokens": 740973, | |
"Average output tokens": 2917 | |
} | |
}, | |
"Qwen2-0.5B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Qwen2-0.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 1.67, | |
"Pass rate": 0.9469, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8.0", | |
"Samples": 1319, | |
"All tokens": 16465720, | |
"Total input tokens": 11019864, | |
"Average input tokens": 8355, | |
"Total output tokens": 5445856, | |
"Average output tokens": 4129 | |
}, | |
"AQuA": { | |
"Score": 22.83, | |
"Pass rate": 0.9724, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0.0", | |
"Samples": 254, | |
"All tokens": 2215091, | |
"Total input tokens": 1246929, | |
"Average input tokens": 4909, | |
"Total output tokens": 968162, | |
"Average output tokens": 3812 | |
} | |
} | |
} | |
} | |
} |