{ "time": "2025-01-24 15:10:27", "results": { "IO": { "gpt-3.5-turbo": { "META": { "Algorithm": "IO", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 37.83, "Pass rate": 0.9992, "Cost($)": 0.3328, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 586553, "Total input tokens": 546990, "Average input tokens": 415, "Total output tokens": 39563, "Average output tokens": 30 }, "AQuA": { "Score": 38.98, "Pass rate": 1.0, "Cost($)": 0.038, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 42471, "Total input tokens": 25701, "Average input tokens": 101, "Total output tokens": 16770, "Average output tokens": 66 } }, "Doubao-lite-32k": { "META": { "Algorithm": "IO", "LLM": "Doubao-lite-32k", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 72.02, "Pass rate": 0.9992, "Cost($)": 0.0354, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 740483, "Total input tokens": 617377, "Average input tokens": 468, "Total output tokens": 123106, "Average output tokens": 93 }, "AQuA": { "Score": 79.13, "Pass rate": 1.0, "Cost($)": 0.0058, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 87742, "Total input tokens": 33058, "Average input tokens": 130, "Total output tokens": 54684, "Average output tokens": 215 } }, "gpt-4o": { "META": { "Algorithm": "IO", "LLM": "gpt-4o", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 88.4, "Pass rate": 1.0, "Cost($)": 3.3463, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 741446, "Total input tokens": 542416, "Average input tokens": 411, "Total output tokens": 199030, "Average output tokens": 151 }, "AQuA": { "Score": 75.59, "Pass rate": 0.9724, "Cost($)": 1.1453, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 133752, "Total input tokens": 25631, "Average input tokens": 101, "Total output tokens": 108121, "Average output tokens": 426 } }, "Qwen2.5-72B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Qwen2.5-72B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 86.58, "Pass rate": 1.0, "Cost($)": 0.4899, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 869060, "Total input tokens": 555340, "Average input tokens": 421, "Total output tokens": 313720, "Average output tokens": 238 }, "AQuA": { "Score": 84.25, "Pass rate": 0.9961, "Cost($)": 0.0742, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 131604, "Total input tokens": 25397, "Average input tokens": 100, "Total output tokens": 106207, "Average output tokens": 418 } }, "Llama-3.3-70B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Llama-3.3-70B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 92.27, "Pass rate": 1.0, "Cost($)": 0.4709, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 835275, "Total input tokens": 583916, "Average input tokens": 443, "Total output tokens": 251359, "Average output tokens": 191 }, "AQuA": { "Score": 82.68, "Pass rate": 0.9921, "Cost($)": 0.0798, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 141567, "Total input tokens": 32809, "Average input tokens": 129, "Total output tokens": 108758, "Average output tokens": 428 } }, "Qwen2.5-7B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Qwen2.5-7B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 57.24, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 887913, "Total input tokens": 596229, "Average input tokens": 452, "Total output tokens": 291684, "Average output tokens": 221 }, "AQuA": { "Score": 78.74, "Pass rate": 0.9843, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 137771, "Total input tokens": 33271, "Average input tokens": 131, "Total output tokens": 104500, "Average output tokens": 411 } }, "Llama-3.1-8B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Llama-3.1-8B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 57.16, "Pass rate": 0.9955, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1745429, "Total input tokens": 550941, "Average input tokens": 418, "Total output tokens": 1194488, "Average output tokens": 906 }, "AQuA": { "Score": 51.18, "Pass rate": 0.9882, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 133106, "Total input tokens": 26459, "Average input tokens": 104, "Total output tokens": 106647, "Average output tokens": 420 } }, "Internllm2_5-7B": { "META": { "Algorithm": "IO", "LLM": "Internllm2_5-7B", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 11.6, "Pass rate": 0.9795, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1113728, "Total input tokens": 679302, "Average input tokens": 515, "Total output tokens": 434426, "Average output tokens": 329 }, "AQuA": { "Score": 47.64, "Pass rate": 0.9094, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 185041, "Total input tokens": 50232, "Average input tokens": 198, "Total output tokens": 134809, "Average output tokens": 531 } }, "Qwen2-1.5B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Qwen2-1.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 16.68, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 736996, "Total input tokens": 568530, "Average input tokens": 431, "Total output tokens": 168466, "Average output tokens": 128 }, "AQuA": { "Score": 29.13, "Pass rate": 0.9764, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 71047, "Total input tokens": 27937, "Average input tokens": 110, "Total output tokens": 43110, "Average output tokens": 170 } }, "Qwen2-0.5B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Qwen2-0.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 14.71, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 834897, "Total input tokens": 568116, "Average input tokens": 431, "Total output tokens": 266781, "Average output tokens": 202 }, "AQuA": { "Score": 27.17, "Pass rate": 0.9882, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 110415, "Total input tokens": 27937, "Average input tokens": 110, "Total output tokens": 82478, "Average output tokens": 325 } } }, "ReAct-Pro*": { "gpt-3.5-turbo": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 74.91, "Pass rate": 0.9939, "Cost($)": 3.4633, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 6646286, "Total input tokens": 6506164, "Average input tokens": 4933, "Total output tokens": 140122, "Average output tokens": 106 }, "AQuA": { "Score": 64.57, "Pass rate": 0.9803, "Cost($)": 0.4928, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 903587, "Total input tokens": 862614, "Average input tokens": 3396, "Total output tokens": 40973, "Average output tokens": 161 } }, "Doubao-lite-32k": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Doubao-lite-32k", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 85.6, "Pass rate": 0.9962, "Cost($)": 0.2512, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 5998639, "Total input tokens": 5862016, "Average input tokens": 4444, "Total output tokens": 136623, "Average output tokens": 104 }, "AQuA": { "Score": 77.56, "Pass rate": 0.9606, "Cost($)": 0.0445, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 1032841, "Total input tokens": 977890, "Average input tokens": 3850, "Total output tokens": 54951, "Average output tokens": 216 } }, "gpt-4o": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "gpt-4o", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 63.31, "Pass rate": 0.9955, "Cost($)": 39.0751, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 14715887, "Total input tokens": 14411173, "Average input tokens": 10926, "Total output tokens": 304714, "Average output tokens": 231 }, "AQuA": { "Score": 57.48, "Pass rate": 0.9724, "Cost($)": 2.304, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 692096, "Total input tokens": 615589, "Average input tokens": 2424, "Total output tokens": 76507, "Average output tokens": 301 } }, "Qwen2.5-72B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Qwen2.5-72B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 87.26, "Pass rate": 1.0, "Cost($)": 10.5479, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 18710437, "Total input tokens": 18160983, "Average input tokens": 13769, "Total output tokens": 549454, "Average output tokens": 417 }, "AQuA": { "Score": 73.23, "Pass rate": 1.0, "Cost($)": 0.3177, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 563603, "Total input tokens": 441765, "Average input tokens": 1739, "Total output tokens": 121838, "Average output tokens": 480 } }, "Llama-3.3-70B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Llama-3.3-70B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 87.64, "Pass rate": 0.9992, "Cost($)": 10.1124, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 17937864, "Total input tokens": 17038928, "Average input tokens": 12918, "Total output tokens": 898936, "Average output tokens": 682 }, "AQuA": { "Score": 79.13, "Pass rate": 0.9961, "Cost($)": 0.768, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 1362379, "Total input tokens": 1119143, "Average input tokens": 4406, "Total output tokens": 243236, "Average output tokens": 958 } }, "Qwen2.5-7B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Qwen2.5-7B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 82.87, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 14850914, "Total input tokens": 14355752, "Average input tokens": 10884, "Total output tokens": 495162, "Average output tokens": 375 }, "AQuA": { "Score": 74.41, "Pass rate": 0.9921, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 695844, "Total input tokens": 564165, "Average input tokens": 2221, "Total output tokens": 131679, "Average output tokens": 518 } }, "Llama-3.1-8B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Llama-3.1-8B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 67.78, "Pass rate": 0.9856, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 22835767, "Total input tokens": 21044978, "Average input tokens": 15955, "Total output tokens": 1790789, "Average output tokens": 1358 }, "AQuA": { "Score": 55.51, "Pass rate": 0.9685, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 4340821, "Total input tokens": 3764723, "Average input tokens": 14822, "Total output tokens": 576098, "Average output tokens": 2268 } }, "Internllm2_5-7B": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Internllm2_5-7B", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 33.51, "Pass rate": 0.9795, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 35669989, "Total input tokens": 30120070, "Average input tokens": 22836, "Total output tokens": 5549919, "Average output tokens": 4208 }, "AQuA": { "Score": 40.94, "Pass rate": 0.9685, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 4428801, "Total input tokens": 3592039, "Average input tokens": 14142, "Total output tokens": 836762, "Average output tokens": 3294 } }, "Qwen2-1.5B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Qwen2-1.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 24.87, "Pass rate": 0.8021, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 9828001, "Total input tokens": 9133603, "Average input tokens": 6925, "Total output tokens": 694398, "Average output tokens": 526 }, "AQuA": { "Score": 25.59, "Pass rate": 0.9606, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 5072004, "Total input tokens": 4555858, "Average input tokens": 17936, "Total output tokens": 516146, "Average output tokens": 2032 } }, "Qwen2-0.5B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Qwen2-0.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 7.66, "Pass rate": 0.9522, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 55392611, "Total input tokens": 52431343, "Average input tokens": 39751, "Total output tokens": 2961268, "Average output tokens": 2245 }, "AQuA": { "Score": 24.02, "Pass rate": 0.9685, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 7170087, "Total input tokens": 6344167, "Average input tokens": 24977, "Total output tokens": 825920, "Average output tokens": 3252 } } }, "PoT": { "gpt-3.5-turbo": { "META": { "Algorithm": "PoT", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 76.88, "Pass rate": 0.9924, "Cost($)": 0.6902, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1187080, "Total input tokens": 1090418, "Average input tokens": 827, "Total output tokens": 96662, "Average output tokens": 73 }, "AQuA": { "Score": 59.45, "Pass rate": 1.0, "Cost($)": 0.1748, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 266654, "Total input tokens": 225162, "Average input tokens": 886, "Total output tokens": 41492, "Average output tokens": 163 } }, "Doubao-lite-32k": { "META": { "Algorithm": "PoT", "LLM": "Doubao-lite-32k", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 79.61, "Pass rate": 0.9257, "Cost($)": 0.0576, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1288055, "Total input tokens": 1170038, "Average input tokens": 887, "Total output tokens": 118017, "Average output tokens": 89 }, "AQuA": { "Score": 71.65, "Pass rate": 0.9685, "Cost($)": 0.0147, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 309436, "Total input tokens": 259863, "Average input tokens": 1023, "Total output tokens": 49573, "Average output tokens": 195 } }, "gpt-4o": { "META": { "Algorithm": "PoT", "LLM": "gpt-4o", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 93.1, "Pass rate": 0.9977, "Cost($)": 4.2166, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1247912, "Total input tokens": 1101672, "Average input tokens": 835, "Total output tokens": 146240, "Average output tokens": 111 }, "AQuA": { "Score": 75.2, "Pass rate": 1.0, "Cost($)": 1.6087, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 327908, "Total input tokens": 222717, "Average input tokens": 877, "Total output tokens": 105191, "Average output tokens": 414 } }, "Qwen2.5-72B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Qwen2.5-72B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 92.34, "Pass rate": 0.9939, "Cost($)": 0.7054, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1251210, "Total input tokens": 1106682, "Average input tokens": 839, "Total output tokens": 144528, "Average output tokens": 110 }, "AQuA": { "Score": 75.2, "Pass rate": 1.0, "Cost($)": 0.1645, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 291764, "Total input tokens": 249215, "Average input tokens": 981, "Total output tokens": 42549, "Average output tokens": 168 } }, "Llama-3.3-70B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Llama-3.3-70B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 73.09, "Pass rate": 0.7961, "Cost($)": 0.9736, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1727044, "Total input tokens": 1126025, "Average input tokens": 854, "Total output tokens": 601019, "Average output tokens": 456 }, "AQuA": { "Score": 79.53, "Pass rate": 0.9921, "Cost($)": 0.1746, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 309799, "Total input tokens": 240735, "Average input tokens": 948, "Total output tokens": 69064, "Average output tokens": 272 } }, "Qwen2.5-7B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Qwen2.5-7B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 58.83, "Pass rate": 0.7051, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1362822, "Total input tokens": 1145390, "Average input tokens": 868, "Total output tokens": 217432, "Average output tokens": 165 }, "AQuA": { "Score": 68.11, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 313728, "Total input tokens": 264517, "Average input tokens": 1041, "Total output tokens": 49211, "Average output tokens": 194 } }, "Llama-3.1-8B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Llama-3.1-8B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 38.67, "Pass rate": 0.5542, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1391111, "Total input tokens": 1147538, "Average input tokens": 870, "Total output tokens": 243573, "Average output tokens": 185 }, "AQuA": { "Score": 36.61, "Pass rate": 0.9685, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 290914, "Total input tokens": 240613, "Average input tokens": 947, "Total output tokens": 50301, "Average output tokens": 198 } }, "Internllm2_5-7B": { "META": { "Algorithm": "PoT", "LLM": "Internllm2_5-7B", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 38.21, "Pass rate": 0.489, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1324949, "Total input tokens": 1136843, "Average input tokens": 862, "Total output tokens": 188106, "Average output tokens": 143 }, "AQuA": { "Score": 36.61, "Pass rate": 0.9882, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 301962, "Total input tokens": 233505, "Average input tokens": 919, "Total output tokens": 68457, "Average output tokens": 270 } }, "Qwen2-1.5B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Qwen2-1.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 18.5, "Pass rate": 0.3101, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1327522, "Total input tokens": 1151528, "Average input tokens": 873, "Total output tokens": 175994, "Average output tokens": 133 }, "AQuA": { "Score": 30.71, "Pass rate": 0.9646, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 298475, "Total input tokens": 246560, "Average input tokens": 971, "Total output tokens": 51915, "Average output tokens": 204 } }, "Qwen2-0.5B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Qwen2-0.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 9.62, "Pass rate": 0.1691, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1389135, "Total input tokens": 1151528, "Average input tokens": 873, "Total output tokens": 237607, "Average output tokens": 180 }, "AQuA": { "Score": 17.32, "Pass rate": 0.9213, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 322281, "Total input tokens": 258867, "Average input tokens": 1019, "Total output tokens": 63414, "Average output tokens": 250 } } }, "CoT": { "gpt-3.5-turbo": { "META": { "Algorithm": "CoT", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 78.7, "Pass rate": 1.0, "Cost($)": 0.6788, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1088041, "Total input tokens": 953242, "Average input tokens": 723, "Total output tokens": 134799, "Average output tokens": 102 }, "AQuA": { "Score": 61.02, "Pass rate": 0.937, "Cost($)": 0.0957, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 80793, "Total input tokens": 25447, "Average input tokens": 100, "Total output tokens": 55346, "Average output tokens": 218 } }, "Doubao-lite-32k": { "META": { "Algorithm": "CoT", "LLM": "Doubao-lite-32k", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 89.31, "Pass rate": 1.0, "Cost($)": 0.0558, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1201820, "Total input tokens": 1042095, "Average input tokens": 790, "Total output tokens": 159725, "Average output tokens": 121 }, "AQuA": { "Score": 82.68, "Pass rate": 0.9724, "Cost($)": 0.0066, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 94577, "Total input tokens": 27978, "Average input tokens": 110, "Total output tokens": 66599, "Average output tokens": 262 } }, "gpt-4o": { "META": { "Algorithm": "CoT", "LLM": "gpt-4o", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 94.09, "Pass rate": 1.0, "Cost($)": 4.5367, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1165166, "Total input tokens": 948668, "Average input tokens": 719, "Total output tokens": 216498, "Average output tokens": 164 }, "AQuA": { "Score": 82.68, "Pass rate": 0.9803, "Cost($)": 1.0417, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 123017, "Total input tokens": 25123, "Average input tokens": 99, "Total output tokens": 97894, "Average output tokens": 385 } }, "Qwen2.5-72B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Qwen2.5-72B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 92.87, "Pass rate": 1.0, "Cost($)": 0.7195, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1276252, "Total input tokens": 1005119, "Average input tokens": 762, "Total output tokens": 271133, "Average output tokens": 206 }, "AQuA": { "Score": 86.22, "Pass rate": 0.9921, "Cost($)": 0.0808, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 143289, "Total input tokens": 25143, "Average input tokens": 99, "Total output tokens": 118146, "Average output tokens": 465 } }, "Llama-3.3-70B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Llama-3.3-70B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 93.93, "Pass rate": 1.0, "Cost($)": 0.687, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1218665, "Total input tokens": 990168, "Average input tokens": 751, "Total output tokens": 228497, "Average output tokens": 173 }, "AQuA": { "Score": 83.46, "Pass rate": 0.9843, "Cost($)": 0.0927, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 164389, "Total input tokens": 32555, "Average input tokens": 128, "Total output tokens": 131834, "Average output tokens": 519 } }, "Qwen2.5-7B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Qwen2.5-7B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 85.67, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1290805, "Total input tokens": 1046008, "Average input tokens": 793, "Total output tokens": 244797, "Average output tokens": 186 }, "AQuA": { "Score": 80.71, "Pass rate": 0.9961, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 149736, "Total input tokens": 33017, "Average input tokens": 130, "Total output tokens": 116719, "Average output tokens": 460 } }, "Llama-3.1-8B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Llama-3.1-8B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 75.44, "Pass rate": 0.9992, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1248329, "Total input tokens": 990168, "Average input tokens": 751, "Total output tokens": 258161, "Average output tokens": 196 }, "AQuA": { "Score": 60.63, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 144435, "Total input tokens": 32555, "Average input tokens": 128, "Total output tokens": 111880, "Average output tokens": 440 } }, "Internllm2_5-7B": { "META": { "Algorithm": "CoT", "LLM": "Internllm2_5-7B", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 77.71, "Pass rate": 0.997, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1202163, "Total input tokens": 968163, "Average input tokens": 734, "Total output tokens": 234000, "Average output tokens": 177 }, "AQuA": { "Score": 52.76, "Pass rate": 0.8937, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 127520, "Total input tokens": 26610, "Average input tokens": 105, "Total output tokens": 100910, "Average output tokens": 397 } }, "Qwen2-1.5B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Qwen2-1.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 55.5, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1218525, "Total input tokens": 1032818, "Average input tokens": 783, "Total output tokens": 185707, "Average output tokens": 141 }, "AQuA": { "Score": 40.55, "Pass rate": 0.9882, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 110040, "Total input tokens": 30477, "Average input tokens": 120, "Total output tokens": 79563, "Average output tokens": 313 } }, "Qwen2-0.5B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Qwen2-0.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 35.94, "Pass rate": 0.9992, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 1223459, "Total input tokens": 1032818, "Average input tokens": 783, "Total output tokens": 190641, "Average output tokens": 145 }, "AQuA": { "Score": 33.07, "Pass rate": 0.9882, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 117339, "Total input tokens": 30477, "Average input tokens": 120, "Total output tokens": 86862, "Average output tokens": 342 } } }, "SC-CoT": { "gpt-3.5-turbo": { "META": { "Algorithm": "SC-CoT", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 79.91, "Pass rate": 0.9992, "Cost($)": 3.3938, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 4089612, "Total input tokens": 2740652, "Average input tokens": 2078, "Total output tokens": 1348960, "Average output tokens": 1023 }, "AQuA": { "Score": 66.14, "Pass rate": 0.9921, "Cost($)": 0.7888, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 847335, "Total input tokens": 482192, "Average input tokens": 1898, "Total output tokens": 365143, "Average output tokens": 1438 } }, "Doubao-lite-32k": { "META": { "Algorithm": "SC-CoT", "LLM": "Doubao-lite-32k", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 87.26, "Pass rate": 0.9992, "Cost($)": 0.2083, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 3888813, "Total input tokens": 2691714, "Average input tokens": 2041, "Total output tokens": 1197099, "Average output tokens": 908 }, "AQuA": { "Score": 81.1, "Pass rate": 0.9724, "Cost($)": 0.0519, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 885986, "Total input tokens": 503751, "Average input tokens": 1983, "Total output tokens": 382235, "Average output tokens": 1505 } }, "gpt-4o": { "META": { "Algorithm": "SC-CoT", "LLM": "gpt-4o", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 90.3, "Pass rate": 0.9992, "Cost($)": 31.0542, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 5798173, "Total input tokens": 3590336, "Average input tokens": 2722, "Total output tokens": 2207837, "Average output tokens": 1674 }, "AQuA": { "Score": 86.61, "Pass rate": 0.9882, "Cost($)": 8.1485, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 1373206, "Total input tokens": 744478, "Average input tokens": 2931, "Total output tokens": 628728, "Average output tokens": 2475 } }, "Qwen2.5-72B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Qwen2.5-72B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 93.86, "Pass rate": 1.0, "Cost($)": 5.9858, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 10618008, "Total input tokens": 8136223, "Average input tokens": 6168, "Total output tokens": 2481785, "Average output tokens": 1882 }, "AQuA": { "Score": 85.04, "Pass rate": 0.9921, "Cost($)": 1.0348, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 1835669, "Total input tokens": 1051218, "Average input tokens": 4139, "Total output tokens": 784451, "Average output tokens": 3088 } }, "Llama-3.3-70B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Llama-3.3-70B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 95.07, "Pass rate": 1.0, "Cost($)": 6.2005, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 10998794, "Total input tokens": 8413717, "Average input tokens": 6379, "Total output tokens": 2585077, "Average output tokens": 1960 }, "AQuA": { "Score": 82.28, "Pass rate": 0.9921, "Cost($)": 1.0756, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 1907924, "Total input tokens": 1135251, "Average input tokens": 4469, "Total output tokens": 772673, "Average output tokens": 3042 } }, "Qwen2.5-7B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Qwen2.5-7B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 91.13, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 11140985, "Total input tokens": 8586888, "Average input tokens": 6510, "Total output tokens": 2554097, "Average output tokens": 1936 }, "AQuA": { "Score": 79.92, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 1845332, "Total input tokens": 1098280, "Average input tokens": 4324, "Total output tokens": 747052, "Average output tokens": 2941 } }, "Llama-3.1-8B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Llama-3.1-8B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 73.46, "Pass rate": 0.9955, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 11778716, "Total input tokens": 8630514, "Average input tokens": 6543, "Total output tokens": 3148202, "Average output tokens": 2387 }, "AQuA": { "Score": 59.45, "Pass rate": 0.9724, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 1651333, "Total input tokens": 971003, "Average input tokens": 3823, "Total output tokens": 680330, "Average output tokens": 2678 } }, "Internllm2_5-7B": { "META": { "Algorithm": "SC-CoT", "LLM": "Internllm2_5-7B", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 48.22, "Pass rate": 0.9841, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 14526431, "Total input tokens": 10678792, "Average input tokens": 8096, "Total output tokens": 3847639, "Average output tokens": 2917 }, "AQuA": { "Score": 39.37, "Pass rate": 0.9803, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 2296222, "Total input tokens": 1420494, "Average input tokens": 5592, "Total output tokens": 875728, "Average output tokens": 3448 } }, "Qwen2-1.5B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Qwen2-1.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 11.75, "Pass rate": 0.9189, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 12411942, "Total input tokens": 9066115, "Average input tokens": 6873, "Total output tokens": 3345827, "Average output tokens": 2537 }, "AQuA": { "Score": 23.62, "Pass rate": 0.9646, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 1775335, "Total input tokens": 1034362, "Average input tokens": 4072, "Total output tokens": 740973, "Average output tokens": 2917 } }, "Qwen2-0.5B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Qwen2-0.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 1.67, "Pass rate": 0.9469, "Cost($)": 0.0, "Framework": "", "X-shot": "8.0", "Samples": 1319, "All tokens": 16465720, "Total input tokens": 11019864, "Average input tokens": 8355, "Total output tokens": 5445856, "Average output tokens": 4129 }, "AQuA": { "Score": 22.83, "Pass rate": 0.9724, "Cost($)": 0.0, "Framework": "", "X-shot": "0.0", "Samples": 254, "All tokens": 2215091, "Total input tokens": 1246929, "Average input tokens": 4909, "Total output tokens": 968162, "Average output tokens": 3812 } } } } }