open-agent-leaderboard / src /detail_results.csv
liaojiajia
update sc-cot scores
be9cdf5
Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
1,SC-CoT,AQuA,gpt-4o,2025/1/22,86.61,0.9882,0.0,8.1485,,254,1373206,744478,2931,628728,2475
2,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0.0,0.0808,,254,143289,25143,99,118146,465
3,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.04,0.9921,0.0,1.0348,,254,1835669,1051218,4139,784451,3088
4,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.9961,0.0,0.0742,,254,131604,25397,100,106207,418
5,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9843,0.0,0.0927,,254,164389,32555,128,131834,519
6,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.68,0.9921,0.0,0.0798,,254,141567,32809,129,108758,428
7,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.68,0.9724,0.0,0.0066,,254,94577,27978,110,66599,262
8,CoT,AQuA,gpt-4o,2025/1/22,82.68,0.9803,0.0,1.0417,,254,123017,25123,99,97894,385
9,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.28,0.9921,0.0,1.0756,,254,1907924,1135251,4469,772673,3042
10,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,81.1,0.9724,0.0,0.0519,,254,885986,503751,1983,382235,1505
11,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.71,0.9961,0.0,0.0,,254,149736,33017,130,116719,460
12,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,79.92,1.0,0.0,0.0,,254,1845332,1098280,4324,747052,2941
13,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.53,0.9921,0.0,0.1746,,254,309799,240735,948,69064,272
14,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0.0,0.0058,,254,87742,33058,130,54684,215
15,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.9961,0.0,0.768,,254,1362379,1119143,4406,243236,958
16,IO,AQuA,Qwen2.5-7B-Instruct,2025/1/22,78.74,0.9843,0.0,0.0,,254,137771,33271,131,104500,411
17,ReAct-Pro*,AQuA,Doubao-lite-32k,2025/1/7,77.56,0.9606,0.0,0.0445,,254,1032841,977890,3850,54951,216
18,IO,AQuA,gpt-4o,2025/1/22,75.59,0.9724,0.0,1.1453,,254,133752,25631,101,108121,426
19,PoT,AQuA,gpt-4o,2025/1/22,75.2,1.0,0.0,1.6087,,254,327908,222717,877,105191,414
20,PoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,75.2,1.0,0.0,0.1645,,254,291764,249215,981,42549,168
21,ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,2025/1/22,74.41,0.9921,0.0,0.0,,254,695844,564165,2221,131679,518
22,ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,2025/1/22,73.23,1.0,0.0,0.3177,,254,563603,441765,1739,121838,480
23,PoT,AQuA,Doubao-lite-32k,2025/1/7,71.65,0.9685,0.0,0.0147,,254,309436,259863,1023,49573,195
24,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0.0,0.0,,254,313728,264517,1041,49211,194
25,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,66.14,0.9921,0.0,0.7888,,254,847335,482192,1898,365143,1438
26,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.57,0.9803,0.0,0.4928,,254,903587,862614,3396,40973,161
27,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0.0,0.0957,,254,80793,25447,100,55346,218
28,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.63,1.0,0.0,0.0,,254,144435,32555,128,111880,440
29,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.45,1.0,0.0,0.1748,,254,266654,225162,886,41492,163
30,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.45,0.9724,0.0,0.0,,254,1651333,971003,3823,680330,2678
31,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0.0,2.304,,254,692096,615589,2424,76507,301
32,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0.0,0.0,,254,4340821,3764723,14822,576098,2268
33,CoT,AQuA,Internllm2_5-7B,2025/1/22,52.76,0.8937,0.0,0.0,,254,127520,26610,105,100910,397
34,IO,AQuA,Llama-3.1-8B-Instruct,2025/1/22,51.18,0.9882,0.0,0.0,,254,133106,26459,104,106647,420
35,IO,AQuA,Internllm2_5-7B,2025/1/22,47.64,0.9094,0.0,0.0,,254,185041,50232,198,134809,531
36,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0.0,0.0,,254,4428801,3592039,14142,836762,3294
37,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9882,0.0,0.0,,254,110040,30477,120,79563,313
38,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,39.37,0.9803,0.0,0.0,,254,2296222,1420494,5592,875728,3448
39,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.98,1.0,0.0,0.038,,254,42471,25701,101,16770,66
40,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0.0,0.0,,254,290914,240613,947,50301,198
41,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9882,0.0,0.0,,254,301962,233505,919,68457,270
42,CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,33.07,0.9882,0.0,0.0,,254,117339,30477,120,86862,342
43,PoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.71,0.9646,0.0,0.0,,254,298475,246560,971,51915,204
44,IO,AQuA,Qwen2-1.5B-Instruct,2025/1/22,29.13,0.9764,0.0,0.0,,254,71047,27937,110,43110,170
45,IO,AQuA,Qwen2-0.5B-Instruct,2025/1/22,27.17,0.9882,0.0,0.0,,254,110415,27937,110,82478,325
46,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0.0,0.0,,254,5072004,4555858,17936,516146,2032
47,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.02,0.9685,0.0,0.0,,254,7170087,6344167,24977,825920,3252
48,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,23.62,0.9646,0.0,0.0,,254,1775335,1034362,4072,740973,2917
49,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,22.83,0.9724,0.0,0.0,,254,2215091,1246929,4909,968162,3812
50,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9213,0.0,0.0,,254,322281,258867,1019,63414,250
1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.07,1.0,8.0,6.2005,,1319,10998794,8413717,6379,2585077,1960
2,CoT,gsm8k,gpt-4o,2025/1/22,94.09,1.0,8.0,4.5367,,1319,1165166,948668,719,216498,164
3,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8.0,0.687,,1319,1218665,990168,751,228497,173
4,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,93.86,1.0,8.0,5.9858,,1319,10618008,8136223,6168,2481785,1882
5,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8.0,4.2166,,1319,1247912,1101672,835,146240,111
6,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8.0,0.7195,,1319,1276252,1005119,762,271133,206
7,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8.0,0.7054,,1319,1251210,1106682,839,144528,110
8,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.27,1.0,8.0,0.4709,,1319,835275,583916,443,251359,191
9,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,91.13,1.0,8.0,0.0,,1319,11140985,8586888,6510,2554097,1936
10,SC-CoT,gsm8k,gpt-4o,2025/1/22,90.3,0.9992,8.0,31.0542,,1319,5798173,3590336,2722,2207837,1674
11,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8.0,0.0558,,1319,1201820,1042095,790,159725,121
12,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8.0,3.3463,,1319,741446,542416,411,199030,151
13,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,8.0,10.1124,,1319,17937864,17038928,12918,898936,682
14,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8.0,10.5479,,1319,18710437,18160983,13769,549454,417
15,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,87.26,0.9992,8.0,0.2083,,1319,3888813,2691714,2041,1197099,908
16,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8.0,0.4899,,1319,869060,555340,421,313720,238
17,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8.0,0.0,,1319,1290805,1046008,793,244797,186
18,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.6,0.9962,8.0,0.2512,,1319,5998639,5862016,4444,136623,104
19,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.87,1.0,8.0,0.0,,1319,14850914,14355752,10884,495162,375
20,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,79.91,0.9992,8.0,3.3938,,1319,4089612,2740652,2078,1348960,1023
21,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.61,0.9257,8.0,0.0576,,1319,1288055,1170038,887,118017,89
22,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.7,1.0,8.0,0.6788,,1319,1088041,953242,723,134799,102
23,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.997,8.0,0.0,,1319,1202163,968163,734,234000,177
24,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.88,0.9924,8.0,0.6902,,1319,1187080,1090418,827,96662,73
25,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.44,0.9992,8.0,0.0,,1319,1248329,990168,751,258161,196
26,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.91,0.9939,8.0,3.4633,,1319,6646286,6506164,4933,140122,106
27,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,73.46,0.9955,8.0,0.0,,1319,11778716,8630514,6543,3148202,2387
28,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.09,0.7961,8.0,0.9736,,1319,1727044,1126025,854,601019,456
29,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8.0,0.0354,,1319,740483,617377,468,123106,93
30,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.78,0.9856,8.0,0.0,,1319,22835767,21044978,15955,1790789,1358
31,ReAct-Pro*,gsm8k,gpt-4o,2025/1/22,63.31,0.9955,8.0,39.0751,,1319,14715887,14411173,10926,304714,231
32,PoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,58.83,0.7051,8.0,0.0,,1319,1362822,1145390,868,217432,165
33,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8.0,0.0,,1319,887913,596229,452,291684,221
34,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9955,8.0,0.0,,1319,1745429,550941,418,1194488,906
35,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.5,1.0,8.0,0.0,,1319,1218525,1032818,783,185707,141
36,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,48.22,0.9841,8.0,0.0,,1319,14526431,10678792,8096,3847639,2917
37,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.67,0.5542,8.0,0.0,,1319,1391111,1147538,870,243573,185
38,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8.0,0.0,,1319,1324949,1136843,862,188106,143
39,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8.0,0.3328,,1319,586553,546990,415,39563,30
40,CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,35.94,0.9992,8.0,0.0,,1319,1223459,1032818,783,190641,145
41,ReAct-Pro*,gsm8k,Internllm2_5-7B,2025/1/22,33.51,0.9795,8.0,0.0,,1319,35669989,30120070,22836,5549919,4208
42,ReAct-Pro*,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,24.87,0.8021,8.0,0.0,,1319,9828001,9133603,6925,694398,526
43,PoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,18.5,0.3101,8.0,0.0,,1319,1327522,1151528,873,175994,133
44,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.68,1.0,8.0,0.0,,1319,736996,568530,431,168466,128
45,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.71,1.0,8.0,0.0,,1319,834897,568116,431,266781,202
46,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,11.75,0.9189,8.0,0.0,,1319,12411942,9066115,6873,3345827,2537
47,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.6,0.9795,8.0,0.0,,1319,1113728,679302,515,434426,329
48,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.62,0.1691,8.0,0.0,,1319,1389135,1151528,873,237607,180
49,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.66,0.9522,8.0,0.0,,1319,55392611,52431343,39751,2961268,2245
50,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,1.67,0.9469,8.0,0.0,,1319,16465720,11019864,8355,5445856,4129