liaojiajia commited on
Commit
52f14c3
ยท
1 Parent(s): c9a97c2

update score

Browse files
src/detail_math_score.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "time": "2025-01-23 09:27:24",
3
  "results": {
4
  "IO": {
5
  "gpt-3.5-turbo": {
@@ -22,7 +22,7 @@
22
  "Average output tokens": 30
23
  },
24
  "AQuA": {
25
- "Score": 38.97,
26
  "Pass rate": 1.0,
27
  "Cost($)": 0.038,
28
  "Framework": "",
@@ -122,7 +122,7 @@
122
  },
123
  "AQuA": {
124
  "Score": 84.25,
125
- "Pass rate": 0.996,
126
  "Cost($)": 0.0742,
127
  "Framework": "",
128
  "X-shot": "0.0",
@@ -141,7 +141,7 @@
141
  "Eval Date": "2025/1/22"
142
  },
143
  "gsm8k": {
144
- "Score": 92.26,
145
  "Pass rate": 1.0,
146
  "Cost($)": 0.4709,
147
  "Framework": "",
@@ -154,7 +154,7 @@
154
  "Average output tokens": 191
155
  },
156
  "AQuA": {
157
- "Score": 82.67,
158
  "Pass rate": 0.9921,
159
  "Cost($)": 0.0798,
160
  "Framework": "",
@@ -188,7 +188,7 @@
188
  },
189
  "AQuA": {
190
  "Score": 78.74,
191
- "Pass rate": 0.9842,
192
  "Cost($)": 0.0,
193
  "Framework": "",
194
  "X-shot": "0.0",
@@ -208,7 +208,7 @@
208
  },
209
  "gsm8k": {
210
  "Score": 57.16,
211
- "Pass rate": 0.9954,
212
  "Cost($)": 0.0,
213
  "Framework": "",
214
  "X-shot": "8.0",
@@ -221,7 +221,7 @@
221
  },
222
  "AQuA": {
223
  "Score": 51.18,
224
- "Pass rate": 0.9881,
225
  "Cost($)": 0.0,
226
  "Framework": "",
227
  "X-shot": "0.0",
@@ -240,7 +240,7 @@
240
  "Eval Date": "2025/1/22"
241
  },
242
  "gsm8k": {
243
- "Score": 11.59,
244
  "Pass rate": 0.9795,
245
  "Cost($)": 0.0,
246
  "Framework": "",
@@ -253,7 +253,7 @@
253
  "Average output tokens": 329
254
  },
255
  "AQuA": {
256
- "Score": 47.63,
257
  "Pass rate": 0.9094,
258
  "Cost($)": 0.0,
259
  "Framework": "",
@@ -273,7 +273,7 @@
273
  "Eval Date": "2025/1/22"
274
  },
275
  "gsm8k": {
276
- "Score": 16.67,
277
  "Pass rate": 1.0,
278
  "Cost($)": 0.0,
279
  "Framework": "",
@@ -287,7 +287,7 @@
287
  },
288
  "AQuA": {
289
  "Score": 29.13,
290
- "Pass rate": 0.9763,
291
  "Cost($)": 0.0,
292
  "Framework": "",
293
  "X-shot": "0.0",
@@ -306,7 +306,7 @@
306
  "Eval Date": "2025/1/22"
307
  },
308
  "gsm8k": {
309
- "Score": 14.7,
310
  "Pass rate": 1.0,
311
  "Cost($)": 0.0,
312
  "Framework": "",
@@ -319,8 +319,8 @@
319
  "Average output tokens": 202
320
  },
321
  "AQuA": {
322
- "Score": 27.16,
323
- "Pass rate": 0.9881,
324
  "Cost($)": 0.0,
325
  "Framework": "",
326
  "X-shot": "0.0",
@@ -341,7 +341,7 @@
341
  "Eval Date": "2025/1/7"
342
  },
343
  "gsm8k": {
344
- "Score": 74.9,
345
  "Pass rate": 0.9939,
346
  "Cost($)": 3.4633,
347
  "Framework": "",
@@ -354,7 +354,7 @@
354
  "Average output tokens": 106
355
  },
356
  "AQuA": {
357
- "Score": 64.56,
358
  "Pass rate": 0.9803,
359
  "Cost($)": 0.4928,
360
  "Framework": "",
@@ -374,7 +374,7 @@
374
  "Eval Date": "2025/1/7"
375
  },
376
  "gsm8k": {
377
- "Score": 85.59,
378
  "Pass rate": 0.9962,
379
  "Cost($)": 0.2512,
380
  "Framework": "",
@@ -387,7 +387,7 @@
387
  "Average output tokens": 104
388
  },
389
  "AQuA": {
390
- "Score": 77.55,
391
  "Pass rate": 0.9606,
392
  "Cost($)": 0.0445,
393
  "Framework": "",
@@ -407,8 +407,8 @@
407
  "Eval Date": "2025/1/22"
408
  },
409
  "gsm8k": {
410
- "Score": 63.3,
411
- "Pass rate": 0.9954,
412
  "Cost($)": 39.0751,
413
  "Framework": "",
414
  "X-shot": "8.0",
@@ -453,7 +453,7 @@
453
  "Average output tokens": 417
454
  },
455
  "AQuA": {
456
- "Score": 73.22,
457
  "Pass rate": 1.0,
458
  "Cost($)": 0.3177,
459
  "Framework": "",
@@ -477,7 +477,7 @@
477
  "Pass rate": 0.9992,
478
  "Cost($)": 10.1124,
479
  "Framework": "",
480
- "X-shot": "",
481
  "Samples": 1319,
482
  "All tokens": 17937864,
483
  "Total input tokens": 17038928,
@@ -487,7 +487,7 @@
487
  },
488
  "AQuA": {
489
  "Score": 79.13,
490
- "Pass rate": 0.996,
491
  "Cost($)": 0.768,
492
  "Framework": "",
493
  "X-shot": "0.0",
@@ -506,7 +506,7 @@
506
  "Eval Date": "2025/1/22"
507
  },
508
  "gsm8k": {
509
- "Score": 82.86,
510
  "Pass rate": 1.0,
511
  "Cost($)": 0.0,
512
  "Framework": "",
@@ -519,7 +519,7 @@
519
  "Average output tokens": 375
520
  },
521
  "AQuA": {
522
- "Score": 74.4,
523
  "Pass rate": 0.9921,
524
  "Cost($)": 0.0,
525
  "Framework": "",
@@ -539,8 +539,8 @@
539
  "Eval Date": "2025/1/22"
540
  },
541
  "gsm8k": {
542
- "Score": 67.77,
543
- "Pass rate": 0.9855,
544
  "Cost($)": 0.0,
545
  "Framework": "",
546
  "X-shot": "8.0",
@@ -576,7 +576,7 @@
576
  "Pass rate": 0.9795,
577
  "Cost($)": 0.0,
578
  "Framework": "",
579
- "X-shot": "",
580
  "Samples": 1319,
581
  "All tokens": 35669989,
582
  "Total input tokens": 30120070,
@@ -605,7 +605,7 @@
605
  "Eval Date": "2025/1/22"
606
  },
607
  "gsm8k": {
608
- "Score": 24.86,
609
  "Pass rate": 0.8021,
610
  "Cost($)": 0.0,
611
  "Framework": "",
@@ -638,7 +638,7 @@
638
  "Eval Date": "2025/1/22"
639
  },
640
  "gsm8k": {
641
- "Score": 7.65,
642
  "Pass rate": 0.9522,
643
  "Cost($)": 0.0,
644
  "Framework": "",
@@ -651,7 +651,7 @@
651
  "Average output tokens": 2245
652
  },
653
  "AQuA": {
654
- "Score": 24.01,
655
  "Pass rate": 0.9685,
656
  "Cost($)": 0.0,
657
  "Framework": "",
@@ -673,7 +673,7 @@
673
  "Eval Date": "2025/1/7"
674
  },
675
  "gsm8k": {
676
- "Score": 76.87,
677
  "Pass rate": 0.9924,
678
  "Cost($)": 0.6902,
679
  "Framework": "",
@@ -686,7 +686,7 @@
686
  "Average output tokens": 73
687
  },
688
  "AQuA": {
689
- "Score": 59.44,
690
  "Pass rate": 1.0,
691
  "Cost($)": 0.1748,
692
  "Framework": "",
@@ -706,7 +706,7 @@
706
  "Eval Date": "2025/1/7"
707
  },
708
  "gsm8k": {
709
- "Score": 79.6,
710
  "Pass rate": 0.9257,
711
  "Cost($)": 0.0576,
712
  "Framework": "",
@@ -752,7 +752,7 @@
752
  "Average output tokens": 111
753
  },
754
  "AQuA": {
755
- "Score": 75.19,
756
  "Pass rate": 1.0,
757
  "Cost($)": 1.6087,
758
  "Framework": "",
@@ -785,7 +785,7 @@
785
  "Average output tokens": 110
786
  },
787
  "AQuA": {
788
- "Score": 75.19,
789
  "Pass rate": 1.0,
790
  "Cost($)": 0.1645,
791
  "Framework": "",
@@ -805,8 +805,8 @@
805
  "Eval Date": "2025/1/22"
806
  },
807
  "gsm8k": {
808
- "Score": 73.08,
809
- "Pass rate": 0.796,
810
  "Cost($)": 0.9736,
811
  "Framework": "",
812
  "X-shot": "8.0",
@@ -818,7 +818,7 @@
818
  "Average output tokens": 456
819
  },
820
  "AQuA": {
821
- "Score": 79.52,
822
  "Pass rate": 0.9921,
823
  "Cost($)": 0.1746,
824
  "Framework": "",
@@ -839,7 +839,7 @@
839
  },
840
  "gsm8k": {
841
  "Score": 58.83,
842
- "Pass rate": 0.705,
843
  "Cost($)": 0.0,
844
  "Framework": "",
845
  "X-shot": "8.0",
@@ -871,7 +871,7 @@
871
  "Eval Date": "2025/1/22"
872
  },
873
  "gsm8k": {
874
- "Score": 38.66,
875
  "Pass rate": 0.5542,
876
  "Cost($)": 0.0,
877
  "Framework": "",
@@ -918,7 +918,7 @@
918
  },
919
  "AQuA": {
920
  "Score": 36.61,
921
- "Pass rate": 0.9881,
922
  "Cost($)": 0.0,
923
  "Framework": "",
924
  "X-shot": "0.0",
@@ -937,8 +937,8 @@
937
  "Eval Date": "2025/1/22"
938
  },
939
  "gsm8k": {
940
- "Score": 18.49,
941
- "Pass rate": 0.31,
942
  "Cost($)": 0.0,
943
  "Framework": "",
944
  "X-shot": "8.0",
@@ -950,8 +950,8 @@
950
  "Average output tokens": 133
951
  },
952
  "AQuA": {
953
- "Score": 30.7,
954
- "Pass rate": 0.9645,
955
  "Cost($)": 0.0,
956
  "Framework": "",
957
  "X-shot": "0.0",
@@ -984,7 +984,7 @@
984
  },
985
  "AQuA": {
986
  "Score": 17.32,
987
- "Pass rate": 0.9212,
988
  "Cost($)": 0.0,
989
  "Framework": "",
990
  "X-shot": "0.0",
@@ -1005,7 +1005,7 @@
1005
  "Eval Date": "2025/1/7"
1006
  },
1007
  "gsm8k": {
1008
- "Score": 78.69,
1009
  "Pass rate": 1.0,
1010
  "Cost($)": 0.6788,
1011
  "Framework": "",
@@ -1051,7 +1051,7 @@
1051
  "Average output tokens": 121
1052
  },
1053
  "AQuA": {
1054
- "Score": 82.67,
1055
  "Pass rate": 0.9724,
1056
  "Cost($)": 0.0066,
1057
  "Framework": "",
@@ -1071,7 +1071,7 @@
1071
  "Eval Date": "2025/1/22"
1072
  },
1073
  "gsm8k": {
1074
- "Score": 94.08,
1075
  "Pass rate": 1.0,
1076
  "Cost($)": 4.5367,
1077
  "Framework": "",
@@ -1084,7 +1084,7 @@
1084
  "Average output tokens": 164
1085
  },
1086
  "AQuA": {
1087
- "Score": 82.67,
1088
  "Pass rate": 0.9803,
1089
  "Cost($)": 1.0417,
1090
  "Framework": "",
@@ -1151,7 +1151,7 @@
1151
  },
1152
  "AQuA": {
1153
  "Score": 83.46,
1154
- "Pass rate": 0.9842,
1155
  "Cost($)": 0.0927,
1156
  "Framework": "",
1157
  "X-shot": "0.0",
@@ -1183,8 +1183,8 @@
1183
  "Average output tokens": 186
1184
  },
1185
  "AQuA": {
1186
- "Score": 80.7,
1187
- "Pass rate": 0.996,
1188
  "Cost($)": 0.0,
1189
  "Framework": "",
1190
  "X-shot": "0.0",
@@ -1203,7 +1203,7 @@
1203
  "Eval Date": "2025/1/22"
1204
  },
1205
  "gsm8k": {
1206
- "Score": 75.43,
1207
  "Pass rate": 0.9992,
1208
  "Cost($)": 0.0,
1209
  "Framework": "",
@@ -1216,7 +1216,7 @@
1216
  "Average output tokens": 196
1217
  },
1218
  "AQuA": {
1219
- "Score": 60.62,
1220
  "Pass rate": 1.0,
1221
  "Cost($)": 0.0,
1222
  "Framework": "",
@@ -1237,7 +1237,7 @@
1237
  },
1238
  "gsm8k": {
1239
  "Score": 77.71,
1240
- "Pass rate": 0.9969,
1241
  "Cost($)": 0.0,
1242
  "Framework": "",
1243
  "X-shot": "8.0",
@@ -1249,7 +1249,7 @@
1249
  "Average output tokens": 177
1250
  },
1251
  "AQuA": {
1252
- "Score": 52.75,
1253
  "Pass rate": 0.8937,
1254
  "Cost($)": 0.0,
1255
  "Framework": "",
@@ -1269,7 +1269,7 @@
1269
  "Eval Date": "2025/1/22"
1270
  },
1271
  "gsm8k": {
1272
- "Score": 55.49,
1273
  "Pass rate": 1.0,
1274
  "Cost($)": 0.0,
1275
  "Framework": "",
@@ -1283,7 +1283,7 @@
1283
  },
1284
  "AQuA": {
1285
  "Score": 40.55,
1286
- "Pass rate": 0.9881,
1287
  "Cost($)": 0.0,
1288
  "Framework": "",
1289
  "X-shot": "0.0",
@@ -1302,7 +1302,7 @@
1302
  "Eval Date": "2025/1/22"
1303
  },
1304
  "gsm8k": {
1305
- "Score": 35.93,
1306
  "Pass rate": 0.9992,
1307
  "Cost($)": 0.0,
1308
  "Framework": "",
@@ -1316,7 +1316,7 @@
1316
  },
1317
  "AQuA": {
1318
  "Score": 33.07,
1319
- "Pass rate": 0.9881,
1320
  "Cost($)": 0.0,
1321
  "Framework": "",
1322
  "X-shot": "0.0",
@@ -1449,8 +1449,8 @@
1449
  "Average output tokens": 1723
1450
  },
1451
  "AQuA": {
1452
- "Score": 85.82,
1453
- "Pass rate": 0.9842,
1454
  "Cost($)": 0.5576,
1455
  "Framework": "",
1456
  "X-shot": "0.0",
@@ -1503,7 +1503,7 @@
1503
  },
1504
  "gsm8k": {
1505
  "Score": 88.32,
1506
- "Pass rate": 0.9984,
1507
  "Cost($)": 0.0,
1508
  "Framework": "",
1509
  "X-shot": "8.0",
@@ -1515,7 +1515,7 @@
1515
  "Average output tokens": 1900
1516
  },
1517
  "AQuA": {
1518
- "Score": 81.49,
1519
  "Pass rate": 1.0,
1520
  "Cost($)": 0.0,
1521
  "Framework": "",
@@ -1535,8 +1535,8 @@
1535
  "Eval Date": "2025/1/22"
1536
  },
1537
  "gsm8k": {
1538
- "Score": 75.2,
1539
- "Pass rate": 0.9954,
1540
  "Cost($)": 0.0,
1541
  "Framework": "",
1542
  "X-shot": "8.0",
@@ -1548,7 +1548,7 @@
1548
  "Average output tokens": 2358
1549
  },
1550
  "AQuA": {
1551
- "Score": 53.14,
1552
  "Pass rate": 0.9606,
1553
  "Cost($)": 0.0,
1554
  "Framework": "",
@@ -1569,7 +1569,7 @@
1569
  },
1570
  "gsm8k": {
1571
  "Score": 41.39,
1572
- "Pass rate": 0.9825,
1573
  "Cost($)": 0.0,
1574
  "Framework": "",
1575
  "X-shot": "8.0",
@@ -1647,8 +1647,8 @@
1647
  "Average output tokens": 3036
1648
  },
1649
  "AQuA": {
1650
- "Score": 30.7,
1651
- "Pass rate": 0.9842,
1652
  "Cost($)": 0.0,
1653
  "Framework": "",
1654
  "X-shot": "0.0",
 
1
  {
2
+ "time": "2025-01-23 11:23:17",
3
  "results": {
4
  "IO": {
5
  "gpt-3.5-turbo": {
 
22
  "Average output tokens": 30
23
  },
24
  "AQuA": {
25
+ "Score": 38.98,
26
  "Pass rate": 1.0,
27
  "Cost($)": 0.038,
28
  "Framework": "",
 
122
  },
123
  "AQuA": {
124
  "Score": 84.25,
125
+ "Pass rate": 0.9961,
126
  "Cost($)": 0.0742,
127
  "Framework": "",
128
  "X-shot": "0.0",
 
141
  "Eval Date": "2025/1/22"
142
  },
143
  "gsm8k": {
144
+ "Score": 92.27,
145
  "Pass rate": 1.0,
146
  "Cost($)": 0.4709,
147
  "Framework": "",
 
154
  "Average output tokens": 191
155
  },
156
  "AQuA": {
157
+ "Score": 82.68,
158
  "Pass rate": 0.9921,
159
  "Cost($)": 0.0798,
160
  "Framework": "",
 
188
  },
189
  "AQuA": {
190
  "Score": 78.74,
191
+ "Pass rate": 0.9843,
192
  "Cost($)": 0.0,
193
  "Framework": "",
194
  "X-shot": "0.0",
 
208
  },
209
  "gsm8k": {
210
  "Score": 57.16,
211
+ "Pass rate": 0.9955,
212
  "Cost($)": 0.0,
213
  "Framework": "",
214
  "X-shot": "8.0",
 
221
  },
222
  "AQuA": {
223
  "Score": 51.18,
224
+ "Pass rate": 0.9882,
225
  "Cost($)": 0.0,
226
  "Framework": "",
227
  "X-shot": "0.0",
 
240
  "Eval Date": "2025/1/22"
241
  },
242
  "gsm8k": {
243
+ "Score": 11.6,
244
  "Pass rate": 0.9795,
245
  "Cost($)": 0.0,
246
  "Framework": "",
 
253
  "Average output tokens": 329
254
  },
255
  "AQuA": {
256
+ "Score": 47.64,
257
  "Pass rate": 0.9094,
258
  "Cost($)": 0.0,
259
  "Framework": "",
 
273
  "Eval Date": "2025/1/22"
274
  },
275
  "gsm8k": {
276
+ "Score": 16.68,
277
  "Pass rate": 1.0,
278
  "Cost($)": 0.0,
279
  "Framework": "",
 
287
  },
288
  "AQuA": {
289
  "Score": 29.13,
290
+ "Pass rate": 0.9764,
291
  "Cost($)": 0.0,
292
  "Framework": "",
293
  "X-shot": "0.0",
 
306
  "Eval Date": "2025/1/22"
307
  },
308
  "gsm8k": {
309
+ "Score": 14.71,
310
  "Pass rate": 1.0,
311
  "Cost($)": 0.0,
312
  "Framework": "",
 
319
  "Average output tokens": 202
320
  },
321
  "AQuA": {
322
+ "Score": 27.17,
323
+ "Pass rate": 0.9882,
324
  "Cost($)": 0.0,
325
  "Framework": "",
326
  "X-shot": "0.0",
 
341
  "Eval Date": "2025/1/7"
342
  },
343
  "gsm8k": {
344
+ "Score": 74.91,
345
  "Pass rate": 0.9939,
346
  "Cost($)": 3.4633,
347
  "Framework": "",
 
354
  "Average output tokens": 106
355
  },
356
  "AQuA": {
357
+ "Score": 64.57,
358
  "Pass rate": 0.9803,
359
  "Cost($)": 0.4928,
360
  "Framework": "",
 
374
  "Eval Date": "2025/1/7"
375
  },
376
  "gsm8k": {
377
+ "Score": 85.6,
378
  "Pass rate": 0.9962,
379
  "Cost($)": 0.2512,
380
  "Framework": "",
 
387
  "Average output tokens": 104
388
  },
389
  "AQuA": {
390
+ "Score": 77.56,
391
  "Pass rate": 0.9606,
392
  "Cost($)": 0.0445,
393
  "Framework": "",
 
407
  "Eval Date": "2025/1/22"
408
  },
409
  "gsm8k": {
410
+ "Score": 63.31,
411
+ "Pass rate": 0.9955,
412
  "Cost($)": 39.0751,
413
  "Framework": "",
414
  "X-shot": "8.0",
 
453
  "Average output tokens": 417
454
  },
455
  "AQuA": {
456
+ "Score": 73.23,
457
  "Pass rate": 1.0,
458
  "Cost($)": 0.3177,
459
  "Framework": "",
 
477
  "Pass rate": 0.9992,
478
  "Cost($)": 10.1124,
479
  "Framework": "",
480
+ "X-shot": "8.0",
481
  "Samples": 1319,
482
  "All tokens": 17937864,
483
  "Total input tokens": 17038928,
 
487
  },
488
  "AQuA": {
489
  "Score": 79.13,
490
+ "Pass rate": 0.9961,
491
  "Cost($)": 0.768,
492
  "Framework": "",
493
  "X-shot": "0.0",
 
506
  "Eval Date": "2025/1/22"
507
  },
508
  "gsm8k": {
509
+ "Score": 82.87,
510
  "Pass rate": 1.0,
511
  "Cost($)": 0.0,
512
  "Framework": "",
 
519
  "Average output tokens": 375
520
  },
521
  "AQuA": {
522
+ "Score": 74.41,
523
  "Pass rate": 0.9921,
524
  "Cost($)": 0.0,
525
  "Framework": "",
 
539
  "Eval Date": "2025/1/22"
540
  },
541
  "gsm8k": {
542
+ "Score": 67.78,
543
+ "Pass rate": 0.9856,
544
  "Cost($)": 0.0,
545
  "Framework": "",
546
  "X-shot": "8.0",
 
576
  "Pass rate": 0.9795,
577
  "Cost($)": 0.0,
578
  "Framework": "",
579
+ "X-shot": "8.0",
580
  "Samples": 1319,
581
  "All tokens": 35669989,
582
  "Total input tokens": 30120070,
 
605
  "Eval Date": "2025/1/22"
606
  },
607
  "gsm8k": {
608
+ "Score": 24.87,
609
  "Pass rate": 0.8021,
610
  "Cost($)": 0.0,
611
  "Framework": "",
 
638
  "Eval Date": "2025/1/22"
639
  },
640
  "gsm8k": {
641
+ "Score": 7.66,
642
  "Pass rate": 0.9522,
643
  "Cost($)": 0.0,
644
  "Framework": "",
 
651
  "Average output tokens": 2245
652
  },
653
  "AQuA": {
654
+ "Score": 24.02,
655
  "Pass rate": 0.9685,
656
  "Cost($)": 0.0,
657
  "Framework": "",
 
673
  "Eval Date": "2025/1/7"
674
  },
675
  "gsm8k": {
676
+ "Score": 76.88,
677
  "Pass rate": 0.9924,
678
  "Cost($)": 0.6902,
679
  "Framework": "",
 
686
  "Average output tokens": 73
687
  },
688
  "AQuA": {
689
+ "Score": 59.45,
690
  "Pass rate": 1.0,
691
  "Cost($)": 0.1748,
692
  "Framework": "",
 
706
  "Eval Date": "2025/1/7"
707
  },
708
  "gsm8k": {
709
+ "Score": 79.61,
710
  "Pass rate": 0.9257,
711
  "Cost($)": 0.0576,
712
  "Framework": "",
 
752
  "Average output tokens": 111
753
  },
754
  "AQuA": {
755
+ "Score": 75.2,
756
  "Pass rate": 1.0,
757
  "Cost($)": 1.6087,
758
  "Framework": "",
 
785
  "Average output tokens": 110
786
  },
787
  "AQuA": {
788
+ "Score": 75.2,
789
  "Pass rate": 1.0,
790
  "Cost($)": 0.1645,
791
  "Framework": "",
 
805
  "Eval Date": "2025/1/22"
806
  },
807
  "gsm8k": {
808
+ "Score": 73.09,
809
+ "Pass rate": 0.7961,
810
  "Cost($)": 0.9736,
811
  "Framework": "",
812
  "X-shot": "8.0",
 
818
  "Average output tokens": 456
819
  },
820
  "AQuA": {
821
+ "Score": 79.53,
822
  "Pass rate": 0.9921,
823
  "Cost($)": 0.1746,
824
  "Framework": "",
 
839
  },
840
  "gsm8k": {
841
  "Score": 58.83,
842
+ "Pass rate": 0.7051,
843
  "Cost($)": 0.0,
844
  "Framework": "",
845
  "X-shot": "8.0",
 
871
  "Eval Date": "2025/1/22"
872
  },
873
  "gsm8k": {
874
+ "Score": 38.67,
875
  "Pass rate": 0.5542,
876
  "Cost($)": 0.0,
877
  "Framework": "",
 
918
  },
919
  "AQuA": {
920
  "Score": 36.61,
921
+ "Pass rate": 0.9882,
922
  "Cost($)": 0.0,
923
  "Framework": "",
924
  "X-shot": "0.0",
 
937
  "Eval Date": "2025/1/22"
938
  },
939
  "gsm8k": {
940
+ "Score": 18.5,
941
+ "Pass rate": 0.3101,
942
  "Cost($)": 0.0,
943
  "Framework": "",
944
  "X-shot": "8.0",
 
950
  "Average output tokens": 133
951
  },
952
  "AQuA": {
953
+ "Score": 30.71,
954
+ "Pass rate": 0.9646,
955
  "Cost($)": 0.0,
956
  "Framework": "",
957
  "X-shot": "0.0",
 
984
  },
985
  "AQuA": {
986
  "Score": 17.32,
987
+ "Pass rate": 0.9213,
988
  "Cost($)": 0.0,
989
  "Framework": "",
990
  "X-shot": "0.0",
 
1005
  "Eval Date": "2025/1/7"
1006
  },
1007
  "gsm8k": {
1008
+ "Score": 78.7,
1009
  "Pass rate": 1.0,
1010
  "Cost($)": 0.6788,
1011
  "Framework": "",
 
1051
  "Average output tokens": 121
1052
  },
1053
  "AQuA": {
1054
+ "Score": 82.68,
1055
  "Pass rate": 0.9724,
1056
  "Cost($)": 0.0066,
1057
  "Framework": "",
 
1071
  "Eval Date": "2025/1/22"
1072
  },
1073
  "gsm8k": {
1074
+ "Score": 94.09,
1075
  "Pass rate": 1.0,
1076
  "Cost($)": 4.5367,
1077
  "Framework": "",
 
1084
  "Average output tokens": 164
1085
  },
1086
  "AQuA": {
1087
+ "Score": 82.68,
1088
  "Pass rate": 0.9803,
1089
  "Cost($)": 1.0417,
1090
  "Framework": "",
 
1151
  },
1152
  "AQuA": {
1153
  "Score": 83.46,
1154
+ "Pass rate": 0.9843,
1155
  "Cost($)": 0.0927,
1156
  "Framework": "",
1157
  "X-shot": "0.0",
 
1183
  "Average output tokens": 186
1184
  },
1185
  "AQuA": {
1186
+ "Score": 80.71,
1187
+ "Pass rate": 0.9961,
1188
  "Cost($)": 0.0,
1189
  "Framework": "",
1190
  "X-shot": "0.0",
 
1203
  "Eval Date": "2025/1/22"
1204
  },
1205
  "gsm8k": {
1206
+ "Score": 75.44,
1207
  "Pass rate": 0.9992,
1208
  "Cost($)": 0.0,
1209
  "Framework": "",
 
1216
  "Average output tokens": 196
1217
  },
1218
  "AQuA": {
1219
+ "Score": 60.63,
1220
  "Pass rate": 1.0,
1221
  "Cost($)": 0.0,
1222
  "Framework": "",
 
1237
  },
1238
  "gsm8k": {
1239
  "Score": 77.71,
1240
+ "Pass rate": 0.997,
1241
  "Cost($)": 0.0,
1242
  "Framework": "",
1243
  "X-shot": "8.0",
 
1249
  "Average output tokens": 177
1250
  },
1251
  "AQuA": {
1252
+ "Score": 52.76,
1253
  "Pass rate": 0.8937,
1254
  "Cost($)": 0.0,
1255
  "Framework": "",
 
1269
  "Eval Date": "2025/1/22"
1270
  },
1271
  "gsm8k": {
1272
+ "Score": 55.5,
1273
  "Pass rate": 1.0,
1274
  "Cost($)": 0.0,
1275
  "Framework": "",
 
1283
  },
1284
  "AQuA": {
1285
  "Score": 40.55,
1286
+ "Pass rate": 0.9882,
1287
  "Cost($)": 0.0,
1288
  "Framework": "",
1289
  "X-shot": "0.0",
 
1302
  "Eval Date": "2025/1/22"
1303
  },
1304
  "gsm8k": {
1305
+ "Score": 35.94,
1306
  "Pass rate": 0.9992,
1307
  "Cost($)": 0.0,
1308
  "Framework": "",
 
1316
  },
1317
  "AQuA": {
1318
  "Score": 33.07,
1319
+ "Pass rate": 0.9882,
1320
  "Cost($)": 0.0,
1321
  "Framework": "",
1322
  "X-shot": "0.0",
 
1449
  "Average output tokens": 1723
1450
  },
1451
  "AQuA": {
1452
+ "Score": 85.83,
1453
+ "Pass rate": 0.9843,
1454
  "Cost($)": 0.5576,
1455
  "Framework": "",
1456
  "X-shot": "0.0",
 
1503
  },
1504
  "gsm8k": {
1505
  "Score": 88.32,
1506
+ "Pass rate": 0.9985,
1507
  "Cost($)": 0.0,
1508
  "Framework": "",
1509
  "X-shot": "8.0",
 
1515
  "Average output tokens": 1900
1516
  },
1517
  "AQuA": {
1518
+ "Score": 81.5,
1519
  "Pass rate": 1.0,
1520
  "Cost($)": 0.0,
1521
  "Framework": "",
 
1535
  "Eval Date": "2025/1/22"
1536
  },
1537
  "gsm8k": {
1538
+ "Score": 75.21,
1539
+ "Pass rate": 0.9955,
1540
  "Cost($)": 0.0,
1541
  "Framework": "",
1542
  "X-shot": "8.0",
 
1548
  "Average output tokens": 2358
1549
  },
1550
  "AQuA": {
1551
+ "Score": 53.15,
1552
  "Pass rate": 0.9606,
1553
  "Cost($)": 0.0,
1554
  "Framework": "",
 
1569
  },
1570
  "gsm8k": {
1571
  "Score": 41.39,
1572
+ "Pass rate": 0.9826,
1573
  "Cost($)": 0.0,
1574
  "Framework": "",
1575
  "X-shot": "8.0",
 
1647
  "Average output tokens": 3036
1648
  },
1649
  "AQuA": {
1650
+ "Score": 30.71,
1651
+ "Pass rate": 0.9843,
1652
  "Cost($)": 0.0,
1653
  "Framework": "",
1654
  "X-shot": "0.0",
src/detail_results.csv CHANGED
@@ -2,100 +2,100 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
2
  1,SC-CoT,AQuA,gpt-4o,2025/1/22,88.19,1.0,0.0,6.2412,,254,678811,72916,287,605895,2385
3
  2,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,86.61,0.9921,0.0,0.5847,,254,1037124,283248,1115,753876,2968
4
  3,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0.0,0.0808,,254,143289,25143,99,118146,465
5
- 4,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.82,0.9842,0.0,0.5576,,254,989058,241149,949,747909,2945
6
- 5,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.996,0.0,0.0742,,254,131604,25397,100,106207,418
7
- 6,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9842,0.0,0.0927,,254,164389,32555,128,131834,519
8
- 7,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.67,0.9921,0.0,0.0798,,254,141567,32809,129,108758,428
9
- 8,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.67,0.9724,0.0,0.0066,,254,94577,27978,110,66599,262
10
- 9,CoT,AQuA,gpt-4o,2025/1/22,82.67,0.9803,0.0,1.0417,,254,123017,25123,99,97894,385
11
  10,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,81.5,0.9764,0.0,0.0347,,254,465846,83830,330,382016,1504
12
- 11,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,81.49,1.0,0.0,0.0,,254,1015368,278848,1098,736520,2900
13
- 12,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.7,0.996,0.0,0.0,,254,149736,33017,130,116719,460
14
- 13,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.52,0.9921,0.0,0.1746,,254,309799,240735,948,69064,272
15
  14,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0.0,0.0058,,254,87742,33058,130,54684,215
16
- 15,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.996,0.0,0.768,,254,1362379,1119143,4406,243236,958
17
- 16,IO,AQuA,Qwen2.5-7B-Instruct,2025/1/22,78.74,0.9842,0.0,0.0,,254,137771,33271,131,104500,411
18
- 17,ReAct-Pro*,AQuA,Doubao-lite-32k,2025/1/7,77.55,0.9606,0.0,0.0445,,254,1032841,977890,3850,54951,216
19
  18,IO,AQuA,gpt-4o,2025/1/22,75.59,0.9724,0.0,1.1453,,254,133752,25631,101,108121,426
20
- 19,PoT,AQuA,gpt-4o,2025/1/22,75.19,1.0,0.0,1.6087,,254,327908,222717,877,105191,414
21
- 20,PoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,75.19,1.0,0.0,0.1645,,254,291764,249215,981,42549,168
22
- 21,ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,2025/1/22,74.4,0.9921,0.0,0.0,,254,695844,564165,2221,131679,518
23
- 22,ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,2025/1/22,73.22,1.0,0.0,0.3177,,254,563603,441765,1739,121838,480
24
  23,PoT,AQuA,Doubao-lite-32k,2025/1/7,71.65,0.9685,0.0,0.0147,,254,309436,259863,1023,49573,195
25
  24,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,70.47,0.9882,0.0,0.5578,,254,418617,70157,276,348460,1372
26
  25,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0.0,0.0,,254,313728,264517,1041,49211,194
27
- 26,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.56,0.9803,0.0,0.4928,,254,903587,862614,3396,40973,161
28
  27,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0.0,0.0957,,254,80793,25447,100,55346,218
29
- 28,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.62,1.0,0.0,0.0,,254,144435,32555,128,111880,440
30
- 29,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.44,1.0,0.0,0.1748,,254,266654,225162,886,41492,163
31
  30,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0.0,2.304,,254,692096,615589,2424,76507,301
32
  31,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0.0,0.0,,254,4340821,3764723,14822,576098,2268
33
- 32,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,53.14,0.9606,0.0,0.0,,254,1041346,372968,1468,668378,2631
34
- 33,CoT,AQuA,Internllm2_5-7B,2025/1/22,52.75,0.8937,0.0,0.0,,254,127520,26610,105,100910,397
35
- 34,IO,AQuA,Llama-3.1-8B-Instruct,2025/1/22,51.18,0.9881,0.0,0.0,,254,133106,26459,104,106647,420
36
- 35,IO,AQuA,Internllm2_5-7B,2025/1/22,47.63,0.9094,0.0,0.0,,254,185041,50232,198,134809,531
37
  36,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0.0,0.0,,254,4428801,3592039,14142,836762,3294
38
- 37,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9881,0.0,0.0,,254,110040,30477,120,79563,313
39
- 38,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.97,1.0,0.0,0.038,,254,42471,25701,101,16770,66
40
  39,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0.0,0.0,,254,290914,240613,947,50301,198
41
- 40,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9881,0.0,0.0,,254,301962,233505,919,68457,270
42
  41,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,35.85,0.988,0.0,0.0,,254,1240388,530701,2089,709687,2794
43
- 42,CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,33.07,0.9881,0.0,0.0,,254,117339,30477,120,86862,342
44
- 43,PoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.7,0.9645,0.0,0.0,,254,298475,246560,971,51915,204
45
- 44,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,30.7,0.9842,0.0,0.0,,254,1225539,496206,1954,729333,2871
46
  45,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.31,0.9724,0.0,0.0,,254,1157076,430703,1696,726373,2860
47
- 46,IO,AQuA,Qwen2-1.5B-Instruct,2025/1/22,29.13,0.9763,0.0,0.0,,254,71047,27937,110,43110,170
48
- 47,IO,AQuA,Qwen2-0.5B-Instruct,2025/1/22,27.16,0.9881,0.0,0.0,,254,110415,27937,110,82478,325
49
  48,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0.0,0.0,,254,5072004,4555858,17936,516146,2032
50
- 49,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.01,0.9685,0.0,0.0,,254,7170087,6344167,24977,825920,3252
51
- 50,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9212,0.0,0.0,,254,322281,258867,1019,63414,250
52
  1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.45,1.0,8.0,4.5021,,1319,7985996,5406763,4099,2579233,1955
53
- 2,CoT,gsm8k,gpt-4o,2025/1/22,94.08,1.0,8.0,4.5367,,1319,1165166,948668,719,216498,164
54
  3,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8.0,0.687,,1319,1218665,990168,751,228497,173
55
  4,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8.0,4.2166,,1319,1247912,1101672,835,146240,111
56
  5,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8.0,0.7195,,1319,1276252,1005119,762,271133,206
57
  6,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8.0,0.7054,,1319,1251210,1106682,839,144528,110
58
- 7,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.26,1.0,8.0,0.4709,,1319,835275,583916,443,251359,191
59
  8,SC-CoT,gsm8k,gpt-4o,2025/1/22,90.75,1.0,8.0,24.2428,,1319,3300971,1168927,886,2132044,1616
60
  9,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,90.67,1.0,8.0,4.2651,,1319,7565637,5292383,4012,2273254,1723
61
  10,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8.0,0.0558,,1319,1201820,1042095,790,159725,121
62
  11,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8.0,3.3463,,1319,741446,542416,411,199030,151
63
- 12,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,88.32,0.9984,8.0,0.0,,1319,8173818,5668252,4297,2505566,1900
64
- 13,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,,10.1124,,1319,17937864,17038928,12918,898936,682
65
  14,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8.0,10.5479,,1319,18710437,18160983,13769,549454,417
66
  15,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8.0,0.4899,,1319,869060,555340,421,313720,238
67
  16,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8.0,0.0,,1319,1290805,1046008,793,244797,186
68
- 17,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.59,0.9962,8.0,0.2512,,1319,5998639,5862016,4444,136623,104
69
  18,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,83.7,0.997,8.0,0.155,,1319,2507687,1230019,933,1277668,969
70
- 19,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.86,1.0,8.0,0.0,,1319,14850914,14355752,10884,495162,375
71
  20,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,82.56,0.9985,8.0,2.6285,,1319,2560697,1212520,919,1348177,1022
72
- 21,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.6,0.9257,8.0,0.0576,,1319,1288055,1170038,887,118017,89
73
- 22,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.69,1.0,8.0,0.6788,,1319,1088041,953242,723,134799,102
74
- 23,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.9969,8.0,0.0,,1319,1202163,968163,734,234000,177
75
- 24,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.87,0.9924,8.0,0.6902,,1319,1187080,1090418,827,96662,73
76
- 25,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.43,0.9992,8.0,0.0,,1319,1248329,990168,751,258161,196
77
- 26,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.2,0.9954,8.0,0.0,,1319,8444203,5334657,4044,3109546,2358
78
- 27,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.9,0.9939,8.0,3.4633,,1319,6646286,6506164,4933,140122,106
79
- 28,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.08,0.796,8.0,0.9736,,1319,1727044,1126025,854,601019,456
80
  29,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8.0,0.0354,,1319,740483,617377,468,123106,93
81
- 30,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.77,0.9855,8.0,0.0,,1319,22835767,21044978,15955,1790789,1358
82
- 31,ReAct-Pro*,gsm8k,gpt-4o,2025/1/22,63.3,0.9954,8.0,39.0751,,1319,14715887,14411173,10926,304714,231
83
- 32,PoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,58.83,0.705,8.0,0.0,,1319,1362822,1145390,868,217432,165
84
  33,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8.0,0.0,,1319,887913,596229,452,291684,221
85
- 34,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9954,8.0,0.0,,1319,1745429,550941,418,1194488,906
86
- 35,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.49,1.0,8.0,0.0,,1319,1218525,1032818,783,185707,141
87
- 36,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,41.39,0.9825,8.0,0.0,,1319,10024857,6674518,5060,3350339,2540
88
- 37,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.66,0.5542,8.0,0.0,,1319,1391111,1147538,870,243573,185
89
  38,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8.0,0.0,,1319,1324949,1136843,862,188106,143
90
  39,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8.0,0.3328,,1319,586553,546990,415,39563,30
91
- 40,CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,35.93,0.9992,8.0,0.0,,1319,1223459,1032818,783,190641,145
92
- 41,ReAct-Pro*,gsm8k,Internllm2_5-7B,2025/1/22,33.51,0.9795,,0.0,,1319,35669989,30120070,22836,5549919,4208
93
- 42,ReAct-Pro*,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,24.86,0.8021,8.0,0.0,,1319,9828001,9133603,6925,694398,526
94
- 43,PoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,18.49,0.31,8.0,0.0,,1319,1327522,1151528,873,175994,133
95
- 44,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.67,1.0,8.0,0.0,,1319,736996,568530,431,168466,128
96
- 45,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.7,1.0,8.0,0.0,,1319,834897,568116,431,266781,202
97
- 46,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.59,0.9795,8.0,0.0,,1319,1113728,679302,515,434426,329
98
  47,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.62,0.169,8.0,0.0,,1319,1389135,1151528,873,237607,180
99
- 48,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.65,0.9522,8.0,0.0,,1319,55392611,52431343,39751,2961268,2245
100
  49,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,5.53,0.8673,8.0,0.0,,1319,8961768,5844218,4431,3117550,2364
101
  50,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,3.79,0.9484,8.0,0.0,,1319,10533815,6529832,4951,4003983,3036
 
2
  1,SC-CoT,AQuA,gpt-4o,2025/1/22,88.19,1.0,0.0,6.2412,,254,678811,72916,287,605895,2385
3
  2,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,86.61,0.9921,0.0,0.5847,,254,1037124,283248,1115,753876,2968
4
  3,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0.0,0.0808,,254,143289,25143,99,118146,465
5
+ 4,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.83,0.9843,0.0,0.5576,,254,989058,241149,949,747909,2945
6
+ 5,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.9961,0.0,0.0742,,254,131604,25397,100,106207,418
7
+ 6,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9843,0.0,0.0927,,254,164389,32555,128,131834,519
8
+ 7,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.68,0.9921,0.0,0.0798,,254,141567,32809,129,108758,428
9
+ 8,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.68,0.9724,0.0,0.0066,,254,94577,27978,110,66599,262
10
+ 9,CoT,AQuA,gpt-4o,2025/1/22,82.68,0.9803,0.0,1.0417,,254,123017,25123,99,97894,385
11
  10,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,81.5,0.9764,0.0,0.0347,,254,465846,83830,330,382016,1504
12
+ 11,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,81.5,1.0,0.0,0.0,,254,1015368,278848,1098,736520,2900
13
+ 12,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.71,0.9961,0.0,0.0,,254,149736,33017,130,116719,460
14
+ 13,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.53,0.9921,0.0,0.1746,,254,309799,240735,948,69064,272
15
  14,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0.0,0.0058,,254,87742,33058,130,54684,215
16
+ 15,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.9961,0.0,0.768,,254,1362379,1119143,4406,243236,958
17
+ 16,IO,AQuA,Qwen2.5-7B-Instruct,2025/1/22,78.74,0.9843,0.0,0.0,,254,137771,33271,131,104500,411
18
+ 17,ReAct-Pro*,AQuA,Doubao-lite-32k,2025/1/7,77.56,0.9606,0.0,0.0445,,254,1032841,977890,3850,54951,216
19
  18,IO,AQuA,gpt-4o,2025/1/22,75.59,0.9724,0.0,1.1453,,254,133752,25631,101,108121,426
20
+ 19,PoT,AQuA,gpt-4o,2025/1/22,75.2,1.0,0.0,1.6087,,254,327908,222717,877,105191,414
21
+ 20,PoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,75.2,1.0,0.0,0.1645,,254,291764,249215,981,42549,168
22
+ 21,ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,2025/1/22,74.41,0.9921,0.0,0.0,,254,695844,564165,2221,131679,518
23
+ 22,ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,2025/1/22,73.23,1.0,0.0,0.3177,,254,563603,441765,1739,121838,480
24
  23,PoT,AQuA,Doubao-lite-32k,2025/1/7,71.65,0.9685,0.0,0.0147,,254,309436,259863,1023,49573,195
25
  24,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,70.47,0.9882,0.0,0.5578,,254,418617,70157,276,348460,1372
26
  25,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0.0,0.0,,254,313728,264517,1041,49211,194
27
+ 26,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.57,0.9803,0.0,0.4928,,254,903587,862614,3396,40973,161
28
  27,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0.0,0.0957,,254,80793,25447,100,55346,218
29
+ 28,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.63,1.0,0.0,0.0,,254,144435,32555,128,111880,440
30
+ 29,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.45,1.0,0.0,0.1748,,254,266654,225162,886,41492,163
31
  30,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0.0,2.304,,254,692096,615589,2424,76507,301
32
  31,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0.0,0.0,,254,4340821,3764723,14822,576098,2268
33
+ 32,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,53.15,0.9606,0.0,0.0,,254,1041346,372968,1468,668378,2631
34
+ 33,CoT,AQuA,Internllm2_5-7B,2025/1/22,52.76,0.8937,0.0,0.0,,254,127520,26610,105,100910,397
35
+ 34,IO,AQuA,Llama-3.1-8B-Instruct,2025/1/22,51.18,0.9882,0.0,0.0,,254,133106,26459,104,106647,420
36
+ 35,IO,AQuA,Internllm2_5-7B,2025/1/22,47.64,0.9094,0.0,0.0,,254,185041,50232,198,134809,531
37
  36,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0.0,0.0,,254,4428801,3592039,14142,836762,3294
38
+ 37,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9882,0.0,0.0,,254,110040,30477,120,79563,313
39
+ 38,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.98,1.0,0.0,0.038,,254,42471,25701,101,16770,66
40
  39,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0.0,0.0,,254,290914,240613,947,50301,198
41
+ 40,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9882,0.0,0.0,,254,301962,233505,919,68457,270
42
  41,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,35.85,0.988,0.0,0.0,,254,1240388,530701,2089,709687,2794
43
+ 42,CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,33.07,0.9882,0.0,0.0,,254,117339,30477,120,86862,342
44
+ 43,PoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.71,0.9646,0.0,0.0,,254,298475,246560,971,51915,204
45
+ 44,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,30.71,0.9843,0.0,0.0,,254,1225539,496206,1954,729333,2871
46
  45,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.31,0.9724,0.0,0.0,,254,1157076,430703,1696,726373,2860
47
+ 46,IO,AQuA,Qwen2-1.5B-Instruct,2025/1/22,29.13,0.9764,0.0,0.0,,254,71047,27937,110,43110,170
48
+ 47,IO,AQuA,Qwen2-0.5B-Instruct,2025/1/22,27.17,0.9882,0.0,0.0,,254,110415,27937,110,82478,325
49
  48,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0.0,0.0,,254,5072004,4555858,17936,516146,2032
50
+ 49,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.02,0.9685,0.0,0.0,,254,7170087,6344167,24977,825920,3252
51
+ 50,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9213,0.0,0.0,,254,322281,258867,1019,63414,250
52
  1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.45,1.0,8.0,4.5021,,1319,7985996,5406763,4099,2579233,1955
53
+ 2,CoT,gsm8k,gpt-4o,2025/1/22,94.09,1.0,8.0,4.5367,,1319,1165166,948668,719,216498,164
54
  3,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8.0,0.687,,1319,1218665,990168,751,228497,173
55
  4,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8.0,4.2166,,1319,1247912,1101672,835,146240,111
56
  5,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8.0,0.7195,,1319,1276252,1005119,762,271133,206
57
  6,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8.0,0.7054,,1319,1251210,1106682,839,144528,110
58
+ 7,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.27,1.0,8.0,0.4709,,1319,835275,583916,443,251359,191
59
  8,SC-CoT,gsm8k,gpt-4o,2025/1/22,90.75,1.0,8.0,24.2428,,1319,3300971,1168927,886,2132044,1616
60
  9,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,90.67,1.0,8.0,4.2651,,1319,7565637,5292383,4012,2273254,1723
61
  10,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8.0,0.0558,,1319,1201820,1042095,790,159725,121
62
  11,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8.0,3.3463,,1319,741446,542416,411,199030,151
63
+ 12,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,88.32,0.9985,8.0,0.0,,1319,8173818,5668252,4297,2505566,1900
64
+ 13,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,8.0,10.1124,,1319,17937864,17038928,12918,898936,682
65
  14,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8.0,10.5479,,1319,18710437,18160983,13769,549454,417
66
  15,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8.0,0.4899,,1319,869060,555340,421,313720,238
67
  16,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8.0,0.0,,1319,1290805,1046008,793,244797,186
68
+ 17,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.6,0.9962,8.0,0.2512,,1319,5998639,5862016,4444,136623,104
69
  18,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,83.7,0.997,8.0,0.155,,1319,2507687,1230019,933,1277668,969
70
+ 19,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.87,1.0,8.0,0.0,,1319,14850914,14355752,10884,495162,375
71
  20,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,82.56,0.9985,8.0,2.6285,,1319,2560697,1212520,919,1348177,1022
72
+ 21,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.61,0.9257,8.0,0.0576,,1319,1288055,1170038,887,118017,89
73
+ 22,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.7,1.0,8.0,0.6788,,1319,1088041,953242,723,134799,102
74
+ 23,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.997,8.0,0.0,,1319,1202163,968163,734,234000,177
75
+ 24,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.88,0.9924,8.0,0.6902,,1319,1187080,1090418,827,96662,73
76
+ 25,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.44,0.9992,8.0,0.0,,1319,1248329,990168,751,258161,196
77
+ 26,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.21,0.9955,8.0,0.0,,1319,8444203,5334657,4044,3109546,2358
78
+ 27,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.91,0.9939,8.0,3.4633,,1319,6646286,6506164,4933,140122,106
79
+ 28,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.09,0.7961,8.0,0.9736,,1319,1727044,1126025,854,601019,456
80
  29,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8.0,0.0354,,1319,740483,617377,468,123106,93
81
+ 30,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.78,0.9856,8.0,0.0,,1319,22835767,21044978,15955,1790789,1358
82
+ 31,ReAct-Pro*,gsm8k,gpt-4o,2025/1/22,63.31,0.9955,8.0,39.0751,,1319,14715887,14411173,10926,304714,231
83
+ 32,PoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,58.83,0.7051,8.0,0.0,,1319,1362822,1145390,868,217432,165
84
  33,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8.0,0.0,,1319,887913,596229,452,291684,221
85
+ 34,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9955,8.0,0.0,,1319,1745429,550941,418,1194488,906
86
+ 35,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.5,1.0,8.0,0.0,,1319,1218525,1032818,783,185707,141
87
+ 36,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,41.39,0.9826,8.0,0.0,,1319,10024857,6674518,5060,3350339,2540
88
+ 37,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.67,0.5542,8.0,0.0,,1319,1391111,1147538,870,243573,185
89
  38,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8.0,0.0,,1319,1324949,1136843,862,188106,143
90
  39,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8.0,0.3328,,1319,586553,546990,415,39563,30
91
+ 40,CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,35.94,0.9992,8.0,0.0,,1319,1223459,1032818,783,190641,145
92
+ 41,ReAct-Pro*,gsm8k,Internllm2_5-7B,2025/1/22,33.51,0.9795,8.0,0.0,,1319,35669989,30120070,22836,5549919,4208
93
+ 42,ReAct-Pro*,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,24.87,0.8021,8.0,0.0,,1319,9828001,9133603,6925,694398,526
94
+ 43,PoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,18.5,0.3101,8.0,0.0,,1319,1327522,1151528,873,175994,133
95
+ 44,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.68,1.0,8.0,0.0,,1319,736996,568530,431,168466,128
96
+ 45,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.71,1.0,8.0,0.0,,1319,834897,568116,431,266781,202
97
+ 46,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.6,0.9795,8.0,0.0,,1319,1113728,679302,515,434426,329
98
  47,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.62,0.169,8.0,0.0,,1319,1389135,1151528,873,237607,180
99
+ 48,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.66,0.9522,8.0,0.0,,1319,55392611,52431343,39751,2961268,2245
100
  49,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,5.53,0.8673,8.0,0.0,,1319,8961768,5844218,4431,3117550,2364
101
  50,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,3.79,0.9484,8.0,0.0,,1319,10533815,6529832,4951,4003983,3036
src/overall_math_score.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "time": "2025-01-23 09:27:24",
3
  "results": {
4
  "IO": {
5
  "META": {
@@ -12,7 +12,7 @@
12
  "Cost($)": 0.3328
13
  },
14
  "AQuA": {
15
- "Score": 38.97,
16
  "Cost($)": 0.038
17
  }
18
  },
@@ -23,11 +23,11 @@
23
  "Eval Date": "2025/1/7"
24
  },
25
  "gsm8k": {
26
- "Score": 74.9,
27
  "Cost($)": 3.4633
28
  },
29
  "AQuA": {
30
- "Score": 64.56,
31
  "Cost($)": 0.4928
32
  }
33
  },
@@ -38,11 +38,11 @@
38
  "Eval Date": "2025/1/7"
39
  },
40
  "gsm8k": {
41
- "Score": 76.87,
42
  "Cost($)": 0.6902
43
  },
44
  "AQuA": {
45
- "Score": 59.44,
46
  "Cost($)": 0.1748
47
  }
48
  },
@@ -53,7 +53,7 @@
53
  "Eval Date": "2025/1/7"
54
  },
55
  "gsm8k": {
56
- "Score": 78.69,
57
  "Cost($)": 0.6788
58
  },
59
  "AQuA": {
@@ -98,11 +98,11 @@
98
  "Eval Date": "2025/1/7"
99
  },
100
  "gsm8k": {
101
- "Score": 85.59,
102
  "Cost($)": 0.2512
103
  },
104
  "AQuA": {
105
- "Score": 77.55,
106
  "Cost($)": 0.0445
107
  }
108
  },
@@ -113,7 +113,7 @@
113
  "Eval Date": "2025/1/7"
114
  },
115
  "gsm8k": {
116
- "Score": 79.6,
117
  "Cost($)": 0.0576
118
  },
119
  "AQuA": {
@@ -132,7 +132,7 @@
132
  "Cost($)": 0.0558
133
  },
134
  "AQuA": {
135
- "Score": 82.67,
136
  "Cost($)": 0.0066
137
  }
138
  },
@@ -173,7 +173,7 @@
173
  "Eval Date": "2025/1/22"
174
  },
175
  "gsm8k": {
176
- "Score": 63.3,
177
  "Cost($)": 39.0751
178
  },
179
  "AQuA": {
@@ -192,7 +192,7 @@
192
  "Cost($)": 4.2166
193
  },
194
  "AQuA": {
195
- "Score": 75.19,
196
  "Cost($)": 1.6087
197
  }
198
  },
@@ -203,11 +203,11 @@
203
  "Eval Date": "2025/1/22"
204
  },
205
  "gsm8k": {
206
- "Score": 94.08,
207
  "Cost($)": 4.5367
208
  },
209
  "AQuA": {
210
- "Score": 82.67,
211
  "Cost($)": 1.0417
212
  }
213
  },
@@ -252,7 +252,7 @@
252
  "Cost($)": 10.5479
253
  },
254
  "AQuA": {
255
- "Score": 73.22,
256
  "Cost($)": 0.3177
257
  }
258
  },
@@ -267,7 +267,7 @@
267
  "Cost($)": 0.7054
268
  },
269
  "AQuA": {
270
- "Score": 75.19,
271
  "Cost($)": 0.1645
272
  }
273
  },
@@ -297,7 +297,7 @@
297
  "Cost($)": 4.2651
298
  },
299
  "AQuA": {
300
- "Score": 85.82,
301
  "Cost($)": 0.5576
302
  }
303
  },
@@ -308,11 +308,11 @@
308
  "Eval Date": "2025/1/22"
309
  },
310
  "gsm8k": {
311
- "Score": 92.26,
312
  "Cost($)": 0.4709
313
  },
314
  "AQuA": {
315
- "Score": 82.67,
316
  "Cost($)": 0.0798
317
  }
318
  },
@@ -338,11 +338,11 @@
338
  "Eval Date": "2025/1/22"
339
  },
340
  "gsm8k": {
341
- "Score": 73.08,
342
  "Cost($)": 0.9736
343
  },
344
  "AQuA": {
345
- "Score": 79.52,
346
  "Cost($)": 0.1746
347
  }
348
  },
@@ -398,11 +398,11 @@
398
  "Eval Date": "2025/1/22"
399
  },
400
  "gsm8k": {
401
- "Score": 82.86,
402
  "Cost($)": 0.0
403
  },
404
  "AQuA": {
405
- "Score": 74.4,
406
  "Cost($)": 0.0
407
  }
408
  },
@@ -432,7 +432,7 @@
432
  "Cost($)": 0.0
433
  },
434
  "AQuA": {
435
- "Score": 80.7,
436
  "Cost($)": 0.0
437
  }
438
  },
@@ -447,7 +447,7 @@
447
  "Cost($)": 0.0
448
  },
449
  "AQuA": {
450
- "Score": 81.49,
451
  "Cost($)": 0.0
452
  }
453
  },
@@ -473,7 +473,7 @@
473
  "Eval Date": "2025/1/22"
474
  },
475
  "gsm8k": {
476
- "Score": 67.77,
477
  "Cost($)": 0.0
478
  },
479
  "AQuA": {
@@ -488,7 +488,7 @@
488
  "Eval Date": "2025/1/22"
489
  },
490
  "gsm8k": {
491
- "Score": 38.66,
492
  "Cost($)": 0.0
493
  },
494
  "AQuA": {
@@ -503,11 +503,11 @@
503
  "Eval Date": "2025/1/22"
504
  },
505
  "gsm8k": {
506
- "Score": 75.43,
507
  "Cost($)": 0.0
508
  },
509
  "AQuA": {
510
- "Score": 60.62,
511
  "Cost($)": 0.0
512
  }
513
  },
@@ -518,11 +518,11 @@
518
  "Eval Date": "2025/1/22"
519
  },
520
  "gsm8k": {
521
- "Score": 75.2,
522
  "Cost($)": 0.0
523
  },
524
  "AQuA": {
525
- "Score": 53.14,
526
  "Cost($)": 0.0
527
  }
528
  },
@@ -533,11 +533,11 @@
533
  "Eval Date": "2025/1/22"
534
  },
535
  "gsm8k": {
536
- "Score": 11.59,
537
  "Cost($)": 0.0
538
  },
539
  "AQuA": {
540
- "Score": 47.63,
541
  "Cost($)": 0.0
542
  }
543
  },
@@ -582,7 +582,7 @@
582
  "Cost($)": 0.0
583
  },
584
  "AQuA": {
585
- "Score": 52.75,
586
  "Cost($)": 0.0
587
  }
588
  },
@@ -608,7 +608,7 @@
608
  "Eval Date": "2025/1/22"
609
  },
610
  "gsm8k": {
611
- "Score": 16.67,
612
  "Cost($)": 0.0
613
  },
614
  "AQuA": {
@@ -623,7 +623,7 @@
623
  "Eval Date": "2025/1/22"
624
  },
625
  "gsm8k": {
626
- "Score": 24.86,
627
  "Cost($)": 0.0
628
  },
629
  "AQuA": {
@@ -638,11 +638,11 @@
638
  "Eval Date": "2025/1/22"
639
  },
640
  "gsm8k": {
641
- "Score": 18.49,
642
  "Cost($)": 0.0
643
  },
644
  "AQuA": {
645
- "Score": 30.7,
646
  "Cost($)": 0.0
647
  }
648
  },
@@ -653,7 +653,7 @@
653
  "Eval Date": "2025/1/22"
654
  },
655
  "gsm8k": {
656
- "Score": 55.49,
657
  "Cost($)": 0.0
658
  },
659
  "AQuA": {
@@ -683,11 +683,11 @@
683
  "Eval Date": "2025/1/22"
684
  },
685
  "gsm8k": {
686
- "Score": 14.7,
687
  "Cost($)": 0.0
688
  },
689
  "AQuA": {
690
- "Score": 27.16,
691
  "Cost($)": 0.0
692
  }
693
  },
@@ -698,11 +698,11 @@
698
  "Eval Date": "2025/1/22"
699
  },
700
  "gsm8k": {
701
- "Score": 7.65,
702
  "Cost($)": 0.0
703
  },
704
  "AQuA": {
705
- "Score": 24.01,
706
  "Cost($)": 0.0
707
  }
708
  },
@@ -728,7 +728,7 @@
728
  "Eval Date": "2025/1/22"
729
  },
730
  "gsm8k": {
731
- "Score": 35.93,
732
  "Cost($)": 0.0
733
  },
734
  "AQuA": {
@@ -747,7 +747,7 @@
747
  "Cost($)": 0.0
748
  },
749
  "AQuA": {
750
- "Score": 30.7,
751
  "Cost($)": 0.0
752
  }
753
  }
 
1
  {
2
+ "time": "2025-01-23 11:23:17",
3
  "results": {
4
  "IO": {
5
  "META": {
 
12
  "Cost($)": 0.3328
13
  },
14
  "AQuA": {
15
+ "Score": 38.98,
16
  "Cost($)": 0.038
17
  }
18
  },
 
23
  "Eval Date": "2025/1/7"
24
  },
25
  "gsm8k": {
26
+ "Score": 74.91,
27
  "Cost($)": 3.4633
28
  },
29
  "AQuA": {
30
+ "Score": 64.57,
31
  "Cost($)": 0.4928
32
  }
33
  },
 
38
  "Eval Date": "2025/1/7"
39
  },
40
  "gsm8k": {
41
+ "Score": 76.88,
42
  "Cost($)": 0.6902
43
  },
44
  "AQuA": {
45
+ "Score": 59.45,
46
  "Cost($)": 0.1748
47
  }
48
  },
 
53
  "Eval Date": "2025/1/7"
54
  },
55
  "gsm8k": {
56
+ "Score": 78.7,
57
  "Cost($)": 0.6788
58
  },
59
  "AQuA": {
 
98
  "Eval Date": "2025/1/7"
99
  },
100
  "gsm8k": {
101
+ "Score": 85.6,
102
  "Cost($)": 0.2512
103
  },
104
  "AQuA": {
105
+ "Score": 77.56,
106
  "Cost($)": 0.0445
107
  }
108
  },
 
113
  "Eval Date": "2025/1/7"
114
  },
115
  "gsm8k": {
116
+ "Score": 79.61,
117
  "Cost($)": 0.0576
118
  },
119
  "AQuA": {
 
132
  "Cost($)": 0.0558
133
  },
134
  "AQuA": {
135
+ "Score": 82.68,
136
  "Cost($)": 0.0066
137
  }
138
  },
 
173
  "Eval Date": "2025/1/22"
174
  },
175
  "gsm8k": {
176
+ "Score": 63.31,
177
  "Cost($)": 39.0751
178
  },
179
  "AQuA": {
 
192
  "Cost($)": 4.2166
193
  },
194
  "AQuA": {
195
+ "Score": 75.2,
196
  "Cost($)": 1.6087
197
  }
198
  },
 
203
  "Eval Date": "2025/1/22"
204
  },
205
  "gsm8k": {
206
+ "Score": 94.09,
207
  "Cost($)": 4.5367
208
  },
209
  "AQuA": {
210
+ "Score": 82.68,
211
  "Cost($)": 1.0417
212
  }
213
  },
 
252
  "Cost($)": 10.5479
253
  },
254
  "AQuA": {
255
+ "Score": 73.23,
256
  "Cost($)": 0.3177
257
  }
258
  },
 
267
  "Cost($)": 0.7054
268
  },
269
  "AQuA": {
270
+ "Score": 75.2,
271
  "Cost($)": 0.1645
272
  }
273
  },
 
297
  "Cost($)": 4.2651
298
  },
299
  "AQuA": {
300
+ "Score": 85.83,
301
  "Cost($)": 0.5576
302
  }
303
  },
 
308
  "Eval Date": "2025/1/22"
309
  },
310
  "gsm8k": {
311
+ "Score": 92.27,
312
  "Cost($)": 0.4709
313
  },
314
  "AQuA": {
315
+ "Score": 82.68,
316
  "Cost($)": 0.0798
317
  }
318
  },
 
338
  "Eval Date": "2025/1/22"
339
  },
340
  "gsm8k": {
341
+ "Score": 73.09,
342
  "Cost($)": 0.9736
343
  },
344
  "AQuA": {
345
+ "Score": 79.53,
346
  "Cost($)": 0.1746
347
  }
348
  },
 
398
  "Eval Date": "2025/1/22"
399
  },
400
  "gsm8k": {
401
+ "Score": 82.87,
402
  "Cost($)": 0.0
403
  },
404
  "AQuA": {
405
+ "Score": 74.41,
406
  "Cost($)": 0.0
407
  }
408
  },
 
432
  "Cost($)": 0.0
433
  },
434
  "AQuA": {
435
+ "Score": 80.71,
436
  "Cost($)": 0.0
437
  }
438
  },
 
447
  "Cost($)": 0.0
448
  },
449
  "AQuA": {
450
+ "Score": 81.5,
451
  "Cost($)": 0.0
452
  }
453
  },
 
473
  "Eval Date": "2025/1/22"
474
  },
475
  "gsm8k": {
476
+ "Score": 67.78,
477
  "Cost($)": 0.0
478
  },
479
  "AQuA": {
 
488
  "Eval Date": "2025/1/22"
489
  },
490
  "gsm8k": {
491
+ "Score": 38.67,
492
  "Cost($)": 0.0
493
  },
494
  "AQuA": {
 
503
  "Eval Date": "2025/1/22"
504
  },
505
  "gsm8k": {
506
+ "Score": 75.44,
507
  "Cost($)": 0.0
508
  },
509
  "AQuA": {
510
+ "Score": 60.63,
511
  "Cost($)": 0.0
512
  }
513
  },
 
518
  "Eval Date": "2025/1/22"
519
  },
520
  "gsm8k": {
521
+ "Score": 75.21,
522
  "Cost($)": 0.0
523
  },
524
  "AQuA": {
525
+ "Score": 53.15,
526
  "Cost($)": 0.0
527
  }
528
  },
 
533
  "Eval Date": "2025/1/22"
534
  },
535
  "gsm8k": {
536
+ "Score": 11.6,
537
  "Cost($)": 0.0
538
  },
539
  "AQuA": {
540
+ "Score": 47.64,
541
  "Cost($)": 0.0
542
  }
543
  },
 
582
  "Cost($)": 0.0
583
  },
584
  "AQuA": {
585
+ "Score": 52.76,
586
  "Cost($)": 0.0
587
  }
588
  },
 
608
  "Eval Date": "2025/1/22"
609
  },
610
  "gsm8k": {
611
+ "Score": 16.68,
612
  "Cost($)": 0.0
613
  },
614
  "AQuA": {
 
623
  "Eval Date": "2025/1/22"
624
  },
625
  "gsm8k": {
626
+ "Score": 24.87,
627
  "Cost($)": 0.0
628
  },
629
  "AQuA": {
 
638
  "Eval Date": "2025/1/22"
639
  },
640
  "gsm8k": {
641
+ "Score": 18.5,
642
  "Cost($)": 0.0
643
  },
644
  "AQuA": {
645
+ "Score": 30.71,
646
  "Cost($)": 0.0
647
  }
648
  },
 
653
  "Eval Date": "2025/1/22"
654
  },
655
  "gsm8k": {
656
+ "Score": 55.5,
657
  "Cost($)": 0.0
658
  },
659
  "AQuA": {
 
683
  "Eval Date": "2025/1/22"
684
  },
685
  "gsm8k": {
686
+ "Score": 14.71,
687
  "Cost($)": 0.0
688
  },
689
  "AQuA": {
690
+ "Score": 27.17,
691
  "Cost($)": 0.0
692
  }
693
  },
 
698
  "Eval Date": "2025/1/22"
699
  },
700
  "gsm8k": {
701
+ "Score": 7.66,
702
  "Cost($)": 0.0
703
  },
704
  "AQuA": {
705
+ "Score": 24.02,
706
  "Cost($)": 0.0
707
  }
708
  },
 
728
  "Eval Date": "2025/1/22"
729
  },
730
  "gsm8k": {
731
+ "Score": 35.94,
732
  "Cost($)": 0.0
733
  },
734
  "AQuA": {
 
747
  "Cost($)": 0.0
748
  },
749
  "AQuA": {
750
+ "Score": 30.71,
751
  "Cost($)": 0.0
752
  }
753
  }
src/overall_results.csv CHANGED
@@ -3,49 +3,49 @@ Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA
3
  2.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,89.55,92.87,0.7195,86.22,0.0808
4
  3.0,SC-CoT,gpt-4o,2025/1/22,89.47,90.75,24.2428,88.19,6.2412
5
  4.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,88.70,93.93,0.687,83.46,0.0927
6
- 5.0,CoT,gpt-4o,2025/1/22,88.38,94.08,4.5367,82.67,1.0417
7
- 6.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,88.25,90.67,4.2651,85.82,0.5576
8
- 7.0,IO,Llama-3.3-70B-Instruct,2025/1/22,87.47,92.26,0.4709,82.67,0.0798
9
- 8.0,CoT,Doubao-lite-32k,2025/1/7,85.99,89.31,0.0558,82.67,0.0066
10
  9.0,IO,Qwen2.5-72B-Instruct,2025/1/22,85.42,86.58,0.4899,84.25,0.0742
11
- 10.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,84.91,88.32,0.0,81.49,0.0
12
- 11.0,PoT,gpt-4o,2025/1/22,84.15,93.1,4.2166,75.19,1.6087
13
- 12.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,83.77,92.34,0.7054,75.19,0.1645
14
  13.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,83.39,87.64,10.1124,79.13,0.768
15
- 14.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,83.19,85.67,0.0,80.7,0.0
16
  15.0,SC-CoT,Doubao-lite-32k,2025/1/7,82.60,83.7,0.155,81.5,0.0347
17
  16.0,IO,gpt-4o,2025/1/22,82.00,88.4,3.3463,75.59,1.1453
18
- 17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,81.57,85.59,0.2512,77.55,0.0445
19
- 18.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,80.24,87.26,10.5479,73.22,0.3177
20
- 19.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,78.63,82.86,0.0,74.4,0.0
21
  20.0,SC-CoT,gpt-3.5-turbo,2025/1/7,76.52,82.56,2.6285,70.47,0.5578
22
- 21.0,PoT,Llama-3.3-70B-Instruct,2025/1/22,76.30,73.08,0.9736,79.52,0.1746
23
- 22.0,PoT,Doubao-lite-32k,2025/1/7,75.63,79.6,0.0576,71.65,0.0147
24
  23.0,IO,Doubao-lite-32k,2025/1/7,75.58,72.02,0.0354,79.13,0.0058
25
- 24.0,CoT,gpt-3.5-turbo,2025/1/7,69.86,78.69,0.6788,61.02,0.0957
26
- 25.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,69.73,74.9,3.4633,64.56,0.4928
27
- 26.0,PoT,gpt-3.5-turbo,2025/1/7,68.16,76.87,0.6902,59.44,0.1748
28
- 27.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,68.03,75.43,0.0,60.62,0.0
29
  28.0,IO,Qwen2.5-7B-Instruct,2025/1/22,67.99,57.24,0.0,78.74,0.0
30
- 29.0,CoT,Internllm2_5-7B,2025/1/22,65.23,77.71,0.0,52.75,0.0
31
- 30.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,64.17,75.2,0.0,53.14,0.0
32
  31.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,63.47,58.83,0.0,68.11,0.0
33
- 32.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,61.64,67.77,0.0,55.51,0.0
34
- 33.0,ReAct-Pro*,gpt-4o,2025/1/22,60.39,63.3,39.0751,57.48,2.304
35
  34.0,IO,Llama-3.1-8B-Instruct,2025/1/22,54.17,57.16,0.0,51.18,0.0
36
- 35.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,48.02,55.49,0.0,40.55,0.0
37
  36.0,SC-CoT,Internllm2_5-7B,2025/1/22,38.62,41.39,0.0,35.85,0.0
38
- 37.0,IO,gpt-3.5-turbo,2025/1/7,38.40,37.83,0.3328,38.97,0.038
39
- 38.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,37.64,38.66,0.0,36.61,0.0
40
  39.0,PoT,Internllm2_5-7B,2025/1/22,37.41,38.21,0.0,36.61,0.0
41
  40.0,ReAct-Pro*,Internllm2_5-7B,2025/1/22,37.23,33.51,0.0,40.94,0.0
42
- 41.0,CoT,Qwen2-0.5B-Instruct,2025/1/22,34.50,35.93,0.0,33.07,0.0
43
- 42.0,IO,Internllm2_5-7B,2025/1/22,29.61,11.59,0.0,47.63,0.0
44
- 43.0,ReAct-Pro*,Qwen2-1.5B-Instruct,2025/1/22,25.23,24.86,0.0,25.59,0.0
45
- 44.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,24.60,18.49,0.0,30.7,0.0
46
- 45.0,IO,Qwen2-1.5B-Instruct,2025/1/22,22.90,16.67,0.0,29.13,0.0
47
- 46.0,IO,Qwen2-0.5B-Instruct,2025/1/22,20.93,14.7,0.0,27.16,0.0
48
  47.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,17.92,5.53,0.0,30.31,0.0
49
- 48.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,17.25,3.79,0.0,30.7,0.0
50
- 49.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,15.83,7.65,0.0,24.01,0.0
51
  50.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,13.47,9.62,0.0,17.32,0.0
 
3
  2.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,89.55,92.87,0.7195,86.22,0.0808
4
  3.0,SC-CoT,gpt-4o,2025/1/22,89.47,90.75,24.2428,88.19,6.2412
5
  4.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,88.70,93.93,0.687,83.46,0.0927
6
+ 5.0,CoT,gpt-4o,2025/1/22,88.39,94.09,4.5367,82.68,1.0417
7
+ 6.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,88.25,90.67,4.2651,85.83,0.5576
8
+ 7.0,IO,Llama-3.3-70B-Instruct,2025/1/22,87.48,92.27,0.4709,82.68,0.0798
9
+ 8.0,CoT,Doubao-lite-32k,2025/1/7,86.00,89.31,0.0558,82.68,0.0066
10
  9.0,IO,Qwen2.5-72B-Instruct,2025/1/22,85.42,86.58,0.4899,84.25,0.0742
11
+ 10.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,84.91,88.32,0.0,81.5,0.0
12
+ 11.0,PoT,gpt-4o,2025/1/22,84.15,93.1,4.2166,75.2,1.6087
13
+ 12.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,83.77,92.34,0.7054,75.2,0.1645
14
  13.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,83.39,87.64,10.1124,79.13,0.768
15
+ 14.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,83.19,85.67,0.0,80.71,0.0
16
  15.0,SC-CoT,Doubao-lite-32k,2025/1/7,82.60,83.7,0.155,81.5,0.0347
17
  16.0,IO,gpt-4o,2025/1/22,82.00,88.4,3.3463,75.59,1.1453
18
+ 17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,81.58,85.6,0.2512,77.56,0.0445
19
+ 18.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,80.25,87.26,10.5479,73.23,0.3177
20
+ 19.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,78.64,82.87,0.0,74.41,0.0
21
  20.0,SC-CoT,gpt-3.5-turbo,2025/1/7,76.52,82.56,2.6285,70.47,0.5578
22
+ 21.0,PoT,Llama-3.3-70B-Instruct,2025/1/22,76.31,73.09,0.9736,79.53,0.1746
23
+ 22.0,PoT,Doubao-lite-32k,2025/1/7,75.63,79.61,0.0576,71.65,0.0147
24
  23.0,IO,Doubao-lite-32k,2025/1/7,75.58,72.02,0.0354,79.13,0.0058
25
+ 24.0,CoT,gpt-3.5-turbo,2025/1/7,69.86,78.7,0.6788,61.02,0.0957
26
+ 25.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,69.74,74.91,3.4633,64.57,0.4928
27
+ 26.0,PoT,gpt-3.5-turbo,2025/1/7,68.17,76.88,0.6902,59.45,0.1748
28
+ 27.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,68.04,75.44,0.0,60.63,0.0
29
  28.0,IO,Qwen2.5-7B-Instruct,2025/1/22,67.99,57.24,0.0,78.74,0.0
30
+ 29.0,CoT,Internllm2_5-7B,2025/1/22,65.24,77.71,0.0,52.76,0.0
31
+ 30.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,64.18,75.21,0.0,53.15,0.0
32
  31.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,63.47,58.83,0.0,68.11,0.0
33
+ 32.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,61.65,67.78,0.0,55.51,0.0
34
+ 33.0,ReAct-Pro*,gpt-4o,2025/1/22,60.40,63.31,39.0751,57.48,2.304
35
  34.0,IO,Llama-3.1-8B-Instruct,2025/1/22,54.17,57.16,0.0,51.18,0.0
36
+ 35.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,48.03,55.5,0.0,40.55,0.0
37
  36.0,SC-CoT,Internllm2_5-7B,2025/1/22,38.62,41.39,0.0,35.85,0.0
38
+ 37.0,IO,gpt-3.5-turbo,2025/1/7,38.41,37.83,0.3328,38.98,0.038
39
+ 38.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,37.64,38.67,0.0,36.61,0.0
40
  39.0,PoT,Internllm2_5-7B,2025/1/22,37.41,38.21,0.0,36.61,0.0
41
  40.0,ReAct-Pro*,Internllm2_5-7B,2025/1/22,37.23,33.51,0.0,40.94,0.0
42
+ 41.0,CoT,Qwen2-0.5B-Instruct,2025/1/22,34.51,35.94,0.0,33.07,0.0
43
+ 42.0,IO,Internllm2_5-7B,2025/1/22,29.62,11.6,0.0,47.64,0.0
44
+ 43.0,ReAct-Pro*,Qwen2-1.5B-Instruct,2025/1/22,25.23,24.87,0.0,25.59,0.0
45
+ 44.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,24.61,18.5,0.0,30.71,0.0
46
+ 45.0,IO,Qwen2-1.5B-Instruct,2025/1/22,22.91,16.68,0.0,29.13,0.0
47
+ 46.0,IO,Qwen2-0.5B-Instruct,2025/1/22,20.94,14.71,0.0,27.17,0.0
48
  47.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,17.92,5.53,0.0,30.31,0.0
49
+ 48.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,17.25,3.79,0.0,30.71,0.0
50
+ 49.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,15.84,7.66,0.0,24.02,0.0
51
  50.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,13.47,9.62,0.0,17.32,0.0
src/record.csv CHANGED
@@ -1,148 +1,148 @@
1
- ๏ปฟAlgorithm,dataset,llm,Score,Pass rate,X-shot,X-shot,Parameters,Nums,Total input tokens,Average input tokens,Total output tokens,Average output tokens,All tokens,Cost($),Eval Date,Note,,,,,,,,,,,,,,,,,,,
2
- IO,gsm8k,gpt-3.5-turbo,37.83,99.92,8,few_shot,,1319,"546,990",415,"39,563",30,"586,553",0.3328,2025/1/7,,,,,,,,,,,,,,,,,,,,
3
- IO,gsm8k,Doubao-lite-32k,72.02,99.92,8,few_shot,,1319,"617,377",468,"123,106",93,"740,483",0.0354,2025/1/7,0.2590 ๏ผˆๅ…ƒ๏ผ‰,,,,,,,,,,,,,,,,,,,
4
- IO,gsm8k,gpt-4o,88.4,100,8,few_shot,,1319,"542,416",411,"199,030",151,"741,446",3.3463,2025/1/22,,,,,,,,,,,,,,,,,,,,
5
- IO,gsm8k,Qwen2.5-72B-Instruct,86.58,100,8,few_shot,,1319,"555,340",421,"313,720",238,"869,060",0.4899,2025/1/22,,,,,,,,,,,,,,,,,,,,
6
- IO,gsm8k,Llama-3.3-70B-Instruct,92.26,100,8,few_shot,,1319,"583,916",443,"251,359",191,"835,275",0.4709,2025/1/22,,,,,,,,,,,,,,,,,,,,
7
- IO,gsm8k,Qwen2.5-7B-Instruct,57.24,100,8,few_shot,,1319,"596,229",452,"291,684",221,"887,913",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
8
- IO,gsm8k,Llama-3.1-8B-Instruct,57.16,99.54,8,few_shot,,1319,"550,941",418,"1,194,488",906,"1,745,429",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
9
- IO,gsm8k,Internllm2_5-7B,11.59,97.95,8,few_shot,,1319,"679,302",515,"434,426",329,"1,113,728",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
10
- IO,gsm8k,Qwen2-1.5B-Instruct,16.67,100,8,few_shot,,1319,"568,530",431,"168,466",128,"736,996",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
11
- IO,gsm8k,Qwen2-0.5B-Instruct,14.7,100,8,few_shot,,1319,"568,116",431,"266,781",202,"834,897",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
12
- ReAct-Pro*,gsm8k,gpt-3.5-turbo,74.9,99.39,8,few_shot,max_steps=10,1319,"6,506,164","4,933","140,122",106,"6,646,286",3.4633,2025/1/7,"think-action ๅ•็‹ฌ่ฟ”ๅ›ž,prompt v1",,,,,,,,,,,,,,,,,,,
13
- ReAct-Pro*,gsm8k,Doubao-lite-32k,85.59,99.62,8,few_shot,max_steps=10,1319,"5,862,016","4,444","136,623",104,"5,998,639",0.2512,2025/1/7,"think-action ๅ•็‹ฌ่ฟ”ๅ›ž,prompt v1",,,,,,,,,,,,,,,,,,,
14
- ReAct-Pro*,gsm8k,gpt-4o,63.3,99.54,8,few_shot,max_steps=10,1319,"14,411,173","10,926","304,714",231,"14,715,887",39.0751,2025/1/22,"think-action ๅ•็‹ฌ่ฟ”ๅ›ž,prompt v1",,,,,,,,,,,,,,,,,,,
15
- ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,87.26,100,8,few_shot,max_steps=10,1319,"18,160,983","13,769","549,454",417,"18,710,437",10.5479,2025/1/22,,,,,,,,,,,,,,,,,,,,
16
- ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,87.64,99.92,,few_shot,max_steps=10,1319,"17,038,928","12,918","898,936",682,"17,937,864",10.1124,2025/1/22,,,,,,,,,,,,,,,,,,,,
17
- ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,82.86,100,8,few_shot,max_steps=10,1319,"14,355,752","10,884","495,162",375,"14,850,914",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
18
- ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,67.77,98.55,8,few_shot,max_steps=10,1319,"21,044,978","15,955","1,790,789","1,358","22,835,767",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
19
- ReAct-Pro*,gsm8k,Internllm2_5-7B,33.51,97.95,,few_shot,max_steps=10,1319,"30,120,070","22,836","5,549,919","4,208","35,669,989",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
20
- ReAct-Pro*,gsm8k,Qwen2-1.5B-Instruct,24.86,80.21,8,few_shot,max_steps=10,1319,"9,133,603","6,925","694,398",526,"9,828,001",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
21
- ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,7.65,95.22,8,few_shot,max_steps=10,1319,"52,431,343","39,751","2,961,268","2,245","55,392,611",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
22
- PoT,gsm8k,gpt-3.5-turbo,76.87,99.24,8,few_shot,,1319,"1,090,418",827,"96,662",73,"1,187,080",0.6902,2025/1/7,,,,,,,,,,,,,,,,,,,,
23
- PoT,gsm8k,Doubao-lite-32k,79.6,92.57,8,few_shot,,1319,"1,170,038",887,"118,017",89,"1,288,055",0.0576,2025/1/7,,,,,,,,,,,,,,,,,,,,
24
- PoT,gsm8k,gpt-4o,93.1,99.77,8,few_shot,,1319,"1,101,672",835,"146,240",111,"1,247,912",4.2166,2025/1/22,,,,,,,,,,,,,,,,,,,,
25
- PoT,gsm8k,Qwen2.5-72B-Instruct,92.34,99.39,8,few_shot,,1319,"1,106,682",839,"144,528",110,"1,251,210",0.7054,2025/1/22,,,,,,,,,,,,,,,,,,,,
26
- PoT,gsm8k,Llama-3.3-70B-Instruct,73.08,79.6,8,few_shot,,1319,"1,126,025",854,"601,019",456,"1,727,044",0.9736,2025/1/22,,,,,,,,,,,,,,,,,,,,
27
- PoT,gsm8k,Qwen2.5-7B-Instruct,58.83,70.5,8,few_shot,,1319,"1,145,390",868,"217,432",165,"1,362,822",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
28
- PoT,gsm8k,Llama-3.1-8B-Instruct,38.66,55.42,8,few_shot,,1319,"1,147,538",870,"243,573",185,"1,391,111",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
29
- PoT,gsm8k,Internllm2_5-7B,38.21,48.9,8,few_shot,,1319,"1,136,843",862,"188,106",143,"1,324,949",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
30
- PoT,gsm8k,Qwen2-1.5B-Instruct,18.49,31,8,few_shot,,1319,"1,151,528",873,"175,994",133,"1,327,522",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
31
- PoT,gsm8k,Qwen2-0.5B-Instruct,9.62,16.9,8,few_shot,,1319,"1,151,528",873,"237,607",180,"1,389,135",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
32
- CoT,gsm8k,gpt-3.5-turbo,78.69,100,8,few_shot,,1319,"953,242",723,"134,799",102,"1,088,041",0.6788,2025/1/7,,,,,,,,,,,,,,,,,,,,
33
- CoT,gsm8k,Doubao-lite-32k,89.31,100,8,few_shot,,1319,"1,042,095",790,"159,725",121,"1,201,820",0.0558,2025/1/7,0.4084635 ๏ผˆๅ…ƒ๏ผ‰,,,,,,,,,,,,,,,,,,,
34
- CoT,gsm8k,gpt-4o,94.08,100,8,few_shot,,1319,"948,668",719,"216,498",164,"1,165,166",4.5367,2025/1/22,,,,,,,,,,,,,,,,,,,,
35
- CoT,gsm8k,Qwen2.5-72B-Instruct,92.87,100,8,few_shot,,1319,"1,005,119",762,"271,133",206,"1,276,252",0.7195,2025/1/22,,,,,,,,,,,,,,,,,,,,
36
- CoT,gsm8k,Llama-3.3-70B-Instruct,93.93,100,8,few_shot,,1319,"990,168",751,"228,497",173,"1,218,665",0.6870,2025/1/22,,,,,,,,,,,,,,,,,,,,
37
- CoT,gsm8k,Qwen2.5-7B-Instruct,85.67,100,8,few_shot,,1319,"1,046,008",793,"244,797",186,"1,290,805",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
38
- CoT,gsm8k,Llama-3.1-8B-Instruct,75.43,99.92,8,few_shot,,1319,"990,168",751,"258,161",196,"1,248,329",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
39
- CoT,gsm8k,Internllm2_5-7B,77.71,99.69,8,few_shot,,1319,"968,163",734,"234,000",177,"1,202,163",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
40
- CoT,gsm8k,Qwen2-1.5B-Instruct,55.49,100,8,few_shot,,1319,"1,032,818",783,"185,707",141,"1,218,525",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
41
- CoT,gsm8k,Qwen2-0.5B-Instruct,35.93,99.92,8,few_shot,,1319,"1,032,818",783,"190,641",145,"1,223,459",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
42
- SC-CoT,gsm8k,gpt-3.5-turbo,82.56,99.85,8,few_shot,"temperature=1, path_num=5",1319,"1,212,520",919,"1,348,177","1,022","2,560,697",2.6285,2025/1/7,,,,,,,,,,,,,,,,,,,,
43
- SC-CoT,gsm8k,Doubao-lite-32k,83.7,99.70,8,few_shot,"temperature=1, path_num=5",1319,"1,230,019",933,"1,277,668",969,"2,507,687",0.1550,2025/1/7,,,,,,,,,,,,,,,,,,,,
44
- SC-CoT,gsm8k,gpt-4o,90.75,100,8,few_shot,"temperature=1, path_num=5",1319,"1,168,927",886,"2,132,044","1,616","3,300,971",24.2428,2025/1/22,,,,,,,,,,,,,,,,,,,,
45
- SC-CoT,gsm8k,Qwen2.5-72B-Instruct,90.67,100,8,few_shot,"temperature=1, path_num=5",1319,"5,292,383","4,012","2,273,254","1,723","7,565,637",4.2651,2025/1/22,,,,,,,,,,,,,,,,,,,,
46
- SC-CoT,gsm8k,Llama-3.3-70B-Instruct,95.45,100,8,few_shot,"temperature=1, path_num=5",1319,"5,406,763","4,099","2,579,233","1,955","7,985,996",4.5021,2025/1/22,,,,,,,,,,,,,,,,,,,,
47
- SC-CoT,gsm8k,Qwen2.5-7B-Instruct,88.32,99.84,8,few_shot,"temperature=1, path_num=5",1319,"5,668,252","4,297","2,505,566","1,900","8,173,818",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
48
- SC-CoT,gsm8k,Llama-3.1-8B-Instruct,75.2,99.54,8,few_shot,"temperature=1, path_num=5",1319,"5,334,657","4,044","3,109,546","2,358","8,444,203",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
49
- SC-CoT,gsm8k,Internllm2_5-7B,41.39,98.25,8,few_shot,"temperature=1, path_num=5",1319,"6,674,518","5,060","3,350,339","2,540","10,024,857",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
50
- SC-CoT,gsm8k,Qwen2-1.5B-Instruct,5.53,86.73,8,few_shot,"temperature=1, path_num=5",1319,"5,844,218","4,431","3,117,550","2,364","8,961,768",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
51
- SC-CoT,gsm8k,Qwen2-0.5B-Instruct,3.79,94.84,8,few_shot,"temperature=1, path_num=5",1319,"6,529,832","4,951","4,003,983","3,036","10,533,815",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
52
- IO,AQuA,gpt-3.5-turbo,38.97,100,0,zero_shot,,254,"25,701",101,"16,770",66,"42,471",0.0380,2025/1/7,,,,,,,,,,,,,,,,,,,,
53
- IO,AQuA,Doubao-lite-32k,79.13,100,0,zero_shot,,254,"33,058",130,"54,684",215,"87,742",0.0058,2025/1/7,0.0427๏ผˆๅ…ƒ๏ผ‰,,,,,,,,,,,,,,,,,,,
54
- IO,AQuA,gpt-4o,75.59,97.24,0,zero_shot,,254,"25,631",101,"108,121",426,"133,752",1.1453,2025/1/22,,,,,,,,,,,,,,,,,,,,
55
- IO,AQuA,Qwen2.5-72B-Instruct,84.25,99.6,0,zero_shot,,254,"25,397",100,"106,207",418,"131,604",0.0742,2025/1/22,,,,,,,,,,,,,,,,,,,,
56
- IO,AQuA,Llama-3.3-70B-Instruct,82.67,99.21,0,zero_shot,,254,"32,809",129,"108,758",428,"141,567",0.0798,2025/1/22,,,,,,,,,,,,,,,,,,,,
57
- IO,AQuA,Qwen2.5-7B-Instruct,78.74,98.42,0,zero_shot,,254,"33,271",131,"104,500",411,"137,771",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
58
- IO,AQuA,Llama-3.1-8B-Instruct,51.18,98.81,0,zero_shot,,254,"26,459",104,"106,647",420,"133,106",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
59
- IO,AQuA,Internllm2_5-7B,47.63,90.94,0,zero_shot,,254,"50,232",198,"134,809",531,"185,041",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
60
- IO,AQuA,Qwen2-1.5B-Instruct,29.13,97.63,0,zero_shot,,254,"27,937",110,"43,110",170,"71,047",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
61
- IO,AQuA,Qwen2-0.5B-Instruct,27.16,98.81,0,zero_shot,,254,"27,937",110,"82,478",325,"110,415",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
62
- CoT,AQuA,gpt-3.5-turbo,61.02,93.7,0,zero_shot,,254,"25,447",100,"55,346",218,"80,793",0.0957,2025/1/22,,,,,,,,,,,,,,,,,,,,
63
- CoT,AQuA,Doubao-lite-32k,82.67,97.24,0,zero_shot,,254,"27,978",110,"66,599",262,"94,577",0.0066,2025/1/7,0.0483 ๏ผˆๅ…ƒ๏ผ‰,,,,,,,,,,,,,,,,,,,
64
- CoT,AQuA,gpt-4o,82.67,98.03,0,zero_shot,,254,"25,123",99,"97,894",385,"123,017",1.0417,2025/1/22,,,,,,,,,,,,,,,,,,,,
65
- CoT,AQuA,Qwen2.5-72B-Instruct,86.22,99.21,0,zero_shot,,254,"25,143",99,"118,146",465,"143,289",0.0808,2025/1/22,,,,,,,,,,,,,,,,,,,,
66
- CoT,AQuA,Llama-3.3-70B-Instruct,83.46,98.42,0,zero_shot,,254,"32,555",128,"131,834",519,"164,389",0.0927,2025/1/22,,,,,,,,,,,,,,,,,,,,
67
- CoT,AQuA,Qwen2.5-7B-Instruct,80.7,99.6,0,zero_shot,,254,"33,017",130,"116,719",460,"149,736",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
68
- CoT,AQuA,Llama-3.1-8B-Instruct,60.62,100,0,zero_shot,,254,"32,555",128,"111,880",440,"144,435",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
69
- CoT,AQuA,Internllm2_5-7B,52.75,89.37,0,zero_shot,,254,"26,610",105,"100,910",397,"127,520",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
70
- CoT,AQuA,Qwen2-1.5B-Instruct,40.55,98.81,0,zero_shot,,254,"30,477",120,"79,563",313,"110,040",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
71
- CoT,AQuA,Qwen2-0.5B-Instruct,33.07,98.81,0,zero_shot,,254,"30,477",120,"86,862",342,"117,339",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
72
- PoT,AQuA,gpt-3.5-turbo,59.44,100,0,zero_shot,,254,"225,162",886,"41,492",163,"266,654",0.1748,2025/1/7,,,,,,,,,,,,,,,,,,,,
73
- PoT,AQuA,gpt-4o,75.19,100,0,zero_shot,,254,"222,717",877,"105,191",414,"327,908",1.6087,2025/1/22,,,,,,,,,,,,,,,,,,,,
74
- PoT,AQuA,Doubao-lite-32k,71.65,96.85,0,zero_shot,,254,"259,863","1,023","49,573",195,"309,436",0.0147,2025/1/7,,,,,,,,,,,,,,,,,,,,
75
- PoT,AQuA,Qwen2.5-72B-Instruct,75.19,100,0,zero_shot,,254,"249,215",981,"42,549",168,"291,764",0.1645,2025/1/22,,,,,,,,,,,,,,,,,,,,
76
- PoT,AQuA,Llama-3.3-70B-Instruct,79.52,99.21,0,zero_shot,,254,"240,735",948,"69,064",272,"309,799",0.1746,2025/1/22,,,,,,,,,,,,,,,,,,,,
77
- PoT,AQuA,Qwen2.5-7B-Instruct,68.11,100,0,zero_shot,,254,"264,517","1,041","49,211",194,"313,728",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
78
- PoT,AQuA,Llama-3.1-8B-Instruct,36.61,96.85,0,zero_shot,,254,"240,613",947,"50,301",198,"290,914",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
79
- PoT,AQuA,Internllm2_5-7B,36.61,98.81,0,zero_shot,,254,"233,505",919,"68,457",270,"301,962",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
80
- PoT,AQuA,Qwen2-1.5B-Instruct,30.7,96.45,0,zero_shot,,254,"246,560",971,"51,915",204,"298,475",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
81
- PoT,AQuA,Qwen2-0.5B-Instruct,17.32,92.12,0,zero_shot,,254,"258,867","1,019","63,414",250,"322,281",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
82
- SC-CoT,AQuA,gpt-3.5-turbo,70.47,98.82,0,zero_shot,"temperature=1, path_num=5",254,"70,157",276,"348,460","1,372","418,617",0.5578,2025/1/7,,,,,,,,,,,,,,,,,,,,
83
- SC-CoT,AQuA,Doubao-lite-32k,81.5,97.64,0,zero_shot,"temperature=1, path_num=5",254,"83,830",330,"382,016","1,504","465,846",0.0347,2025/1/7,,,,,,,,,,,,,,,,,,,,
84
- SC-CoT,AQuA,gpt-4o,88.19,100,0,zero_shot,"temperature=1, path_num=5",254,"72,916",287,"605,895","2,385","678,811",6.2412,2025/1/22,,,,,,,,,,,,,,,,,,,,
85
- SC-CoT,AQuA,Qwen2.5-72B-Instruct,85.82,98.42,0,zero_shot,"temperature=1, path_num=5",254,"241,149",949,"747,909","2,945","989,058",0.5576,2025/1/22,,,,,,,,,,,,,,,,,,,,
86
- SC-CoT,AQuA,Llama-3.3-70B-Instruct,86.61,99.21,0,zero_shot,"temperature=1, path_num=5",254,"283,248","1,115","753,876","2,968","1,037,124",0.5847,2025/1/22,,,,,,,,,,,,,,,,,,,,
87
- SC-CoT,AQuA,Qwen2.5-7B-Instruct,81.49,100,0,zero_shot,"temperature=1, path_num=5",254,"278,848","1,098","736,520","2,900","1,015,368",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
88
- SC-CoT,AQuA,Llama-3.1-8B-Instruct,53.14,96.06,0,zero_shot,"temperature=1, path_num=5",254,"372,968","1,468","668,378","2,631","1,041,346",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
89
- SC-CoT,AQuA,Internllm2_5-7B,35.85,98.8,0,zero_shot,"temperature=1, path_num=5",254,"530,701","2,089","709,687","2,794","1,240,388",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
90
- SC-CoT,AQuA,Qwen2-1.5B-Instruct,30.31,97.24,0,zero_shot,"temperature=1, path_num=5",254,"430,703","1,696","726,373","2,860","1,157,076",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
91
- SC-CoT,AQuA,Qwen2-0.5B-Instruct,30.7,98.42,0,zero_shot,"temperature=1, path_num=5",254,"496,206","1,954","729,333","2,871","1,225,539",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
92
- ReAct-Pro*,AQuA,gpt-3.5-turbo,64.56,98.03,0,zero_shot,max_steps=10,254,"862,614","3,396","40,973",161,"903,587",0.4928,2025/1/7,"think-action ๅ•็‹ฌ่ฟ”ๅ›ž,prompt v1",,,,,,,,,,,,,,,,,,,
93
- ReAct-Pro*,AQuA,Doubao-lite-32k,77.55,96.06,0,zero_shot,max_steps=10,254,"977,890","3,850","54,951",216,"1,032,841",0.0445,2025/1/7,"think-action ๅ•็‹ฌ่ฟ”ๅ›ž,prompt v1",,,,,,,,,,,,,,,,,,,
94
- ReAct-Pro*,AQuA,gpt-4o,57.48,97.24,0,zero_shot,max_steps=10,254,"615,589","2,424","76,507",301,"692,096",2.3040,2025/1/22,"think-action ๅ•็‹ฌ่ฟ”ๅ›ž,prompt v1",,,,,,,,,,,,,,,,,,,
95
- ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,73.22,100,0,zero_shot,max_steps=10,254,"441,765","1,739","121,838",480,"563,603",0.3177,2025/1/22,,,,,,,,,,,,,,,,,,,,
96
- ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,79.13,99.6,0,zero_shot,max_steps=10,254,"1,119,143","4,406","243,236",958,"1,362,379",0.7680,2025/1/22,,,,,,,,,,,,,,,,,,,,
97
- ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,74.4,99.21,0,zero_shot,max_steps=10,254,"564,165","2,221","131,679",518,"695,844",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
98
- ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,55.51,96.85,0,zero_shot,max_steps=10,254,"3,764,723","14,822","576,098","2,268","4,340,821",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
99
- ReAct-Pro*,AQuA,Internllm2_5-7B,40.94,96.85,0,zero_shot,max_steps=10,254,"3,592,039","14,142","836,762","3,294","4,428,801",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
100
- ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,25.59,96.06,0,zero_shot,max_steps=10,254,"4,555,858","17,936","516,146","2,032","5,072,004",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
101
- ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,24.01,96.85,0,zero_shot,max_steps=10,254,6344167,"24,977",825920,"3,252","7,170,087",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
102
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
103
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
104
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
105
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
106
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
107
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
108
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
109
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
110
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
111
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
112
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
113
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
114
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
115
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
116
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
117
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
118
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
119
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
120
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
121
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
122
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
123
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
124
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
125
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
126
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
127
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
128
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
129
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
130
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
131
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
132
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
133
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
134
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
135
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
136
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
137
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
138
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
139
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
140
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
141
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
142
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
143
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
144
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
145
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
146
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
147
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
148
- ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 
1
+ ๏ปฟAlgorithm,Dataset,Eval Date,LLM,Score,Pass rate,X-shot,Parameters,Samples,Total input tokens,Average input tokens,Total output tokens,Average output tokens,All tokens,Cost($),Note,,,,,,,,,,,,,,,,,,,
2
+ IO,gsm8k,2025/1/7,gpt-3.5-turbo,37.83,99.92,8,,1319,"546,990",415,"39,563",30,"586,553",0.3328,,,,,,,,,,,,,,,,,,,,
3
+ IO,gsm8k,2025/1/7,Doubao-lite-32k,72.02,99.92,8,,1319,"617,377",468,"123,106",93,"740,483",0.0354,0.2590 ๏ผˆๅ…ƒ๏ผ‰,,,,,,,,,,,,,,,,,,,
4
+ IO,gsm8k,2025/1/22,gpt-4o,88.4,100,8,,1319,"542,416",411,"199,030",151,"741,446",3.3463,,,,,,,,,,,,,,,,,,,,
5
+ IO,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,86.58,100,8,,1319,"555,340",421,"313,720",238,"869,060",0.4899,,,,,,,,,,,,,,,,,,,,
6
+ IO,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,92.27,100,8,,1319,"583,916",443,"251,359",191,"835,275",0.4709,,,,,,,,,,,,,,,,,,,,
7
+ IO,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,57.24,100,8,,1319,"596,229",452,"291,684",221,"887,913",0.0000,,,,,,,,,,,,,,,,,,,,
8
+ IO,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,57.16,99.55,8,,1319,"550,941",418,"1,194,488",906,"1,745,429",0.0000,,,,,,,,,,,,,,,,,,,,
9
+ IO,gsm8k,2025/1/22,Internllm2_5-7B,11.6,97.95,8,,1319,"679,302",515,"434,426",329,"1,113,728",0.0000,,,,,,,,,,,,,,,,,,,,
10
+ IO,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,16.68,100,8,,1319,"568,530",431,"168,466",128,"736,996",0.0000,,,,,,,,,,,,,,,,,,,,
11
+ IO,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,14.71,100,8,,1319,"568,116",431,"266,781",202,"834,897",0.0000,,,,,,,,,,,,,,,,,,,,
12
+ ReAct-Pro*,gsm8k,2025/1/7,gpt-3.5-turbo,74.91,99.39,8,max_steps=10,1319,"6,506,164","4,933","140,122",106,"6,646,286",3.4633,"think-action ๅ•็‹ฌ่ฟ”ๅ›ž,prompt v1",,,,,,,,,,,,,,,,,,,
13
+ ReAct-Pro*,gsm8k,2025/1/7,Doubao-lite-32k,85.6,99.62,8,max_steps=10,1319,"5,862,016","4,444","136,623",104,"5,998,639",0.2512,"think-action ๅ•็‹ฌ่ฟ”ๅ›ž,prompt v1",,,,,,,,,,,,,,,,,,,
14
+ ReAct-Pro*,gsm8k,2025/1/22,gpt-4o,63.31,99.55,8,max_steps=10,1319,"14,411,173","10,926","304,714",231,"14,715,887",39.0751,"think-action ๅ•็‹ฌ่ฟ”ๅ›ž,prompt v1",,,,,,,,,,,,,,,,,,,
15
+ ReAct-Pro*,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,87.26,100,8,max_steps=10,1319,"18,160,983","13,769","549,454",417,"18,710,437",10.5479,,,,,,,,,,,,,,,,,,,,
16
+ ReAct-Pro*,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,87.64,99.92,8,max_steps=10,1319,"17,038,928","12,918","898,936",682,"17,937,864",10.1124,,,,,,,,,,,,,,,,,,,,
17
+ ReAct-Pro*,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,82.87,100,8,max_steps=10,1319,"14,355,752","10,884","495,162",375,"14,850,914",0.0000,,,,,,,,,,,,,,,,,,,,
18
+ ReAct-Pro*,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,67.78,98.56,8,max_steps=10,1319,"21,044,978","15,955","1,790,789","1,358","22,835,767",0.0000,,,,,,,,,,,,,,,,,,,,
19
+ ReAct-Pro*,gsm8k,2025/1/22,Internllm2_5-7B,33.51,97.95,8,max_steps=10,1319,"30,120,070","22,836","5,549,919","4,208","35,669,989",0.0000,,,,,,,,,,,,,,,,,,,,
20
+ ReAct-Pro*,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,24.87,80.21,8,max_steps=10,1319,"9,133,603","6,925","694,398",526,"9,828,001",0.0000,,,,,,,,,,,,,,,,,,,,
21
+ ReAct-Pro*,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,7.66,95.22,8,max_steps=10,1319,"52,431,343","39,751","2,961,268","2,245","55,392,611",0.0000,,,,,,,,,,,,,,,,,,,,
22
+ PoT,gsm8k,2025/1/7,gpt-3.5-turbo,76.88,99.24,8,,1319,"1,090,418",827,"96,662",73,"1,187,080",0.6902,,,,,,,,,,,,,,,,,,,,
23
+ PoT,gsm8k,2025/1/7,Doubao-lite-32k,79.61,92.57,8,,1319,"1,170,038",887,"118,017",89,"1,288,055",0.0576,,,,,,,,,,,,,,,,,,,,
24
+ PoT,gsm8k,2025/1/22,gpt-4o,93.1,99.77,8,,1319,"1,101,672",835,"146,240",111,"1,247,912",4.2166,,,,,,,,,,,,,,,,,,,,
25
+ PoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,92.34,99.39,8,,1319,"1,106,682",839,"144,528",110,"1,251,210",0.7054,,,,,,,,,,,,,,,,,,,,
26
+ PoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,73.09,79.61,8,,1319,"1,126,025",854,"601,019",456,"1,727,044",0.9736,,,,,,,,,,,,,,,,,,,,
27
+ PoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,58.83,70.51,8,,1319,"1,145,390",868,"217,432",165,"1,362,822",0.0000,,,,,,,,,,,,,,,,,,,,
28
+ PoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,38.67,55.42,8,,1319,"1,147,538",870,"243,573",185,"1,391,111",0.0000,,,,,,,,,,,,,,,,,,,,
29
+ PoT,gsm8k,2025/1/22,Internllm2_5-7B,38.21,48.9,8,,1319,"1,136,843",862,"188,106",143,"1,324,949",0.0000,,,,,,,,,,,,,,,,,,,,
30
+ PoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,18.5,31.01,8,,1319,"1,151,528",873,"175,994",133,"1,327,522",0.0000,,,,,,,,,,,,,,,,,,,,
31
+ PoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,9.62,16.9,8,,1319,"1,151,528",873,"237,607",180,"1,389,135",0.0000,,,,,,,,,,,,,,,,,,,,
32
+ CoT,gsm8k,2025/1/7,gpt-3.5-turbo,78.7,100,8,,1319,"953,242",723,"134,799",102,"1,088,041",0.6788,,,,,,,,,,,,,,,,,,,,
33
+ CoT,gsm8k,2025/1/7,Doubao-lite-32k,89.31,100,8,,1319,"1,042,095",790,"159,725",121,"1,201,820",0.0558,0.4084635 ๏ผˆๅ…ƒ๏ผ‰,,,,,,,,,,,,,,,,,,,
34
+ CoT,gsm8k,2025/1/22,gpt-4o,94.09,100,8,,1319,"948,668",719,"216,498",164,"1,165,166",4.5367,,,,,,,,,,,,,,,,,,,,
35
+ CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,92.87,100,8,,1319,"1,005,119",762,"271,133",206,"1,276,252",0.7195,,,,,,,,,,,,,,,,,,,,
36
+ CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,93.93,100,8,,1319,"990,168",751,"228,497",173,"1,218,665",0.6870,,,,,,,,,,,,,,,,,,,,
37
+ CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,85.67,100,8,,1319,"1,046,008",793,"244,797",186,"1,290,805",0.0000,,,,,,,,,,,,,,,,,,,,
38
+ CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,75.44,99.92,8,,1319,"990,168",751,"258,161",196,"1,248,329",0.0000,,,,,,,,,,,,,,,,,,,,
39
+ CoT,gsm8k,2025/1/22,Internllm2_5-7B,77.71,99.7,8,,1319,"968,163",734,"234,000",177,"1,202,163",0.0000,,,,,,,,,,,,,,,,,,,,
40
+ CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,55.5,100,8,,1319,"1,032,818",783,"185,707",141,"1,218,525",0.0000,,,,,,,,,,,,,,,,,,,,
41
+ CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,35.94,99.92,8,,1319,"1,032,818",783,"190,641",145,"1,223,459",0.0000,,,,,,,,,,,,,,,,,,,,
42
+ SC-CoT,gsm8k,2025/1/7,gpt-3.5-turbo,82.56,99.85,8,"temperature=1, path_num=5",1319,"1,212,520",919,"1,348,177","1,022","2,560,697",2.6285,,,,,,,,,,,,,,,,,,,,
43
+ SC-CoT,gsm8k,2025/1/7,Doubao-lite-32k,83.7,99.7,8,"temperature=1, path_num=5",1319,"1,230,019",933,"1,277,668",969,"2,507,687",0.1550,,,,,,,,,,,,,,,,,,,,
44
+ SC-CoT,gsm8k,2025/1/22,gpt-4o,90.75,100,8,"temperature=1, path_num=5",1319,"1,168,927",886,"2,132,044","1,616","3,300,971",24.2428,,,,,,,,,,,,,,,,,,,,
45
+ SC-CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,90.67,100,8,"temperature=1, path_num=5",1319,"5,292,383","4,012","2,273,254","1,723","7,565,637",4.2651,,,,,,,,,,,,,,,,,,,,
46
+ SC-CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,95.45,100,8,"temperature=1, path_num=5",1319,"5,406,763","4,099","2,579,233","1,955","7,985,996",4.5021,,,,,,,,,,,,,,,,,,,,
47
+ SC-CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,88.32,99.85,8,"temperature=1, path_num=5",1319,"5,668,252","4,297","2,505,566","1,900","8,173,818",0.0000,,,,,,,,,,,,,,,,,,,,
48
+ SC-CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,75.21,99.55,8,"temperature=1, path_num=5",1319,"5,334,657","4,044","3,109,546","2,358","8,444,203",0.0000,,,,,,,,,,,,,,,,,,,,
49
+ SC-CoT,gsm8k,2025/1/22,Internllm2_5-7B,41.39,98.26,8,"temperature=1, path_num=5",1319,"6,674,518","5,060","3,350,339","2,540","10,024,857",0.0000,,,,,,,,,,,,,,,,,,,,
50
+ SC-CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,5.53,86.73,8,"temperature=1, path_num=5",1319,"5,844,218","4,431","3,117,550","2,364","8,961,768",0.0000,,,,,,,,,,,,,,,,,,,,
51
+ SC-CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,3.79,94.84,8,"temperature=1, path_num=5",1319,"6,529,832","4,951","4,003,983","3,036","10,533,815",0.0000,,,,,,,,,,,,,,,,,,,,
52
+ IO,AQuA,2025/1/7,gpt-3.5-turbo,38.98,100,0,,254,"25,701",101,"16,770",66,"42,471",0.0380,,,,,,,,,,,,,,,,,,,,
53
+ IO,AQuA,2025/1/7,Doubao-lite-32k,79.13,100,0,,254,"33,058",130,"54,684",215,"87,742",0.0058,0.0427๏ผˆๅ…ƒ๏ผ‰,,,,,,,,,,,,,,,,,,,
54
+ IO,AQuA,2025/1/22,gpt-4o,75.59,97.24,0,,254,"25,631",101,"108,121",426,"133,752",1.1453,,,,,,,,,,,,,,,,,,,,
55
+ IO,AQuA,2025/1/22,Qwen2.5-72B-Instruct,84.25,99.61,0,,254,"25,397",100,"106,207",418,"131,604",0.0742,,,,,,,,,,,,,,,,,,,,
56
+ IO,AQuA,2025/1/22,Llama-3.3-70B-Instruct,82.68,99.21,0,,254,"32,809",129,"108,758",428,"141,567",0.0798,,,,,,,,,,,,,,,,,,,,
57
+ IO,AQuA,2025/1/22,Qwen2.5-7B-Instruct,78.74,98.43,0,,254,"33,271",131,"104,500",411,"137,771",0.0000,,,,,,,,,,,,,,,,,,,,
58
+ IO,AQuA,2025/1/22,Llama-3.1-8B-Instruct,51.18,98.82,0,,254,"26,459",104,"106,647",420,"133,106",0.0000,,,,,,,,,,,,,,,,,,,,
59
+ IO,AQuA,2025/1/22,Internllm2_5-7B,47.64,90.94,0,,254,"50,232",198,"134,809",531,"185,041",0.0000,,,,,,,,,,,,,,,,,,,,
60
+ IO,AQuA,2025/1/22,Qwen2-1.5B-Instruct,29.13,97.64,0,,254,"27,937",110,"43,110",170,"71,047",0.0000,,,,,,,,,,,,,,,,,,,,
61
+ IO,AQuA,2025/1/22,Qwen2-0.5B-Instruct,27.17,98.82,0,,254,"27,937",110,"82,478",325,"110,415",0.0000,,,,,,,,,,,,,,,,,,,,
62
+ CoT,AQuA,2025/1/22,gpt-3.5-turbo,61.02,93.7,0,,254,"25,447",100,"55,346",218,"80,793",0.0957,,,,,,,,,,,,,,,,,,,,
63
+ CoT,AQuA,2025/1/7,Doubao-lite-32k,82.68,97.24,0,,254,"27,978",110,"66,599",262,"94,577",0.0066,0.0483 ๏ผˆๅ…ƒ๏ผ‰,,,,,,,,,,,,,,,,,,,
64
+ CoT,AQuA,2025/1/22,gpt-4o,82.68,98.03,0,,254,"25,123",99,"97,894",385,"123,017",1.0417,,,,,,,,,,,,,,,,,,,,
65
+ CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,86.22,99.21,0,,254,"25,143",99,"118,146",465,"143,289",0.0808,,,,,,,,,,,,,,,,,,,,
66
+ CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,83.46,98.43,0,,254,"32,555",128,"131,834",519,"164,389",0.0927,,,,,,,,,,,,,,,,,,,,
67
+ CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,80.71,99.61,0,,254,"33,017",130,"116,719",460,"149,736",0.0000,,,,,,,,,,,,,,,,,,,,
68
+ CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,60.63,100,0,,254,"32,555",128,"111,880",440,"144,435",0.0000,,,,,,,,,,,,,,,,,,,,
69
+ CoT,AQuA,2025/1/22,Internllm2_5-7B,52.76,89.37,0,,254,"26,610",105,"100,910",397,"127,520",0.0000,,,,,,,,,,,,,,,,,,,,
70
+ CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,40.55,98.82,0,,254,"30,477",120,"79,563",313,"110,040",0.0000,,,,,,,,,,,,,,,,,,,,
71
+ CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,33.07,98.82,0,,254,"30,477",120,"86,862",342,"117,339",0.0000,,,,,,,,,,,,,,,,,,,,
72
+ PoT,AQuA,2025/1/7,gpt-3.5-turbo,59.45,100,0,,254,"225,162",886,"41,492",163,"266,654",0.1748,,,,,,,,,,,,,,,,,,,,
73
+ PoT,AQuA,2025/1/7,Doubao-lite-32k,71.65,96.85,0,,254,"259,863","1,023","49,573",195,"309,436",0.0147,,,,,,,,,,,,,,,,,,,,
74
+ PoT,AQuA,2025/1/22,gpt-4o,75.2,100,0,,254,"222,717",877,"105,191",414,"327,908",1.6087,,,,,,,,,,,,,,,,,,,,
75
+ PoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,75.2,100,0,,254,"249,215",981,"42,549",168,"291,764",0.1645,,,,,,,,,,,,,,,,,,,,
76
+ PoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,79.53,99.21,0,,254,"240,735",948,"69,064",272,"309,799",0.1746,,,,,,,,,,,,,,,,,,,,
77
+ PoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,68.11,100,0,,254,"264,517","1,041","49,211",194,"313,728",0.0000,,,,,,,,,,,,,,,,,,,,
78
+ PoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,36.61,96.85,0,,254,"240,613",947,"50,301",198,"290,914",0.0000,,,,,,,,,,,,,,,,,,,,
79
+ PoT,AQuA,2025/1/22,Internllm2_5-7B,36.61,98.82,0,,254,"233,505",919,"68,457",270,"301,962",0.0000,,,,,,,,,,,,,,,,,,,,
80
+ PoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,30.71,96.46,0,,254,"246,560",971,"51,915",204,"298,475",0.0000,,,,,,,,,,,,,,,,,,,,
81
+ PoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,17.32,92.13,0,,254,"258,867","1,019","63,414",250,"322,281",0.0000,,,,,,,,,,,,,,,,,,,,
82
+ SC-CoT,AQuA,2025/1/7,gpt-3.5-turbo,70.47,98.82,0,"temperature=1, path_num=5",254,"70,157",276,"348,460","1,372","418,617",0.5578,,,,,,,,,,,,,,,,,,,,
83
+ SC-CoT,AQuA,2025/1/7,Doubao-lite-32k,81.5,97.64,0,"temperature=1, path_num=5",254,"83,830",330,"382,016","1,504","465,846",0.0347,,,,,,,,,,,,,,,,,,,,
84
+ SC-CoT,AQuA,2025/1/22,gpt-4o,88.19,100,0,"temperature=1, path_num=5",254,"72,916",287,"605,895","2,385","678,811",6.2412,,,,,,,,,,,,,,,,,,,,
85
+ SC-CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,85.83,98.43,0,"temperature=1, path_num=5",254,"241,149",949,"747,909","2,945","989,058",0.5576,,,,,,,,,,,,,,,,,,,,
86
+ SC-CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,86.61,99.21,0,"temperature=1, path_num=5",254,"283,248","1,115","753,876","2,968","1,037,124",0.5847,,,,,,,,,,,,,,,,,,,,
87
+ SC-CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,81.5,100,0,"temperature=1, path_num=5",254,"278,848","1,098","736,520","2,900","1,015,368",0.0000,,,,,,,,,,,,,,,,,,,,
88
+ SC-CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,53.15,96.06,0,"temperature=1, path_num=5",254,"372,968","1,468","668,378","2,631","1,041,346",0.0000,,,,,,,,,,,,,,,,,,,,
89
+ SC-CoT,AQuA,2025/1/22,Internllm2_5-7B,35.85,98.8,0,"temperature=1, path_num=5",254,"530,701","2,089","709,687","2,794","1,240,388",0.0000,,,,,,,,,,,,,,,,,,,,
90
+ SC-CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,30.31,97.24,0,"temperature=1, path_num=5",254,"430,703","1,696","726,373","2,860","1,157,076",0.0000,,,,,,,,,,,,,,,,,,,,
91
+ SC-CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,30.71,98.43,0,"temperature=1, path_num=5",254,"496,206","1,954","729,333","2,871","1,225,539",0.0000,,,,,,,,,,,,,,,,,,,,
92
+ ReAct-Pro*,AQuA,2025/1/7,gpt-3.5-turbo,64.57,98.03,0,max_steps=10,254,"862,614","3,396","40,973",161,"903,587",0.4928,"think-action ๅ•็‹ฌ่ฟ”ๅ›ž,prompt v1",,,,,,,,,,,,,,,,,,,
93
+ ReAct-Pro*,AQuA,2025/1/7,Doubao-lite-32k,77.56,96.06,0,max_steps=10,254,"977,890","3,850","54,951",216,"1,032,841",0.0445,"think-action ๅ•็‹ฌ่ฟ”ๅ›ž,prompt v1",,,,,,,,,,,,,,,,,,,
94
+ ReAct-Pro*,AQuA,2025/1/22,gpt-4o,57.48,97.24,0,max_steps=10,254,"615,589","2,424","76,507",301,"692,096",2.3040,"think-action ๅ•็‹ฌ่ฟ”ๅ›ž,prompt v1",,,,,,,,,,,,,,,,,,,
95
+ ReAct-Pro*,AQuA,2025/1/22,Qwen2.5-72B-Instruct,73.23,100,0,max_steps=10,254,"441,765","1,739","121,838",480,"563,603",0.3177,,,,,,,,,,,,,,,,,,,,
96
+ ReAct-Pro*,AQuA,2025/1/22,Llama-3.3-70B-Instruct,79.13,99.61,0,max_steps=10,254,"1,119,143","4,406","243,236",958,"1,362,379",0.7680,,,,,,,,,,,,,,,,,,,,
97
+ ReAct-Pro*,AQuA,2025/1/22,Qwen2.5-7B-Instruct,74.41,99.21,0,max_steps=10,254,"564,165","2,221","131,679",518,"695,844",0.0000,,,,,,,,,,,,,,,,,,,,
98
+ ReAct-Pro*,AQuA,2025/1/22,Llama-3.1-8B-Instruct,55.51,96.85,0,max_steps=10,254,"3,764,723","14,822","576,098","2,268","4,340,821",0.0000,,,,,,,,,,,,,,,,,,,,
99
+ ReAct-Pro*,AQuA,2025/1/22,Internllm2_5-7B,40.94,96.85,0,max_steps=10,254,"3,592,039","14,142","836,762","3,294","4,428,801",0.0000,,,,,,,,,,,,,,,,,,,,
100
+ ReAct-Pro*,AQuA,2025/1/22,Qwen2-1.5B-Instruct,25.59,96.06,0,max_steps=10,254,"4,555,858","17,936","516,146","2,032","5,072,004",0.0000,,,,,,,,,,,,,,,,,,,,
101
+ ReAct-Pro*,AQuA,2025/1/22,Qwen2-0.5B-Instruct,24.02,96.85,0,max_steps=10,254,6344167,"24,977",825920,"3,252","7,170,087",0.0000,,,,,,,,,,,,,,,,,,,,
102
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
103
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
104
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
105
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
106
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
107
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
108
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
109
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
110
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
111
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
112
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
113
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
114
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
115
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
116
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
117
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
118
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
119
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
120
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
121
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
122
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
123
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
124
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
125
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
126
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
127
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
128
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
129
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
130
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
131
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
132
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
133
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
134
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
135
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
136
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
137
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
138
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
139
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
140
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
141
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
142
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
143
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
144
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
145
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
146
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
147
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
148
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,