Spaces:
Running
Running
liaojiajia
commited on
Commit
ยท
52f14c3
1
Parent(s):
c9a97c2
update score
Browse files- src/detail_math_score.json +73 -73
- src/detail_results.csv +62 -62
- src/overall_math_score.json +46 -46
- src/overall_results.csv +32 -32
- src/record.csv +148 -148
src/detail_math_score.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"time": "2025-01-23
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"gpt-3.5-turbo": {
|
@@ -22,7 +22,7 @@
|
|
22 |
"Average output tokens": 30
|
23 |
},
|
24 |
"AQuA": {
|
25 |
-
"Score": 38.
|
26 |
"Pass rate": 1.0,
|
27 |
"Cost($)": 0.038,
|
28 |
"Framework": "",
|
@@ -122,7 +122,7 @@
|
|
122 |
},
|
123 |
"AQuA": {
|
124 |
"Score": 84.25,
|
125 |
-
"Pass rate": 0.
|
126 |
"Cost($)": 0.0742,
|
127 |
"Framework": "",
|
128 |
"X-shot": "0.0",
|
@@ -141,7 +141,7 @@
|
|
141 |
"Eval Date": "2025/1/22"
|
142 |
},
|
143 |
"gsm8k": {
|
144 |
-
"Score": 92.
|
145 |
"Pass rate": 1.0,
|
146 |
"Cost($)": 0.4709,
|
147 |
"Framework": "",
|
@@ -154,7 +154,7 @@
|
|
154 |
"Average output tokens": 191
|
155 |
},
|
156 |
"AQuA": {
|
157 |
-
"Score": 82.
|
158 |
"Pass rate": 0.9921,
|
159 |
"Cost($)": 0.0798,
|
160 |
"Framework": "",
|
@@ -188,7 +188,7 @@
|
|
188 |
},
|
189 |
"AQuA": {
|
190 |
"Score": 78.74,
|
191 |
-
"Pass rate": 0.
|
192 |
"Cost($)": 0.0,
|
193 |
"Framework": "",
|
194 |
"X-shot": "0.0",
|
@@ -208,7 +208,7 @@
|
|
208 |
},
|
209 |
"gsm8k": {
|
210 |
"Score": 57.16,
|
211 |
-
"Pass rate": 0.
|
212 |
"Cost($)": 0.0,
|
213 |
"Framework": "",
|
214 |
"X-shot": "8.0",
|
@@ -221,7 +221,7 @@
|
|
221 |
},
|
222 |
"AQuA": {
|
223 |
"Score": 51.18,
|
224 |
-
"Pass rate": 0.
|
225 |
"Cost($)": 0.0,
|
226 |
"Framework": "",
|
227 |
"X-shot": "0.0",
|
@@ -240,7 +240,7 @@
|
|
240 |
"Eval Date": "2025/1/22"
|
241 |
},
|
242 |
"gsm8k": {
|
243 |
-
"Score": 11.
|
244 |
"Pass rate": 0.9795,
|
245 |
"Cost($)": 0.0,
|
246 |
"Framework": "",
|
@@ -253,7 +253,7 @@
|
|
253 |
"Average output tokens": 329
|
254 |
},
|
255 |
"AQuA": {
|
256 |
-
"Score": 47.
|
257 |
"Pass rate": 0.9094,
|
258 |
"Cost($)": 0.0,
|
259 |
"Framework": "",
|
@@ -273,7 +273,7 @@
|
|
273 |
"Eval Date": "2025/1/22"
|
274 |
},
|
275 |
"gsm8k": {
|
276 |
-
"Score": 16.
|
277 |
"Pass rate": 1.0,
|
278 |
"Cost($)": 0.0,
|
279 |
"Framework": "",
|
@@ -287,7 +287,7 @@
|
|
287 |
},
|
288 |
"AQuA": {
|
289 |
"Score": 29.13,
|
290 |
-
"Pass rate": 0.
|
291 |
"Cost($)": 0.0,
|
292 |
"Framework": "",
|
293 |
"X-shot": "0.0",
|
@@ -306,7 +306,7 @@
|
|
306 |
"Eval Date": "2025/1/22"
|
307 |
},
|
308 |
"gsm8k": {
|
309 |
-
"Score": 14.
|
310 |
"Pass rate": 1.0,
|
311 |
"Cost($)": 0.0,
|
312 |
"Framework": "",
|
@@ -319,8 +319,8 @@
|
|
319 |
"Average output tokens": 202
|
320 |
},
|
321 |
"AQuA": {
|
322 |
-
"Score": 27.
|
323 |
-
"Pass rate": 0.
|
324 |
"Cost($)": 0.0,
|
325 |
"Framework": "",
|
326 |
"X-shot": "0.0",
|
@@ -341,7 +341,7 @@
|
|
341 |
"Eval Date": "2025/1/7"
|
342 |
},
|
343 |
"gsm8k": {
|
344 |
-
"Score": 74.
|
345 |
"Pass rate": 0.9939,
|
346 |
"Cost($)": 3.4633,
|
347 |
"Framework": "",
|
@@ -354,7 +354,7 @@
|
|
354 |
"Average output tokens": 106
|
355 |
},
|
356 |
"AQuA": {
|
357 |
-
"Score": 64.
|
358 |
"Pass rate": 0.9803,
|
359 |
"Cost($)": 0.4928,
|
360 |
"Framework": "",
|
@@ -374,7 +374,7 @@
|
|
374 |
"Eval Date": "2025/1/7"
|
375 |
},
|
376 |
"gsm8k": {
|
377 |
-
"Score": 85.
|
378 |
"Pass rate": 0.9962,
|
379 |
"Cost($)": 0.2512,
|
380 |
"Framework": "",
|
@@ -387,7 +387,7 @@
|
|
387 |
"Average output tokens": 104
|
388 |
},
|
389 |
"AQuA": {
|
390 |
-
"Score": 77.
|
391 |
"Pass rate": 0.9606,
|
392 |
"Cost($)": 0.0445,
|
393 |
"Framework": "",
|
@@ -407,8 +407,8 @@
|
|
407 |
"Eval Date": "2025/1/22"
|
408 |
},
|
409 |
"gsm8k": {
|
410 |
-
"Score": 63.
|
411 |
-
"Pass rate": 0.
|
412 |
"Cost($)": 39.0751,
|
413 |
"Framework": "",
|
414 |
"X-shot": "8.0",
|
@@ -453,7 +453,7 @@
|
|
453 |
"Average output tokens": 417
|
454 |
},
|
455 |
"AQuA": {
|
456 |
-
"Score": 73.
|
457 |
"Pass rate": 1.0,
|
458 |
"Cost($)": 0.3177,
|
459 |
"Framework": "",
|
@@ -477,7 +477,7 @@
|
|
477 |
"Pass rate": 0.9992,
|
478 |
"Cost($)": 10.1124,
|
479 |
"Framework": "",
|
480 |
-
"X-shot": "",
|
481 |
"Samples": 1319,
|
482 |
"All tokens": 17937864,
|
483 |
"Total input tokens": 17038928,
|
@@ -487,7 +487,7 @@
|
|
487 |
},
|
488 |
"AQuA": {
|
489 |
"Score": 79.13,
|
490 |
-
"Pass rate": 0.
|
491 |
"Cost($)": 0.768,
|
492 |
"Framework": "",
|
493 |
"X-shot": "0.0",
|
@@ -506,7 +506,7 @@
|
|
506 |
"Eval Date": "2025/1/22"
|
507 |
},
|
508 |
"gsm8k": {
|
509 |
-
"Score": 82.
|
510 |
"Pass rate": 1.0,
|
511 |
"Cost($)": 0.0,
|
512 |
"Framework": "",
|
@@ -519,7 +519,7 @@
|
|
519 |
"Average output tokens": 375
|
520 |
},
|
521 |
"AQuA": {
|
522 |
-
"Score": 74.
|
523 |
"Pass rate": 0.9921,
|
524 |
"Cost($)": 0.0,
|
525 |
"Framework": "",
|
@@ -539,8 +539,8 @@
|
|
539 |
"Eval Date": "2025/1/22"
|
540 |
},
|
541 |
"gsm8k": {
|
542 |
-
"Score": 67.
|
543 |
-
"Pass rate": 0.
|
544 |
"Cost($)": 0.0,
|
545 |
"Framework": "",
|
546 |
"X-shot": "8.0",
|
@@ -576,7 +576,7 @@
|
|
576 |
"Pass rate": 0.9795,
|
577 |
"Cost($)": 0.0,
|
578 |
"Framework": "",
|
579 |
-
"X-shot": "",
|
580 |
"Samples": 1319,
|
581 |
"All tokens": 35669989,
|
582 |
"Total input tokens": 30120070,
|
@@ -605,7 +605,7 @@
|
|
605 |
"Eval Date": "2025/1/22"
|
606 |
},
|
607 |
"gsm8k": {
|
608 |
-
"Score": 24.
|
609 |
"Pass rate": 0.8021,
|
610 |
"Cost($)": 0.0,
|
611 |
"Framework": "",
|
@@ -638,7 +638,7 @@
|
|
638 |
"Eval Date": "2025/1/22"
|
639 |
},
|
640 |
"gsm8k": {
|
641 |
-
"Score": 7.
|
642 |
"Pass rate": 0.9522,
|
643 |
"Cost($)": 0.0,
|
644 |
"Framework": "",
|
@@ -651,7 +651,7 @@
|
|
651 |
"Average output tokens": 2245
|
652 |
},
|
653 |
"AQuA": {
|
654 |
-
"Score": 24.
|
655 |
"Pass rate": 0.9685,
|
656 |
"Cost($)": 0.0,
|
657 |
"Framework": "",
|
@@ -673,7 +673,7 @@
|
|
673 |
"Eval Date": "2025/1/7"
|
674 |
},
|
675 |
"gsm8k": {
|
676 |
-
"Score": 76.
|
677 |
"Pass rate": 0.9924,
|
678 |
"Cost($)": 0.6902,
|
679 |
"Framework": "",
|
@@ -686,7 +686,7 @@
|
|
686 |
"Average output tokens": 73
|
687 |
},
|
688 |
"AQuA": {
|
689 |
-
"Score": 59.
|
690 |
"Pass rate": 1.0,
|
691 |
"Cost($)": 0.1748,
|
692 |
"Framework": "",
|
@@ -706,7 +706,7 @@
|
|
706 |
"Eval Date": "2025/1/7"
|
707 |
},
|
708 |
"gsm8k": {
|
709 |
-
"Score": 79.
|
710 |
"Pass rate": 0.9257,
|
711 |
"Cost($)": 0.0576,
|
712 |
"Framework": "",
|
@@ -752,7 +752,7 @@
|
|
752 |
"Average output tokens": 111
|
753 |
},
|
754 |
"AQuA": {
|
755 |
-
"Score": 75.
|
756 |
"Pass rate": 1.0,
|
757 |
"Cost($)": 1.6087,
|
758 |
"Framework": "",
|
@@ -785,7 +785,7 @@
|
|
785 |
"Average output tokens": 110
|
786 |
},
|
787 |
"AQuA": {
|
788 |
-
"Score": 75.
|
789 |
"Pass rate": 1.0,
|
790 |
"Cost($)": 0.1645,
|
791 |
"Framework": "",
|
@@ -805,8 +805,8 @@
|
|
805 |
"Eval Date": "2025/1/22"
|
806 |
},
|
807 |
"gsm8k": {
|
808 |
-
"Score": 73.
|
809 |
-
"Pass rate": 0.
|
810 |
"Cost($)": 0.9736,
|
811 |
"Framework": "",
|
812 |
"X-shot": "8.0",
|
@@ -818,7 +818,7 @@
|
|
818 |
"Average output tokens": 456
|
819 |
},
|
820 |
"AQuA": {
|
821 |
-
"Score": 79.
|
822 |
"Pass rate": 0.9921,
|
823 |
"Cost($)": 0.1746,
|
824 |
"Framework": "",
|
@@ -839,7 +839,7 @@
|
|
839 |
},
|
840 |
"gsm8k": {
|
841 |
"Score": 58.83,
|
842 |
-
"Pass rate": 0.
|
843 |
"Cost($)": 0.0,
|
844 |
"Framework": "",
|
845 |
"X-shot": "8.0",
|
@@ -871,7 +871,7 @@
|
|
871 |
"Eval Date": "2025/1/22"
|
872 |
},
|
873 |
"gsm8k": {
|
874 |
-
"Score": 38.
|
875 |
"Pass rate": 0.5542,
|
876 |
"Cost($)": 0.0,
|
877 |
"Framework": "",
|
@@ -918,7 +918,7 @@
|
|
918 |
},
|
919 |
"AQuA": {
|
920 |
"Score": 36.61,
|
921 |
-
"Pass rate": 0.
|
922 |
"Cost($)": 0.0,
|
923 |
"Framework": "",
|
924 |
"X-shot": "0.0",
|
@@ -937,8 +937,8 @@
|
|
937 |
"Eval Date": "2025/1/22"
|
938 |
},
|
939 |
"gsm8k": {
|
940 |
-
"Score": 18.
|
941 |
-
"Pass rate": 0.
|
942 |
"Cost($)": 0.0,
|
943 |
"Framework": "",
|
944 |
"X-shot": "8.0",
|
@@ -950,8 +950,8 @@
|
|
950 |
"Average output tokens": 133
|
951 |
},
|
952 |
"AQuA": {
|
953 |
-
"Score": 30.
|
954 |
-
"Pass rate": 0.
|
955 |
"Cost($)": 0.0,
|
956 |
"Framework": "",
|
957 |
"X-shot": "0.0",
|
@@ -984,7 +984,7 @@
|
|
984 |
},
|
985 |
"AQuA": {
|
986 |
"Score": 17.32,
|
987 |
-
"Pass rate": 0.
|
988 |
"Cost($)": 0.0,
|
989 |
"Framework": "",
|
990 |
"X-shot": "0.0",
|
@@ -1005,7 +1005,7 @@
|
|
1005 |
"Eval Date": "2025/1/7"
|
1006 |
},
|
1007 |
"gsm8k": {
|
1008 |
-
"Score": 78.
|
1009 |
"Pass rate": 1.0,
|
1010 |
"Cost($)": 0.6788,
|
1011 |
"Framework": "",
|
@@ -1051,7 +1051,7 @@
|
|
1051 |
"Average output tokens": 121
|
1052 |
},
|
1053 |
"AQuA": {
|
1054 |
-
"Score": 82.
|
1055 |
"Pass rate": 0.9724,
|
1056 |
"Cost($)": 0.0066,
|
1057 |
"Framework": "",
|
@@ -1071,7 +1071,7 @@
|
|
1071 |
"Eval Date": "2025/1/22"
|
1072 |
},
|
1073 |
"gsm8k": {
|
1074 |
-
"Score": 94.
|
1075 |
"Pass rate": 1.0,
|
1076 |
"Cost($)": 4.5367,
|
1077 |
"Framework": "",
|
@@ -1084,7 +1084,7 @@
|
|
1084 |
"Average output tokens": 164
|
1085 |
},
|
1086 |
"AQuA": {
|
1087 |
-
"Score": 82.
|
1088 |
"Pass rate": 0.9803,
|
1089 |
"Cost($)": 1.0417,
|
1090 |
"Framework": "",
|
@@ -1151,7 +1151,7 @@
|
|
1151 |
},
|
1152 |
"AQuA": {
|
1153 |
"Score": 83.46,
|
1154 |
-
"Pass rate": 0.
|
1155 |
"Cost($)": 0.0927,
|
1156 |
"Framework": "",
|
1157 |
"X-shot": "0.0",
|
@@ -1183,8 +1183,8 @@
|
|
1183 |
"Average output tokens": 186
|
1184 |
},
|
1185 |
"AQuA": {
|
1186 |
-
"Score": 80.
|
1187 |
-
"Pass rate": 0.
|
1188 |
"Cost($)": 0.0,
|
1189 |
"Framework": "",
|
1190 |
"X-shot": "0.0",
|
@@ -1203,7 +1203,7 @@
|
|
1203 |
"Eval Date": "2025/1/22"
|
1204 |
},
|
1205 |
"gsm8k": {
|
1206 |
-
"Score": 75.
|
1207 |
"Pass rate": 0.9992,
|
1208 |
"Cost($)": 0.0,
|
1209 |
"Framework": "",
|
@@ -1216,7 +1216,7 @@
|
|
1216 |
"Average output tokens": 196
|
1217 |
},
|
1218 |
"AQuA": {
|
1219 |
-
"Score": 60.
|
1220 |
"Pass rate": 1.0,
|
1221 |
"Cost($)": 0.0,
|
1222 |
"Framework": "",
|
@@ -1237,7 +1237,7 @@
|
|
1237 |
},
|
1238 |
"gsm8k": {
|
1239 |
"Score": 77.71,
|
1240 |
-
"Pass rate": 0.
|
1241 |
"Cost($)": 0.0,
|
1242 |
"Framework": "",
|
1243 |
"X-shot": "8.0",
|
@@ -1249,7 +1249,7 @@
|
|
1249 |
"Average output tokens": 177
|
1250 |
},
|
1251 |
"AQuA": {
|
1252 |
-
"Score": 52.
|
1253 |
"Pass rate": 0.8937,
|
1254 |
"Cost($)": 0.0,
|
1255 |
"Framework": "",
|
@@ -1269,7 +1269,7 @@
|
|
1269 |
"Eval Date": "2025/1/22"
|
1270 |
},
|
1271 |
"gsm8k": {
|
1272 |
-
"Score": 55.
|
1273 |
"Pass rate": 1.0,
|
1274 |
"Cost($)": 0.0,
|
1275 |
"Framework": "",
|
@@ -1283,7 +1283,7 @@
|
|
1283 |
},
|
1284 |
"AQuA": {
|
1285 |
"Score": 40.55,
|
1286 |
-
"Pass rate": 0.
|
1287 |
"Cost($)": 0.0,
|
1288 |
"Framework": "",
|
1289 |
"X-shot": "0.0",
|
@@ -1302,7 +1302,7 @@
|
|
1302 |
"Eval Date": "2025/1/22"
|
1303 |
},
|
1304 |
"gsm8k": {
|
1305 |
-
"Score": 35.
|
1306 |
"Pass rate": 0.9992,
|
1307 |
"Cost($)": 0.0,
|
1308 |
"Framework": "",
|
@@ -1316,7 +1316,7 @@
|
|
1316 |
},
|
1317 |
"AQuA": {
|
1318 |
"Score": 33.07,
|
1319 |
-
"Pass rate": 0.
|
1320 |
"Cost($)": 0.0,
|
1321 |
"Framework": "",
|
1322 |
"X-shot": "0.0",
|
@@ -1449,8 +1449,8 @@
|
|
1449 |
"Average output tokens": 1723
|
1450 |
},
|
1451 |
"AQuA": {
|
1452 |
-
"Score": 85.
|
1453 |
-
"Pass rate": 0.
|
1454 |
"Cost($)": 0.5576,
|
1455 |
"Framework": "",
|
1456 |
"X-shot": "0.0",
|
@@ -1503,7 +1503,7 @@
|
|
1503 |
},
|
1504 |
"gsm8k": {
|
1505 |
"Score": 88.32,
|
1506 |
-
"Pass rate": 0.
|
1507 |
"Cost($)": 0.0,
|
1508 |
"Framework": "",
|
1509 |
"X-shot": "8.0",
|
@@ -1515,7 +1515,7 @@
|
|
1515 |
"Average output tokens": 1900
|
1516 |
},
|
1517 |
"AQuA": {
|
1518 |
-
"Score": 81.
|
1519 |
"Pass rate": 1.0,
|
1520 |
"Cost($)": 0.0,
|
1521 |
"Framework": "",
|
@@ -1535,8 +1535,8 @@
|
|
1535 |
"Eval Date": "2025/1/22"
|
1536 |
},
|
1537 |
"gsm8k": {
|
1538 |
-
"Score": 75.
|
1539 |
-
"Pass rate": 0.
|
1540 |
"Cost($)": 0.0,
|
1541 |
"Framework": "",
|
1542 |
"X-shot": "8.0",
|
@@ -1548,7 +1548,7 @@
|
|
1548 |
"Average output tokens": 2358
|
1549 |
},
|
1550 |
"AQuA": {
|
1551 |
-
"Score": 53.
|
1552 |
"Pass rate": 0.9606,
|
1553 |
"Cost($)": 0.0,
|
1554 |
"Framework": "",
|
@@ -1569,7 +1569,7 @@
|
|
1569 |
},
|
1570 |
"gsm8k": {
|
1571 |
"Score": 41.39,
|
1572 |
-
"Pass rate": 0.
|
1573 |
"Cost($)": 0.0,
|
1574 |
"Framework": "",
|
1575 |
"X-shot": "8.0",
|
@@ -1647,8 +1647,8 @@
|
|
1647 |
"Average output tokens": 3036
|
1648 |
},
|
1649 |
"AQuA": {
|
1650 |
-
"Score": 30.
|
1651 |
-
"Pass rate": 0.
|
1652 |
"Cost($)": 0.0,
|
1653 |
"Framework": "",
|
1654 |
"X-shot": "0.0",
|
|
|
1 |
{
|
2 |
+
"time": "2025-01-23 11:23:17",
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"gpt-3.5-turbo": {
|
|
|
22 |
"Average output tokens": 30
|
23 |
},
|
24 |
"AQuA": {
|
25 |
+
"Score": 38.98,
|
26 |
"Pass rate": 1.0,
|
27 |
"Cost($)": 0.038,
|
28 |
"Framework": "",
|
|
|
122 |
},
|
123 |
"AQuA": {
|
124 |
"Score": 84.25,
|
125 |
+
"Pass rate": 0.9961,
|
126 |
"Cost($)": 0.0742,
|
127 |
"Framework": "",
|
128 |
"X-shot": "0.0",
|
|
|
141 |
"Eval Date": "2025/1/22"
|
142 |
},
|
143 |
"gsm8k": {
|
144 |
+
"Score": 92.27,
|
145 |
"Pass rate": 1.0,
|
146 |
"Cost($)": 0.4709,
|
147 |
"Framework": "",
|
|
|
154 |
"Average output tokens": 191
|
155 |
},
|
156 |
"AQuA": {
|
157 |
+
"Score": 82.68,
|
158 |
"Pass rate": 0.9921,
|
159 |
"Cost($)": 0.0798,
|
160 |
"Framework": "",
|
|
|
188 |
},
|
189 |
"AQuA": {
|
190 |
"Score": 78.74,
|
191 |
+
"Pass rate": 0.9843,
|
192 |
"Cost($)": 0.0,
|
193 |
"Framework": "",
|
194 |
"X-shot": "0.0",
|
|
|
208 |
},
|
209 |
"gsm8k": {
|
210 |
"Score": 57.16,
|
211 |
+
"Pass rate": 0.9955,
|
212 |
"Cost($)": 0.0,
|
213 |
"Framework": "",
|
214 |
"X-shot": "8.0",
|
|
|
221 |
},
|
222 |
"AQuA": {
|
223 |
"Score": 51.18,
|
224 |
+
"Pass rate": 0.9882,
|
225 |
"Cost($)": 0.0,
|
226 |
"Framework": "",
|
227 |
"X-shot": "0.0",
|
|
|
240 |
"Eval Date": "2025/1/22"
|
241 |
},
|
242 |
"gsm8k": {
|
243 |
+
"Score": 11.6,
|
244 |
"Pass rate": 0.9795,
|
245 |
"Cost($)": 0.0,
|
246 |
"Framework": "",
|
|
|
253 |
"Average output tokens": 329
|
254 |
},
|
255 |
"AQuA": {
|
256 |
+
"Score": 47.64,
|
257 |
"Pass rate": 0.9094,
|
258 |
"Cost($)": 0.0,
|
259 |
"Framework": "",
|
|
|
273 |
"Eval Date": "2025/1/22"
|
274 |
},
|
275 |
"gsm8k": {
|
276 |
+
"Score": 16.68,
|
277 |
"Pass rate": 1.0,
|
278 |
"Cost($)": 0.0,
|
279 |
"Framework": "",
|
|
|
287 |
},
|
288 |
"AQuA": {
|
289 |
"Score": 29.13,
|
290 |
+
"Pass rate": 0.9764,
|
291 |
"Cost($)": 0.0,
|
292 |
"Framework": "",
|
293 |
"X-shot": "0.0",
|
|
|
306 |
"Eval Date": "2025/1/22"
|
307 |
},
|
308 |
"gsm8k": {
|
309 |
+
"Score": 14.71,
|
310 |
"Pass rate": 1.0,
|
311 |
"Cost($)": 0.0,
|
312 |
"Framework": "",
|
|
|
319 |
"Average output tokens": 202
|
320 |
},
|
321 |
"AQuA": {
|
322 |
+
"Score": 27.17,
|
323 |
+
"Pass rate": 0.9882,
|
324 |
"Cost($)": 0.0,
|
325 |
"Framework": "",
|
326 |
"X-shot": "0.0",
|
|
|
341 |
"Eval Date": "2025/1/7"
|
342 |
},
|
343 |
"gsm8k": {
|
344 |
+
"Score": 74.91,
|
345 |
"Pass rate": 0.9939,
|
346 |
"Cost($)": 3.4633,
|
347 |
"Framework": "",
|
|
|
354 |
"Average output tokens": 106
|
355 |
},
|
356 |
"AQuA": {
|
357 |
+
"Score": 64.57,
|
358 |
"Pass rate": 0.9803,
|
359 |
"Cost($)": 0.4928,
|
360 |
"Framework": "",
|
|
|
374 |
"Eval Date": "2025/1/7"
|
375 |
},
|
376 |
"gsm8k": {
|
377 |
+
"Score": 85.6,
|
378 |
"Pass rate": 0.9962,
|
379 |
"Cost($)": 0.2512,
|
380 |
"Framework": "",
|
|
|
387 |
"Average output tokens": 104
|
388 |
},
|
389 |
"AQuA": {
|
390 |
+
"Score": 77.56,
|
391 |
"Pass rate": 0.9606,
|
392 |
"Cost($)": 0.0445,
|
393 |
"Framework": "",
|
|
|
407 |
"Eval Date": "2025/1/22"
|
408 |
},
|
409 |
"gsm8k": {
|
410 |
+
"Score": 63.31,
|
411 |
+
"Pass rate": 0.9955,
|
412 |
"Cost($)": 39.0751,
|
413 |
"Framework": "",
|
414 |
"X-shot": "8.0",
|
|
|
453 |
"Average output tokens": 417
|
454 |
},
|
455 |
"AQuA": {
|
456 |
+
"Score": 73.23,
|
457 |
"Pass rate": 1.0,
|
458 |
"Cost($)": 0.3177,
|
459 |
"Framework": "",
|
|
|
477 |
"Pass rate": 0.9992,
|
478 |
"Cost($)": 10.1124,
|
479 |
"Framework": "",
|
480 |
+
"X-shot": "8.0",
|
481 |
"Samples": 1319,
|
482 |
"All tokens": 17937864,
|
483 |
"Total input tokens": 17038928,
|
|
|
487 |
},
|
488 |
"AQuA": {
|
489 |
"Score": 79.13,
|
490 |
+
"Pass rate": 0.9961,
|
491 |
"Cost($)": 0.768,
|
492 |
"Framework": "",
|
493 |
"X-shot": "0.0",
|
|
|
506 |
"Eval Date": "2025/1/22"
|
507 |
},
|
508 |
"gsm8k": {
|
509 |
+
"Score": 82.87,
|
510 |
"Pass rate": 1.0,
|
511 |
"Cost($)": 0.0,
|
512 |
"Framework": "",
|
|
|
519 |
"Average output tokens": 375
|
520 |
},
|
521 |
"AQuA": {
|
522 |
+
"Score": 74.41,
|
523 |
"Pass rate": 0.9921,
|
524 |
"Cost($)": 0.0,
|
525 |
"Framework": "",
|
|
|
539 |
"Eval Date": "2025/1/22"
|
540 |
},
|
541 |
"gsm8k": {
|
542 |
+
"Score": 67.78,
|
543 |
+
"Pass rate": 0.9856,
|
544 |
"Cost($)": 0.0,
|
545 |
"Framework": "",
|
546 |
"X-shot": "8.0",
|
|
|
576 |
"Pass rate": 0.9795,
|
577 |
"Cost($)": 0.0,
|
578 |
"Framework": "",
|
579 |
+
"X-shot": "8.0",
|
580 |
"Samples": 1319,
|
581 |
"All tokens": 35669989,
|
582 |
"Total input tokens": 30120070,
|
|
|
605 |
"Eval Date": "2025/1/22"
|
606 |
},
|
607 |
"gsm8k": {
|
608 |
+
"Score": 24.87,
|
609 |
"Pass rate": 0.8021,
|
610 |
"Cost($)": 0.0,
|
611 |
"Framework": "",
|
|
|
638 |
"Eval Date": "2025/1/22"
|
639 |
},
|
640 |
"gsm8k": {
|
641 |
+
"Score": 7.66,
|
642 |
"Pass rate": 0.9522,
|
643 |
"Cost($)": 0.0,
|
644 |
"Framework": "",
|
|
|
651 |
"Average output tokens": 2245
|
652 |
},
|
653 |
"AQuA": {
|
654 |
+
"Score": 24.02,
|
655 |
"Pass rate": 0.9685,
|
656 |
"Cost($)": 0.0,
|
657 |
"Framework": "",
|
|
|
673 |
"Eval Date": "2025/1/7"
|
674 |
},
|
675 |
"gsm8k": {
|
676 |
+
"Score": 76.88,
|
677 |
"Pass rate": 0.9924,
|
678 |
"Cost($)": 0.6902,
|
679 |
"Framework": "",
|
|
|
686 |
"Average output tokens": 73
|
687 |
},
|
688 |
"AQuA": {
|
689 |
+
"Score": 59.45,
|
690 |
"Pass rate": 1.0,
|
691 |
"Cost($)": 0.1748,
|
692 |
"Framework": "",
|
|
|
706 |
"Eval Date": "2025/1/7"
|
707 |
},
|
708 |
"gsm8k": {
|
709 |
+
"Score": 79.61,
|
710 |
"Pass rate": 0.9257,
|
711 |
"Cost($)": 0.0576,
|
712 |
"Framework": "",
|
|
|
752 |
"Average output tokens": 111
|
753 |
},
|
754 |
"AQuA": {
|
755 |
+
"Score": 75.2,
|
756 |
"Pass rate": 1.0,
|
757 |
"Cost($)": 1.6087,
|
758 |
"Framework": "",
|
|
|
785 |
"Average output tokens": 110
|
786 |
},
|
787 |
"AQuA": {
|
788 |
+
"Score": 75.2,
|
789 |
"Pass rate": 1.0,
|
790 |
"Cost($)": 0.1645,
|
791 |
"Framework": "",
|
|
|
805 |
"Eval Date": "2025/1/22"
|
806 |
},
|
807 |
"gsm8k": {
|
808 |
+
"Score": 73.09,
|
809 |
+
"Pass rate": 0.7961,
|
810 |
"Cost($)": 0.9736,
|
811 |
"Framework": "",
|
812 |
"X-shot": "8.0",
|
|
|
818 |
"Average output tokens": 456
|
819 |
},
|
820 |
"AQuA": {
|
821 |
+
"Score": 79.53,
|
822 |
"Pass rate": 0.9921,
|
823 |
"Cost($)": 0.1746,
|
824 |
"Framework": "",
|
|
|
839 |
},
|
840 |
"gsm8k": {
|
841 |
"Score": 58.83,
|
842 |
+
"Pass rate": 0.7051,
|
843 |
"Cost($)": 0.0,
|
844 |
"Framework": "",
|
845 |
"X-shot": "8.0",
|
|
|
871 |
"Eval Date": "2025/1/22"
|
872 |
},
|
873 |
"gsm8k": {
|
874 |
+
"Score": 38.67,
|
875 |
"Pass rate": 0.5542,
|
876 |
"Cost($)": 0.0,
|
877 |
"Framework": "",
|
|
|
918 |
},
|
919 |
"AQuA": {
|
920 |
"Score": 36.61,
|
921 |
+
"Pass rate": 0.9882,
|
922 |
"Cost($)": 0.0,
|
923 |
"Framework": "",
|
924 |
"X-shot": "0.0",
|
|
|
937 |
"Eval Date": "2025/1/22"
|
938 |
},
|
939 |
"gsm8k": {
|
940 |
+
"Score": 18.5,
|
941 |
+
"Pass rate": 0.3101,
|
942 |
"Cost($)": 0.0,
|
943 |
"Framework": "",
|
944 |
"X-shot": "8.0",
|
|
|
950 |
"Average output tokens": 133
|
951 |
},
|
952 |
"AQuA": {
|
953 |
+
"Score": 30.71,
|
954 |
+
"Pass rate": 0.9646,
|
955 |
"Cost($)": 0.0,
|
956 |
"Framework": "",
|
957 |
"X-shot": "0.0",
|
|
|
984 |
},
|
985 |
"AQuA": {
|
986 |
"Score": 17.32,
|
987 |
+
"Pass rate": 0.9213,
|
988 |
"Cost($)": 0.0,
|
989 |
"Framework": "",
|
990 |
"X-shot": "0.0",
|
|
|
1005 |
"Eval Date": "2025/1/7"
|
1006 |
},
|
1007 |
"gsm8k": {
|
1008 |
+
"Score": 78.7,
|
1009 |
"Pass rate": 1.0,
|
1010 |
"Cost($)": 0.6788,
|
1011 |
"Framework": "",
|
|
|
1051 |
"Average output tokens": 121
|
1052 |
},
|
1053 |
"AQuA": {
|
1054 |
+
"Score": 82.68,
|
1055 |
"Pass rate": 0.9724,
|
1056 |
"Cost($)": 0.0066,
|
1057 |
"Framework": "",
|
|
|
1071 |
"Eval Date": "2025/1/22"
|
1072 |
},
|
1073 |
"gsm8k": {
|
1074 |
+
"Score": 94.09,
|
1075 |
"Pass rate": 1.0,
|
1076 |
"Cost($)": 4.5367,
|
1077 |
"Framework": "",
|
|
|
1084 |
"Average output tokens": 164
|
1085 |
},
|
1086 |
"AQuA": {
|
1087 |
+
"Score": 82.68,
|
1088 |
"Pass rate": 0.9803,
|
1089 |
"Cost($)": 1.0417,
|
1090 |
"Framework": "",
|
|
|
1151 |
},
|
1152 |
"AQuA": {
|
1153 |
"Score": 83.46,
|
1154 |
+
"Pass rate": 0.9843,
|
1155 |
"Cost($)": 0.0927,
|
1156 |
"Framework": "",
|
1157 |
"X-shot": "0.0",
|
|
|
1183 |
"Average output tokens": 186
|
1184 |
},
|
1185 |
"AQuA": {
|
1186 |
+
"Score": 80.71,
|
1187 |
+
"Pass rate": 0.9961,
|
1188 |
"Cost($)": 0.0,
|
1189 |
"Framework": "",
|
1190 |
"X-shot": "0.0",
|
|
|
1203 |
"Eval Date": "2025/1/22"
|
1204 |
},
|
1205 |
"gsm8k": {
|
1206 |
+
"Score": 75.44,
|
1207 |
"Pass rate": 0.9992,
|
1208 |
"Cost($)": 0.0,
|
1209 |
"Framework": "",
|
|
|
1216 |
"Average output tokens": 196
|
1217 |
},
|
1218 |
"AQuA": {
|
1219 |
+
"Score": 60.63,
|
1220 |
"Pass rate": 1.0,
|
1221 |
"Cost($)": 0.0,
|
1222 |
"Framework": "",
|
|
|
1237 |
},
|
1238 |
"gsm8k": {
|
1239 |
"Score": 77.71,
|
1240 |
+
"Pass rate": 0.997,
|
1241 |
"Cost($)": 0.0,
|
1242 |
"Framework": "",
|
1243 |
"X-shot": "8.0",
|
|
|
1249 |
"Average output tokens": 177
|
1250 |
},
|
1251 |
"AQuA": {
|
1252 |
+
"Score": 52.76,
|
1253 |
"Pass rate": 0.8937,
|
1254 |
"Cost($)": 0.0,
|
1255 |
"Framework": "",
|
|
|
1269 |
"Eval Date": "2025/1/22"
|
1270 |
},
|
1271 |
"gsm8k": {
|
1272 |
+
"Score": 55.5,
|
1273 |
"Pass rate": 1.0,
|
1274 |
"Cost($)": 0.0,
|
1275 |
"Framework": "",
|
|
|
1283 |
},
|
1284 |
"AQuA": {
|
1285 |
"Score": 40.55,
|
1286 |
+
"Pass rate": 0.9882,
|
1287 |
"Cost($)": 0.0,
|
1288 |
"Framework": "",
|
1289 |
"X-shot": "0.0",
|
|
|
1302 |
"Eval Date": "2025/1/22"
|
1303 |
},
|
1304 |
"gsm8k": {
|
1305 |
+
"Score": 35.94,
|
1306 |
"Pass rate": 0.9992,
|
1307 |
"Cost($)": 0.0,
|
1308 |
"Framework": "",
|
|
|
1316 |
},
|
1317 |
"AQuA": {
|
1318 |
"Score": 33.07,
|
1319 |
+
"Pass rate": 0.9882,
|
1320 |
"Cost($)": 0.0,
|
1321 |
"Framework": "",
|
1322 |
"X-shot": "0.0",
|
|
|
1449 |
"Average output tokens": 1723
|
1450 |
},
|
1451 |
"AQuA": {
|
1452 |
+
"Score": 85.83,
|
1453 |
+
"Pass rate": 0.9843,
|
1454 |
"Cost($)": 0.5576,
|
1455 |
"Framework": "",
|
1456 |
"X-shot": "0.0",
|
|
|
1503 |
},
|
1504 |
"gsm8k": {
|
1505 |
"Score": 88.32,
|
1506 |
+
"Pass rate": 0.9985,
|
1507 |
"Cost($)": 0.0,
|
1508 |
"Framework": "",
|
1509 |
"X-shot": "8.0",
|
|
|
1515 |
"Average output tokens": 1900
|
1516 |
},
|
1517 |
"AQuA": {
|
1518 |
+
"Score": 81.5,
|
1519 |
"Pass rate": 1.0,
|
1520 |
"Cost($)": 0.0,
|
1521 |
"Framework": "",
|
|
|
1535 |
"Eval Date": "2025/1/22"
|
1536 |
},
|
1537 |
"gsm8k": {
|
1538 |
+
"Score": 75.21,
|
1539 |
+
"Pass rate": 0.9955,
|
1540 |
"Cost($)": 0.0,
|
1541 |
"Framework": "",
|
1542 |
"X-shot": "8.0",
|
|
|
1548 |
"Average output tokens": 2358
|
1549 |
},
|
1550 |
"AQuA": {
|
1551 |
+
"Score": 53.15,
|
1552 |
"Pass rate": 0.9606,
|
1553 |
"Cost($)": 0.0,
|
1554 |
"Framework": "",
|
|
|
1569 |
},
|
1570 |
"gsm8k": {
|
1571 |
"Score": 41.39,
|
1572 |
+
"Pass rate": 0.9826,
|
1573 |
"Cost($)": 0.0,
|
1574 |
"Framework": "",
|
1575 |
"X-shot": "8.0",
|
|
|
1647 |
"Average output tokens": 3036
|
1648 |
},
|
1649 |
"AQuA": {
|
1650 |
+
"Score": 30.71,
|
1651 |
+
"Pass rate": 0.9843,
|
1652 |
"Cost($)": 0.0,
|
1653 |
"Framework": "",
|
1654 |
"X-shot": "0.0",
|
src/detail_results.csv
CHANGED
@@ -2,100 +2,100 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
|
|
2 |
1,SC-CoT,AQuA,gpt-4o,2025/1/22,88.19,1.0,0.0,6.2412,,254,678811,72916,287,605895,2385
|
3 |
2,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,86.61,0.9921,0.0,0.5847,,254,1037124,283248,1115,753876,2968
|
4 |
3,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0.0,0.0808,,254,143289,25143,99,118146,465
|
5 |
-
4,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.
|
6 |
-
5,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.
|
7 |
-
6,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.
|
8 |
-
7,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.
|
9 |
-
8,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.
|
10 |
-
9,CoT,AQuA,gpt-4o,2025/1/22,82.
|
11 |
10,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,81.5,0.9764,0.0,0.0347,,254,465846,83830,330,382016,1504
|
12 |
-
11,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,81.
|
13 |
-
12,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.
|
14 |
-
13,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.
|
15 |
14,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0.0,0.0058,,254,87742,33058,130,54684,215
|
16 |
-
15,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.
|
17 |
-
16,IO,AQuA,Qwen2.5-7B-Instruct,2025/1/22,78.74,0.
|
18 |
-
17,ReAct-Pro*,AQuA,Doubao-lite-32k,2025/1/7,77.
|
19 |
18,IO,AQuA,gpt-4o,2025/1/22,75.59,0.9724,0.0,1.1453,,254,133752,25631,101,108121,426
|
20 |
-
19,PoT,AQuA,gpt-4o,2025/1/22,75.
|
21 |
-
20,PoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,75.
|
22 |
-
21,ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,2025/1/22,74.
|
23 |
-
22,ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,2025/1/22,73.
|
24 |
23,PoT,AQuA,Doubao-lite-32k,2025/1/7,71.65,0.9685,0.0,0.0147,,254,309436,259863,1023,49573,195
|
25 |
24,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,70.47,0.9882,0.0,0.5578,,254,418617,70157,276,348460,1372
|
26 |
25,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0.0,0.0,,254,313728,264517,1041,49211,194
|
27 |
-
26,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.
|
28 |
27,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0.0,0.0957,,254,80793,25447,100,55346,218
|
29 |
-
28,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.
|
30 |
-
29,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.
|
31 |
30,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0.0,2.304,,254,692096,615589,2424,76507,301
|
32 |
31,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0.0,0.0,,254,4340821,3764723,14822,576098,2268
|
33 |
-
32,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,53.
|
34 |
-
33,CoT,AQuA,Internllm2_5-7B,2025/1/22,52.
|
35 |
-
34,IO,AQuA,Llama-3.1-8B-Instruct,2025/1/22,51.18,0.
|
36 |
-
35,IO,AQuA,Internllm2_5-7B,2025/1/22,47.
|
37 |
36,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0.0,0.0,,254,4428801,3592039,14142,836762,3294
|
38 |
-
37,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.
|
39 |
-
38,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.
|
40 |
39,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0.0,0.0,,254,290914,240613,947,50301,198
|
41 |
-
40,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.
|
42 |
41,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,35.85,0.988,0.0,0.0,,254,1240388,530701,2089,709687,2794
|
43 |
-
42,CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,33.07,0.
|
44 |
-
43,PoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.
|
45 |
-
44,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,30.
|
46 |
45,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.31,0.9724,0.0,0.0,,254,1157076,430703,1696,726373,2860
|
47 |
-
46,IO,AQuA,Qwen2-1.5B-Instruct,2025/1/22,29.13,0.
|
48 |
-
47,IO,AQuA,Qwen2-0.5B-Instruct,2025/1/22,27.
|
49 |
48,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0.0,0.0,,254,5072004,4555858,17936,516146,2032
|
50 |
-
49,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.
|
51 |
-
50,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.
|
52 |
1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.45,1.0,8.0,4.5021,,1319,7985996,5406763,4099,2579233,1955
|
53 |
-
2,CoT,gsm8k,gpt-4o,2025/1/22,94.
|
54 |
3,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8.0,0.687,,1319,1218665,990168,751,228497,173
|
55 |
4,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8.0,4.2166,,1319,1247912,1101672,835,146240,111
|
56 |
5,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8.0,0.7195,,1319,1276252,1005119,762,271133,206
|
57 |
6,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8.0,0.7054,,1319,1251210,1106682,839,144528,110
|
58 |
-
7,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.
|
59 |
8,SC-CoT,gsm8k,gpt-4o,2025/1/22,90.75,1.0,8.0,24.2428,,1319,3300971,1168927,886,2132044,1616
|
60 |
9,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,90.67,1.0,8.0,4.2651,,1319,7565637,5292383,4012,2273254,1723
|
61 |
10,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8.0,0.0558,,1319,1201820,1042095,790,159725,121
|
62 |
11,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8.0,3.3463,,1319,741446,542416,411,199030,151
|
63 |
-
12,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,88.32,0.
|
64 |
-
13,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992
|
65 |
14,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8.0,10.5479,,1319,18710437,18160983,13769,549454,417
|
66 |
15,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8.0,0.4899,,1319,869060,555340,421,313720,238
|
67 |
16,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8.0,0.0,,1319,1290805,1046008,793,244797,186
|
68 |
-
17,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.
|
69 |
18,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,83.7,0.997,8.0,0.155,,1319,2507687,1230019,933,1277668,969
|
70 |
-
19,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.
|
71 |
20,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,82.56,0.9985,8.0,2.6285,,1319,2560697,1212520,919,1348177,1022
|
72 |
-
21,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.
|
73 |
-
22,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.
|
74 |
-
23,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.
|
75 |
-
24,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.
|
76 |
-
25,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.
|
77 |
-
26,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.
|
78 |
-
27,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.
|
79 |
-
28,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.
|
80 |
29,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8.0,0.0354,,1319,740483,617377,468,123106,93
|
81 |
-
30,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.
|
82 |
-
31,ReAct-Pro*,gsm8k,gpt-4o,2025/1/22,63.
|
83 |
-
32,PoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,58.83,0.
|
84 |
33,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8.0,0.0,,1319,887913,596229,452,291684,221
|
85 |
-
34,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.
|
86 |
-
35,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.
|
87 |
-
36,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,41.39,0.
|
88 |
-
37,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.
|
89 |
38,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8.0,0.0,,1319,1324949,1136843,862,188106,143
|
90 |
39,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8.0,0.3328,,1319,586553,546990,415,39563,30
|
91 |
-
40,CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,35.
|
92 |
-
41,ReAct-Pro*,gsm8k,Internllm2_5-7B,2025/1/22,33.51,0.9795
|
93 |
-
42,ReAct-Pro*,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,24.
|
94 |
-
43,PoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,18.
|
95 |
-
44,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.
|
96 |
-
45,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.
|
97 |
-
46,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.
|
98 |
47,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.62,0.169,8.0,0.0,,1319,1389135,1151528,873,237607,180
|
99 |
-
48,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.
|
100 |
49,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,5.53,0.8673,8.0,0.0,,1319,8961768,5844218,4431,3117550,2364
|
101 |
50,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,3.79,0.9484,8.0,0.0,,1319,10533815,6529832,4951,4003983,3036
|
|
|
2 |
1,SC-CoT,AQuA,gpt-4o,2025/1/22,88.19,1.0,0.0,6.2412,,254,678811,72916,287,605895,2385
|
3 |
2,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,86.61,0.9921,0.0,0.5847,,254,1037124,283248,1115,753876,2968
|
4 |
3,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0.0,0.0808,,254,143289,25143,99,118146,465
|
5 |
+
4,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.83,0.9843,0.0,0.5576,,254,989058,241149,949,747909,2945
|
6 |
+
5,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.9961,0.0,0.0742,,254,131604,25397,100,106207,418
|
7 |
+
6,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9843,0.0,0.0927,,254,164389,32555,128,131834,519
|
8 |
+
7,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.68,0.9921,0.0,0.0798,,254,141567,32809,129,108758,428
|
9 |
+
8,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.68,0.9724,0.0,0.0066,,254,94577,27978,110,66599,262
|
10 |
+
9,CoT,AQuA,gpt-4o,2025/1/22,82.68,0.9803,0.0,1.0417,,254,123017,25123,99,97894,385
|
11 |
10,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,81.5,0.9764,0.0,0.0347,,254,465846,83830,330,382016,1504
|
12 |
+
11,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,81.5,1.0,0.0,0.0,,254,1015368,278848,1098,736520,2900
|
13 |
+
12,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.71,0.9961,0.0,0.0,,254,149736,33017,130,116719,460
|
14 |
+
13,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.53,0.9921,0.0,0.1746,,254,309799,240735,948,69064,272
|
15 |
14,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0.0,0.0058,,254,87742,33058,130,54684,215
|
16 |
+
15,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.9961,0.0,0.768,,254,1362379,1119143,4406,243236,958
|
17 |
+
16,IO,AQuA,Qwen2.5-7B-Instruct,2025/1/22,78.74,0.9843,0.0,0.0,,254,137771,33271,131,104500,411
|
18 |
+
17,ReAct-Pro*,AQuA,Doubao-lite-32k,2025/1/7,77.56,0.9606,0.0,0.0445,,254,1032841,977890,3850,54951,216
|
19 |
18,IO,AQuA,gpt-4o,2025/1/22,75.59,0.9724,0.0,1.1453,,254,133752,25631,101,108121,426
|
20 |
+
19,PoT,AQuA,gpt-4o,2025/1/22,75.2,1.0,0.0,1.6087,,254,327908,222717,877,105191,414
|
21 |
+
20,PoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,75.2,1.0,0.0,0.1645,,254,291764,249215,981,42549,168
|
22 |
+
21,ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,2025/1/22,74.41,0.9921,0.0,0.0,,254,695844,564165,2221,131679,518
|
23 |
+
22,ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,2025/1/22,73.23,1.0,0.0,0.3177,,254,563603,441765,1739,121838,480
|
24 |
23,PoT,AQuA,Doubao-lite-32k,2025/1/7,71.65,0.9685,0.0,0.0147,,254,309436,259863,1023,49573,195
|
25 |
24,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,70.47,0.9882,0.0,0.5578,,254,418617,70157,276,348460,1372
|
26 |
25,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0.0,0.0,,254,313728,264517,1041,49211,194
|
27 |
+
26,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.57,0.9803,0.0,0.4928,,254,903587,862614,3396,40973,161
|
28 |
27,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0.0,0.0957,,254,80793,25447,100,55346,218
|
29 |
+
28,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.63,1.0,0.0,0.0,,254,144435,32555,128,111880,440
|
30 |
+
29,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.45,1.0,0.0,0.1748,,254,266654,225162,886,41492,163
|
31 |
30,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0.0,2.304,,254,692096,615589,2424,76507,301
|
32 |
31,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0.0,0.0,,254,4340821,3764723,14822,576098,2268
|
33 |
+
32,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,53.15,0.9606,0.0,0.0,,254,1041346,372968,1468,668378,2631
|
34 |
+
33,CoT,AQuA,Internllm2_5-7B,2025/1/22,52.76,0.8937,0.0,0.0,,254,127520,26610,105,100910,397
|
35 |
+
34,IO,AQuA,Llama-3.1-8B-Instruct,2025/1/22,51.18,0.9882,0.0,0.0,,254,133106,26459,104,106647,420
|
36 |
+
35,IO,AQuA,Internllm2_5-7B,2025/1/22,47.64,0.9094,0.0,0.0,,254,185041,50232,198,134809,531
|
37 |
36,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0.0,0.0,,254,4428801,3592039,14142,836762,3294
|
38 |
+
37,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9882,0.0,0.0,,254,110040,30477,120,79563,313
|
39 |
+
38,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.98,1.0,0.0,0.038,,254,42471,25701,101,16770,66
|
40 |
39,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0.0,0.0,,254,290914,240613,947,50301,198
|
41 |
+
40,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9882,0.0,0.0,,254,301962,233505,919,68457,270
|
42 |
41,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,35.85,0.988,0.0,0.0,,254,1240388,530701,2089,709687,2794
|
43 |
+
42,CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,33.07,0.9882,0.0,0.0,,254,117339,30477,120,86862,342
|
44 |
+
43,PoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.71,0.9646,0.0,0.0,,254,298475,246560,971,51915,204
|
45 |
+
44,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,30.71,0.9843,0.0,0.0,,254,1225539,496206,1954,729333,2871
|
46 |
45,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.31,0.9724,0.0,0.0,,254,1157076,430703,1696,726373,2860
|
47 |
+
46,IO,AQuA,Qwen2-1.5B-Instruct,2025/1/22,29.13,0.9764,0.0,0.0,,254,71047,27937,110,43110,170
|
48 |
+
47,IO,AQuA,Qwen2-0.5B-Instruct,2025/1/22,27.17,0.9882,0.0,0.0,,254,110415,27937,110,82478,325
|
49 |
48,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0.0,0.0,,254,5072004,4555858,17936,516146,2032
|
50 |
+
49,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.02,0.9685,0.0,0.0,,254,7170087,6344167,24977,825920,3252
|
51 |
+
50,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9213,0.0,0.0,,254,322281,258867,1019,63414,250
|
52 |
1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.45,1.0,8.0,4.5021,,1319,7985996,5406763,4099,2579233,1955
|
53 |
+
2,CoT,gsm8k,gpt-4o,2025/1/22,94.09,1.0,8.0,4.5367,,1319,1165166,948668,719,216498,164
|
54 |
3,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8.0,0.687,,1319,1218665,990168,751,228497,173
|
55 |
4,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8.0,4.2166,,1319,1247912,1101672,835,146240,111
|
56 |
5,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8.0,0.7195,,1319,1276252,1005119,762,271133,206
|
57 |
6,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8.0,0.7054,,1319,1251210,1106682,839,144528,110
|
58 |
+
7,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.27,1.0,8.0,0.4709,,1319,835275,583916,443,251359,191
|
59 |
8,SC-CoT,gsm8k,gpt-4o,2025/1/22,90.75,1.0,8.0,24.2428,,1319,3300971,1168927,886,2132044,1616
|
60 |
9,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,90.67,1.0,8.0,4.2651,,1319,7565637,5292383,4012,2273254,1723
|
61 |
10,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8.0,0.0558,,1319,1201820,1042095,790,159725,121
|
62 |
11,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8.0,3.3463,,1319,741446,542416,411,199030,151
|
63 |
+
12,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,88.32,0.9985,8.0,0.0,,1319,8173818,5668252,4297,2505566,1900
|
64 |
+
13,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,8.0,10.1124,,1319,17937864,17038928,12918,898936,682
|
65 |
14,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8.0,10.5479,,1319,18710437,18160983,13769,549454,417
|
66 |
15,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8.0,0.4899,,1319,869060,555340,421,313720,238
|
67 |
16,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8.0,0.0,,1319,1290805,1046008,793,244797,186
|
68 |
+
17,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.6,0.9962,8.0,0.2512,,1319,5998639,5862016,4444,136623,104
|
69 |
18,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,83.7,0.997,8.0,0.155,,1319,2507687,1230019,933,1277668,969
|
70 |
+
19,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.87,1.0,8.0,0.0,,1319,14850914,14355752,10884,495162,375
|
71 |
20,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,82.56,0.9985,8.0,2.6285,,1319,2560697,1212520,919,1348177,1022
|
72 |
+
21,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.61,0.9257,8.0,0.0576,,1319,1288055,1170038,887,118017,89
|
73 |
+
22,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.7,1.0,8.0,0.6788,,1319,1088041,953242,723,134799,102
|
74 |
+
23,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.997,8.0,0.0,,1319,1202163,968163,734,234000,177
|
75 |
+
24,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.88,0.9924,8.0,0.6902,,1319,1187080,1090418,827,96662,73
|
76 |
+
25,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.44,0.9992,8.0,0.0,,1319,1248329,990168,751,258161,196
|
77 |
+
26,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.21,0.9955,8.0,0.0,,1319,8444203,5334657,4044,3109546,2358
|
78 |
+
27,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.91,0.9939,8.0,3.4633,,1319,6646286,6506164,4933,140122,106
|
79 |
+
28,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.09,0.7961,8.0,0.9736,,1319,1727044,1126025,854,601019,456
|
80 |
29,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8.0,0.0354,,1319,740483,617377,468,123106,93
|
81 |
+
30,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.78,0.9856,8.0,0.0,,1319,22835767,21044978,15955,1790789,1358
|
82 |
+
31,ReAct-Pro*,gsm8k,gpt-4o,2025/1/22,63.31,0.9955,8.0,39.0751,,1319,14715887,14411173,10926,304714,231
|
83 |
+
32,PoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,58.83,0.7051,8.0,0.0,,1319,1362822,1145390,868,217432,165
|
84 |
33,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8.0,0.0,,1319,887913,596229,452,291684,221
|
85 |
+
34,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9955,8.0,0.0,,1319,1745429,550941,418,1194488,906
|
86 |
+
35,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.5,1.0,8.0,0.0,,1319,1218525,1032818,783,185707,141
|
87 |
+
36,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,41.39,0.9826,8.0,0.0,,1319,10024857,6674518,5060,3350339,2540
|
88 |
+
37,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.67,0.5542,8.0,0.0,,1319,1391111,1147538,870,243573,185
|
89 |
38,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8.0,0.0,,1319,1324949,1136843,862,188106,143
|
90 |
39,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8.0,0.3328,,1319,586553,546990,415,39563,30
|
91 |
+
40,CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,35.94,0.9992,8.0,0.0,,1319,1223459,1032818,783,190641,145
|
92 |
+
41,ReAct-Pro*,gsm8k,Internllm2_5-7B,2025/1/22,33.51,0.9795,8.0,0.0,,1319,35669989,30120070,22836,5549919,4208
|
93 |
+
42,ReAct-Pro*,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,24.87,0.8021,8.0,0.0,,1319,9828001,9133603,6925,694398,526
|
94 |
+
43,PoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,18.5,0.3101,8.0,0.0,,1319,1327522,1151528,873,175994,133
|
95 |
+
44,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.68,1.0,8.0,0.0,,1319,736996,568530,431,168466,128
|
96 |
+
45,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.71,1.0,8.0,0.0,,1319,834897,568116,431,266781,202
|
97 |
+
46,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.6,0.9795,8.0,0.0,,1319,1113728,679302,515,434426,329
|
98 |
47,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.62,0.169,8.0,0.0,,1319,1389135,1151528,873,237607,180
|
99 |
+
48,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.66,0.9522,8.0,0.0,,1319,55392611,52431343,39751,2961268,2245
|
100 |
49,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,5.53,0.8673,8.0,0.0,,1319,8961768,5844218,4431,3117550,2364
|
101 |
50,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,3.79,0.9484,8.0,0.0,,1319,10533815,6529832,4951,4003983,3036
|
src/overall_math_score.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"time": "2025-01-23
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"META": {
|
@@ -12,7 +12,7 @@
|
|
12 |
"Cost($)": 0.3328
|
13 |
},
|
14 |
"AQuA": {
|
15 |
-
"Score": 38.
|
16 |
"Cost($)": 0.038
|
17 |
}
|
18 |
},
|
@@ -23,11 +23,11 @@
|
|
23 |
"Eval Date": "2025/1/7"
|
24 |
},
|
25 |
"gsm8k": {
|
26 |
-
"Score": 74.
|
27 |
"Cost($)": 3.4633
|
28 |
},
|
29 |
"AQuA": {
|
30 |
-
"Score": 64.
|
31 |
"Cost($)": 0.4928
|
32 |
}
|
33 |
},
|
@@ -38,11 +38,11 @@
|
|
38 |
"Eval Date": "2025/1/7"
|
39 |
},
|
40 |
"gsm8k": {
|
41 |
-
"Score": 76.
|
42 |
"Cost($)": 0.6902
|
43 |
},
|
44 |
"AQuA": {
|
45 |
-
"Score": 59.
|
46 |
"Cost($)": 0.1748
|
47 |
}
|
48 |
},
|
@@ -53,7 +53,7 @@
|
|
53 |
"Eval Date": "2025/1/7"
|
54 |
},
|
55 |
"gsm8k": {
|
56 |
-
"Score": 78.
|
57 |
"Cost($)": 0.6788
|
58 |
},
|
59 |
"AQuA": {
|
@@ -98,11 +98,11 @@
|
|
98 |
"Eval Date": "2025/1/7"
|
99 |
},
|
100 |
"gsm8k": {
|
101 |
-
"Score": 85.
|
102 |
"Cost($)": 0.2512
|
103 |
},
|
104 |
"AQuA": {
|
105 |
-
"Score": 77.
|
106 |
"Cost($)": 0.0445
|
107 |
}
|
108 |
},
|
@@ -113,7 +113,7 @@
|
|
113 |
"Eval Date": "2025/1/7"
|
114 |
},
|
115 |
"gsm8k": {
|
116 |
-
"Score": 79.
|
117 |
"Cost($)": 0.0576
|
118 |
},
|
119 |
"AQuA": {
|
@@ -132,7 +132,7 @@
|
|
132 |
"Cost($)": 0.0558
|
133 |
},
|
134 |
"AQuA": {
|
135 |
-
"Score": 82.
|
136 |
"Cost($)": 0.0066
|
137 |
}
|
138 |
},
|
@@ -173,7 +173,7 @@
|
|
173 |
"Eval Date": "2025/1/22"
|
174 |
},
|
175 |
"gsm8k": {
|
176 |
-
"Score": 63.
|
177 |
"Cost($)": 39.0751
|
178 |
},
|
179 |
"AQuA": {
|
@@ -192,7 +192,7 @@
|
|
192 |
"Cost($)": 4.2166
|
193 |
},
|
194 |
"AQuA": {
|
195 |
-
"Score": 75.
|
196 |
"Cost($)": 1.6087
|
197 |
}
|
198 |
},
|
@@ -203,11 +203,11 @@
|
|
203 |
"Eval Date": "2025/1/22"
|
204 |
},
|
205 |
"gsm8k": {
|
206 |
-
"Score": 94.
|
207 |
"Cost($)": 4.5367
|
208 |
},
|
209 |
"AQuA": {
|
210 |
-
"Score": 82.
|
211 |
"Cost($)": 1.0417
|
212 |
}
|
213 |
},
|
@@ -252,7 +252,7 @@
|
|
252 |
"Cost($)": 10.5479
|
253 |
},
|
254 |
"AQuA": {
|
255 |
-
"Score": 73.
|
256 |
"Cost($)": 0.3177
|
257 |
}
|
258 |
},
|
@@ -267,7 +267,7 @@
|
|
267 |
"Cost($)": 0.7054
|
268 |
},
|
269 |
"AQuA": {
|
270 |
-
"Score": 75.
|
271 |
"Cost($)": 0.1645
|
272 |
}
|
273 |
},
|
@@ -297,7 +297,7 @@
|
|
297 |
"Cost($)": 4.2651
|
298 |
},
|
299 |
"AQuA": {
|
300 |
-
"Score": 85.
|
301 |
"Cost($)": 0.5576
|
302 |
}
|
303 |
},
|
@@ -308,11 +308,11 @@
|
|
308 |
"Eval Date": "2025/1/22"
|
309 |
},
|
310 |
"gsm8k": {
|
311 |
-
"Score": 92.
|
312 |
"Cost($)": 0.4709
|
313 |
},
|
314 |
"AQuA": {
|
315 |
-
"Score": 82.
|
316 |
"Cost($)": 0.0798
|
317 |
}
|
318 |
},
|
@@ -338,11 +338,11 @@
|
|
338 |
"Eval Date": "2025/1/22"
|
339 |
},
|
340 |
"gsm8k": {
|
341 |
-
"Score": 73.
|
342 |
"Cost($)": 0.9736
|
343 |
},
|
344 |
"AQuA": {
|
345 |
-
"Score": 79.
|
346 |
"Cost($)": 0.1746
|
347 |
}
|
348 |
},
|
@@ -398,11 +398,11 @@
|
|
398 |
"Eval Date": "2025/1/22"
|
399 |
},
|
400 |
"gsm8k": {
|
401 |
-
"Score": 82.
|
402 |
"Cost($)": 0.0
|
403 |
},
|
404 |
"AQuA": {
|
405 |
-
"Score": 74.
|
406 |
"Cost($)": 0.0
|
407 |
}
|
408 |
},
|
@@ -432,7 +432,7 @@
|
|
432 |
"Cost($)": 0.0
|
433 |
},
|
434 |
"AQuA": {
|
435 |
-
"Score": 80.
|
436 |
"Cost($)": 0.0
|
437 |
}
|
438 |
},
|
@@ -447,7 +447,7 @@
|
|
447 |
"Cost($)": 0.0
|
448 |
},
|
449 |
"AQuA": {
|
450 |
-
"Score": 81.
|
451 |
"Cost($)": 0.0
|
452 |
}
|
453 |
},
|
@@ -473,7 +473,7 @@
|
|
473 |
"Eval Date": "2025/1/22"
|
474 |
},
|
475 |
"gsm8k": {
|
476 |
-
"Score": 67.
|
477 |
"Cost($)": 0.0
|
478 |
},
|
479 |
"AQuA": {
|
@@ -488,7 +488,7 @@
|
|
488 |
"Eval Date": "2025/1/22"
|
489 |
},
|
490 |
"gsm8k": {
|
491 |
-
"Score": 38.
|
492 |
"Cost($)": 0.0
|
493 |
},
|
494 |
"AQuA": {
|
@@ -503,11 +503,11 @@
|
|
503 |
"Eval Date": "2025/1/22"
|
504 |
},
|
505 |
"gsm8k": {
|
506 |
-
"Score": 75.
|
507 |
"Cost($)": 0.0
|
508 |
},
|
509 |
"AQuA": {
|
510 |
-
"Score": 60.
|
511 |
"Cost($)": 0.0
|
512 |
}
|
513 |
},
|
@@ -518,11 +518,11 @@
|
|
518 |
"Eval Date": "2025/1/22"
|
519 |
},
|
520 |
"gsm8k": {
|
521 |
-
"Score": 75.
|
522 |
"Cost($)": 0.0
|
523 |
},
|
524 |
"AQuA": {
|
525 |
-
"Score": 53.
|
526 |
"Cost($)": 0.0
|
527 |
}
|
528 |
},
|
@@ -533,11 +533,11 @@
|
|
533 |
"Eval Date": "2025/1/22"
|
534 |
},
|
535 |
"gsm8k": {
|
536 |
-
"Score": 11.
|
537 |
"Cost($)": 0.0
|
538 |
},
|
539 |
"AQuA": {
|
540 |
-
"Score": 47.
|
541 |
"Cost($)": 0.0
|
542 |
}
|
543 |
},
|
@@ -582,7 +582,7 @@
|
|
582 |
"Cost($)": 0.0
|
583 |
},
|
584 |
"AQuA": {
|
585 |
-
"Score": 52.
|
586 |
"Cost($)": 0.0
|
587 |
}
|
588 |
},
|
@@ -608,7 +608,7 @@
|
|
608 |
"Eval Date": "2025/1/22"
|
609 |
},
|
610 |
"gsm8k": {
|
611 |
-
"Score": 16.
|
612 |
"Cost($)": 0.0
|
613 |
},
|
614 |
"AQuA": {
|
@@ -623,7 +623,7 @@
|
|
623 |
"Eval Date": "2025/1/22"
|
624 |
},
|
625 |
"gsm8k": {
|
626 |
-
"Score": 24.
|
627 |
"Cost($)": 0.0
|
628 |
},
|
629 |
"AQuA": {
|
@@ -638,11 +638,11 @@
|
|
638 |
"Eval Date": "2025/1/22"
|
639 |
},
|
640 |
"gsm8k": {
|
641 |
-
"Score": 18.
|
642 |
"Cost($)": 0.0
|
643 |
},
|
644 |
"AQuA": {
|
645 |
-
"Score": 30.
|
646 |
"Cost($)": 0.0
|
647 |
}
|
648 |
},
|
@@ -653,7 +653,7 @@
|
|
653 |
"Eval Date": "2025/1/22"
|
654 |
},
|
655 |
"gsm8k": {
|
656 |
-
"Score": 55.
|
657 |
"Cost($)": 0.0
|
658 |
},
|
659 |
"AQuA": {
|
@@ -683,11 +683,11 @@
|
|
683 |
"Eval Date": "2025/1/22"
|
684 |
},
|
685 |
"gsm8k": {
|
686 |
-
"Score": 14.
|
687 |
"Cost($)": 0.0
|
688 |
},
|
689 |
"AQuA": {
|
690 |
-
"Score": 27.
|
691 |
"Cost($)": 0.0
|
692 |
}
|
693 |
},
|
@@ -698,11 +698,11 @@
|
|
698 |
"Eval Date": "2025/1/22"
|
699 |
},
|
700 |
"gsm8k": {
|
701 |
-
"Score": 7.
|
702 |
"Cost($)": 0.0
|
703 |
},
|
704 |
"AQuA": {
|
705 |
-
"Score": 24.
|
706 |
"Cost($)": 0.0
|
707 |
}
|
708 |
},
|
@@ -728,7 +728,7 @@
|
|
728 |
"Eval Date": "2025/1/22"
|
729 |
},
|
730 |
"gsm8k": {
|
731 |
-
"Score": 35.
|
732 |
"Cost($)": 0.0
|
733 |
},
|
734 |
"AQuA": {
|
@@ -747,7 +747,7 @@
|
|
747 |
"Cost($)": 0.0
|
748 |
},
|
749 |
"AQuA": {
|
750 |
-
"Score": 30.
|
751 |
"Cost($)": 0.0
|
752 |
}
|
753 |
}
|
|
|
1 |
{
|
2 |
+
"time": "2025-01-23 11:23:17",
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"META": {
|
|
|
12 |
"Cost($)": 0.3328
|
13 |
},
|
14 |
"AQuA": {
|
15 |
+
"Score": 38.98,
|
16 |
"Cost($)": 0.038
|
17 |
}
|
18 |
},
|
|
|
23 |
"Eval Date": "2025/1/7"
|
24 |
},
|
25 |
"gsm8k": {
|
26 |
+
"Score": 74.91,
|
27 |
"Cost($)": 3.4633
|
28 |
},
|
29 |
"AQuA": {
|
30 |
+
"Score": 64.57,
|
31 |
"Cost($)": 0.4928
|
32 |
}
|
33 |
},
|
|
|
38 |
"Eval Date": "2025/1/7"
|
39 |
},
|
40 |
"gsm8k": {
|
41 |
+
"Score": 76.88,
|
42 |
"Cost($)": 0.6902
|
43 |
},
|
44 |
"AQuA": {
|
45 |
+
"Score": 59.45,
|
46 |
"Cost($)": 0.1748
|
47 |
}
|
48 |
},
|
|
|
53 |
"Eval Date": "2025/1/7"
|
54 |
},
|
55 |
"gsm8k": {
|
56 |
+
"Score": 78.7,
|
57 |
"Cost($)": 0.6788
|
58 |
},
|
59 |
"AQuA": {
|
|
|
98 |
"Eval Date": "2025/1/7"
|
99 |
},
|
100 |
"gsm8k": {
|
101 |
+
"Score": 85.6,
|
102 |
"Cost($)": 0.2512
|
103 |
},
|
104 |
"AQuA": {
|
105 |
+
"Score": 77.56,
|
106 |
"Cost($)": 0.0445
|
107 |
}
|
108 |
},
|
|
|
113 |
"Eval Date": "2025/1/7"
|
114 |
},
|
115 |
"gsm8k": {
|
116 |
+
"Score": 79.61,
|
117 |
"Cost($)": 0.0576
|
118 |
},
|
119 |
"AQuA": {
|
|
|
132 |
"Cost($)": 0.0558
|
133 |
},
|
134 |
"AQuA": {
|
135 |
+
"Score": 82.68,
|
136 |
"Cost($)": 0.0066
|
137 |
}
|
138 |
},
|
|
|
173 |
"Eval Date": "2025/1/22"
|
174 |
},
|
175 |
"gsm8k": {
|
176 |
+
"Score": 63.31,
|
177 |
"Cost($)": 39.0751
|
178 |
},
|
179 |
"AQuA": {
|
|
|
192 |
"Cost($)": 4.2166
|
193 |
},
|
194 |
"AQuA": {
|
195 |
+
"Score": 75.2,
|
196 |
"Cost($)": 1.6087
|
197 |
}
|
198 |
},
|
|
|
203 |
"Eval Date": "2025/1/22"
|
204 |
},
|
205 |
"gsm8k": {
|
206 |
+
"Score": 94.09,
|
207 |
"Cost($)": 4.5367
|
208 |
},
|
209 |
"AQuA": {
|
210 |
+
"Score": 82.68,
|
211 |
"Cost($)": 1.0417
|
212 |
}
|
213 |
},
|
|
|
252 |
"Cost($)": 10.5479
|
253 |
},
|
254 |
"AQuA": {
|
255 |
+
"Score": 73.23,
|
256 |
"Cost($)": 0.3177
|
257 |
}
|
258 |
},
|
|
|
267 |
"Cost($)": 0.7054
|
268 |
},
|
269 |
"AQuA": {
|
270 |
+
"Score": 75.2,
|
271 |
"Cost($)": 0.1645
|
272 |
}
|
273 |
},
|
|
|
297 |
"Cost($)": 4.2651
|
298 |
},
|
299 |
"AQuA": {
|
300 |
+
"Score": 85.83,
|
301 |
"Cost($)": 0.5576
|
302 |
}
|
303 |
},
|
|
|
308 |
"Eval Date": "2025/1/22"
|
309 |
},
|
310 |
"gsm8k": {
|
311 |
+
"Score": 92.27,
|
312 |
"Cost($)": 0.4709
|
313 |
},
|
314 |
"AQuA": {
|
315 |
+
"Score": 82.68,
|
316 |
"Cost($)": 0.0798
|
317 |
}
|
318 |
},
|
|
|
338 |
"Eval Date": "2025/1/22"
|
339 |
},
|
340 |
"gsm8k": {
|
341 |
+
"Score": 73.09,
|
342 |
"Cost($)": 0.9736
|
343 |
},
|
344 |
"AQuA": {
|
345 |
+
"Score": 79.53,
|
346 |
"Cost($)": 0.1746
|
347 |
}
|
348 |
},
|
|
|
398 |
"Eval Date": "2025/1/22"
|
399 |
},
|
400 |
"gsm8k": {
|
401 |
+
"Score": 82.87,
|
402 |
"Cost($)": 0.0
|
403 |
},
|
404 |
"AQuA": {
|
405 |
+
"Score": 74.41,
|
406 |
"Cost($)": 0.0
|
407 |
}
|
408 |
},
|
|
|
432 |
"Cost($)": 0.0
|
433 |
},
|
434 |
"AQuA": {
|
435 |
+
"Score": 80.71,
|
436 |
"Cost($)": 0.0
|
437 |
}
|
438 |
},
|
|
|
447 |
"Cost($)": 0.0
|
448 |
},
|
449 |
"AQuA": {
|
450 |
+
"Score": 81.5,
|
451 |
"Cost($)": 0.0
|
452 |
}
|
453 |
},
|
|
|
473 |
"Eval Date": "2025/1/22"
|
474 |
},
|
475 |
"gsm8k": {
|
476 |
+
"Score": 67.78,
|
477 |
"Cost($)": 0.0
|
478 |
},
|
479 |
"AQuA": {
|
|
|
488 |
"Eval Date": "2025/1/22"
|
489 |
},
|
490 |
"gsm8k": {
|
491 |
+
"Score": 38.67,
|
492 |
"Cost($)": 0.0
|
493 |
},
|
494 |
"AQuA": {
|
|
|
503 |
"Eval Date": "2025/1/22"
|
504 |
},
|
505 |
"gsm8k": {
|
506 |
+
"Score": 75.44,
|
507 |
"Cost($)": 0.0
|
508 |
},
|
509 |
"AQuA": {
|
510 |
+
"Score": 60.63,
|
511 |
"Cost($)": 0.0
|
512 |
}
|
513 |
},
|
|
|
518 |
"Eval Date": "2025/1/22"
|
519 |
},
|
520 |
"gsm8k": {
|
521 |
+
"Score": 75.21,
|
522 |
"Cost($)": 0.0
|
523 |
},
|
524 |
"AQuA": {
|
525 |
+
"Score": 53.15,
|
526 |
"Cost($)": 0.0
|
527 |
}
|
528 |
},
|
|
|
533 |
"Eval Date": "2025/1/22"
|
534 |
},
|
535 |
"gsm8k": {
|
536 |
+
"Score": 11.6,
|
537 |
"Cost($)": 0.0
|
538 |
},
|
539 |
"AQuA": {
|
540 |
+
"Score": 47.64,
|
541 |
"Cost($)": 0.0
|
542 |
}
|
543 |
},
|
|
|
582 |
"Cost($)": 0.0
|
583 |
},
|
584 |
"AQuA": {
|
585 |
+
"Score": 52.76,
|
586 |
"Cost($)": 0.0
|
587 |
}
|
588 |
},
|
|
|
608 |
"Eval Date": "2025/1/22"
|
609 |
},
|
610 |
"gsm8k": {
|
611 |
+
"Score": 16.68,
|
612 |
"Cost($)": 0.0
|
613 |
},
|
614 |
"AQuA": {
|
|
|
623 |
"Eval Date": "2025/1/22"
|
624 |
},
|
625 |
"gsm8k": {
|
626 |
+
"Score": 24.87,
|
627 |
"Cost($)": 0.0
|
628 |
},
|
629 |
"AQuA": {
|
|
|
638 |
"Eval Date": "2025/1/22"
|
639 |
},
|
640 |
"gsm8k": {
|
641 |
+
"Score": 18.5,
|
642 |
"Cost($)": 0.0
|
643 |
},
|
644 |
"AQuA": {
|
645 |
+
"Score": 30.71,
|
646 |
"Cost($)": 0.0
|
647 |
}
|
648 |
},
|
|
|
653 |
"Eval Date": "2025/1/22"
|
654 |
},
|
655 |
"gsm8k": {
|
656 |
+
"Score": 55.5,
|
657 |
"Cost($)": 0.0
|
658 |
},
|
659 |
"AQuA": {
|
|
|
683 |
"Eval Date": "2025/1/22"
|
684 |
},
|
685 |
"gsm8k": {
|
686 |
+
"Score": 14.71,
|
687 |
"Cost($)": 0.0
|
688 |
},
|
689 |
"AQuA": {
|
690 |
+
"Score": 27.17,
|
691 |
"Cost($)": 0.0
|
692 |
}
|
693 |
},
|
|
|
698 |
"Eval Date": "2025/1/22"
|
699 |
},
|
700 |
"gsm8k": {
|
701 |
+
"Score": 7.66,
|
702 |
"Cost($)": 0.0
|
703 |
},
|
704 |
"AQuA": {
|
705 |
+
"Score": 24.02,
|
706 |
"Cost($)": 0.0
|
707 |
}
|
708 |
},
|
|
|
728 |
"Eval Date": "2025/1/22"
|
729 |
},
|
730 |
"gsm8k": {
|
731 |
+
"Score": 35.94,
|
732 |
"Cost($)": 0.0
|
733 |
},
|
734 |
"AQuA": {
|
|
|
747 |
"Cost($)": 0.0
|
748 |
},
|
749 |
"AQuA": {
|
750 |
+
"Score": 30.71,
|
751 |
"Cost($)": 0.0
|
752 |
}
|
753 |
}
|
src/overall_results.csv
CHANGED
@@ -3,49 +3,49 @@ Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA
|
|
3 |
2.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,89.55,92.87,0.7195,86.22,0.0808
|
4 |
3.0,SC-CoT,gpt-4o,2025/1/22,89.47,90.75,24.2428,88.19,6.2412
|
5 |
4.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,88.70,93.93,0.687,83.46,0.0927
|
6 |
-
5.0,CoT,gpt-4o,2025/1/22,88.
|
7 |
-
6.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,88.25,90.67,4.2651,85.
|
8 |
-
7.0,IO,Llama-3.3-70B-Instruct,2025/1/22,87.
|
9 |
-
8.0,CoT,Doubao-lite-32k,2025/1/7,
|
10 |
9.0,IO,Qwen2.5-72B-Instruct,2025/1/22,85.42,86.58,0.4899,84.25,0.0742
|
11 |
-
10.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,84.91,88.32,0.0,81.
|
12 |
-
11.0,PoT,gpt-4o,2025/1/22,84.15,93.1,4.2166,75.
|
13 |
-
12.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,83.77,92.34,0.7054,75.
|
14 |
13.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,83.39,87.64,10.1124,79.13,0.768
|
15 |
-
14.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,83.19,85.67,0.0,80.
|
16 |
15.0,SC-CoT,Doubao-lite-32k,2025/1/7,82.60,83.7,0.155,81.5,0.0347
|
17 |
16.0,IO,gpt-4o,2025/1/22,82.00,88.4,3.3463,75.59,1.1453
|
18 |
-
17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,81.
|
19 |
-
18.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,80.
|
20 |
-
19.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,78.
|
21 |
20.0,SC-CoT,gpt-3.5-turbo,2025/1/7,76.52,82.56,2.6285,70.47,0.5578
|
22 |
-
21.0,PoT,Llama-3.3-70B-Instruct,2025/1/22,76.
|
23 |
-
22.0,PoT,Doubao-lite-32k,2025/1/7,75.63,79.
|
24 |
23.0,IO,Doubao-lite-32k,2025/1/7,75.58,72.02,0.0354,79.13,0.0058
|
25 |
-
24.0,CoT,gpt-3.5-turbo,2025/1/7,69.86,78.
|
26 |
-
25.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,69.
|
27 |
-
26.0,PoT,gpt-3.5-turbo,2025/1/7,68.
|
28 |
-
27.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,68.
|
29 |
28.0,IO,Qwen2.5-7B-Instruct,2025/1/22,67.99,57.24,0.0,78.74,0.0
|
30 |
-
29.0,CoT,Internllm2_5-7B,2025/1/22,65.
|
31 |
-
30.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,64.
|
32 |
31.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,63.47,58.83,0.0,68.11,0.0
|
33 |
-
32.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,61.
|
34 |
-
33.0,ReAct-Pro*,gpt-4o,2025/1/22,60.
|
35 |
34.0,IO,Llama-3.1-8B-Instruct,2025/1/22,54.17,57.16,0.0,51.18,0.0
|
36 |
-
35.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,48.
|
37 |
36.0,SC-CoT,Internllm2_5-7B,2025/1/22,38.62,41.39,0.0,35.85,0.0
|
38 |
-
37.0,IO,gpt-3.5-turbo,2025/1/7,38.
|
39 |
-
38.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,37.64,38.
|
40 |
39.0,PoT,Internllm2_5-7B,2025/1/22,37.41,38.21,0.0,36.61,0.0
|
41 |
40.0,ReAct-Pro*,Internllm2_5-7B,2025/1/22,37.23,33.51,0.0,40.94,0.0
|
42 |
-
41.0,CoT,Qwen2-0.5B-Instruct,2025/1/22,34.
|
43 |
-
42.0,IO,Internllm2_5-7B,2025/1/22,29.
|
44 |
-
43.0,ReAct-Pro*,Qwen2-1.5B-Instruct,2025/1/22,25.23,24.
|
45 |
-
44.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,24.
|
46 |
-
45.0,IO,Qwen2-1.5B-Instruct,2025/1/22,22.
|
47 |
-
46.0,IO,Qwen2-0.5B-Instruct,2025/1/22,20.
|
48 |
47.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,17.92,5.53,0.0,30.31,0.0
|
49 |
-
48.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,17.25,3.79,0.0,30.
|
50 |
-
49.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,15.
|
51 |
50.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,13.47,9.62,0.0,17.32,0.0
|
|
|
3 |
2.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,89.55,92.87,0.7195,86.22,0.0808
|
4 |
3.0,SC-CoT,gpt-4o,2025/1/22,89.47,90.75,24.2428,88.19,6.2412
|
5 |
4.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,88.70,93.93,0.687,83.46,0.0927
|
6 |
+
5.0,CoT,gpt-4o,2025/1/22,88.39,94.09,4.5367,82.68,1.0417
|
7 |
+
6.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,88.25,90.67,4.2651,85.83,0.5576
|
8 |
+
7.0,IO,Llama-3.3-70B-Instruct,2025/1/22,87.48,92.27,0.4709,82.68,0.0798
|
9 |
+
8.0,CoT,Doubao-lite-32k,2025/1/7,86.00,89.31,0.0558,82.68,0.0066
|
10 |
9.0,IO,Qwen2.5-72B-Instruct,2025/1/22,85.42,86.58,0.4899,84.25,0.0742
|
11 |
+
10.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,84.91,88.32,0.0,81.5,0.0
|
12 |
+
11.0,PoT,gpt-4o,2025/1/22,84.15,93.1,4.2166,75.2,1.6087
|
13 |
+
12.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,83.77,92.34,0.7054,75.2,0.1645
|
14 |
13.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,83.39,87.64,10.1124,79.13,0.768
|
15 |
+
14.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,83.19,85.67,0.0,80.71,0.0
|
16 |
15.0,SC-CoT,Doubao-lite-32k,2025/1/7,82.60,83.7,0.155,81.5,0.0347
|
17 |
16.0,IO,gpt-4o,2025/1/22,82.00,88.4,3.3463,75.59,1.1453
|
18 |
+
17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,81.58,85.6,0.2512,77.56,0.0445
|
19 |
+
18.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,80.25,87.26,10.5479,73.23,0.3177
|
20 |
+
19.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,78.64,82.87,0.0,74.41,0.0
|
21 |
20.0,SC-CoT,gpt-3.5-turbo,2025/1/7,76.52,82.56,2.6285,70.47,0.5578
|
22 |
+
21.0,PoT,Llama-3.3-70B-Instruct,2025/1/22,76.31,73.09,0.9736,79.53,0.1746
|
23 |
+
22.0,PoT,Doubao-lite-32k,2025/1/7,75.63,79.61,0.0576,71.65,0.0147
|
24 |
23.0,IO,Doubao-lite-32k,2025/1/7,75.58,72.02,0.0354,79.13,0.0058
|
25 |
+
24.0,CoT,gpt-3.5-turbo,2025/1/7,69.86,78.7,0.6788,61.02,0.0957
|
26 |
+
25.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,69.74,74.91,3.4633,64.57,0.4928
|
27 |
+
26.0,PoT,gpt-3.5-turbo,2025/1/7,68.17,76.88,0.6902,59.45,0.1748
|
28 |
+
27.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,68.04,75.44,0.0,60.63,0.0
|
29 |
28.0,IO,Qwen2.5-7B-Instruct,2025/1/22,67.99,57.24,0.0,78.74,0.0
|
30 |
+
29.0,CoT,Internllm2_5-7B,2025/1/22,65.24,77.71,0.0,52.76,0.0
|
31 |
+
30.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,64.18,75.21,0.0,53.15,0.0
|
32 |
31.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,63.47,58.83,0.0,68.11,0.0
|
33 |
+
32.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,61.65,67.78,0.0,55.51,0.0
|
34 |
+
33.0,ReAct-Pro*,gpt-4o,2025/1/22,60.40,63.31,39.0751,57.48,2.304
|
35 |
34.0,IO,Llama-3.1-8B-Instruct,2025/1/22,54.17,57.16,0.0,51.18,0.0
|
36 |
+
35.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,48.03,55.5,0.0,40.55,0.0
|
37 |
36.0,SC-CoT,Internllm2_5-7B,2025/1/22,38.62,41.39,0.0,35.85,0.0
|
38 |
+
37.0,IO,gpt-3.5-turbo,2025/1/7,38.41,37.83,0.3328,38.98,0.038
|
39 |
+
38.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,37.64,38.67,0.0,36.61,0.0
|
40 |
39.0,PoT,Internllm2_5-7B,2025/1/22,37.41,38.21,0.0,36.61,0.0
|
41 |
40.0,ReAct-Pro*,Internllm2_5-7B,2025/1/22,37.23,33.51,0.0,40.94,0.0
|
42 |
+
41.0,CoT,Qwen2-0.5B-Instruct,2025/1/22,34.51,35.94,0.0,33.07,0.0
|
43 |
+
42.0,IO,Internllm2_5-7B,2025/1/22,29.62,11.6,0.0,47.64,0.0
|
44 |
+
43.0,ReAct-Pro*,Qwen2-1.5B-Instruct,2025/1/22,25.23,24.87,0.0,25.59,0.0
|
45 |
+
44.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,24.61,18.5,0.0,30.71,0.0
|
46 |
+
45.0,IO,Qwen2-1.5B-Instruct,2025/1/22,22.91,16.68,0.0,29.13,0.0
|
47 |
+
46.0,IO,Qwen2-0.5B-Instruct,2025/1/22,20.94,14.71,0.0,27.17,0.0
|
48 |
47.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,17.92,5.53,0.0,30.31,0.0
|
49 |
+
48.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,17.25,3.79,0.0,30.71,0.0
|
50 |
+
49.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,15.84,7.66,0.0,24.02,0.0
|
51 |
50.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,13.47,9.62,0.0,17.32,0.0
|
src/record.csv
CHANGED
@@ -1,148 +1,148 @@
|
|
1 |
-
๏ปฟAlgorithm,
|
2 |
-
IO,gsm8k,gpt-3.5-turbo,37.83,99.92,8
|
3 |
-
IO,gsm8k,Doubao-lite-32k,72.02,99.92,8
|
4 |
-
IO,gsm8k,gpt-4o,88.4,100,8
|
5 |
-
IO,gsm8k,Qwen2.5-72B-Instruct,86.58,100,8
|
6 |
-
IO,gsm8k,Llama-3.3-70B-Instruct,92.
|
7 |
-
IO,gsm8k,Qwen2.5-7B-Instruct,57.24,100,8
|
8 |
-
IO,gsm8k,Llama-3.1-8B-Instruct,57.16,99.
|
9 |
-
IO,gsm8k,Internllm2_5-7B,11.
|
10 |
-
IO,gsm8k,Qwen2-1.5B-Instruct,16.
|
11 |
-
IO,gsm8k,Qwen2-0.5B-Instruct,14.
|
12 |
-
ReAct-Pro*,gsm8k,gpt-3.5-turbo,74.
|
13 |
-
ReAct-Pro*,gsm8k,Doubao-lite-32k,85.
|
14 |
-
ReAct-Pro*,gsm8k,gpt-4o,63.
|
15 |
-
ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,87.26,100,8,
|
16 |
-
ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,87.64,99.92
|
17 |
-
ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,82.
|
18 |
-
ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,67.
|
19 |
-
ReAct-Pro*,gsm8k,Internllm2_5-7B,33.51,97.95
|
20 |
-
ReAct-Pro*,gsm8k,Qwen2-1.5B-Instruct,24.
|
21 |
-
ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,7.
|
22 |
-
PoT,gsm8k,gpt-3.5-turbo,76.
|
23 |
-
PoT,gsm8k,Doubao-lite-32k,79.
|
24 |
-
PoT,gsm8k,gpt-4o,93.1,99.77,8
|
25 |
-
PoT,gsm8k,Qwen2.5-72B-Instruct,92.34,99.39,8
|
26 |
-
PoT,gsm8k,Llama-3.3-70B-Instruct,73.
|
27 |
-
PoT,gsm8k,Qwen2.5-7B-Instruct,58.83,70.
|
28 |
-
PoT,gsm8k,Llama-3.1-8B-Instruct,38.
|
29 |
-
PoT,gsm8k,Internllm2_5-7B,38.21,48.9,8
|
30 |
-
PoT,gsm8k,Qwen2-1.5B-Instruct,18.
|
31 |
-
PoT,gsm8k,Qwen2-0.5B-Instruct,9.62,16.9,8
|
32 |
-
CoT,gsm8k,gpt-3.5-turbo,78.
|
33 |
-
CoT,gsm8k,Doubao-lite-32k,89.31,100,8
|
34 |
-
CoT,gsm8k,gpt-4o,94.
|
35 |
-
CoT,gsm8k,Qwen2.5-72B-Instruct,92.87,100,8
|
36 |
-
CoT,gsm8k,Llama-3.3-70B-Instruct,93.93,100,8
|
37 |
-
CoT,gsm8k,Qwen2.5-7B-Instruct,85.67,100,8
|
38 |
-
CoT,gsm8k,Llama-3.1-8B-Instruct,75.
|
39 |
-
CoT,gsm8k,Internllm2_5-7B,77.71,99.
|
40 |
-
CoT,gsm8k,Qwen2-1.5B-Instruct,55.
|
41 |
-
CoT,gsm8k,Qwen2-0.5B-Instruct,35.
|
42 |
-
SC-CoT,gsm8k,gpt-3.5-turbo,82.56,99.85,8,
|
43 |
-
SC-CoT,gsm8k,Doubao-lite-32k,83.7,99.
|
44 |
-
SC-CoT,gsm8k,gpt-4o,90.75,100,8,
|
45 |
-
SC-CoT,gsm8k,Qwen2.5-72B-Instruct,90.67,100,8,
|
46 |
-
SC-CoT,gsm8k,Llama-3.3-70B-Instruct,95.45,100,8,
|
47 |
-
SC-CoT,gsm8k,Qwen2.5-7B-Instruct,88.32,99.
|
48 |
-
SC-CoT,gsm8k,Llama-3.1-8B-Instruct,75.
|
49 |
-
SC-CoT,gsm8k,Internllm2_5-7B,41.39,98.
|
50 |
-
SC-CoT,gsm8k,Qwen2-1.5B-Instruct,5.53,86.73,8,
|
51 |
-
SC-CoT,gsm8k,Qwen2-0.5B-Instruct,3.79,94.84,8,
|
52 |
-
IO,AQuA,gpt-3.5-turbo,38.
|
53 |
-
IO,AQuA,Doubao-lite-32k,79.13,100,0
|
54 |
-
IO,AQuA,gpt-4o,75.59,97.24,0
|
55 |
-
IO,AQuA,Qwen2.5-72B-Instruct,84.25,99.
|
56 |
-
IO,AQuA,Llama-3.3-70B-Instruct,82.
|
57 |
-
IO,AQuA,Qwen2.5-7B-Instruct,78.74,98.
|
58 |
-
IO,AQuA,Llama-3.1-8B-Instruct,51.18,98.
|
59 |
-
IO,AQuA,Internllm2_5-7B,47.
|
60 |
-
IO,AQuA,Qwen2-1.5B-Instruct,29.13,97.
|
61 |
-
IO,AQuA,Qwen2-0.5B-Instruct,27.
|
62 |
-
CoT,AQuA,gpt-3.5-turbo,61.02,93.7,0
|
63 |
-
CoT,AQuA,Doubao-lite-32k,82.
|
64 |
-
CoT,AQuA,gpt-4o,82.
|
65 |
-
CoT,AQuA,Qwen2.5-72B-Instruct,86.22,99.21,0
|
66 |
-
CoT,AQuA,Llama-3.3-70B-Instruct,83.46,98.
|
67 |
-
CoT,AQuA,Qwen2.5-7B-Instruct,80.
|
68 |
-
CoT,AQuA,Llama-3.1-8B-Instruct,60.
|
69 |
-
CoT,AQuA,Internllm2_5-7B,52.
|
70 |
-
CoT,AQuA,Qwen2-1.5B-Instruct,40.55,98.
|
71 |
-
CoT,AQuA,Qwen2-0.5B-Instruct,33.07,98.
|
72 |
-
PoT,AQuA,gpt-3.5-turbo,59.
|
73 |
-
PoT,AQuA,
|
74 |
-
PoT,AQuA,
|
75 |
-
PoT,AQuA,Qwen2.5-72B-Instruct,75.
|
76 |
-
PoT,AQuA,Llama-3.3-70B-Instruct,79.
|
77 |
-
PoT,AQuA,Qwen2.5-7B-Instruct,68.11,100,0
|
78 |
-
PoT,AQuA,Llama-3.1-8B-Instruct,36.61,96.85,0
|
79 |
-
PoT,AQuA,Internllm2_5-7B,36.61,98.
|
80 |
-
PoT,AQuA,Qwen2-1.5B-Instruct,30.
|
81 |
-
PoT,AQuA,Qwen2-0.5B-Instruct,17.32,92.
|
82 |
-
SC-CoT,AQuA,gpt-3.5-turbo,70.47,98.82,0,
|
83 |
-
SC-CoT,AQuA,Doubao-lite-32k,81.5,97.64,0,
|
84 |
-
SC-CoT,AQuA,gpt-4o,88.19,100,0,
|
85 |
-
SC-CoT,AQuA,Qwen2.5-72B-Instruct,85.
|
86 |
-
SC-CoT,AQuA,Llama-3.3-70B-Instruct,86.61,99.21,0,
|
87 |
-
SC-CoT,AQuA,Qwen2.5-7B-Instruct,81.
|
88 |
-
SC-CoT,AQuA,Llama-3.1-8B-Instruct,53.
|
89 |
-
SC-CoT,AQuA,Internllm2_5-7B,35.85,98.8,0,
|
90 |
-
SC-CoT,AQuA,Qwen2-1.5B-Instruct,30.31,97.24,0,
|
91 |
-
SC-CoT,AQuA,Qwen2-0.5B-Instruct,30.
|
92 |
-
ReAct-Pro*,AQuA,gpt-3.5-turbo,64.
|
93 |
-
ReAct-Pro*,AQuA,Doubao-lite-32k,77.
|
94 |
-
ReAct-Pro*,AQuA,gpt-4o,57.48,97.24,0,
|
95 |
-
ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,73.
|
96 |
-
ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,79.13,99.
|
97 |
-
ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,74.
|
98 |
-
ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,55.51,96.85,0,
|
99 |
-
ReAct-Pro*,AQuA,Internllm2_5-7B,40.94,96.85,0,
|
100 |
-
ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,25.59,96.06,0,
|
101 |
-
ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,24.
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
1 |
+
๏ปฟAlgorithm,Dataset,Eval Date,LLM,Score,Pass rate,X-shot,Parameters,Samples,Total input tokens,Average input tokens,Total output tokens,Average output tokens,All tokens,Cost($),Note,,,,,,,,,,,,,,,,,,,
|
2 |
+
IO,gsm8k,2025/1/7,gpt-3.5-turbo,37.83,99.92,8,,1319,"546,990",415,"39,563",30,"586,553",0.3328,,,,,,,,,,,,,,,,,,,,
|
3 |
+
IO,gsm8k,2025/1/7,Doubao-lite-32k,72.02,99.92,8,,1319,"617,377",468,"123,106",93,"740,483",0.0354,0.2590 ๏ผๅ
๏ผ,,,,,,,,,,,,,,,,,,,
|
4 |
+
IO,gsm8k,2025/1/22,gpt-4o,88.4,100,8,,1319,"542,416",411,"199,030",151,"741,446",3.3463,,,,,,,,,,,,,,,,,,,,
|
5 |
+
IO,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,86.58,100,8,,1319,"555,340",421,"313,720",238,"869,060",0.4899,,,,,,,,,,,,,,,,,,,,
|
6 |
+
IO,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,92.27,100,8,,1319,"583,916",443,"251,359",191,"835,275",0.4709,,,,,,,,,,,,,,,,,,,,
|
7 |
+
IO,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,57.24,100,8,,1319,"596,229",452,"291,684",221,"887,913",0.0000,,,,,,,,,,,,,,,,,,,,
|
8 |
+
IO,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,57.16,99.55,8,,1319,"550,941",418,"1,194,488",906,"1,745,429",0.0000,,,,,,,,,,,,,,,,,,,,
|
9 |
+
IO,gsm8k,2025/1/22,Internllm2_5-7B,11.6,97.95,8,,1319,"679,302",515,"434,426",329,"1,113,728",0.0000,,,,,,,,,,,,,,,,,,,,
|
10 |
+
IO,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,16.68,100,8,,1319,"568,530",431,"168,466",128,"736,996",0.0000,,,,,,,,,,,,,,,,,,,,
|
11 |
+
IO,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,14.71,100,8,,1319,"568,116",431,"266,781",202,"834,897",0.0000,,,,,,,,,,,,,,,,,,,,
|
12 |
+
ReAct-Pro*,gsm8k,2025/1/7,gpt-3.5-turbo,74.91,99.39,8,max_steps=10,1319,"6,506,164","4,933","140,122",106,"6,646,286",3.4633,"think-action ๅ็ฌ่ฟๅ,prompt v1",,,,,,,,,,,,,,,,,,,
|
13 |
+
ReAct-Pro*,gsm8k,2025/1/7,Doubao-lite-32k,85.6,99.62,8,max_steps=10,1319,"5,862,016","4,444","136,623",104,"5,998,639",0.2512,"think-action ๅ็ฌ่ฟๅ,prompt v1",,,,,,,,,,,,,,,,,,,
|
14 |
+
ReAct-Pro*,gsm8k,2025/1/22,gpt-4o,63.31,99.55,8,max_steps=10,1319,"14,411,173","10,926","304,714",231,"14,715,887",39.0751,"think-action ๅ็ฌ่ฟๅ,prompt v1",,,,,,,,,,,,,,,,,,,
|
15 |
+
ReAct-Pro*,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,87.26,100,8,max_steps=10,1319,"18,160,983","13,769","549,454",417,"18,710,437",10.5479,,,,,,,,,,,,,,,,,,,,
|
16 |
+
ReAct-Pro*,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,87.64,99.92,8,max_steps=10,1319,"17,038,928","12,918","898,936",682,"17,937,864",10.1124,,,,,,,,,,,,,,,,,,,,
|
17 |
+
ReAct-Pro*,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,82.87,100,8,max_steps=10,1319,"14,355,752","10,884","495,162",375,"14,850,914",0.0000,,,,,,,,,,,,,,,,,,,,
|
18 |
+
ReAct-Pro*,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,67.78,98.56,8,max_steps=10,1319,"21,044,978","15,955","1,790,789","1,358","22,835,767",0.0000,,,,,,,,,,,,,,,,,,,,
|
19 |
+
ReAct-Pro*,gsm8k,2025/1/22,Internllm2_5-7B,33.51,97.95,8,max_steps=10,1319,"30,120,070","22,836","5,549,919","4,208","35,669,989",0.0000,,,,,,,,,,,,,,,,,,,,
|
20 |
+
ReAct-Pro*,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,24.87,80.21,8,max_steps=10,1319,"9,133,603","6,925","694,398",526,"9,828,001",0.0000,,,,,,,,,,,,,,,,,,,,
|
21 |
+
ReAct-Pro*,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,7.66,95.22,8,max_steps=10,1319,"52,431,343","39,751","2,961,268","2,245","55,392,611",0.0000,,,,,,,,,,,,,,,,,,,,
|
22 |
+
PoT,gsm8k,2025/1/7,gpt-3.5-turbo,76.88,99.24,8,,1319,"1,090,418",827,"96,662",73,"1,187,080",0.6902,,,,,,,,,,,,,,,,,,,,
|
23 |
+
PoT,gsm8k,2025/1/7,Doubao-lite-32k,79.61,92.57,8,,1319,"1,170,038",887,"118,017",89,"1,288,055",0.0576,,,,,,,,,,,,,,,,,,,,
|
24 |
+
PoT,gsm8k,2025/1/22,gpt-4o,93.1,99.77,8,,1319,"1,101,672",835,"146,240",111,"1,247,912",4.2166,,,,,,,,,,,,,,,,,,,,
|
25 |
+
PoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,92.34,99.39,8,,1319,"1,106,682",839,"144,528",110,"1,251,210",0.7054,,,,,,,,,,,,,,,,,,,,
|
26 |
+
PoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,73.09,79.61,8,,1319,"1,126,025",854,"601,019",456,"1,727,044",0.9736,,,,,,,,,,,,,,,,,,,,
|
27 |
+
PoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,58.83,70.51,8,,1319,"1,145,390",868,"217,432",165,"1,362,822",0.0000,,,,,,,,,,,,,,,,,,,,
|
28 |
+
PoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,38.67,55.42,8,,1319,"1,147,538",870,"243,573",185,"1,391,111",0.0000,,,,,,,,,,,,,,,,,,,,
|
29 |
+
PoT,gsm8k,2025/1/22,Internllm2_5-7B,38.21,48.9,8,,1319,"1,136,843",862,"188,106",143,"1,324,949",0.0000,,,,,,,,,,,,,,,,,,,,
|
30 |
+
PoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,18.5,31.01,8,,1319,"1,151,528",873,"175,994",133,"1,327,522",0.0000,,,,,,,,,,,,,,,,,,,,
|
31 |
+
PoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,9.62,16.9,8,,1319,"1,151,528",873,"237,607",180,"1,389,135",0.0000,,,,,,,,,,,,,,,,,,,,
|
32 |
+
CoT,gsm8k,2025/1/7,gpt-3.5-turbo,78.7,100,8,,1319,"953,242",723,"134,799",102,"1,088,041",0.6788,,,,,,,,,,,,,,,,,,,,
|
33 |
+
CoT,gsm8k,2025/1/7,Doubao-lite-32k,89.31,100,8,,1319,"1,042,095",790,"159,725",121,"1,201,820",0.0558,0.4084635 ๏ผๅ
๏ผ,,,,,,,,,,,,,,,,,,,
|
34 |
+
CoT,gsm8k,2025/1/22,gpt-4o,94.09,100,8,,1319,"948,668",719,"216,498",164,"1,165,166",4.5367,,,,,,,,,,,,,,,,,,,,
|
35 |
+
CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,92.87,100,8,,1319,"1,005,119",762,"271,133",206,"1,276,252",0.7195,,,,,,,,,,,,,,,,,,,,
|
36 |
+
CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,93.93,100,8,,1319,"990,168",751,"228,497",173,"1,218,665",0.6870,,,,,,,,,,,,,,,,,,,,
|
37 |
+
CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,85.67,100,8,,1319,"1,046,008",793,"244,797",186,"1,290,805",0.0000,,,,,,,,,,,,,,,,,,,,
|
38 |
+
CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,75.44,99.92,8,,1319,"990,168",751,"258,161",196,"1,248,329",0.0000,,,,,,,,,,,,,,,,,,,,
|
39 |
+
CoT,gsm8k,2025/1/22,Internllm2_5-7B,77.71,99.7,8,,1319,"968,163",734,"234,000",177,"1,202,163",0.0000,,,,,,,,,,,,,,,,,,,,
|
40 |
+
CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,55.5,100,8,,1319,"1,032,818",783,"185,707",141,"1,218,525",0.0000,,,,,,,,,,,,,,,,,,,,
|
41 |
+
CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,35.94,99.92,8,,1319,"1,032,818",783,"190,641",145,"1,223,459",0.0000,,,,,,,,,,,,,,,,,,,,
|
42 |
+
SC-CoT,gsm8k,2025/1/7,gpt-3.5-turbo,82.56,99.85,8,"temperature=1, path_num=5",1319,"1,212,520",919,"1,348,177","1,022","2,560,697",2.6285,,,,,,,,,,,,,,,,,,,,
|
43 |
+
SC-CoT,gsm8k,2025/1/7,Doubao-lite-32k,83.7,99.7,8,"temperature=1, path_num=5",1319,"1,230,019",933,"1,277,668",969,"2,507,687",0.1550,,,,,,,,,,,,,,,,,,,,
|
44 |
+
SC-CoT,gsm8k,2025/1/22,gpt-4o,90.75,100,8,"temperature=1, path_num=5",1319,"1,168,927",886,"2,132,044","1,616","3,300,971",24.2428,,,,,,,,,,,,,,,,,,,,
|
45 |
+
SC-CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,90.67,100,8,"temperature=1, path_num=5",1319,"5,292,383","4,012","2,273,254","1,723","7,565,637",4.2651,,,,,,,,,,,,,,,,,,,,
|
46 |
+
SC-CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,95.45,100,8,"temperature=1, path_num=5",1319,"5,406,763","4,099","2,579,233","1,955","7,985,996",4.5021,,,,,,,,,,,,,,,,,,,,
|
47 |
+
SC-CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,88.32,99.85,8,"temperature=1, path_num=5",1319,"5,668,252","4,297","2,505,566","1,900","8,173,818",0.0000,,,,,,,,,,,,,,,,,,,,
|
48 |
+
SC-CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,75.21,99.55,8,"temperature=1, path_num=5",1319,"5,334,657","4,044","3,109,546","2,358","8,444,203",0.0000,,,,,,,,,,,,,,,,,,,,
|
49 |
+
SC-CoT,gsm8k,2025/1/22,Internllm2_5-7B,41.39,98.26,8,"temperature=1, path_num=5",1319,"6,674,518","5,060","3,350,339","2,540","10,024,857",0.0000,,,,,,,,,,,,,,,,,,,,
|
50 |
+
SC-CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,5.53,86.73,8,"temperature=1, path_num=5",1319,"5,844,218","4,431","3,117,550","2,364","8,961,768",0.0000,,,,,,,,,,,,,,,,,,,,
|
51 |
+
SC-CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,3.79,94.84,8,"temperature=1, path_num=5",1319,"6,529,832","4,951","4,003,983","3,036","10,533,815",0.0000,,,,,,,,,,,,,,,,,,,,
|
52 |
+
IO,AQuA,2025/1/7,gpt-3.5-turbo,38.98,100,0,,254,"25,701",101,"16,770",66,"42,471",0.0380,,,,,,,,,,,,,,,,,,,,
|
53 |
+
IO,AQuA,2025/1/7,Doubao-lite-32k,79.13,100,0,,254,"33,058",130,"54,684",215,"87,742",0.0058,0.0427๏ผๅ
๏ผ,,,,,,,,,,,,,,,,,,,
|
54 |
+
IO,AQuA,2025/1/22,gpt-4o,75.59,97.24,0,,254,"25,631",101,"108,121",426,"133,752",1.1453,,,,,,,,,,,,,,,,,,,,
|
55 |
+
IO,AQuA,2025/1/22,Qwen2.5-72B-Instruct,84.25,99.61,0,,254,"25,397",100,"106,207",418,"131,604",0.0742,,,,,,,,,,,,,,,,,,,,
|
56 |
+
IO,AQuA,2025/1/22,Llama-3.3-70B-Instruct,82.68,99.21,0,,254,"32,809",129,"108,758",428,"141,567",0.0798,,,,,,,,,,,,,,,,,,,,
|
57 |
+
IO,AQuA,2025/1/22,Qwen2.5-7B-Instruct,78.74,98.43,0,,254,"33,271",131,"104,500",411,"137,771",0.0000,,,,,,,,,,,,,,,,,,,,
|
58 |
+
IO,AQuA,2025/1/22,Llama-3.1-8B-Instruct,51.18,98.82,0,,254,"26,459",104,"106,647",420,"133,106",0.0000,,,,,,,,,,,,,,,,,,,,
|
59 |
+
IO,AQuA,2025/1/22,Internllm2_5-7B,47.64,90.94,0,,254,"50,232",198,"134,809",531,"185,041",0.0000,,,,,,,,,,,,,,,,,,,,
|
60 |
+
IO,AQuA,2025/1/22,Qwen2-1.5B-Instruct,29.13,97.64,0,,254,"27,937",110,"43,110",170,"71,047",0.0000,,,,,,,,,,,,,,,,,,,,
|
61 |
+
IO,AQuA,2025/1/22,Qwen2-0.5B-Instruct,27.17,98.82,0,,254,"27,937",110,"82,478",325,"110,415",0.0000,,,,,,,,,,,,,,,,,,,,
|
62 |
+
CoT,AQuA,2025/1/22,gpt-3.5-turbo,61.02,93.7,0,,254,"25,447",100,"55,346",218,"80,793",0.0957,,,,,,,,,,,,,,,,,,,,
|
63 |
+
CoT,AQuA,2025/1/7,Doubao-lite-32k,82.68,97.24,0,,254,"27,978",110,"66,599",262,"94,577",0.0066,0.0483 ๏ผๅ
๏ผ,,,,,,,,,,,,,,,,,,,
|
64 |
+
CoT,AQuA,2025/1/22,gpt-4o,82.68,98.03,0,,254,"25,123",99,"97,894",385,"123,017",1.0417,,,,,,,,,,,,,,,,,,,,
|
65 |
+
CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,86.22,99.21,0,,254,"25,143",99,"118,146",465,"143,289",0.0808,,,,,,,,,,,,,,,,,,,,
|
66 |
+
CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,83.46,98.43,0,,254,"32,555",128,"131,834",519,"164,389",0.0927,,,,,,,,,,,,,,,,,,,,
|
67 |
+
CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,80.71,99.61,0,,254,"33,017",130,"116,719",460,"149,736",0.0000,,,,,,,,,,,,,,,,,,,,
|
68 |
+
CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,60.63,100,0,,254,"32,555",128,"111,880",440,"144,435",0.0000,,,,,,,,,,,,,,,,,,,,
|
69 |
+
CoT,AQuA,2025/1/22,Internllm2_5-7B,52.76,89.37,0,,254,"26,610",105,"100,910",397,"127,520",0.0000,,,,,,,,,,,,,,,,,,,,
|
70 |
+
CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,40.55,98.82,0,,254,"30,477",120,"79,563",313,"110,040",0.0000,,,,,,,,,,,,,,,,,,,,
|
71 |
+
CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,33.07,98.82,0,,254,"30,477",120,"86,862",342,"117,339",0.0000,,,,,,,,,,,,,,,,,,,,
|
72 |
+
PoT,AQuA,2025/1/7,gpt-3.5-turbo,59.45,100,0,,254,"225,162",886,"41,492",163,"266,654",0.1748,,,,,,,,,,,,,,,,,,,,
|
73 |
+
PoT,AQuA,2025/1/7,Doubao-lite-32k,71.65,96.85,0,,254,"259,863","1,023","49,573",195,"309,436",0.0147,,,,,,,,,,,,,,,,,,,,
|
74 |
+
PoT,AQuA,2025/1/22,gpt-4o,75.2,100,0,,254,"222,717",877,"105,191",414,"327,908",1.6087,,,,,,,,,,,,,,,,,,,,
|
75 |
+
PoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,75.2,100,0,,254,"249,215",981,"42,549",168,"291,764",0.1645,,,,,,,,,,,,,,,,,,,,
|
76 |
+
PoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,79.53,99.21,0,,254,"240,735",948,"69,064",272,"309,799",0.1746,,,,,,,,,,,,,,,,,,,,
|
77 |
+
PoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,68.11,100,0,,254,"264,517","1,041","49,211",194,"313,728",0.0000,,,,,,,,,,,,,,,,,,,,
|
78 |
+
PoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,36.61,96.85,0,,254,"240,613",947,"50,301",198,"290,914",0.0000,,,,,,,,,,,,,,,,,,,,
|
79 |
+
PoT,AQuA,2025/1/22,Internllm2_5-7B,36.61,98.82,0,,254,"233,505",919,"68,457",270,"301,962",0.0000,,,,,,,,,,,,,,,,,,,,
|
80 |
+
PoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,30.71,96.46,0,,254,"246,560",971,"51,915",204,"298,475",0.0000,,,,,,,,,,,,,,,,,,,,
|
81 |
+
PoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,17.32,92.13,0,,254,"258,867","1,019","63,414",250,"322,281",0.0000,,,,,,,,,,,,,,,,,,,,
|
82 |
+
SC-CoT,AQuA,2025/1/7,gpt-3.5-turbo,70.47,98.82,0,"temperature=1, path_num=5",254,"70,157",276,"348,460","1,372","418,617",0.5578,,,,,,,,,,,,,,,,,,,,
|
83 |
+
SC-CoT,AQuA,2025/1/7,Doubao-lite-32k,81.5,97.64,0,"temperature=1, path_num=5",254,"83,830",330,"382,016","1,504","465,846",0.0347,,,,,,,,,,,,,,,,,,,,
|
84 |
+
SC-CoT,AQuA,2025/1/22,gpt-4o,88.19,100,0,"temperature=1, path_num=5",254,"72,916",287,"605,895","2,385","678,811",6.2412,,,,,,,,,,,,,,,,,,,,
|
85 |
+
SC-CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,85.83,98.43,0,"temperature=1, path_num=5",254,"241,149",949,"747,909","2,945","989,058",0.5576,,,,,,,,,,,,,,,,,,,,
|
86 |
+
SC-CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,86.61,99.21,0,"temperature=1, path_num=5",254,"283,248","1,115","753,876","2,968","1,037,124",0.5847,,,,,,,,,,,,,,,,,,,,
|
87 |
+
SC-CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,81.5,100,0,"temperature=1, path_num=5",254,"278,848","1,098","736,520","2,900","1,015,368",0.0000,,,,,,,,,,,,,,,,,,,,
|
88 |
+
SC-CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,53.15,96.06,0,"temperature=1, path_num=5",254,"372,968","1,468","668,378","2,631","1,041,346",0.0000,,,,,,,,,,,,,,,,,,,,
|
89 |
+
SC-CoT,AQuA,2025/1/22,Internllm2_5-7B,35.85,98.8,0,"temperature=1, path_num=5",254,"530,701","2,089","709,687","2,794","1,240,388",0.0000,,,,,,,,,,,,,,,,,,,,
|
90 |
+
SC-CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,30.31,97.24,0,"temperature=1, path_num=5",254,"430,703","1,696","726,373","2,860","1,157,076",0.0000,,,,,,,,,,,,,,,,,,,,
|
91 |
+
SC-CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,30.71,98.43,0,"temperature=1, path_num=5",254,"496,206","1,954","729,333","2,871","1,225,539",0.0000,,,,,,,,,,,,,,,,,,,,
|
92 |
+
ReAct-Pro*,AQuA,2025/1/7,gpt-3.5-turbo,64.57,98.03,0,max_steps=10,254,"862,614","3,396","40,973",161,"903,587",0.4928,"think-action ๅ็ฌ่ฟๅ,prompt v1",,,,,,,,,,,,,,,,,,,
|
93 |
+
ReAct-Pro*,AQuA,2025/1/7,Doubao-lite-32k,77.56,96.06,0,max_steps=10,254,"977,890","3,850","54,951",216,"1,032,841",0.0445,"think-action ๅ็ฌ่ฟๅ,prompt v1",,,,,,,,,,,,,,,,,,,
|
94 |
+
ReAct-Pro*,AQuA,2025/1/22,gpt-4o,57.48,97.24,0,max_steps=10,254,"615,589","2,424","76,507",301,"692,096",2.3040,"think-action ๅ็ฌ่ฟๅ,prompt v1",,,,,,,,,,,,,,,,,,,
|
95 |
+
ReAct-Pro*,AQuA,2025/1/22,Qwen2.5-72B-Instruct,73.23,100,0,max_steps=10,254,"441,765","1,739","121,838",480,"563,603",0.3177,,,,,,,,,,,,,,,,,,,,
|
96 |
+
ReAct-Pro*,AQuA,2025/1/22,Llama-3.3-70B-Instruct,79.13,99.61,0,max_steps=10,254,"1,119,143","4,406","243,236",958,"1,362,379",0.7680,,,,,,,,,,,,,,,,,,,,
|
97 |
+
ReAct-Pro*,AQuA,2025/1/22,Qwen2.5-7B-Instruct,74.41,99.21,0,max_steps=10,254,"564,165","2,221","131,679",518,"695,844",0.0000,,,,,,,,,,,,,,,,,,,,
|
98 |
+
ReAct-Pro*,AQuA,2025/1/22,Llama-3.1-8B-Instruct,55.51,96.85,0,max_steps=10,254,"3,764,723","14,822","576,098","2,268","4,340,821",0.0000,,,,,,,,,,,,,,,,,,,,
|
99 |
+
ReAct-Pro*,AQuA,2025/1/22,Internllm2_5-7B,40.94,96.85,0,max_steps=10,254,"3,592,039","14,142","836,762","3,294","4,428,801",0.0000,,,,,,,,,,,,,,,,,,,,
|
100 |
+
ReAct-Pro*,AQuA,2025/1/22,Qwen2-1.5B-Instruct,25.59,96.06,0,max_steps=10,254,"4,555,858","17,936","516,146","2,032","5,072,004",0.0000,,,,,,,,,,,,,,,,,,,,
|
101 |
+
ReAct-Pro*,AQuA,2025/1/22,Qwen2-0.5B-Instruct,24.02,96.85,0,max_steps=10,254,6344167,"24,977",825920,"3,252","7,170,087",0.0000,,,,,,,,,,,,,,,,,,,,
|
102 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
103 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
104 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
105 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
106 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
107 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
108 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
109 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
110 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
111 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
112 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
113 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
114 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
115 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
116 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
117 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
118 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
119 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
120 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
121 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
122 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
123 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
124 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
125 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
126 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
127 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
128 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
129 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
130 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
131 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
132 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
133 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
134 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
135 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
136 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
137 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
138 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
139 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
140 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
141 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
142 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
143 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
144 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
145 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
146 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
147 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
148 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|