liaojiajia commited on
Commit
be9cdf5
·
1 Parent(s): 52f14c3

update sc-cot scores

Browse files
src/detail_math_score.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "time": "2025-01-23 11:23:17",
3
  "results": {
4
  "IO": {
5
  "gpt-3.5-turbo": {
@@ -971,7 +971,7 @@
971
  },
972
  "gsm8k": {
973
  "Score": 9.62,
974
- "Pass rate": 0.169,
975
  "Cost($)": 0.0,
976
  "Framework": "",
977
  "X-shot": "8.0",
@@ -1337,30 +1337,30 @@
1337
  "Eval Date": "2025/1/7"
1338
  },
1339
  "gsm8k": {
1340
- "Score": 82.56,
1341
- "Pass rate": 0.9985,
1342
- "Cost($)": 2.6285,
1343
  "Framework": "",
1344
  "X-shot": "8.0",
1345
  "Samples": 1319,
1346
- "All tokens": 2560697,
1347
- "Total input tokens": 1212520,
1348
- "Average input tokens": 919,
1349
- "Total output tokens": 1348177,
1350
- "Average output tokens": 1022
1351
  },
1352
  "AQuA": {
1353
- "Score": 70.47,
1354
- "Pass rate": 0.9882,
1355
- "Cost($)": 0.5578,
1356
  "Framework": "",
1357
  "X-shot": "0.0",
1358
  "Samples": 254,
1359
- "All tokens": 418617,
1360
- "Total input tokens": 70157,
1361
- "Average input tokens": 276,
1362
- "Total output tokens": 348460,
1363
- "Average output tokens": 1372
1364
  }
1365
  },
1366
  "Doubao-lite-32k": {
@@ -1370,30 +1370,30 @@
1370
  "Eval Date": "2025/1/7"
1371
  },
1372
  "gsm8k": {
1373
- "Score": 83.7,
1374
- "Pass rate": 0.997,
1375
- "Cost($)": 0.155,
1376
  "Framework": "",
1377
  "X-shot": "8.0",
1378
  "Samples": 1319,
1379
- "All tokens": 2507687,
1380
- "Total input tokens": 1230019,
1381
- "Average input tokens": 933,
1382
- "Total output tokens": 1277668,
1383
- "Average output tokens": 969
1384
  },
1385
  "AQuA": {
1386
- "Score": 81.5,
1387
- "Pass rate": 0.9764,
1388
- "Cost($)": 0.0347,
1389
  "Framework": "",
1390
  "X-shot": "0.0",
1391
  "Samples": 254,
1392
- "All tokens": 465846,
1393
- "Total input tokens": 83830,
1394
- "Average input tokens": 330,
1395
- "Total output tokens": 382016,
1396
- "Average output tokens": 1504
1397
  }
1398
  },
1399
  "gpt-4o": {
@@ -1403,30 +1403,30 @@
1403
  "Eval Date": "2025/1/22"
1404
  },
1405
  "gsm8k": {
1406
- "Score": 90.75,
1407
- "Pass rate": 1.0,
1408
- "Cost($)": 24.2428,
1409
  "Framework": "",
1410
  "X-shot": "8.0",
1411
  "Samples": 1319,
1412
- "All tokens": 3300971,
1413
- "Total input tokens": 1168927,
1414
- "Average input tokens": 886,
1415
- "Total output tokens": 2132044,
1416
- "Average output tokens": 1616
1417
  },
1418
  "AQuA": {
1419
- "Score": 88.19,
1420
- "Pass rate": 1.0,
1421
- "Cost($)": 6.2412,
1422
  "Framework": "",
1423
  "X-shot": "0.0",
1424
  "Samples": 254,
1425
- "All tokens": 678811,
1426
- "Total input tokens": 72916,
1427
- "Average input tokens": 287,
1428
- "Total output tokens": 605895,
1429
- "Average output tokens": 2385
1430
  }
1431
  },
1432
  "Qwen2.5-72B-Instruct": {
@@ -1436,30 +1436,30 @@
1436
  "Eval Date": "2025/1/22"
1437
  },
1438
  "gsm8k": {
1439
- "Score": 90.67,
1440
  "Pass rate": 1.0,
1441
- "Cost($)": 4.2651,
1442
  "Framework": "",
1443
  "X-shot": "8.0",
1444
  "Samples": 1319,
1445
- "All tokens": 7565637,
1446
- "Total input tokens": 5292383,
1447
- "Average input tokens": 4012,
1448
- "Total output tokens": 2273254,
1449
- "Average output tokens": 1723
1450
  },
1451
  "AQuA": {
1452
- "Score": 85.83,
1453
- "Pass rate": 0.9843,
1454
- "Cost($)": 0.5576,
1455
  "Framework": "",
1456
  "X-shot": "0.0",
1457
  "Samples": 254,
1458
- "All tokens": 989058,
1459
- "Total input tokens": 241149,
1460
- "Average input tokens": 949,
1461
- "Total output tokens": 747909,
1462
- "Average output tokens": 2945
1463
  }
1464
  },
1465
  "Llama-3.3-70B-Instruct": {
@@ -1469,30 +1469,30 @@
1469
  "Eval Date": "2025/1/22"
1470
  },
1471
  "gsm8k": {
1472
- "Score": 95.45,
1473
  "Pass rate": 1.0,
1474
- "Cost($)": 4.5021,
1475
  "Framework": "",
1476
  "X-shot": "8.0",
1477
  "Samples": 1319,
1478
- "All tokens": 7985996,
1479
- "Total input tokens": 5406763,
1480
- "Average input tokens": 4099,
1481
- "Total output tokens": 2579233,
1482
- "Average output tokens": 1955
1483
  },
1484
  "AQuA": {
1485
- "Score": 86.61,
1486
  "Pass rate": 0.9921,
1487
- "Cost($)": 0.5847,
1488
  "Framework": "",
1489
  "X-shot": "0.0",
1490
  "Samples": 254,
1491
- "All tokens": 1037124,
1492
- "Total input tokens": 283248,
1493
- "Average input tokens": 1115,
1494
- "Total output tokens": 753876,
1495
- "Average output tokens": 2968
1496
  }
1497
  },
1498
  "Qwen2.5-7B-Instruct": {
@@ -1502,30 +1502,30 @@
1502
  "Eval Date": "2025/1/22"
1503
  },
1504
  "gsm8k": {
1505
- "Score": 88.32,
1506
- "Pass rate": 0.9985,
1507
  "Cost($)": 0.0,
1508
  "Framework": "",
1509
  "X-shot": "8.0",
1510
  "Samples": 1319,
1511
- "All tokens": 8173818,
1512
- "Total input tokens": 5668252,
1513
- "Average input tokens": 4297,
1514
- "Total output tokens": 2505566,
1515
- "Average output tokens": 1900
1516
  },
1517
  "AQuA": {
1518
- "Score": 81.5,
1519
  "Pass rate": 1.0,
1520
  "Cost($)": 0.0,
1521
  "Framework": "",
1522
  "X-shot": "0.0",
1523
  "Samples": 254,
1524
- "All tokens": 1015368,
1525
- "Total input tokens": 278848,
1526
- "Average input tokens": 1098,
1527
- "Total output tokens": 736520,
1528
- "Average output tokens": 2900
1529
  }
1530
  },
1531
  "Llama-3.1-8B-Instruct": {
@@ -1535,30 +1535,30 @@
1535
  "Eval Date": "2025/1/22"
1536
  },
1537
  "gsm8k": {
1538
- "Score": 75.21,
1539
  "Pass rate": 0.9955,
1540
  "Cost($)": 0.0,
1541
  "Framework": "",
1542
  "X-shot": "8.0",
1543
  "Samples": 1319,
1544
- "All tokens": 8444203,
1545
- "Total input tokens": 5334657,
1546
- "Average input tokens": 4044,
1547
- "Total output tokens": 3109546,
1548
- "Average output tokens": 2358
1549
  },
1550
  "AQuA": {
1551
- "Score": 53.15,
1552
- "Pass rate": 0.9606,
1553
  "Cost($)": 0.0,
1554
  "Framework": "",
1555
  "X-shot": "0.0",
1556
  "Samples": 254,
1557
- "All tokens": 1041346,
1558
- "Total input tokens": 372968,
1559
- "Average input tokens": 1468,
1560
- "Total output tokens": 668378,
1561
- "Average output tokens": 2631
1562
  }
1563
  },
1564
  "Internllm2_5-7B": {
@@ -1568,30 +1568,30 @@
1568
  "Eval Date": "2025/1/22"
1569
  },
1570
  "gsm8k": {
1571
- "Score": 41.39,
1572
- "Pass rate": 0.9826,
1573
  "Cost($)": 0.0,
1574
  "Framework": "",
1575
  "X-shot": "8.0",
1576
  "Samples": 1319,
1577
- "All tokens": 10024857,
1578
- "Total input tokens": 6674518,
1579
- "Average input tokens": 5060,
1580
- "Total output tokens": 3350339,
1581
- "Average output tokens": 2540
1582
  },
1583
  "AQuA": {
1584
- "Score": 35.85,
1585
- "Pass rate": 0.988,
1586
  "Cost($)": 0.0,
1587
  "Framework": "",
1588
  "X-shot": "0.0",
1589
  "Samples": 254,
1590
- "All tokens": 1240388,
1591
- "Total input tokens": 530701,
1592
- "Average input tokens": 2089,
1593
- "Total output tokens": 709687,
1594
- "Average output tokens": 2794
1595
  }
1596
  },
1597
  "Qwen2-1.5B-Instruct": {
@@ -1601,30 +1601,30 @@
1601
  "Eval Date": "2025/1/22"
1602
  },
1603
  "gsm8k": {
1604
- "Score": 5.53,
1605
- "Pass rate": 0.8673,
1606
  "Cost($)": 0.0,
1607
  "Framework": "",
1608
  "X-shot": "8.0",
1609
  "Samples": 1319,
1610
- "All tokens": 8961768,
1611
- "Total input tokens": 5844218,
1612
- "Average input tokens": 4431,
1613
- "Total output tokens": 3117550,
1614
- "Average output tokens": 2364
1615
  },
1616
  "AQuA": {
1617
- "Score": 30.31,
1618
- "Pass rate": 0.9724,
1619
  "Cost($)": 0.0,
1620
  "Framework": "",
1621
  "X-shot": "0.0",
1622
  "Samples": 254,
1623
- "All tokens": 1157076,
1624
- "Total input tokens": 430703,
1625
- "Average input tokens": 1696,
1626
- "Total output tokens": 726373,
1627
- "Average output tokens": 2860
1628
  }
1629
  },
1630
  "Qwen2-0.5B-Instruct": {
@@ -1634,30 +1634,30 @@
1634
  "Eval Date": "2025/1/22"
1635
  },
1636
  "gsm8k": {
1637
- "Score": 3.79,
1638
- "Pass rate": 0.9484,
1639
  "Cost($)": 0.0,
1640
  "Framework": "",
1641
  "X-shot": "8.0",
1642
  "Samples": 1319,
1643
- "All tokens": 10533815,
1644
- "Total input tokens": 6529832,
1645
- "Average input tokens": 4951,
1646
- "Total output tokens": 4003983,
1647
- "Average output tokens": 3036
1648
  },
1649
  "AQuA": {
1650
- "Score": 30.71,
1651
- "Pass rate": 0.9843,
1652
  "Cost($)": 0.0,
1653
  "Framework": "",
1654
  "X-shot": "0.0",
1655
  "Samples": 254,
1656
- "All tokens": 1225539,
1657
- "Total input tokens": 496206,
1658
- "Average input tokens": 1954,
1659
- "Total output tokens": 729333,
1660
- "Average output tokens": 2871
1661
  }
1662
  }
1663
  }
 
1
  {
2
+ "time": "2025-01-24 15:10:27",
3
  "results": {
4
  "IO": {
5
  "gpt-3.5-turbo": {
 
971
  },
972
  "gsm8k": {
973
  "Score": 9.62,
974
+ "Pass rate": 0.1691,
975
  "Cost($)": 0.0,
976
  "Framework": "",
977
  "X-shot": "8.0",
 
1337
  "Eval Date": "2025/1/7"
1338
  },
1339
  "gsm8k": {
1340
+ "Score": 79.91,
1341
+ "Pass rate": 0.9992,
1342
+ "Cost($)": 3.3938,
1343
  "Framework": "",
1344
  "X-shot": "8.0",
1345
  "Samples": 1319,
1346
+ "All tokens": 4089612,
1347
+ "Total input tokens": 2740652,
1348
+ "Average input tokens": 2078,
1349
+ "Total output tokens": 1348960,
1350
+ "Average output tokens": 1023
1351
  },
1352
  "AQuA": {
1353
+ "Score": 66.14,
1354
+ "Pass rate": 0.9921,
1355
+ "Cost($)": 0.7888,
1356
  "Framework": "",
1357
  "X-shot": "0.0",
1358
  "Samples": 254,
1359
+ "All tokens": 847335,
1360
+ "Total input tokens": 482192,
1361
+ "Average input tokens": 1898,
1362
+ "Total output tokens": 365143,
1363
+ "Average output tokens": 1438
1364
  }
1365
  },
1366
  "Doubao-lite-32k": {
 
1370
  "Eval Date": "2025/1/7"
1371
  },
1372
  "gsm8k": {
1373
+ "Score": 87.26,
1374
+ "Pass rate": 0.9992,
1375
+ "Cost($)": 0.2083,
1376
  "Framework": "",
1377
  "X-shot": "8.0",
1378
  "Samples": 1319,
1379
+ "All tokens": 3888813,
1380
+ "Total input tokens": 2691714,
1381
+ "Average input tokens": 2041,
1382
+ "Total output tokens": 1197099,
1383
+ "Average output tokens": 908
1384
  },
1385
  "AQuA": {
1386
+ "Score": 81.1,
1387
+ "Pass rate": 0.9724,
1388
+ "Cost($)": 0.0519,
1389
  "Framework": "",
1390
  "X-shot": "0.0",
1391
  "Samples": 254,
1392
+ "All tokens": 885986,
1393
+ "Total input tokens": 503751,
1394
+ "Average input tokens": 1983,
1395
+ "Total output tokens": 382235,
1396
+ "Average output tokens": 1505
1397
  }
1398
  },
1399
  "gpt-4o": {
 
1403
  "Eval Date": "2025/1/22"
1404
  },
1405
  "gsm8k": {
1406
+ "Score": 90.3,
1407
+ "Pass rate": 0.9992,
1408
+ "Cost($)": 31.0542,
1409
  "Framework": "",
1410
  "X-shot": "8.0",
1411
  "Samples": 1319,
1412
+ "All tokens": 5798173,
1413
+ "Total input tokens": 3590336,
1414
+ "Average input tokens": 2722,
1415
+ "Total output tokens": 2207837,
1416
+ "Average output tokens": 1674
1417
  },
1418
  "AQuA": {
1419
+ "Score": 86.61,
1420
+ "Pass rate": 0.9882,
1421
+ "Cost($)": 8.1485,
1422
  "Framework": "",
1423
  "X-shot": "0.0",
1424
  "Samples": 254,
1425
+ "All tokens": 1373206,
1426
+ "Total input tokens": 744478,
1427
+ "Average input tokens": 2931,
1428
+ "Total output tokens": 628728,
1429
+ "Average output tokens": 2475
1430
  }
1431
  },
1432
  "Qwen2.5-72B-Instruct": {
 
1436
  "Eval Date": "2025/1/22"
1437
  },
1438
  "gsm8k": {
1439
+ "Score": 93.86,
1440
  "Pass rate": 1.0,
1441
+ "Cost($)": 5.9858,
1442
  "Framework": "",
1443
  "X-shot": "8.0",
1444
  "Samples": 1319,
1445
+ "All tokens": 10618008,
1446
+ "Total input tokens": 8136223,
1447
+ "Average input tokens": 6168,
1448
+ "Total output tokens": 2481785,
1449
+ "Average output tokens": 1882
1450
  },
1451
  "AQuA": {
1452
+ "Score": 85.04,
1453
+ "Pass rate": 0.9921,
1454
+ "Cost($)": 1.0348,
1455
  "Framework": "",
1456
  "X-shot": "0.0",
1457
  "Samples": 254,
1458
+ "All tokens": 1835669,
1459
+ "Total input tokens": 1051218,
1460
+ "Average input tokens": 4139,
1461
+ "Total output tokens": 784451,
1462
+ "Average output tokens": 3088
1463
  }
1464
  },
1465
  "Llama-3.3-70B-Instruct": {
 
1469
  "Eval Date": "2025/1/22"
1470
  },
1471
  "gsm8k": {
1472
+ "Score": 95.07,
1473
  "Pass rate": 1.0,
1474
+ "Cost($)": 6.2005,
1475
  "Framework": "",
1476
  "X-shot": "8.0",
1477
  "Samples": 1319,
1478
+ "All tokens": 10998794,
1479
+ "Total input tokens": 8413717,
1480
+ "Average input tokens": 6379,
1481
+ "Total output tokens": 2585077,
1482
+ "Average output tokens": 1960
1483
  },
1484
  "AQuA": {
1485
+ "Score": 82.28,
1486
  "Pass rate": 0.9921,
1487
+ "Cost($)": 1.0756,
1488
  "Framework": "",
1489
  "X-shot": "0.0",
1490
  "Samples": 254,
1491
+ "All tokens": 1907924,
1492
+ "Total input tokens": 1135251,
1493
+ "Average input tokens": 4469,
1494
+ "Total output tokens": 772673,
1495
+ "Average output tokens": 3042
1496
  }
1497
  },
1498
  "Qwen2.5-7B-Instruct": {
 
1502
  "Eval Date": "2025/1/22"
1503
  },
1504
  "gsm8k": {
1505
+ "Score": 91.13,
1506
+ "Pass rate": 1.0,
1507
  "Cost($)": 0.0,
1508
  "Framework": "",
1509
  "X-shot": "8.0",
1510
  "Samples": 1319,
1511
+ "All tokens": 11140985,
1512
+ "Total input tokens": 8586888,
1513
+ "Average input tokens": 6510,
1514
+ "Total output tokens": 2554097,
1515
+ "Average output tokens": 1936
1516
  },
1517
  "AQuA": {
1518
+ "Score": 79.92,
1519
  "Pass rate": 1.0,
1520
  "Cost($)": 0.0,
1521
  "Framework": "",
1522
  "X-shot": "0.0",
1523
  "Samples": 254,
1524
+ "All tokens": 1845332,
1525
+ "Total input tokens": 1098280,
1526
+ "Average input tokens": 4324,
1527
+ "Total output tokens": 747052,
1528
+ "Average output tokens": 2941
1529
  }
1530
  },
1531
  "Llama-3.1-8B-Instruct": {
 
1535
  "Eval Date": "2025/1/22"
1536
  },
1537
  "gsm8k": {
1538
+ "Score": 73.46,
1539
  "Pass rate": 0.9955,
1540
  "Cost($)": 0.0,
1541
  "Framework": "",
1542
  "X-shot": "8.0",
1543
  "Samples": 1319,
1544
+ "All tokens": 11778716,
1545
+ "Total input tokens": 8630514,
1546
+ "Average input tokens": 6543,
1547
+ "Total output tokens": 3148202,
1548
+ "Average output tokens": 2387
1549
  },
1550
  "AQuA": {
1551
+ "Score": 59.45,
1552
+ "Pass rate": 0.9724,
1553
  "Cost($)": 0.0,
1554
  "Framework": "",
1555
  "X-shot": "0.0",
1556
  "Samples": 254,
1557
+ "All tokens": 1651333,
1558
+ "Total input tokens": 971003,
1559
+ "Average input tokens": 3823,
1560
+ "Total output tokens": 680330,
1561
+ "Average output tokens": 2678
1562
  }
1563
  },
1564
  "Internllm2_5-7B": {
 
1568
  "Eval Date": "2025/1/22"
1569
  },
1570
  "gsm8k": {
1571
+ "Score": 48.22,
1572
+ "Pass rate": 0.9841,
1573
  "Cost($)": 0.0,
1574
  "Framework": "",
1575
  "X-shot": "8.0",
1576
  "Samples": 1319,
1577
+ "All tokens": 14526431,
1578
+ "Total input tokens": 10678792,
1579
+ "Average input tokens": 8096,
1580
+ "Total output tokens": 3847639,
1581
+ "Average output tokens": 2917
1582
  },
1583
  "AQuA": {
1584
+ "Score": 39.37,
1585
+ "Pass rate": 0.9803,
1586
  "Cost($)": 0.0,
1587
  "Framework": "",
1588
  "X-shot": "0.0",
1589
  "Samples": 254,
1590
+ "All tokens": 2296222,
1591
+ "Total input tokens": 1420494,
1592
+ "Average input tokens": 5592,
1593
+ "Total output tokens": 875728,
1594
+ "Average output tokens": 3448
1595
  }
1596
  },
1597
  "Qwen2-1.5B-Instruct": {
 
1601
  "Eval Date": "2025/1/22"
1602
  },
1603
  "gsm8k": {
1604
+ "Score": 11.75,
1605
+ "Pass rate": 0.9189,
1606
  "Cost($)": 0.0,
1607
  "Framework": "",
1608
  "X-shot": "8.0",
1609
  "Samples": 1319,
1610
+ "All tokens": 12411942,
1611
+ "Total input tokens": 9066115,
1612
+ "Average input tokens": 6873,
1613
+ "Total output tokens": 3345827,
1614
+ "Average output tokens": 2537
1615
  },
1616
  "AQuA": {
1617
+ "Score": 23.62,
1618
+ "Pass rate": 0.9646,
1619
  "Cost($)": 0.0,
1620
  "Framework": "",
1621
  "X-shot": "0.0",
1622
  "Samples": 254,
1623
+ "All tokens": 1775335,
1624
+ "Total input tokens": 1034362,
1625
+ "Average input tokens": 4072,
1626
+ "Total output tokens": 740973,
1627
+ "Average output tokens": 2917
1628
  }
1629
  },
1630
  "Qwen2-0.5B-Instruct": {
 
1634
  "Eval Date": "2025/1/22"
1635
  },
1636
  "gsm8k": {
1637
+ "Score": 1.67,
1638
+ "Pass rate": 0.9469,
1639
  "Cost($)": 0.0,
1640
  "Framework": "",
1641
  "X-shot": "8.0",
1642
  "Samples": 1319,
1643
+ "All tokens": 16465720,
1644
+ "Total input tokens": 11019864,
1645
+ "Average input tokens": 8355,
1646
+ "Total output tokens": 5445856,
1647
+ "Average output tokens": 4129
1648
  },
1649
  "AQuA": {
1650
+ "Score": 22.83,
1651
+ "Pass rate": 0.9724,
1652
  "Cost($)": 0.0,
1653
  "Framework": "",
1654
  "X-shot": "0.0",
1655
  "Samples": 254,
1656
+ "All tokens": 2215091,
1657
+ "Total input tokens": 1246929,
1658
+ "Average input tokens": 4909,
1659
+ "Total output tokens": 968162,
1660
+ "Average output tokens": 3812
1661
  }
1662
  }
1663
  }
src/detail_results.csv CHANGED
@@ -1,16 +1,16 @@
1
  Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
2
- 1,SC-CoT,AQuA,gpt-4o,2025/1/22,88.19,1.0,0.0,6.2412,,254,678811,72916,287,605895,2385
3
- 2,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,86.61,0.9921,0.0,0.5847,,254,1037124,283248,1115,753876,2968
4
- 3,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0.0,0.0808,,254,143289,25143,99,118146,465
5
- 4,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.83,0.9843,0.0,0.5576,,254,989058,241149,949,747909,2945
6
- 5,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.9961,0.0,0.0742,,254,131604,25397,100,106207,418
7
- 6,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9843,0.0,0.0927,,254,164389,32555,128,131834,519
8
- 7,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.68,0.9921,0.0,0.0798,,254,141567,32809,129,108758,428
9
- 8,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.68,0.9724,0.0,0.0066,,254,94577,27978,110,66599,262
10
- 9,CoT,AQuA,gpt-4o,2025/1/22,82.68,0.9803,0.0,1.0417,,254,123017,25123,99,97894,385
11
- 10,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,81.5,0.9764,0.0,0.0347,,254,465846,83830,330,382016,1504
12
- 11,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,81.5,1.0,0.0,0.0,,254,1015368,278848,1098,736520,2900
13
- 12,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.71,0.9961,0.0,0.0,,254,149736,33017,130,116719,460
14
  13,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.53,0.9921,0.0,0.1746,,254,309799,240735,948,69064,272
15
  14,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0.0,0.0058,,254,87742,33058,130,54684,215
16
  15,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.9961,0.0,0.768,,254,1362379,1119143,4406,243236,958
@@ -22,60 +22,60 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
22
  21,ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,2025/1/22,74.41,0.9921,0.0,0.0,,254,695844,564165,2221,131679,518
23
  22,ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,2025/1/22,73.23,1.0,0.0,0.3177,,254,563603,441765,1739,121838,480
24
  23,PoT,AQuA,Doubao-lite-32k,2025/1/7,71.65,0.9685,0.0,0.0147,,254,309436,259863,1023,49573,195
25
- 24,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,70.47,0.9882,0.0,0.5578,,254,418617,70157,276,348460,1372
26
- 25,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0.0,0.0,,254,313728,264517,1041,49211,194
27
  26,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.57,0.9803,0.0,0.4928,,254,903587,862614,3396,40973,161
28
  27,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0.0,0.0957,,254,80793,25447,100,55346,218
29
  28,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.63,1.0,0.0,0.0,,254,144435,32555,128,111880,440
30
  29,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.45,1.0,0.0,0.1748,,254,266654,225162,886,41492,163
31
- 30,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0.0,2.304,,254,692096,615589,2424,76507,301
32
- 31,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0.0,0.0,,254,4340821,3764723,14822,576098,2268
33
- 32,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,53.15,0.9606,0.0,0.0,,254,1041346,372968,1468,668378,2631
34
  33,CoT,AQuA,Internllm2_5-7B,2025/1/22,52.76,0.8937,0.0,0.0,,254,127520,26610,105,100910,397
35
  34,IO,AQuA,Llama-3.1-8B-Instruct,2025/1/22,51.18,0.9882,0.0,0.0,,254,133106,26459,104,106647,420
36
  35,IO,AQuA,Internllm2_5-7B,2025/1/22,47.64,0.9094,0.0,0.0,,254,185041,50232,198,134809,531
37
  36,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0.0,0.0,,254,4428801,3592039,14142,836762,3294
38
  37,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9882,0.0,0.0,,254,110040,30477,120,79563,313
39
- 38,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.98,1.0,0.0,0.038,,254,42471,25701,101,16770,66
40
- 39,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0.0,0.0,,254,290914,240613,947,50301,198
41
- 40,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9882,0.0,0.0,,254,301962,233505,919,68457,270
42
- 41,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,35.85,0.988,0.0,0.0,,254,1240388,530701,2089,709687,2794
43
  42,CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,33.07,0.9882,0.0,0.0,,254,117339,30477,120,86862,342
44
  43,PoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.71,0.9646,0.0,0.0,,254,298475,246560,971,51915,204
45
- 44,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,30.71,0.9843,0.0,0.0,,254,1225539,496206,1954,729333,2871
46
- 45,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.31,0.9724,0.0,0.0,,254,1157076,430703,1696,726373,2860
47
- 46,IO,AQuA,Qwen2-1.5B-Instruct,2025/1/22,29.13,0.9764,0.0,0.0,,254,71047,27937,110,43110,170
48
- 47,IO,AQuA,Qwen2-0.5B-Instruct,2025/1/22,27.17,0.9882,0.0,0.0,,254,110415,27937,110,82478,325
49
- 48,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0.0,0.0,,254,5072004,4555858,17936,516146,2032
50
- 49,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.02,0.9685,0.0,0.0,,254,7170087,6344167,24977,825920,3252
51
  50,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9213,0.0,0.0,,254,322281,258867,1019,63414,250
52
- 1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.45,1.0,8.0,4.5021,,1319,7985996,5406763,4099,2579233,1955
53
  2,CoT,gsm8k,gpt-4o,2025/1/22,94.09,1.0,8.0,4.5367,,1319,1165166,948668,719,216498,164
54
  3,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8.0,0.687,,1319,1218665,990168,751,228497,173
55
- 4,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8.0,4.2166,,1319,1247912,1101672,835,146240,111
56
- 5,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8.0,0.7195,,1319,1276252,1005119,762,271133,206
57
- 6,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8.0,0.7054,,1319,1251210,1106682,839,144528,110
58
- 7,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.27,1.0,8.0,0.4709,,1319,835275,583916,443,251359,191
59
- 8,SC-CoT,gsm8k,gpt-4o,2025/1/22,90.75,1.0,8.0,24.2428,,1319,3300971,1168927,886,2132044,1616
60
- 9,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,90.67,1.0,8.0,4.2651,,1319,7565637,5292383,4012,2273254,1723
61
- 10,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8.0,0.0558,,1319,1201820,1042095,790,159725,121
62
- 11,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8.0,3.3463,,1319,741446,542416,411,199030,151
63
- 12,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,88.32,0.9985,8.0,0.0,,1319,8173818,5668252,4297,2505566,1900
64
  13,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,8.0,10.1124,,1319,17937864,17038928,12918,898936,682
65
  14,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8.0,10.5479,,1319,18710437,18160983,13769,549454,417
66
- 15,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8.0,0.4899,,1319,869060,555340,421,313720,238
67
- 16,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8.0,0.0,,1319,1290805,1046008,793,244797,186
68
- 17,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.6,0.9962,8.0,0.2512,,1319,5998639,5862016,4444,136623,104
69
- 18,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,83.7,0.997,8.0,0.155,,1319,2507687,1230019,933,1277668,969
70
  19,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.87,1.0,8.0,0.0,,1319,14850914,14355752,10884,495162,375
71
- 20,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,82.56,0.9985,8.0,2.6285,,1319,2560697,1212520,919,1348177,1022
72
  21,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.61,0.9257,8.0,0.0576,,1319,1288055,1170038,887,118017,89
73
  22,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.7,1.0,8.0,0.6788,,1319,1088041,953242,723,134799,102
74
  23,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.997,8.0,0.0,,1319,1202163,968163,734,234000,177
75
  24,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.88,0.9924,8.0,0.6902,,1319,1187080,1090418,827,96662,73
76
  25,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.44,0.9992,8.0,0.0,,1319,1248329,990168,751,258161,196
77
- 26,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.21,0.9955,8.0,0.0,,1319,8444203,5334657,4044,3109546,2358
78
- 27,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.91,0.9939,8.0,3.4633,,1319,6646286,6506164,4933,140122,106
79
  28,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.09,0.7961,8.0,0.9736,,1319,1727044,1126025,854,601019,456
80
  29,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8.0,0.0354,,1319,740483,617377,468,123106,93
81
  30,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.78,0.9856,8.0,0.0,,1319,22835767,21044978,15955,1790789,1358
@@ -84,7 +84,7 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
84
  33,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8.0,0.0,,1319,887913,596229,452,291684,221
85
  34,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9955,8.0,0.0,,1319,1745429,550941,418,1194488,906
86
  35,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.5,1.0,8.0,0.0,,1319,1218525,1032818,783,185707,141
87
- 36,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,41.39,0.9826,8.0,0.0,,1319,10024857,6674518,5060,3350339,2540
88
  37,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.67,0.5542,8.0,0.0,,1319,1391111,1147538,870,243573,185
89
  38,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8.0,0.0,,1319,1324949,1136843,862,188106,143
90
  39,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8.0,0.3328,,1319,586553,546990,415,39563,30
@@ -94,8 +94,8 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
94
  43,PoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,18.5,0.3101,8.0,0.0,,1319,1327522,1151528,873,175994,133
95
  44,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.68,1.0,8.0,0.0,,1319,736996,568530,431,168466,128
96
  45,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.71,1.0,8.0,0.0,,1319,834897,568116,431,266781,202
97
- 46,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.6,0.9795,8.0,0.0,,1319,1113728,679302,515,434426,329
98
- 47,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.62,0.169,8.0,0.0,,1319,1389135,1151528,873,237607,180
99
- 48,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.66,0.9522,8.0,0.0,,1319,55392611,52431343,39751,2961268,2245
100
- 49,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,5.53,0.8673,8.0,0.0,,1319,8961768,5844218,4431,3117550,2364
101
- 50,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,3.79,0.9484,8.0,0.0,,1319,10533815,6529832,4951,4003983,3036
 
1
  Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
2
+ 1,SC-CoT,AQuA,gpt-4o,2025/1/22,86.61,0.9882,0.0,8.1485,,254,1373206,744478,2931,628728,2475
3
+ 2,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0.0,0.0808,,254,143289,25143,99,118146,465
4
+ 3,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.04,0.9921,0.0,1.0348,,254,1835669,1051218,4139,784451,3088
5
+ 4,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.9961,0.0,0.0742,,254,131604,25397,100,106207,418
6
+ 5,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9843,0.0,0.0927,,254,164389,32555,128,131834,519
7
+ 6,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.68,0.9921,0.0,0.0798,,254,141567,32809,129,108758,428
8
+ 7,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.68,0.9724,0.0,0.0066,,254,94577,27978,110,66599,262
9
+ 8,CoT,AQuA,gpt-4o,2025/1/22,82.68,0.9803,0.0,1.0417,,254,123017,25123,99,97894,385
10
+ 9,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.28,0.9921,0.0,1.0756,,254,1907924,1135251,4469,772673,3042
11
+ 10,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,81.1,0.9724,0.0,0.0519,,254,885986,503751,1983,382235,1505
12
+ 11,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.71,0.9961,0.0,0.0,,254,149736,33017,130,116719,460
13
+ 12,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,79.92,1.0,0.0,0.0,,254,1845332,1098280,4324,747052,2941
14
  13,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.53,0.9921,0.0,0.1746,,254,309799,240735,948,69064,272
15
  14,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0.0,0.0058,,254,87742,33058,130,54684,215
16
  15,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.9961,0.0,0.768,,254,1362379,1119143,4406,243236,958
 
22
  21,ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,2025/1/22,74.41,0.9921,0.0,0.0,,254,695844,564165,2221,131679,518
23
  22,ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,2025/1/22,73.23,1.0,0.0,0.3177,,254,563603,441765,1739,121838,480
24
  23,PoT,AQuA,Doubao-lite-32k,2025/1/7,71.65,0.9685,0.0,0.0147,,254,309436,259863,1023,49573,195
25
+ 24,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0.0,0.0,,254,313728,264517,1041,49211,194
26
+ 25,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,66.14,0.9921,0.0,0.7888,,254,847335,482192,1898,365143,1438
27
  26,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.57,0.9803,0.0,0.4928,,254,903587,862614,3396,40973,161
28
  27,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0.0,0.0957,,254,80793,25447,100,55346,218
29
  28,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.63,1.0,0.0,0.0,,254,144435,32555,128,111880,440
30
  29,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.45,1.0,0.0,0.1748,,254,266654,225162,886,41492,163
31
+ 30,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.45,0.9724,0.0,0.0,,254,1651333,971003,3823,680330,2678
32
+ 31,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0.0,2.304,,254,692096,615589,2424,76507,301
33
+ 32,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0.0,0.0,,254,4340821,3764723,14822,576098,2268
34
  33,CoT,AQuA,Internllm2_5-7B,2025/1/22,52.76,0.8937,0.0,0.0,,254,127520,26610,105,100910,397
35
  34,IO,AQuA,Llama-3.1-8B-Instruct,2025/1/22,51.18,0.9882,0.0,0.0,,254,133106,26459,104,106647,420
36
  35,IO,AQuA,Internllm2_5-7B,2025/1/22,47.64,0.9094,0.0,0.0,,254,185041,50232,198,134809,531
37
  36,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0.0,0.0,,254,4428801,3592039,14142,836762,3294
38
  37,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9882,0.0,0.0,,254,110040,30477,120,79563,313
39
+ 38,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,39.37,0.9803,0.0,0.0,,254,2296222,1420494,5592,875728,3448
40
+ 39,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.98,1.0,0.0,0.038,,254,42471,25701,101,16770,66
41
+ 40,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0.0,0.0,,254,290914,240613,947,50301,198
42
+ 41,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9882,0.0,0.0,,254,301962,233505,919,68457,270
43
  42,CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,33.07,0.9882,0.0,0.0,,254,117339,30477,120,86862,342
44
  43,PoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.71,0.9646,0.0,0.0,,254,298475,246560,971,51915,204
45
+ 44,IO,AQuA,Qwen2-1.5B-Instruct,2025/1/22,29.13,0.9764,0.0,0.0,,254,71047,27937,110,43110,170
46
+ 45,IO,AQuA,Qwen2-0.5B-Instruct,2025/1/22,27.17,0.9882,0.0,0.0,,254,110415,27937,110,82478,325
47
+ 46,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0.0,0.0,,254,5072004,4555858,17936,516146,2032
48
+ 47,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.02,0.9685,0.0,0.0,,254,7170087,6344167,24977,825920,3252
49
+ 48,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,23.62,0.9646,0.0,0.0,,254,1775335,1034362,4072,740973,2917
50
+ 49,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,22.83,0.9724,0.0,0.0,,254,2215091,1246929,4909,968162,3812
51
  50,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9213,0.0,0.0,,254,322281,258867,1019,63414,250
52
+ 1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.07,1.0,8.0,6.2005,,1319,10998794,8413717,6379,2585077,1960
53
  2,CoT,gsm8k,gpt-4o,2025/1/22,94.09,1.0,8.0,4.5367,,1319,1165166,948668,719,216498,164
54
  3,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8.0,0.687,,1319,1218665,990168,751,228497,173
55
+ 4,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,93.86,1.0,8.0,5.9858,,1319,10618008,8136223,6168,2481785,1882
56
+ 5,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8.0,4.2166,,1319,1247912,1101672,835,146240,111
57
+ 6,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8.0,0.7195,,1319,1276252,1005119,762,271133,206
58
+ 7,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8.0,0.7054,,1319,1251210,1106682,839,144528,110
59
+ 8,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.27,1.0,8.0,0.4709,,1319,835275,583916,443,251359,191
60
+ 9,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,91.13,1.0,8.0,0.0,,1319,11140985,8586888,6510,2554097,1936
61
+ 10,SC-CoT,gsm8k,gpt-4o,2025/1/22,90.3,0.9992,8.0,31.0542,,1319,5798173,3590336,2722,2207837,1674
62
+ 11,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8.0,0.0558,,1319,1201820,1042095,790,159725,121
63
+ 12,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8.0,3.3463,,1319,741446,542416,411,199030,151
64
  13,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,8.0,10.1124,,1319,17937864,17038928,12918,898936,682
65
  14,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8.0,10.5479,,1319,18710437,18160983,13769,549454,417
66
+ 15,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,87.26,0.9992,8.0,0.2083,,1319,3888813,2691714,2041,1197099,908
67
+ 16,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8.0,0.4899,,1319,869060,555340,421,313720,238
68
+ 17,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8.0,0.0,,1319,1290805,1046008,793,244797,186
69
+ 18,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.6,0.9962,8.0,0.2512,,1319,5998639,5862016,4444,136623,104
70
  19,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.87,1.0,8.0,0.0,,1319,14850914,14355752,10884,495162,375
71
+ 20,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,79.91,0.9992,8.0,3.3938,,1319,4089612,2740652,2078,1348960,1023
72
  21,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.61,0.9257,8.0,0.0576,,1319,1288055,1170038,887,118017,89
73
  22,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.7,1.0,8.0,0.6788,,1319,1088041,953242,723,134799,102
74
  23,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.997,8.0,0.0,,1319,1202163,968163,734,234000,177
75
  24,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.88,0.9924,8.0,0.6902,,1319,1187080,1090418,827,96662,73
76
  25,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.44,0.9992,8.0,0.0,,1319,1248329,990168,751,258161,196
77
+ 26,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.91,0.9939,8.0,3.4633,,1319,6646286,6506164,4933,140122,106
78
+ 27,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,73.46,0.9955,8.0,0.0,,1319,11778716,8630514,6543,3148202,2387
79
  28,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.09,0.7961,8.0,0.9736,,1319,1727044,1126025,854,601019,456
80
  29,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8.0,0.0354,,1319,740483,617377,468,123106,93
81
  30,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.78,0.9856,8.0,0.0,,1319,22835767,21044978,15955,1790789,1358
 
84
  33,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8.0,0.0,,1319,887913,596229,452,291684,221
85
  34,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9955,8.0,0.0,,1319,1745429,550941,418,1194488,906
86
  35,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.5,1.0,8.0,0.0,,1319,1218525,1032818,783,185707,141
87
+ 36,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,48.22,0.9841,8.0,0.0,,1319,14526431,10678792,8096,3847639,2917
88
  37,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.67,0.5542,8.0,0.0,,1319,1391111,1147538,870,243573,185
89
  38,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8.0,0.0,,1319,1324949,1136843,862,188106,143
90
  39,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8.0,0.3328,,1319,586553,546990,415,39563,30
 
94
  43,PoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,18.5,0.3101,8.0,0.0,,1319,1327522,1151528,873,175994,133
95
  44,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.68,1.0,8.0,0.0,,1319,736996,568530,431,168466,128
96
  45,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.71,1.0,8.0,0.0,,1319,834897,568116,431,266781,202
97
+ 46,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,11.75,0.9189,8.0,0.0,,1319,12411942,9066115,6873,3345827,2537
98
+ 47,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.6,0.9795,8.0,0.0,,1319,1113728,679302,515,434426,329
99
+ 48,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.62,0.1691,8.0,0.0,,1319,1389135,1151528,873,237607,180
100
+ 49,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.66,0.9522,8.0,0.0,,1319,55392611,52431343,39751,2961268,2245
101
+ 50,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,1.67,0.9469,8.0,0.0,,1319,16465720,11019864,8355,5445856,4129
src/overall_math_score.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "time": "2025-01-23 11:23:17",
3
  "results": {
4
  "IO": {
5
  "META": {
@@ -68,12 +68,12 @@
68
  "Eval Date": "2025/1/7"
69
  },
70
  "gsm8k": {
71
- "Score": 82.56,
72
- "Cost($)": 2.6285
73
  },
74
  "AQuA": {
75
- "Score": 70.47,
76
- "Cost($)": 0.5578
77
  }
78
  },
79
  "IO-Doubao-lite-32k": {
@@ -143,12 +143,12 @@
143
  "Eval Date": "2025/1/7"
144
  },
145
  "gsm8k": {
146
- "Score": 83.7,
147
- "Cost($)": 0.155
148
  },
149
  "AQuA": {
150
- "Score": 81.5,
151
- "Cost($)": 0.0347
152
  }
153
  },
154
  "IO-gpt-4o": {
@@ -218,12 +218,12 @@
218
  "Eval Date": "2025/1/22"
219
  },
220
  "gsm8k": {
221
- "Score": 90.75,
222
- "Cost($)": 24.2428
223
  },
224
  "AQuA": {
225
- "Score": 88.19,
226
- "Cost($)": 6.2412
227
  }
228
  },
229
  "IO-Qwen2.5-72B-Instruct": {
@@ -293,12 +293,12 @@
293
  "Eval Date": "2025/1/22"
294
  },
295
  "gsm8k": {
296
- "Score": 90.67,
297
- "Cost($)": 4.2651
298
  },
299
  "AQuA": {
300
- "Score": 85.83,
301
- "Cost($)": 0.5576
302
  }
303
  },
304
  "IO-Llama-3.3-70B-Instruct": {
@@ -368,12 +368,12 @@
368
  "Eval Date": "2025/1/22"
369
  },
370
  "gsm8k": {
371
- "Score": 95.45,
372
- "Cost($)": 4.5021
373
  },
374
  "AQuA": {
375
- "Score": 86.61,
376
- "Cost($)": 0.5847
377
  }
378
  },
379
  "IO-Qwen2.5-7B-Instruct": {
@@ -443,11 +443,11 @@
443
  "Eval Date": "2025/1/22"
444
  },
445
  "gsm8k": {
446
- "Score": 88.32,
447
  "Cost($)": 0.0
448
  },
449
  "AQuA": {
450
- "Score": 81.5,
451
  "Cost($)": 0.0
452
  }
453
  },
@@ -518,11 +518,11 @@
518
  "Eval Date": "2025/1/22"
519
  },
520
  "gsm8k": {
521
- "Score": 75.21,
522
  "Cost($)": 0.0
523
  },
524
  "AQuA": {
525
- "Score": 53.15,
526
  "Cost($)": 0.0
527
  }
528
  },
@@ -593,11 +593,11 @@
593
  "Eval Date": "2025/1/22"
594
  },
595
  "gsm8k": {
596
- "Score": 41.39,
597
  "Cost($)": 0.0
598
  },
599
  "AQuA": {
600
- "Score": 35.85,
601
  "Cost($)": 0.0
602
  }
603
  },
@@ -668,11 +668,11 @@
668
  "Eval Date": "2025/1/22"
669
  },
670
  "gsm8k": {
671
- "Score": 5.53,
672
  "Cost($)": 0.0
673
  },
674
  "AQuA": {
675
- "Score": 30.31,
676
  "Cost($)": 0.0
677
  }
678
  },
@@ -743,11 +743,11 @@
743
  "Eval Date": "2025/1/22"
744
  },
745
  "gsm8k": {
746
- "Score": 3.79,
747
  "Cost($)": 0.0
748
  },
749
  "AQuA": {
750
- "Score": 30.71,
751
  "Cost($)": 0.0
752
  }
753
  }
 
1
  {
2
+ "time": "2025-01-24 15:10:27",
3
  "results": {
4
  "IO": {
5
  "META": {
 
68
  "Eval Date": "2025/1/7"
69
  },
70
  "gsm8k": {
71
+ "Score": 79.91,
72
+ "Cost($)": 3.3938
73
  },
74
  "AQuA": {
75
+ "Score": 66.14,
76
+ "Cost($)": 0.7888
77
  }
78
  },
79
  "IO-Doubao-lite-32k": {
 
143
  "Eval Date": "2025/1/7"
144
  },
145
  "gsm8k": {
146
+ "Score": 87.26,
147
+ "Cost($)": 0.2083
148
  },
149
  "AQuA": {
150
+ "Score": 81.1,
151
+ "Cost($)": 0.0519
152
  }
153
  },
154
  "IO-gpt-4o": {
 
218
  "Eval Date": "2025/1/22"
219
  },
220
  "gsm8k": {
221
+ "Score": 90.3,
222
+ "Cost($)": 31.0542
223
  },
224
  "AQuA": {
225
+ "Score": 86.61,
226
+ "Cost($)": 8.1485
227
  }
228
  },
229
  "IO-Qwen2.5-72B-Instruct": {
 
293
  "Eval Date": "2025/1/22"
294
  },
295
  "gsm8k": {
296
+ "Score": 93.86,
297
+ "Cost($)": 5.9858
298
  },
299
  "AQuA": {
300
+ "Score": 85.04,
301
+ "Cost($)": 1.0348
302
  }
303
  },
304
  "IO-Llama-3.3-70B-Instruct": {
 
368
  "Eval Date": "2025/1/22"
369
  },
370
  "gsm8k": {
371
+ "Score": 95.07,
372
+ "Cost($)": 6.2005
373
  },
374
  "AQuA": {
375
+ "Score": 82.28,
376
+ "Cost($)": 1.0756
377
  }
378
  },
379
  "IO-Qwen2.5-7B-Instruct": {
 
443
  "Eval Date": "2025/1/22"
444
  },
445
  "gsm8k": {
446
+ "Score": 91.13,
447
  "Cost($)": 0.0
448
  },
449
  "AQuA": {
450
+ "Score": 79.92,
451
  "Cost($)": 0.0
452
  }
453
  },
 
518
  "Eval Date": "2025/1/22"
519
  },
520
  "gsm8k": {
521
+ "Score": 73.46,
522
  "Cost($)": 0.0
523
  },
524
  "AQuA": {
525
+ "Score": 59.45,
526
  "Cost($)": 0.0
527
  }
528
  },
 
593
  "Eval Date": "2025/1/22"
594
  },
595
  "gsm8k": {
596
+ "Score": 48.22,
597
  "Cost($)": 0.0
598
  },
599
  "AQuA": {
600
+ "Score": 39.37,
601
  "Cost($)": 0.0
602
  }
603
  },
 
668
  "Eval Date": "2025/1/22"
669
  },
670
  "gsm8k": {
671
+ "Score": 11.75,
672
  "Cost($)": 0.0
673
  },
674
  "AQuA": {
675
+ "Score": 23.62,
676
  "Cost($)": 0.0
677
  }
678
  },
 
743
  "Eval Date": "2025/1/22"
744
  },
745
  "gsm8k": {
746
+ "Score": 1.67,
747
  "Cost($)": 0.0
748
  },
749
  "AQuA": {
750
+ "Score": 22.83,
751
  "Cost($)": 0.0
752
  }
753
  }
src/overall_results.csv CHANGED
@@ -1,40 +1,40 @@
1
  Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($)
2
- 1.0,SC-CoT,Llama-3.3-70B-Instruct,2025/1/22,91.03,95.45,4.5021,86.61,0.5847
3
- 2.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,89.55,92.87,0.7195,86.22,0.0808
4
- 3.0,SC-CoT,gpt-4o,2025/1/22,89.47,90.75,24.2428,88.19,6.2412
5
- 4.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,88.70,93.93,0.687,83.46,0.0927
6
- 5.0,CoT,gpt-4o,2025/1/22,88.39,94.09,4.5367,82.68,1.0417
7
- 6.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,88.25,90.67,4.2651,85.83,0.5576
8
  7.0,IO,Llama-3.3-70B-Instruct,2025/1/22,87.48,92.27,0.4709,82.68,0.0798
9
  8.0,CoT,Doubao-lite-32k,2025/1/7,86.00,89.31,0.0558,82.68,0.0066
10
- 9.0,IO,Qwen2.5-72B-Instruct,2025/1/22,85.42,86.58,0.4899,84.25,0.0742
11
- 10.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,84.91,88.32,0.0,81.5,0.0
12
- 11.0,PoT,gpt-4o,2025/1/22,84.15,93.1,4.2166,75.2,1.6087
13
- 12.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,83.77,92.34,0.7054,75.2,0.1645
14
- 13.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,83.39,87.64,10.1124,79.13,0.768
15
- 14.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,83.19,85.67,0.0,80.71,0.0
16
- 15.0,SC-CoT,Doubao-lite-32k,2025/1/7,82.60,83.7,0.155,81.5,0.0347
17
  16.0,IO,gpt-4o,2025/1/22,82.00,88.4,3.3463,75.59,1.1453
18
  17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,81.58,85.6,0.2512,77.56,0.0445
19
  18.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,80.25,87.26,10.5479,73.23,0.3177
20
  19.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,78.64,82.87,0.0,74.41,0.0
21
- 20.0,SC-CoT,gpt-3.5-turbo,2025/1/7,76.52,82.56,2.6285,70.47,0.5578
22
- 21.0,PoT,Llama-3.3-70B-Instruct,2025/1/22,76.31,73.09,0.9736,79.53,0.1746
23
- 22.0,PoT,Doubao-lite-32k,2025/1/7,75.63,79.61,0.0576,71.65,0.0147
24
- 23.0,IO,Doubao-lite-32k,2025/1/7,75.58,72.02,0.0354,79.13,0.0058
25
  24.0,CoT,gpt-3.5-turbo,2025/1/7,69.86,78.7,0.6788,61.02,0.0957
26
  25.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,69.74,74.91,3.4633,64.57,0.4928
27
  26.0,PoT,gpt-3.5-turbo,2025/1/7,68.17,76.88,0.6902,59.45,0.1748
28
  27.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,68.04,75.44,0.0,60.63,0.0
29
  28.0,IO,Qwen2.5-7B-Instruct,2025/1/22,67.99,57.24,0.0,78.74,0.0
30
- 29.0,CoT,Internllm2_5-7B,2025/1/22,65.24,77.71,0.0,52.76,0.0
31
- 30.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,64.18,75.21,0.0,53.15,0.0
32
  31.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,63.47,58.83,0.0,68.11,0.0
33
  32.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,61.65,67.78,0.0,55.51,0.0
34
  33.0,ReAct-Pro*,gpt-4o,2025/1/22,60.40,63.31,39.0751,57.48,2.304
35
  34.0,IO,Llama-3.1-8B-Instruct,2025/1/22,54.17,57.16,0.0,51.18,0.0
36
  35.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,48.03,55.5,0.0,40.55,0.0
37
- 36.0,SC-CoT,Internllm2_5-7B,2025/1/22,38.62,41.39,0.0,35.85,0.0
38
  37.0,IO,gpt-3.5-turbo,2025/1/7,38.41,37.83,0.3328,38.98,0.038
39
  38.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,37.64,38.67,0.0,36.61,0.0
40
  39.0,PoT,Internllm2_5-7B,2025/1/22,37.41,38.21,0.0,36.61,0.0
@@ -45,7 +45,7 @@ Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA
45
  44.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,24.61,18.5,0.0,30.71,0.0
46
  45.0,IO,Qwen2-1.5B-Instruct,2025/1/22,22.91,16.68,0.0,29.13,0.0
47
  46.0,IO,Qwen2-0.5B-Instruct,2025/1/22,20.94,14.71,0.0,27.17,0.0
48
- 47.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,17.92,5.53,0.0,30.31,0.0
49
- 48.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,17.25,3.79,0.0,30.71,0.0
50
- 49.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,15.84,7.66,0.0,24.02,0.0
51
- 50.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,13.47,9.62,0.0,17.32,0.0
 
1
  Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($)
2
+ 1.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,89.55,92.87,0.7195,86.22,0.0808
3
+ 2.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,89.45,93.86,5.9858,85.04,1.0348
4
+ 3.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,88.70,93.93,0.687,83.46,0.0927
5
+ 4.0,SC-CoT,Llama-3.3-70B-Instruct,2025/1/22,88.68,95.07,6.2005,82.28,1.0756
6
+ 5.0,SC-CoT,gpt-4o,2025/1/22,88.46,90.3,31.0542,86.61,8.1485
7
+ 6.0,CoT,gpt-4o,2025/1/22,88.39,94.09,4.5367,82.68,1.0417
8
  7.0,IO,Llama-3.3-70B-Instruct,2025/1/22,87.48,92.27,0.4709,82.68,0.0798
9
  8.0,CoT,Doubao-lite-32k,2025/1/7,86.00,89.31,0.0558,82.68,0.0066
10
+ 9.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,85.53,91.13,0.0,79.92,0.0
11
+ 10.0,IO,Qwen2.5-72B-Instruct,2025/1/22,85.42,86.58,0.4899,84.25,0.0742
12
+ 11.0,SC-CoT,Doubao-lite-32k,2025/1/7,84.18,87.26,0.2083,81.1,0.0519
13
+ 12.0,PoT,gpt-4o,2025/1/22,84.15,93.1,4.2166,75.2,1.6087
14
+ 13.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,83.77,92.34,0.7054,75.2,0.1645
15
+ 14.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,83.39,87.64,10.1124,79.13,0.768
16
+ 15.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,83.19,85.67,0.0,80.71,0.0
17
  16.0,IO,gpt-4o,2025/1/22,82.00,88.4,3.3463,75.59,1.1453
18
  17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,81.58,85.6,0.2512,77.56,0.0445
19
  18.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,80.25,87.26,10.5479,73.23,0.3177
20
  19.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,78.64,82.87,0.0,74.41,0.0
21
+ 20.0,PoT,Llama-3.3-70B-Instruct,2025/1/22,76.31,73.09,0.9736,79.53,0.1746
22
+ 21.0,PoT,Doubao-lite-32k,2025/1/7,75.63,79.61,0.0576,71.65,0.0147
23
+ 22.0,IO,Doubao-lite-32k,2025/1/7,75.58,72.02,0.0354,79.13,0.0058
24
+ 23.0,SC-CoT,gpt-3.5-turbo,2025/1/7,73.03,79.91,3.3938,66.14,0.7888
25
  24.0,CoT,gpt-3.5-turbo,2025/1/7,69.86,78.7,0.6788,61.02,0.0957
26
  25.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,69.74,74.91,3.4633,64.57,0.4928
27
  26.0,PoT,gpt-3.5-turbo,2025/1/7,68.17,76.88,0.6902,59.45,0.1748
28
  27.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,68.04,75.44,0.0,60.63,0.0
29
  28.0,IO,Qwen2.5-7B-Instruct,2025/1/22,67.99,57.24,0.0,78.74,0.0
30
+ 29.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,66.46,73.46,0.0,59.45,0.0
31
+ 30.0,CoT,Internllm2_5-7B,2025/1/22,65.24,77.71,0.0,52.76,0.0
32
  31.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,63.47,58.83,0.0,68.11,0.0
33
  32.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,61.65,67.78,0.0,55.51,0.0
34
  33.0,ReAct-Pro*,gpt-4o,2025/1/22,60.40,63.31,39.0751,57.48,2.304
35
  34.0,IO,Llama-3.1-8B-Instruct,2025/1/22,54.17,57.16,0.0,51.18,0.0
36
  35.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,48.03,55.5,0.0,40.55,0.0
37
+ 36.0,SC-CoT,Internllm2_5-7B,2025/1/22,43.80,48.22,0.0,39.37,0.0
38
  37.0,IO,gpt-3.5-turbo,2025/1/7,38.41,37.83,0.3328,38.98,0.038
39
  38.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,37.64,38.67,0.0,36.61,0.0
40
  39.0,PoT,Internllm2_5-7B,2025/1/22,37.41,38.21,0.0,36.61,0.0
 
45
  44.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,24.61,18.5,0.0,30.71,0.0
46
  45.0,IO,Qwen2-1.5B-Instruct,2025/1/22,22.91,16.68,0.0,29.13,0.0
47
  46.0,IO,Qwen2-0.5B-Instruct,2025/1/22,20.94,14.71,0.0,27.17,0.0
48
+ 47.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,17.69,11.75,0.0,23.62,0.0
49
+ 48.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,15.84,7.66,0.0,24.02,0.0
50
+ 49.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,13.47,9.62,0.0,17.32,0.0
51
+ 50.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,12.25,1.67,0.0,22.83,0.0
src/record.csv CHANGED
@@ -28,7 +28,7 @@ PoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,58.83,70.51,8,,1319,"1,145,390",868,"217
28
  PoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,38.67,55.42,8,,1319,"1,147,538",870,"243,573",185,"1,391,111",0.0000,,,,,,,,,,,,,,,,,,,,
29
  PoT,gsm8k,2025/1/22,Internllm2_5-7B,38.21,48.9,8,,1319,"1,136,843",862,"188,106",143,"1,324,949",0.0000,,,,,,,,,,,,,,,,,,,,
30
  PoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,18.5,31.01,8,,1319,"1,151,528",873,"175,994",133,"1,327,522",0.0000,,,,,,,,,,,,,,,,,,,,
31
- PoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,9.62,16.9,8,,1319,"1,151,528",873,"237,607",180,"1,389,135",0.0000,,,,,,,,,,,,,,,,,,,,
32
  CoT,gsm8k,2025/1/7,gpt-3.5-turbo,78.7,100,8,,1319,"953,242",723,"134,799",102,"1,088,041",0.6788,,,,,,,,,,,,,,,,,,,,
33
  CoT,gsm8k,2025/1/7,Doubao-lite-32k,89.31,100,8,,1319,"1,042,095",790,"159,725",121,"1,201,820",0.0558,0.4084635 (元),,,,,,,,,,,,,,,,,,,
34
  CoT,gsm8k,2025/1/22,gpt-4o,94.09,100,8,,1319,"948,668",719,"216,498",164,"1,165,166",4.5367,,,,,,,,,,,,,,,,,,,,
@@ -39,16 +39,16 @@ CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,75.44,99.92,8,,1319,"990,168",751,"258
39
  CoT,gsm8k,2025/1/22,Internllm2_5-7B,77.71,99.7,8,,1319,"968,163",734,"234,000",177,"1,202,163",0.0000,,,,,,,,,,,,,,,,,,,,
40
  CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,55.5,100,8,,1319,"1,032,818",783,"185,707",141,"1,218,525",0.0000,,,,,,,,,,,,,,,,,,,,
41
  CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,35.94,99.92,8,,1319,"1,032,818",783,"190,641",145,"1,223,459",0.0000,,,,,,,,,,,,,,,,,,,,
42
- SC-CoT,gsm8k,2025/1/7,gpt-3.5-turbo,82.56,99.85,8,"temperature=1, path_num=5",1319,"1,212,520",919,"1,348,177","1,022","2,560,697",2.6285,,,,,,,,,,,,,,,,,,,,
43
- SC-CoT,gsm8k,2025/1/7,Doubao-lite-32k,83.7,99.7,8,"temperature=1, path_num=5",1319,"1,230,019",933,"1,277,668",969,"2,507,687",0.1550,,,,,,,,,,,,,,,,,,,,
44
- SC-CoT,gsm8k,2025/1/22,gpt-4o,90.75,100,8,"temperature=1, path_num=5",1319,"1,168,927",886,"2,132,044","1,616","3,300,971",24.2428,,,,,,,,,,,,,,,,,,,,
45
- SC-CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,90.67,100,8,"temperature=1, path_num=5",1319,"5,292,383","4,012","2,273,254","1,723","7,565,637",4.2651,,,,,,,,,,,,,,,,,,,,
46
- SC-CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,95.45,100,8,"temperature=1, path_num=5",1319,"5,406,763","4,099","2,579,233","1,955","7,985,996",4.5021,,,,,,,,,,,,,,,,,,,,
47
- SC-CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,88.32,99.85,8,"temperature=1, path_num=5",1319,"5,668,252","4,297","2,505,566","1,900","8,173,818",0.0000,,,,,,,,,,,,,,,,,,,,
48
- SC-CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,75.21,99.55,8,"temperature=1, path_num=5",1319,"5,334,657","4,044","3,109,546","2,358","8,444,203",0.0000,,,,,,,,,,,,,,,,,,,,
49
- SC-CoT,gsm8k,2025/1/22,Internllm2_5-7B,41.39,98.26,8,"temperature=1, path_num=5",1319,"6,674,518","5,060","3,350,339","2,540","10,024,857",0.0000,,,,,,,,,,,,,,,,,,,,
50
- SC-CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,5.53,86.73,8,"temperature=1, path_num=5",1319,"5,844,218","4,431","3,117,550","2,364","8,961,768",0.0000,,,,,,,,,,,,,,,,,,,,
51
- SC-CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,3.79,94.84,8,"temperature=1, path_num=5",1319,"6,529,832","4,951","4,003,983","3,036","10,533,815",0.0000,,,,,,,,,,,,,,,,,,,,
52
  IO,AQuA,2025/1/7,gpt-3.5-turbo,38.98,100,0,,254,"25,701",101,"16,770",66,"42,471",0.0380,,,,,,,,,,,,,,,,,,,,
53
  IO,AQuA,2025/1/7,Doubao-lite-32k,79.13,100,0,,254,"33,058",130,"54,684",215,"87,742",0.0058,0.0427(元),,,,,,,,,,,,,,,,,,,
54
  IO,AQuA,2025/1/22,gpt-4o,75.59,97.24,0,,254,"25,631",101,"108,121",426,"133,752",1.1453,,,,,,,,,,,,,,,,,,,,
@@ -59,7 +59,7 @@ IO,AQuA,2025/1/22,Llama-3.1-8B-Instruct,51.18,98.82,0,,254,"26,459",104,"106,647
59
  IO,AQuA,2025/1/22,Internllm2_5-7B,47.64,90.94,0,,254,"50,232",198,"134,809",531,"185,041",0.0000,,,,,,,,,,,,,,,,,,,,
60
  IO,AQuA,2025/1/22,Qwen2-1.5B-Instruct,29.13,97.64,0,,254,"27,937",110,"43,110",170,"71,047",0.0000,,,,,,,,,,,,,,,,,,,,
61
  IO,AQuA,2025/1/22,Qwen2-0.5B-Instruct,27.17,98.82,0,,254,"27,937",110,"82,478",325,"110,415",0.0000,,,,,,,,,,,,,,,,,,,,
62
- CoT,AQuA,2025/1/22,gpt-3.5-turbo,61.02,93.7,0,,254,"25,447",100,"55,346",218,"80,793",0.0957,,,,,,,,,,,,,,,,,,,,
63
  CoT,AQuA,2025/1/7,Doubao-lite-32k,82.68,97.24,0,,254,"27,978",110,"66,599",262,"94,577",0.0066,0.0483 (元),,,,,,,,,,,,,,,,,,,
64
  CoT,AQuA,2025/1/22,gpt-4o,82.68,98.03,0,,254,"25,123",99,"97,894",385,"123,017",1.0417,,,,,,,,,,,,,,,,,,,,
65
  CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,86.22,99.21,0,,254,"25,143",99,"118,146",465,"143,289",0.0808,,,,,,,,,,,,,,,,,,,,
@@ -79,16 +79,16 @@ PoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,36.61,96.85,0,,254,"240,613",947,"50,30
79
  PoT,AQuA,2025/1/22,Internllm2_5-7B,36.61,98.82,0,,254,"233,505",919,"68,457",270,"301,962",0.0000,,,,,,,,,,,,,,,,,,,,
80
  PoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,30.71,96.46,0,,254,"246,560",971,"51,915",204,"298,475",0.0000,,,,,,,,,,,,,,,,,,,,
81
  PoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,17.32,92.13,0,,254,"258,867","1,019","63,414",250,"322,281",0.0000,,,,,,,,,,,,,,,,,,,,
82
- SC-CoT,AQuA,2025/1/7,gpt-3.5-turbo,70.47,98.82,0,"temperature=1, path_num=5",254,"70,157",276,"348,460","1,372","418,617",0.5578,,,,,,,,,,,,,,,,,,,,
83
- SC-CoT,AQuA,2025/1/7,Doubao-lite-32k,81.5,97.64,0,"temperature=1, path_num=5",254,"83,830",330,"382,016","1,504","465,846",0.0347,,,,,,,,,,,,,,,,,,,,
84
- SC-CoT,AQuA,2025/1/22,gpt-4o,88.19,100,0,"temperature=1, path_num=5",254,"72,916",287,"605,895","2,385","678,811",6.2412,,,,,,,,,,,,,,,,,,,,
85
- SC-CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,85.83,98.43,0,"temperature=1, path_num=5",254,"241,149",949,"747,909","2,945","989,058",0.5576,,,,,,,,,,,,,,,,,,,,
86
- SC-CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,86.61,99.21,0,"temperature=1, path_num=5",254,"283,248","1,115","753,876","2,968","1,037,124",0.5847,,,,,,,,,,,,,,,,,,,,
87
- SC-CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,81.5,100,0,"temperature=1, path_num=5",254,"278,848","1,098","736,520","2,900","1,015,368",0.0000,,,,,,,,,,,,,,,,,,,,
88
- SC-CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,53.15,96.06,0,"temperature=1, path_num=5",254,"372,968","1,468","668,378","2,631","1,041,346",0.0000,,,,,,,,,,,,,,,,,,,,
89
- SC-CoT,AQuA,2025/1/22,Internllm2_5-7B,35.85,98.8,0,"temperature=1, path_num=5",254,"530,701","2,089","709,687","2,794","1,240,388",0.0000,,,,,,,,,,,,,,,,,,,,
90
- SC-CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,30.31,97.24,0,"temperature=1, path_num=5",254,"430,703","1,696","726,373","2,860","1,157,076",0.0000,,,,,,,,,,,,,,,,,,,,
91
- SC-CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,30.71,98.43,0,"temperature=1, path_num=5",254,"496,206","1,954","729,333","2,871","1,225,539",0.0000,,,,,,,,,,,,,,,,,,,,
92
  ReAct-Pro*,AQuA,2025/1/7,gpt-3.5-turbo,64.57,98.03,0,max_steps=10,254,"862,614","3,396","40,973",161,"903,587",0.4928,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
93
  ReAct-Pro*,AQuA,2025/1/7,Doubao-lite-32k,77.56,96.06,0,max_steps=10,254,"977,890","3,850","54,951",216,"1,032,841",0.0445,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
94
  ReAct-Pro*,AQuA,2025/1/22,gpt-4o,57.48,97.24,0,max_steps=10,254,"615,589","2,424","76,507",301,"692,096",2.3040,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
@@ -98,7 +98,7 @@ ReAct-Pro*,AQuA,2025/1/22,Qwen2.5-7B-Instruct,74.41,99.21,0,max_steps=10,254,"56
98
  ReAct-Pro*,AQuA,2025/1/22,Llama-3.1-8B-Instruct,55.51,96.85,0,max_steps=10,254,"3,764,723","14,822","576,098","2,268","4,340,821",0.0000,,,,,,,,,,,,,,,,,,,,
99
  ReAct-Pro*,AQuA,2025/1/22,Internllm2_5-7B,40.94,96.85,0,max_steps=10,254,"3,592,039","14,142","836,762","3,294","4,428,801",0.0000,,,,,,,,,,,,,,,,,,,,
100
  ReAct-Pro*,AQuA,2025/1/22,Qwen2-1.5B-Instruct,25.59,96.06,0,max_steps=10,254,"4,555,858","17,936","516,146","2,032","5,072,004",0.0000,,,,,,,,,,,,,,,,,,,,
101
- ReAct-Pro*,AQuA,2025/1/22,Qwen2-0.5B-Instruct,24.02,96.85,0,max_steps=10,254,6344167,"24,977",825920,"3,252","7,170,087",0.0000,,,,,,,,,,,,,,,,,,,,
102
  ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
103
  ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
104
  ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 
28
  PoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,38.67,55.42,8,,1319,"1,147,538",870,"243,573",185,"1,391,111",0.0000,,,,,,,,,,,,,,,,,,,,
29
  PoT,gsm8k,2025/1/22,Internllm2_5-7B,38.21,48.9,8,,1319,"1,136,843",862,"188,106",143,"1,324,949",0.0000,,,,,,,,,,,,,,,,,,,,
30
  PoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,18.5,31.01,8,,1319,"1,151,528",873,"175,994",133,"1,327,522",0.0000,,,,,,,,,,,,,,,,,,,,
31
+ PoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,9.62,16.91,8,,1319,"1,151,528",873,"237,607",180,"1,389,135",0.0000,,,,,,,,,,,,,,,,,,,,
32
  CoT,gsm8k,2025/1/7,gpt-3.5-turbo,78.7,100,8,,1319,"953,242",723,"134,799",102,"1,088,041",0.6788,,,,,,,,,,,,,,,,,,,,
33
  CoT,gsm8k,2025/1/7,Doubao-lite-32k,89.31,100,8,,1319,"1,042,095",790,"159,725",121,"1,201,820",0.0558,0.4084635 (元),,,,,,,,,,,,,,,,,,,
34
  CoT,gsm8k,2025/1/22,gpt-4o,94.09,100,8,,1319,"948,668",719,"216,498",164,"1,165,166",4.5367,,,,,,,,,,,,,,,,,,,,
 
39
  CoT,gsm8k,2025/1/22,Internllm2_5-7B,77.71,99.7,8,,1319,"968,163",734,"234,000",177,"1,202,163",0.0000,,,,,,,,,,,,,,,,,,,,
40
  CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,55.5,100,8,,1319,"1,032,818",783,"185,707",141,"1,218,525",0.0000,,,,,,,,,,,,,,,,,,,,
41
  CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,35.94,99.92,8,,1319,"1,032,818",783,"190,641",145,"1,223,459",0.0000,,,,,,,,,,,,,,,,,,,,
42
+ SC-CoT,gsm8k,2025/1/7,gpt-3.5-turbo,79.91,99.92,8,"temperature=1, path_num=5",1319,"2,740,652","2,078","1,348,960","1,023","4,089,612",3.3938,,,,,,,,,,,,,,,,,,,,
43
+ SC-CoT,gsm8k,2025/1/7,Doubao-lite-32k,87.26,99.92,8,"temperature=1, path_num=5",1319,"2,691,714","2,041","1,197,099",908,"3,888,813",0.2083,,,,,,,,,,,,,,,,,,,,
44
+ SC-CoT,gsm8k,2025/1/22,gpt-4o,90.3,99.92,8,"temperature=1, path_num=5",1319,"3,590,336","2,722","2,207,837","1,674","5,798,173",31.0542,,,,,,,,,,,,,,,,,,,,
45
+ SC-CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,93.86,100,8,"temperature=1, path_num=5",1319,"8,136,223","6,168","2,481,785","1,882","10,618,008",5.9858,,,,,,,,,,,,,,,,,,,,
46
+ SC-CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,95.07,100,8,"temperature=1, path_num=5",1319,"8,413,717","6,379","2,585,077","1,960","10,998,794",6.2005,,,,,,,,,,,,,,,,,,,,
47
+ SC-CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,91.13,100,8,"temperature=1, path_num=5",1319,"8,586,888","6,510","2,554,097","1,936","11,140,985",0.0000,,,,,,,,,,,,,,,,,,,,
48
+ SC-CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,73.46,99.55,8,"temperature=1, path_num=5",1319,"8,630,514","6,543","3,148,202","2,387","11,778,716",0.0000,,,,,,,,,,,,,,,,,,,,
49
+ SC-CoT,gsm8k,2025/1/22,Internllm2_5-7B,48.22,98.41,8,"temperature=1, path_num=5",1319,"10,678,792","8,096","3,847,639","2,917","14,526,431",0.0000,,,,,,,,,,,,,,,,,,,,
50
+ SC-CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,11.75,91.89,8,"temperature=1, path_num=5",1319,"9,066,115","6,873","3,345,827","2,537","12,411,942",0.0000,,,,,,,,,,,,,,,,,,,,
51
+ SC-CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,1.67,94.69,8,"temperature=1, path_num=5",1319,"11,019,864","8,355","5,445,856","4,129","16,465,720",0.0000,,,,,,,,,,,,,,,,,,,,
52
  IO,AQuA,2025/1/7,gpt-3.5-turbo,38.98,100,0,,254,"25,701",101,"16,770",66,"42,471",0.0380,,,,,,,,,,,,,,,,,,,,
53
  IO,AQuA,2025/1/7,Doubao-lite-32k,79.13,100,0,,254,"33,058",130,"54,684",215,"87,742",0.0058,0.0427(元),,,,,,,,,,,,,,,,,,,
54
  IO,AQuA,2025/1/22,gpt-4o,75.59,97.24,0,,254,"25,631",101,"108,121",426,"133,752",1.1453,,,,,,,,,,,,,,,,,,,,
 
59
  IO,AQuA,2025/1/22,Internllm2_5-7B,47.64,90.94,0,,254,"50,232",198,"134,809",531,"185,041",0.0000,,,,,,,,,,,,,,,,,,,,
60
  IO,AQuA,2025/1/22,Qwen2-1.5B-Instruct,29.13,97.64,0,,254,"27,937",110,"43,110",170,"71,047",0.0000,,,,,,,,,,,,,,,,,,,,
61
  IO,AQuA,2025/1/22,Qwen2-0.5B-Instruct,27.17,98.82,0,,254,"27,937",110,"82,478",325,"110,415",0.0000,,,,,,,,,,,,,,,,,,,,
62
+ CoT,AQuA,2025/1/7,gpt-3.5-turbo,61.02,93.7,0,,254,"25,447",100,"55,346",218,"80,793",0.0957,,,,,,,,,,,,,,,,,,,,
63
  CoT,AQuA,2025/1/7,Doubao-lite-32k,82.68,97.24,0,,254,"27,978",110,"66,599",262,"94,577",0.0066,0.0483 (元),,,,,,,,,,,,,,,,,,,
64
  CoT,AQuA,2025/1/22,gpt-4o,82.68,98.03,0,,254,"25,123",99,"97,894",385,"123,017",1.0417,,,,,,,,,,,,,,,,,,,,
65
  CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,86.22,99.21,0,,254,"25,143",99,"118,146",465,"143,289",0.0808,,,,,,,,,,,,,,,,,,,,
 
79
  PoT,AQuA,2025/1/22,Internllm2_5-7B,36.61,98.82,0,,254,"233,505",919,"68,457",270,"301,962",0.0000,,,,,,,,,,,,,,,,,,,,
80
  PoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,30.71,96.46,0,,254,"246,560",971,"51,915",204,"298,475",0.0000,,,,,,,,,,,,,,,,,,,,
81
  PoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,17.32,92.13,0,,254,"258,867","1,019","63,414",250,"322,281",0.0000,,,,,,,,,,,,,,,,,,,,
82
+ SC-CoT,AQuA,2025/1/7,gpt-3.5-turbo,66.14,99.21,0,"temperature=1, path_num=5",254,"482,192","1,898","365,143","1,438","847,335",0.7888,,,,,,,,,,,,,,,,,,,,
83
+ SC-CoT,AQuA,2025/1/7,Doubao-lite-32k,81.1,97.24,0,"temperature=1, path_num=5",254,"503,751","1,983","382,235","1,505","885,986",0.0519,,,,,,,,,,,,,,,,,,,,
84
+ SC-CoT,AQuA,2025/1/22,gpt-4o,86.61,98.82,0,"temperature=1, path_num=5",254,"744,478","2,931","628,728","2,475","1,373,206",8.1485,,,,,,,,,,,,,,,,,,,,
85
+ SC-CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,85.04,99.21,0,"temperature=1, path_num=5",254,"1,051,218","4,139","784,451","3,088","1,835,669",1.0348,,,,,,,,,,,,,,,,,,,,
86
+ SC-CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,82.28,99.21,0,"temperature=1, path_num=5",254,"1,135,251","4,469","772,673","3,042","1,907,924",1.0756,,,,,,,,,,,,,,,,,,,,
87
+ SC-CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,79.92,100,0,"temperature=1, path_num=5",254,"1,098,280","4,324","747,052","2,941","1,845,332",0.0000,,,,,,,,,,,,,,,,,,,,
88
+ SC-CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,59.45,97.24,0,"temperature=1, path_num=5",254,"971,003","3,823","680,330","2,678","1,651,333",0.0000,,,,,,,,,,,,,,,,,,,,
89
+ SC-CoT,AQuA,2025/1/22,Internllm2_5-7B,39.37,98.03,0,"temperature=1, path_num=5",254,"1,420,494","5,592","875,728","3,448","2,296,222",0.0000,,,,,,,,,,,,,,,,,,,,
90
+ SC-CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,23.62,96.46,0,"temperature=1, path_num=5",254,"1,034,362","4,072","740,973","2,917","1,775,335",0.0000,,,,,,,,,,,,,,,,,,,,
91
+ SC-CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,22.83,97.24,0,"temperature=1, path_num=5",254,"1,246,929","4,909","968,162","3,812","2,215,091",0.0000,,,,,,,,,,,,,,,,,,,,
92
  ReAct-Pro*,AQuA,2025/1/7,gpt-3.5-turbo,64.57,98.03,0,max_steps=10,254,"862,614","3,396","40,973",161,"903,587",0.4928,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
93
  ReAct-Pro*,AQuA,2025/1/7,Doubao-lite-32k,77.56,96.06,0,max_steps=10,254,"977,890","3,850","54,951",216,"1,032,841",0.0445,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
94
  ReAct-Pro*,AQuA,2025/1/22,gpt-4o,57.48,97.24,0,max_steps=10,254,"615,589","2,424","76,507",301,"692,096",2.3040,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
 
98
  ReAct-Pro*,AQuA,2025/1/22,Llama-3.1-8B-Instruct,55.51,96.85,0,max_steps=10,254,"3,764,723","14,822","576,098","2,268","4,340,821",0.0000,,,,,,,,,,,,,,,,,,,,
99
  ReAct-Pro*,AQuA,2025/1/22,Internllm2_5-7B,40.94,96.85,0,max_steps=10,254,"3,592,039","14,142","836,762","3,294","4,428,801",0.0000,,,,,,,,,,,,,,,,,,,,
100
  ReAct-Pro*,AQuA,2025/1/22,Qwen2-1.5B-Instruct,25.59,96.06,0,max_steps=10,254,"4,555,858","17,936","516,146","2,032","5,072,004",0.0000,,,,,,,,,,,,,,,,,,,,
101
+ ReAct-Pro*,AQuA,2025/1/22,Qwen2-0.5B-Instruct,24.02,96.85,0,max_steps=10,254,"6,344,167","24,977","825,920","3,252","7,170,087",0.0000,,,,,,,,,,,,,,,,,,,,
102
  ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
103
  ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
104
  ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,