Spaces:
Running
Running
liaojiajia
commited on
Commit
·
be9cdf5
1
Parent(s):
52f14c3
update sc-cot scores
Browse files- src/detail_math_score.json +147 -147
- src/detail_results.csv +50 -50
- src/overall_math_score.json +31 -31
- src/overall_results.csv +24 -24
- src/record.csv +23 -23
src/detail_math_score.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"time": "2025-01-
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"gpt-3.5-turbo": {
|
@@ -971,7 +971,7 @@
|
|
971 |
},
|
972 |
"gsm8k": {
|
973 |
"Score": 9.62,
|
974 |
-
"Pass rate": 0.
|
975 |
"Cost($)": 0.0,
|
976 |
"Framework": "",
|
977 |
"X-shot": "8.0",
|
@@ -1337,30 +1337,30 @@
|
|
1337 |
"Eval Date": "2025/1/7"
|
1338 |
},
|
1339 |
"gsm8k": {
|
1340 |
-
"Score":
|
1341 |
-
"Pass rate": 0.
|
1342 |
-
"Cost($)":
|
1343 |
"Framework": "",
|
1344 |
"X-shot": "8.0",
|
1345 |
"Samples": 1319,
|
1346 |
-
"All tokens":
|
1347 |
-
"Total input tokens":
|
1348 |
-
"Average input tokens":
|
1349 |
-
"Total output tokens":
|
1350 |
-
"Average output tokens":
|
1351 |
},
|
1352 |
"AQuA": {
|
1353 |
-
"Score":
|
1354 |
-
"Pass rate": 0.
|
1355 |
-
"Cost($)": 0.
|
1356 |
"Framework": "",
|
1357 |
"X-shot": "0.0",
|
1358 |
"Samples": 254,
|
1359 |
-
"All tokens":
|
1360 |
-
"Total input tokens":
|
1361 |
-
"Average input tokens":
|
1362 |
-
"Total output tokens":
|
1363 |
-
"Average output tokens":
|
1364 |
}
|
1365 |
},
|
1366 |
"Doubao-lite-32k": {
|
@@ -1370,30 +1370,30 @@
|
|
1370 |
"Eval Date": "2025/1/7"
|
1371 |
},
|
1372 |
"gsm8k": {
|
1373 |
-
"Score":
|
1374 |
-
"Pass rate": 0.
|
1375 |
-
"Cost($)": 0.
|
1376 |
"Framework": "",
|
1377 |
"X-shot": "8.0",
|
1378 |
"Samples": 1319,
|
1379 |
-
"All tokens":
|
1380 |
-
"Total input tokens":
|
1381 |
-
"Average input tokens":
|
1382 |
-
"Total output tokens":
|
1383 |
-
"Average output tokens":
|
1384 |
},
|
1385 |
"AQuA": {
|
1386 |
-
"Score": 81.
|
1387 |
-
"Pass rate": 0.
|
1388 |
-
"Cost($)": 0.
|
1389 |
"Framework": "",
|
1390 |
"X-shot": "0.0",
|
1391 |
"Samples": 254,
|
1392 |
-
"All tokens":
|
1393 |
-
"Total input tokens":
|
1394 |
-
"Average input tokens":
|
1395 |
-
"Total output tokens":
|
1396 |
-
"Average output tokens":
|
1397 |
}
|
1398 |
},
|
1399 |
"gpt-4o": {
|
@@ -1403,30 +1403,30 @@
|
|
1403 |
"Eval Date": "2025/1/22"
|
1404 |
},
|
1405 |
"gsm8k": {
|
1406 |
-
"Score": 90.
|
1407 |
-
"Pass rate":
|
1408 |
-
"Cost($)":
|
1409 |
"Framework": "",
|
1410 |
"X-shot": "8.0",
|
1411 |
"Samples": 1319,
|
1412 |
-
"All tokens":
|
1413 |
-
"Total input tokens":
|
1414 |
-
"Average input tokens":
|
1415 |
-
"Total output tokens":
|
1416 |
-
"Average output tokens":
|
1417 |
},
|
1418 |
"AQuA": {
|
1419 |
-
"Score":
|
1420 |
-
"Pass rate":
|
1421 |
-
"Cost($)":
|
1422 |
"Framework": "",
|
1423 |
"X-shot": "0.0",
|
1424 |
"Samples": 254,
|
1425 |
-
"All tokens":
|
1426 |
-
"Total input tokens":
|
1427 |
-
"Average input tokens":
|
1428 |
-
"Total output tokens":
|
1429 |
-
"Average output tokens":
|
1430 |
}
|
1431 |
},
|
1432 |
"Qwen2.5-72B-Instruct": {
|
@@ -1436,30 +1436,30 @@
|
|
1436 |
"Eval Date": "2025/1/22"
|
1437 |
},
|
1438 |
"gsm8k": {
|
1439 |
-
"Score":
|
1440 |
"Pass rate": 1.0,
|
1441 |
-
"Cost($)":
|
1442 |
"Framework": "",
|
1443 |
"X-shot": "8.0",
|
1444 |
"Samples": 1319,
|
1445 |
-
"All tokens":
|
1446 |
-
"Total input tokens":
|
1447 |
-
"Average input tokens":
|
1448 |
-
"Total output tokens":
|
1449 |
-
"Average output tokens":
|
1450 |
},
|
1451 |
"AQuA": {
|
1452 |
-
"Score": 85.
|
1453 |
-
"Pass rate": 0.
|
1454 |
-
"Cost($)":
|
1455 |
"Framework": "",
|
1456 |
"X-shot": "0.0",
|
1457 |
"Samples": 254,
|
1458 |
-
"All tokens":
|
1459 |
-
"Total input tokens":
|
1460 |
-
"Average input tokens":
|
1461 |
-
"Total output tokens":
|
1462 |
-
"Average output tokens":
|
1463 |
}
|
1464 |
},
|
1465 |
"Llama-3.3-70B-Instruct": {
|
@@ -1469,30 +1469,30 @@
|
|
1469 |
"Eval Date": "2025/1/22"
|
1470 |
},
|
1471 |
"gsm8k": {
|
1472 |
-
"Score": 95.
|
1473 |
"Pass rate": 1.0,
|
1474 |
-
"Cost($)":
|
1475 |
"Framework": "",
|
1476 |
"X-shot": "8.0",
|
1477 |
"Samples": 1319,
|
1478 |
-
"All tokens":
|
1479 |
-
"Total input tokens":
|
1480 |
-
"Average input tokens":
|
1481 |
-
"Total output tokens":
|
1482 |
-
"Average output tokens":
|
1483 |
},
|
1484 |
"AQuA": {
|
1485 |
-
"Score":
|
1486 |
"Pass rate": 0.9921,
|
1487 |
-
"Cost($)":
|
1488 |
"Framework": "",
|
1489 |
"X-shot": "0.0",
|
1490 |
"Samples": 254,
|
1491 |
-
"All tokens":
|
1492 |
-
"Total input tokens":
|
1493 |
-
"Average input tokens":
|
1494 |
-
"Total output tokens":
|
1495 |
-
"Average output tokens":
|
1496 |
}
|
1497 |
},
|
1498 |
"Qwen2.5-7B-Instruct": {
|
@@ -1502,30 +1502,30 @@
|
|
1502 |
"Eval Date": "2025/1/22"
|
1503 |
},
|
1504 |
"gsm8k": {
|
1505 |
-
"Score":
|
1506 |
-
"Pass rate": 0
|
1507 |
"Cost($)": 0.0,
|
1508 |
"Framework": "",
|
1509 |
"X-shot": "8.0",
|
1510 |
"Samples": 1319,
|
1511 |
-
"All tokens":
|
1512 |
-
"Total input tokens":
|
1513 |
-
"Average input tokens":
|
1514 |
-
"Total output tokens":
|
1515 |
-
"Average output tokens":
|
1516 |
},
|
1517 |
"AQuA": {
|
1518 |
-
"Score":
|
1519 |
"Pass rate": 1.0,
|
1520 |
"Cost($)": 0.0,
|
1521 |
"Framework": "",
|
1522 |
"X-shot": "0.0",
|
1523 |
"Samples": 254,
|
1524 |
-
"All tokens":
|
1525 |
-
"Total input tokens":
|
1526 |
-
"Average input tokens":
|
1527 |
-
"Total output tokens":
|
1528 |
-
"Average output tokens":
|
1529 |
}
|
1530 |
},
|
1531 |
"Llama-3.1-8B-Instruct": {
|
@@ -1535,30 +1535,30 @@
|
|
1535 |
"Eval Date": "2025/1/22"
|
1536 |
},
|
1537 |
"gsm8k": {
|
1538 |
-
"Score":
|
1539 |
"Pass rate": 0.9955,
|
1540 |
"Cost($)": 0.0,
|
1541 |
"Framework": "",
|
1542 |
"X-shot": "8.0",
|
1543 |
"Samples": 1319,
|
1544 |
-
"All tokens":
|
1545 |
-
"Total input tokens":
|
1546 |
-
"Average input tokens":
|
1547 |
-
"Total output tokens":
|
1548 |
-
"Average output tokens":
|
1549 |
},
|
1550 |
"AQuA": {
|
1551 |
-
"Score":
|
1552 |
-
"Pass rate": 0.
|
1553 |
"Cost($)": 0.0,
|
1554 |
"Framework": "",
|
1555 |
"X-shot": "0.0",
|
1556 |
"Samples": 254,
|
1557 |
-
"All tokens":
|
1558 |
-
"Total input tokens":
|
1559 |
-
"Average input tokens":
|
1560 |
-
"Total output tokens":
|
1561 |
-
"Average output tokens":
|
1562 |
}
|
1563 |
},
|
1564 |
"Internllm2_5-7B": {
|
@@ -1568,30 +1568,30 @@
|
|
1568 |
"Eval Date": "2025/1/22"
|
1569 |
},
|
1570 |
"gsm8k": {
|
1571 |
-
"Score":
|
1572 |
-
"Pass rate": 0.
|
1573 |
"Cost($)": 0.0,
|
1574 |
"Framework": "",
|
1575 |
"X-shot": "8.0",
|
1576 |
"Samples": 1319,
|
1577 |
-
"All tokens":
|
1578 |
-
"Total input tokens":
|
1579 |
-
"Average input tokens":
|
1580 |
-
"Total output tokens":
|
1581 |
-
"Average output tokens":
|
1582 |
},
|
1583 |
"AQuA": {
|
1584 |
-
"Score":
|
1585 |
-
"Pass rate": 0.
|
1586 |
"Cost($)": 0.0,
|
1587 |
"Framework": "",
|
1588 |
"X-shot": "0.0",
|
1589 |
"Samples": 254,
|
1590 |
-
"All tokens":
|
1591 |
-
"Total input tokens":
|
1592 |
-
"Average input tokens":
|
1593 |
-
"Total output tokens":
|
1594 |
-
"Average output tokens":
|
1595 |
}
|
1596 |
},
|
1597 |
"Qwen2-1.5B-Instruct": {
|
@@ -1601,30 +1601,30 @@
|
|
1601 |
"Eval Date": "2025/1/22"
|
1602 |
},
|
1603 |
"gsm8k": {
|
1604 |
-
"Score":
|
1605 |
-
"Pass rate": 0.
|
1606 |
"Cost($)": 0.0,
|
1607 |
"Framework": "",
|
1608 |
"X-shot": "8.0",
|
1609 |
"Samples": 1319,
|
1610 |
-
"All tokens":
|
1611 |
-
"Total input tokens":
|
1612 |
-
"Average input tokens":
|
1613 |
-
"Total output tokens":
|
1614 |
-
"Average output tokens":
|
1615 |
},
|
1616 |
"AQuA": {
|
1617 |
-
"Score":
|
1618 |
-
"Pass rate": 0.
|
1619 |
"Cost($)": 0.0,
|
1620 |
"Framework": "",
|
1621 |
"X-shot": "0.0",
|
1622 |
"Samples": 254,
|
1623 |
-
"All tokens":
|
1624 |
-
"Total input tokens":
|
1625 |
-
"Average input tokens":
|
1626 |
-
"Total output tokens":
|
1627 |
-
"Average output tokens":
|
1628 |
}
|
1629 |
},
|
1630 |
"Qwen2-0.5B-Instruct": {
|
@@ -1634,30 +1634,30 @@
|
|
1634 |
"Eval Date": "2025/1/22"
|
1635 |
},
|
1636 |
"gsm8k": {
|
1637 |
-
"Score":
|
1638 |
-
"Pass rate": 0.
|
1639 |
"Cost($)": 0.0,
|
1640 |
"Framework": "",
|
1641 |
"X-shot": "8.0",
|
1642 |
"Samples": 1319,
|
1643 |
-
"All tokens":
|
1644 |
-
"Total input tokens":
|
1645 |
-
"Average input tokens":
|
1646 |
-
"Total output tokens":
|
1647 |
-
"Average output tokens":
|
1648 |
},
|
1649 |
"AQuA": {
|
1650 |
-
"Score":
|
1651 |
-
"Pass rate": 0.
|
1652 |
"Cost($)": 0.0,
|
1653 |
"Framework": "",
|
1654 |
"X-shot": "0.0",
|
1655 |
"Samples": 254,
|
1656 |
-
"All tokens":
|
1657 |
-
"Total input tokens":
|
1658 |
-
"Average input tokens":
|
1659 |
-
"Total output tokens":
|
1660 |
-
"Average output tokens":
|
1661 |
}
|
1662 |
}
|
1663 |
}
|
|
|
1 |
{
|
2 |
+
"time": "2025-01-24 15:10:27",
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"gpt-3.5-turbo": {
|
|
|
971 |
},
|
972 |
"gsm8k": {
|
973 |
"Score": 9.62,
|
974 |
+
"Pass rate": 0.1691,
|
975 |
"Cost($)": 0.0,
|
976 |
"Framework": "",
|
977 |
"X-shot": "8.0",
|
|
|
1337 |
"Eval Date": "2025/1/7"
|
1338 |
},
|
1339 |
"gsm8k": {
|
1340 |
+
"Score": 79.91,
|
1341 |
+
"Pass rate": 0.9992,
|
1342 |
+
"Cost($)": 3.3938,
|
1343 |
"Framework": "",
|
1344 |
"X-shot": "8.0",
|
1345 |
"Samples": 1319,
|
1346 |
+
"All tokens": 4089612,
|
1347 |
+
"Total input tokens": 2740652,
|
1348 |
+
"Average input tokens": 2078,
|
1349 |
+
"Total output tokens": 1348960,
|
1350 |
+
"Average output tokens": 1023
|
1351 |
},
|
1352 |
"AQuA": {
|
1353 |
+
"Score": 66.14,
|
1354 |
+
"Pass rate": 0.9921,
|
1355 |
+
"Cost($)": 0.7888,
|
1356 |
"Framework": "",
|
1357 |
"X-shot": "0.0",
|
1358 |
"Samples": 254,
|
1359 |
+
"All tokens": 847335,
|
1360 |
+
"Total input tokens": 482192,
|
1361 |
+
"Average input tokens": 1898,
|
1362 |
+
"Total output tokens": 365143,
|
1363 |
+
"Average output tokens": 1438
|
1364 |
}
|
1365 |
},
|
1366 |
"Doubao-lite-32k": {
|
|
|
1370 |
"Eval Date": "2025/1/7"
|
1371 |
},
|
1372 |
"gsm8k": {
|
1373 |
+
"Score": 87.26,
|
1374 |
+
"Pass rate": 0.9992,
|
1375 |
+
"Cost($)": 0.2083,
|
1376 |
"Framework": "",
|
1377 |
"X-shot": "8.0",
|
1378 |
"Samples": 1319,
|
1379 |
+
"All tokens": 3888813,
|
1380 |
+
"Total input tokens": 2691714,
|
1381 |
+
"Average input tokens": 2041,
|
1382 |
+
"Total output tokens": 1197099,
|
1383 |
+
"Average output tokens": 908
|
1384 |
},
|
1385 |
"AQuA": {
|
1386 |
+
"Score": 81.1,
|
1387 |
+
"Pass rate": 0.9724,
|
1388 |
+
"Cost($)": 0.0519,
|
1389 |
"Framework": "",
|
1390 |
"X-shot": "0.0",
|
1391 |
"Samples": 254,
|
1392 |
+
"All tokens": 885986,
|
1393 |
+
"Total input tokens": 503751,
|
1394 |
+
"Average input tokens": 1983,
|
1395 |
+
"Total output tokens": 382235,
|
1396 |
+
"Average output tokens": 1505
|
1397 |
}
|
1398 |
},
|
1399 |
"gpt-4o": {
|
|
|
1403 |
"Eval Date": "2025/1/22"
|
1404 |
},
|
1405 |
"gsm8k": {
|
1406 |
+
"Score": 90.3,
|
1407 |
+
"Pass rate": 0.9992,
|
1408 |
+
"Cost($)": 31.0542,
|
1409 |
"Framework": "",
|
1410 |
"X-shot": "8.0",
|
1411 |
"Samples": 1319,
|
1412 |
+
"All tokens": 5798173,
|
1413 |
+
"Total input tokens": 3590336,
|
1414 |
+
"Average input tokens": 2722,
|
1415 |
+
"Total output tokens": 2207837,
|
1416 |
+
"Average output tokens": 1674
|
1417 |
},
|
1418 |
"AQuA": {
|
1419 |
+
"Score": 86.61,
|
1420 |
+
"Pass rate": 0.9882,
|
1421 |
+
"Cost($)": 8.1485,
|
1422 |
"Framework": "",
|
1423 |
"X-shot": "0.0",
|
1424 |
"Samples": 254,
|
1425 |
+
"All tokens": 1373206,
|
1426 |
+
"Total input tokens": 744478,
|
1427 |
+
"Average input tokens": 2931,
|
1428 |
+
"Total output tokens": 628728,
|
1429 |
+
"Average output tokens": 2475
|
1430 |
}
|
1431 |
},
|
1432 |
"Qwen2.5-72B-Instruct": {
|
|
|
1436 |
"Eval Date": "2025/1/22"
|
1437 |
},
|
1438 |
"gsm8k": {
|
1439 |
+
"Score": 93.86,
|
1440 |
"Pass rate": 1.0,
|
1441 |
+
"Cost($)": 5.9858,
|
1442 |
"Framework": "",
|
1443 |
"X-shot": "8.0",
|
1444 |
"Samples": 1319,
|
1445 |
+
"All tokens": 10618008,
|
1446 |
+
"Total input tokens": 8136223,
|
1447 |
+
"Average input tokens": 6168,
|
1448 |
+
"Total output tokens": 2481785,
|
1449 |
+
"Average output tokens": 1882
|
1450 |
},
|
1451 |
"AQuA": {
|
1452 |
+
"Score": 85.04,
|
1453 |
+
"Pass rate": 0.9921,
|
1454 |
+
"Cost($)": 1.0348,
|
1455 |
"Framework": "",
|
1456 |
"X-shot": "0.0",
|
1457 |
"Samples": 254,
|
1458 |
+
"All tokens": 1835669,
|
1459 |
+
"Total input tokens": 1051218,
|
1460 |
+
"Average input tokens": 4139,
|
1461 |
+
"Total output tokens": 784451,
|
1462 |
+
"Average output tokens": 3088
|
1463 |
}
|
1464 |
},
|
1465 |
"Llama-3.3-70B-Instruct": {
|
|
|
1469 |
"Eval Date": "2025/1/22"
|
1470 |
},
|
1471 |
"gsm8k": {
|
1472 |
+
"Score": 95.07,
|
1473 |
"Pass rate": 1.0,
|
1474 |
+
"Cost($)": 6.2005,
|
1475 |
"Framework": "",
|
1476 |
"X-shot": "8.0",
|
1477 |
"Samples": 1319,
|
1478 |
+
"All tokens": 10998794,
|
1479 |
+
"Total input tokens": 8413717,
|
1480 |
+
"Average input tokens": 6379,
|
1481 |
+
"Total output tokens": 2585077,
|
1482 |
+
"Average output tokens": 1960
|
1483 |
},
|
1484 |
"AQuA": {
|
1485 |
+
"Score": 82.28,
|
1486 |
"Pass rate": 0.9921,
|
1487 |
+
"Cost($)": 1.0756,
|
1488 |
"Framework": "",
|
1489 |
"X-shot": "0.0",
|
1490 |
"Samples": 254,
|
1491 |
+
"All tokens": 1907924,
|
1492 |
+
"Total input tokens": 1135251,
|
1493 |
+
"Average input tokens": 4469,
|
1494 |
+
"Total output tokens": 772673,
|
1495 |
+
"Average output tokens": 3042
|
1496 |
}
|
1497 |
},
|
1498 |
"Qwen2.5-7B-Instruct": {
|
|
|
1502 |
"Eval Date": "2025/1/22"
|
1503 |
},
|
1504 |
"gsm8k": {
|
1505 |
+
"Score": 91.13,
|
1506 |
+
"Pass rate": 1.0,
|
1507 |
"Cost($)": 0.0,
|
1508 |
"Framework": "",
|
1509 |
"X-shot": "8.0",
|
1510 |
"Samples": 1319,
|
1511 |
+
"All tokens": 11140985,
|
1512 |
+
"Total input tokens": 8586888,
|
1513 |
+
"Average input tokens": 6510,
|
1514 |
+
"Total output tokens": 2554097,
|
1515 |
+
"Average output tokens": 1936
|
1516 |
},
|
1517 |
"AQuA": {
|
1518 |
+
"Score": 79.92,
|
1519 |
"Pass rate": 1.0,
|
1520 |
"Cost($)": 0.0,
|
1521 |
"Framework": "",
|
1522 |
"X-shot": "0.0",
|
1523 |
"Samples": 254,
|
1524 |
+
"All tokens": 1845332,
|
1525 |
+
"Total input tokens": 1098280,
|
1526 |
+
"Average input tokens": 4324,
|
1527 |
+
"Total output tokens": 747052,
|
1528 |
+
"Average output tokens": 2941
|
1529 |
}
|
1530 |
},
|
1531 |
"Llama-3.1-8B-Instruct": {
|
|
|
1535 |
"Eval Date": "2025/1/22"
|
1536 |
},
|
1537 |
"gsm8k": {
|
1538 |
+
"Score": 73.46,
|
1539 |
"Pass rate": 0.9955,
|
1540 |
"Cost($)": 0.0,
|
1541 |
"Framework": "",
|
1542 |
"X-shot": "8.0",
|
1543 |
"Samples": 1319,
|
1544 |
+
"All tokens": 11778716,
|
1545 |
+
"Total input tokens": 8630514,
|
1546 |
+
"Average input tokens": 6543,
|
1547 |
+
"Total output tokens": 3148202,
|
1548 |
+
"Average output tokens": 2387
|
1549 |
},
|
1550 |
"AQuA": {
|
1551 |
+
"Score": 59.45,
|
1552 |
+
"Pass rate": 0.9724,
|
1553 |
"Cost($)": 0.0,
|
1554 |
"Framework": "",
|
1555 |
"X-shot": "0.0",
|
1556 |
"Samples": 254,
|
1557 |
+
"All tokens": 1651333,
|
1558 |
+
"Total input tokens": 971003,
|
1559 |
+
"Average input tokens": 3823,
|
1560 |
+
"Total output tokens": 680330,
|
1561 |
+
"Average output tokens": 2678
|
1562 |
}
|
1563 |
},
|
1564 |
"Internllm2_5-7B": {
|
|
|
1568 |
"Eval Date": "2025/1/22"
|
1569 |
},
|
1570 |
"gsm8k": {
|
1571 |
+
"Score": 48.22,
|
1572 |
+
"Pass rate": 0.9841,
|
1573 |
"Cost($)": 0.0,
|
1574 |
"Framework": "",
|
1575 |
"X-shot": "8.0",
|
1576 |
"Samples": 1319,
|
1577 |
+
"All tokens": 14526431,
|
1578 |
+
"Total input tokens": 10678792,
|
1579 |
+
"Average input tokens": 8096,
|
1580 |
+
"Total output tokens": 3847639,
|
1581 |
+
"Average output tokens": 2917
|
1582 |
},
|
1583 |
"AQuA": {
|
1584 |
+
"Score": 39.37,
|
1585 |
+
"Pass rate": 0.9803,
|
1586 |
"Cost($)": 0.0,
|
1587 |
"Framework": "",
|
1588 |
"X-shot": "0.0",
|
1589 |
"Samples": 254,
|
1590 |
+
"All tokens": 2296222,
|
1591 |
+
"Total input tokens": 1420494,
|
1592 |
+
"Average input tokens": 5592,
|
1593 |
+
"Total output tokens": 875728,
|
1594 |
+
"Average output tokens": 3448
|
1595 |
}
|
1596 |
},
|
1597 |
"Qwen2-1.5B-Instruct": {
|
|
|
1601 |
"Eval Date": "2025/1/22"
|
1602 |
},
|
1603 |
"gsm8k": {
|
1604 |
+
"Score": 11.75,
|
1605 |
+
"Pass rate": 0.9189,
|
1606 |
"Cost($)": 0.0,
|
1607 |
"Framework": "",
|
1608 |
"X-shot": "8.0",
|
1609 |
"Samples": 1319,
|
1610 |
+
"All tokens": 12411942,
|
1611 |
+
"Total input tokens": 9066115,
|
1612 |
+
"Average input tokens": 6873,
|
1613 |
+
"Total output tokens": 3345827,
|
1614 |
+
"Average output tokens": 2537
|
1615 |
},
|
1616 |
"AQuA": {
|
1617 |
+
"Score": 23.62,
|
1618 |
+
"Pass rate": 0.9646,
|
1619 |
"Cost($)": 0.0,
|
1620 |
"Framework": "",
|
1621 |
"X-shot": "0.0",
|
1622 |
"Samples": 254,
|
1623 |
+
"All tokens": 1775335,
|
1624 |
+
"Total input tokens": 1034362,
|
1625 |
+
"Average input tokens": 4072,
|
1626 |
+
"Total output tokens": 740973,
|
1627 |
+
"Average output tokens": 2917
|
1628 |
}
|
1629 |
},
|
1630 |
"Qwen2-0.5B-Instruct": {
|
|
|
1634 |
"Eval Date": "2025/1/22"
|
1635 |
},
|
1636 |
"gsm8k": {
|
1637 |
+
"Score": 1.67,
|
1638 |
+
"Pass rate": 0.9469,
|
1639 |
"Cost($)": 0.0,
|
1640 |
"Framework": "",
|
1641 |
"X-shot": "8.0",
|
1642 |
"Samples": 1319,
|
1643 |
+
"All tokens": 16465720,
|
1644 |
+
"Total input tokens": 11019864,
|
1645 |
+
"Average input tokens": 8355,
|
1646 |
+
"Total output tokens": 5445856,
|
1647 |
+
"Average output tokens": 4129
|
1648 |
},
|
1649 |
"AQuA": {
|
1650 |
+
"Score": 22.83,
|
1651 |
+
"Pass rate": 0.9724,
|
1652 |
"Cost($)": 0.0,
|
1653 |
"Framework": "",
|
1654 |
"X-shot": "0.0",
|
1655 |
"Samples": 254,
|
1656 |
+
"All tokens": 2215091,
|
1657 |
+
"Total input tokens": 1246929,
|
1658 |
+
"Average input tokens": 4909,
|
1659 |
+
"Total output tokens": 968162,
|
1660 |
+
"Average output tokens": 3812
|
1661 |
}
|
1662 |
}
|
1663 |
}
|
src/detail_results.csv
CHANGED
@@ -1,16 +1,16 @@
|
|
1 |
Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
|
2 |
-
1,SC-CoT,AQuA,gpt-4o,2025/1/22,
|
3 |
-
2,
|
4 |
-
3,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,
|
5 |
-
4,
|
6 |
-
5,
|
7 |
-
6,
|
8 |
-
7,
|
9 |
-
8,CoT,AQuA,
|
10 |
-
9,CoT,AQuA,
|
11 |
-
10,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,81.
|
12 |
-
11,
|
13 |
-
12,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,
|
14 |
13,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.53,0.9921,0.0,0.1746,,254,309799,240735,948,69064,272
|
15 |
14,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0.0,0.0058,,254,87742,33058,130,54684,215
|
16 |
15,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.9961,0.0,0.768,,254,1362379,1119143,4406,243236,958
|
@@ -22,60 +22,60 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
|
|
22 |
21,ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,2025/1/22,74.41,0.9921,0.0,0.0,,254,695844,564165,2221,131679,518
|
23 |
22,ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,2025/1/22,73.23,1.0,0.0,0.3177,,254,563603,441765,1739,121838,480
|
24 |
23,PoT,AQuA,Doubao-lite-32k,2025/1/7,71.65,0.9685,0.0,0.0147,,254,309436,259863,1023,49573,195
|
25 |
-
24,
|
26 |
-
25,
|
27 |
26,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.57,0.9803,0.0,0.4928,,254,903587,862614,3396,40973,161
|
28 |
27,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0.0,0.0957,,254,80793,25447,100,55346,218
|
29 |
28,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.63,1.0,0.0,0.0,,254,144435,32555,128,111880,440
|
30 |
29,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.45,1.0,0.0,0.1748,,254,266654,225162,886,41492,163
|
31 |
-
30,
|
32 |
-
31,ReAct-Pro*,AQuA,
|
33 |
-
32,
|
34 |
33,CoT,AQuA,Internllm2_5-7B,2025/1/22,52.76,0.8937,0.0,0.0,,254,127520,26610,105,100910,397
|
35 |
34,IO,AQuA,Llama-3.1-8B-Instruct,2025/1/22,51.18,0.9882,0.0,0.0,,254,133106,26459,104,106647,420
|
36 |
35,IO,AQuA,Internllm2_5-7B,2025/1/22,47.64,0.9094,0.0,0.0,,254,185041,50232,198,134809,531
|
37 |
36,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0.0,0.0,,254,4428801,3592039,14142,836762,3294
|
38 |
37,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9882,0.0,0.0,,254,110040,30477,120,79563,313
|
39 |
-
38,
|
40 |
-
39,
|
41 |
-
40,PoT,AQuA,
|
42 |
-
41,
|
43 |
42,CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,33.07,0.9882,0.0,0.0,,254,117339,30477,120,86862,342
|
44 |
43,PoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.71,0.9646,0.0,0.0,,254,298475,246560,971,51915,204
|
45 |
-
44,
|
46 |
-
45,
|
47 |
-
46,
|
48 |
-
47,
|
49 |
-
48,
|
50 |
-
49,
|
51 |
50,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9213,0.0,0.0,,254,322281,258867,1019,63414,250
|
52 |
-
1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.
|
53 |
2,CoT,gsm8k,gpt-4o,2025/1/22,94.09,1.0,8.0,4.5367,,1319,1165166,948668,719,216498,164
|
54 |
3,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8.0,0.687,,1319,1218665,990168,751,228497,173
|
55 |
-
4,
|
56 |
-
5,
|
57 |
-
6,
|
58 |
-
7,
|
59 |
-
8,
|
60 |
-
9,SC-CoT,gsm8k,Qwen2.5-
|
61 |
-
10,CoT,gsm8k,
|
62 |
-
11,
|
63 |
-
12,
|
64 |
13,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,8.0,10.1124,,1319,17937864,17038928,12918,898936,682
|
65 |
14,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8.0,10.5479,,1319,18710437,18160983,13769,549454,417
|
66 |
-
15,
|
67 |
-
16,
|
68 |
-
17,
|
69 |
-
18,
|
70 |
19,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.87,1.0,8.0,0.0,,1319,14850914,14355752,10884,495162,375
|
71 |
-
20,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,
|
72 |
21,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.61,0.9257,8.0,0.0576,,1319,1288055,1170038,887,118017,89
|
73 |
22,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.7,1.0,8.0,0.6788,,1319,1088041,953242,723,134799,102
|
74 |
23,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.997,8.0,0.0,,1319,1202163,968163,734,234000,177
|
75 |
24,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.88,0.9924,8.0,0.6902,,1319,1187080,1090418,827,96662,73
|
76 |
25,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.44,0.9992,8.0,0.0,,1319,1248329,990168,751,258161,196
|
77 |
-
26,
|
78 |
-
27,
|
79 |
28,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.09,0.7961,8.0,0.9736,,1319,1727044,1126025,854,601019,456
|
80 |
29,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8.0,0.0354,,1319,740483,617377,468,123106,93
|
81 |
30,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.78,0.9856,8.0,0.0,,1319,22835767,21044978,15955,1790789,1358
|
@@ -84,7 +84,7 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
|
|
84 |
33,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8.0,0.0,,1319,887913,596229,452,291684,221
|
85 |
34,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9955,8.0,0.0,,1319,1745429,550941,418,1194488,906
|
86 |
35,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.5,1.0,8.0,0.0,,1319,1218525,1032818,783,185707,141
|
87 |
-
36,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,
|
88 |
37,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.67,0.5542,8.0,0.0,,1319,1391111,1147538,870,243573,185
|
89 |
38,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8.0,0.0,,1319,1324949,1136843,862,188106,143
|
90 |
39,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8.0,0.3328,,1319,586553,546990,415,39563,30
|
@@ -94,8 +94,8 @@ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Sa
|
|
94 |
43,PoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,18.5,0.3101,8.0,0.0,,1319,1327522,1151528,873,175994,133
|
95 |
44,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.68,1.0,8.0,0.0,,1319,736996,568530,431,168466,128
|
96 |
45,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.71,1.0,8.0,0.0,,1319,834897,568116,431,266781,202
|
97 |
-
46,
|
98 |
-
47,
|
99 |
-
48,
|
100 |
-
49,
|
101 |
-
50,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,
|
|
|
1 |
Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
|
2 |
+
1,SC-CoT,AQuA,gpt-4o,2025/1/22,86.61,0.9882,0.0,8.1485,,254,1373206,744478,2931,628728,2475
|
3 |
+
2,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0.0,0.0808,,254,143289,25143,99,118146,465
|
4 |
+
3,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.04,0.9921,0.0,1.0348,,254,1835669,1051218,4139,784451,3088
|
5 |
+
4,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.9961,0.0,0.0742,,254,131604,25397,100,106207,418
|
6 |
+
5,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9843,0.0,0.0927,,254,164389,32555,128,131834,519
|
7 |
+
6,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.68,0.9921,0.0,0.0798,,254,141567,32809,129,108758,428
|
8 |
+
7,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.68,0.9724,0.0,0.0066,,254,94577,27978,110,66599,262
|
9 |
+
8,CoT,AQuA,gpt-4o,2025/1/22,82.68,0.9803,0.0,1.0417,,254,123017,25123,99,97894,385
|
10 |
+
9,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.28,0.9921,0.0,1.0756,,254,1907924,1135251,4469,772673,3042
|
11 |
+
10,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,81.1,0.9724,0.0,0.0519,,254,885986,503751,1983,382235,1505
|
12 |
+
11,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.71,0.9961,0.0,0.0,,254,149736,33017,130,116719,460
|
13 |
+
12,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,79.92,1.0,0.0,0.0,,254,1845332,1098280,4324,747052,2941
|
14 |
13,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.53,0.9921,0.0,0.1746,,254,309799,240735,948,69064,272
|
15 |
14,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0.0,0.0058,,254,87742,33058,130,54684,215
|
16 |
15,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.9961,0.0,0.768,,254,1362379,1119143,4406,243236,958
|
|
|
22 |
21,ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,2025/1/22,74.41,0.9921,0.0,0.0,,254,695844,564165,2221,131679,518
|
23 |
22,ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,2025/1/22,73.23,1.0,0.0,0.3177,,254,563603,441765,1739,121838,480
|
24 |
23,PoT,AQuA,Doubao-lite-32k,2025/1/7,71.65,0.9685,0.0,0.0147,,254,309436,259863,1023,49573,195
|
25 |
+
24,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0.0,0.0,,254,313728,264517,1041,49211,194
|
26 |
+
25,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,66.14,0.9921,0.0,0.7888,,254,847335,482192,1898,365143,1438
|
27 |
26,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.57,0.9803,0.0,0.4928,,254,903587,862614,3396,40973,161
|
28 |
27,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0.0,0.0957,,254,80793,25447,100,55346,218
|
29 |
28,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.63,1.0,0.0,0.0,,254,144435,32555,128,111880,440
|
30 |
29,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.45,1.0,0.0,0.1748,,254,266654,225162,886,41492,163
|
31 |
+
30,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,59.45,0.9724,0.0,0.0,,254,1651333,971003,3823,680330,2678
|
32 |
+
31,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0.0,2.304,,254,692096,615589,2424,76507,301
|
33 |
+
32,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0.0,0.0,,254,4340821,3764723,14822,576098,2268
|
34 |
33,CoT,AQuA,Internllm2_5-7B,2025/1/22,52.76,0.8937,0.0,0.0,,254,127520,26610,105,100910,397
|
35 |
34,IO,AQuA,Llama-3.1-8B-Instruct,2025/1/22,51.18,0.9882,0.0,0.0,,254,133106,26459,104,106647,420
|
36 |
35,IO,AQuA,Internllm2_5-7B,2025/1/22,47.64,0.9094,0.0,0.0,,254,185041,50232,198,134809,531
|
37 |
36,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0.0,0.0,,254,4428801,3592039,14142,836762,3294
|
38 |
37,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9882,0.0,0.0,,254,110040,30477,120,79563,313
|
39 |
+
38,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,39.37,0.9803,0.0,0.0,,254,2296222,1420494,5592,875728,3448
|
40 |
+
39,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.98,1.0,0.0,0.038,,254,42471,25701,101,16770,66
|
41 |
+
40,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0.0,0.0,,254,290914,240613,947,50301,198
|
42 |
+
41,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9882,0.0,0.0,,254,301962,233505,919,68457,270
|
43 |
42,CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,33.07,0.9882,0.0,0.0,,254,117339,30477,120,86862,342
|
44 |
43,PoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.71,0.9646,0.0,0.0,,254,298475,246560,971,51915,204
|
45 |
+
44,IO,AQuA,Qwen2-1.5B-Instruct,2025/1/22,29.13,0.9764,0.0,0.0,,254,71047,27937,110,43110,170
|
46 |
+
45,IO,AQuA,Qwen2-0.5B-Instruct,2025/1/22,27.17,0.9882,0.0,0.0,,254,110415,27937,110,82478,325
|
47 |
+
46,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0.0,0.0,,254,5072004,4555858,17936,516146,2032
|
48 |
+
47,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.02,0.9685,0.0,0.0,,254,7170087,6344167,24977,825920,3252
|
49 |
+
48,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,23.62,0.9646,0.0,0.0,,254,1775335,1034362,4072,740973,2917
|
50 |
+
49,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,22.83,0.9724,0.0,0.0,,254,2215091,1246929,4909,968162,3812
|
51 |
50,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9213,0.0,0.0,,254,322281,258867,1019,63414,250
|
52 |
+
1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.07,1.0,8.0,6.2005,,1319,10998794,8413717,6379,2585077,1960
|
53 |
2,CoT,gsm8k,gpt-4o,2025/1/22,94.09,1.0,8.0,4.5367,,1319,1165166,948668,719,216498,164
|
54 |
3,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8.0,0.687,,1319,1218665,990168,751,228497,173
|
55 |
+
4,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,93.86,1.0,8.0,5.9858,,1319,10618008,8136223,6168,2481785,1882
|
56 |
+
5,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8.0,4.2166,,1319,1247912,1101672,835,146240,111
|
57 |
+
6,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8.0,0.7195,,1319,1276252,1005119,762,271133,206
|
58 |
+
7,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8.0,0.7054,,1319,1251210,1106682,839,144528,110
|
59 |
+
8,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.27,1.0,8.0,0.4709,,1319,835275,583916,443,251359,191
|
60 |
+
9,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,91.13,1.0,8.0,0.0,,1319,11140985,8586888,6510,2554097,1936
|
61 |
+
10,SC-CoT,gsm8k,gpt-4o,2025/1/22,90.3,0.9992,8.0,31.0542,,1319,5798173,3590336,2722,2207837,1674
|
62 |
+
11,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8.0,0.0558,,1319,1201820,1042095,790,159725,121
|
63 |
+
12,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8.0,3.3463,,1319,741446,542416,411,199030,151
|
64 |
13,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,8.0,10.1124,,1319,17937864,17038928,12918,898936,682
|
65 |
14,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8.0,10.5479,,1319,18710437,18160983,13769,549454,417
|
66 |
+
15,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,87.26,0.9992,8.0,0.2083,,1319,3888813,2691714,2041,1197099,908
|
67 |
+
16,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8.0,0.4899,,1319,869060,555340,421,313720,238
|
68 |
+
17,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8.0,0.0,,1319,1290805,1046008,793,244797,186
|
69 |
+
18,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.6,0.9962,8.0,0.2512,,1319,5998639,5862016,4444,136623,104
|
70 |
19,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.87,1.0,8.0,0.0,,1319,14850914,14355752,10884,495162,375
|
71 |
+
20,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,79.91,0.9992,8.0,3.3938,,1319,4089612,2740652,2078,1348960,1023
|
72 |
21,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.61,0.9257,8.0,0.0576,,1319,1288055,1170038,887,118017,89
|
73 |
22,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.7,1.0,8.0,0.6788,,1319,1088041,953242,723,134799,102
|
74 |
23,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.997,8.0,0.0,,1319,1202163,968163,734,234000,177
|
75 |
24,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.88,0.9924,8.0,0.6902,,1319,1187080,1090418,827,96662,73
|
76 |
25,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.44,0.9992,8.0,0.0,,1319,1248329,990168,751,258161,196
|
77 |
+
26,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.91,0.9939,8.0,3.4633,,1319,6646286,6506164,4933,140122,106
|
78 |
+
27,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,73.46,0.9955,8.0,0.0,,1319,11778716,8630514,6543,3148202,2387
|
79 |
28,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.09,0.7961,8.0,0.9736,,1319,1727044,1126025,854,601019,456
|
80 |
29,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8.0,0.0354,,1319,740483,617377,468,123106,93
|
81 |
30,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.78,0.9856,8.0,0.0,,1319,22835767,21044978,15955,1790789,1358
|
|
|
84 |
33,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8.0,0.0,,1319,887913,596229,452,291684,221
|
85 |
34,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9955,8.0,0.0,,1319,1745429,550941,418,1194488,906
|
86 |
35,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.5,1.0,8.0,0.0,,1319,1218525,1032818,783,185707,141
|
87 |
+
36,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,48.22,0.9841,8.0,0.0,,1319,14526431,10678792,8096,3847639,2917
|
88 |
37,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.67,0.5542,8.0,0.0,,1319,1391111,1147538,870,243573,185
|
89 |
38,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8.0,0.0,,1319,1324949,1136843,862,188106,143
|
90 |
39,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8.0,0.3328,,1319,586553,546990,415,39563,30
|
|
|
94 |
43,PoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,18.5,0.3101,8.0,0.0,,1319,1327522,1151528,873,175994,133
|
95 |
44,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.68,1.0,8.0,0.0,,1319,736996,568530,431,168466,128
|
96 |
45,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.71,1.0,8.0,0.0,,1319,834897,568116,431,266781,202
|
97 |
+
46,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,11.75,0.9189,8.0,0.0,,1319,12411942,9066115,6873,3345827,2537
|
98 |
+
47,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.6,0.9795,8.0,0.0,,1319,1113728,679302,515,434426,329
|
99 |
+
48,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.62,0.1691,8.0,0.0,,1319,1389135,1151528,873,237607,180
|
100 |
+
49,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.66,0.9522,8.0,0.0,,1319,55392611,52431343,39751,2961268,2245
|
101 |
+
50,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,1.67,0.9469,8.0,0.0,,1319,16465720,11019864,8355,5445856,4129
|
src/overall_math_score.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"time": "2025-01-
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"META": {
|
@@ -68,12 +68,12 @@
|
|
68 |
"Eval Date": "2025/1/7"
|
69 |
},
|
70 |
"gsm8k": {
|
71 |
-
"Score":
|
72 |
-
"Cost($)":
|
73 |
},
|
74 |
"AQuA": {
|
75 |
-
"Score":
|
76 |
-
"Cost($)": 0.
|
77 |
}
|
78 |
},
|
79 |
"IO-Doubao-lite-32k": {
|
@@ -143,12 +143,12 @@
|
|
143 |
"Eval Date": "2025/1/7"
|
144 |
},
|
145 |
"gsm8k": {
|
146 |
-
"Score":
|
147 |
-
"Cost($)": 0.
|
148 |
},
|
149 |
"AQuA": {
|
150 |
-
"Score": 81.
|
151 |
-
"Cost($)": 0.
|
152 |
}
|
153 |
},
|
154 |
"IO-gpt-4o": {
|
@@ -218,12 +218,12 @@
|
|
218 |
"Eval Date": "2025/1/22"
|
219 |
},
|
220 |
"gsm8k": {
|
221 |
-
"Score": 90.
|
222 |
-
"Cost($)":
|
223 |
},
|
224 |
"AQuA": {
|
225 |
-
"Score":
|
226 |
-
"Cost($)":
|
227 |
}
|
228 |
},
|
229 |
"IO-Qwen2.5-72B-Instruct": {
|
@@ -293,12 +293,12 @@
|
|
293 |
"Eval Date": "2025/1/22"
|
294 |
},
|
295 |
"gsm8k": {
|
296 |
-
"Score":
|
297 |
-
"Cost($)":
|
298 |
},
|
299 |
"AQuA": {
|
300 |
-
"Score": 85.
|
301 |
-
"Cost($)":
|
302 |
}
|
303 |
},
|
304 |
"IO-Llama-3.3-70B-Instruct": {
|
@@ -368,12 +368,12 @@
|
|
368 |
"Eval Date": "2025/1/22"
|
369 |
},
|
370 |
"gsm8k": {
|
371 |
-
"Score": 95.
|
372 |
-
"Cost($)":
|
373 |
},
|
374 |
"AQuA": {
|
375 |
-
"Score":
|
376 |
-
"Cost($)":
|
377 |
}
|
378 |
},
|
379 |
"IO-Qwen2.5-7B-Instruct": {
|
@@ -443,11 +443,11 @@
|
|
443 |
"Eval Date": "2025/1/22"
|
444 |
},
|
445 |
"gsm8k": {
|
446 |
-
"Score":
|
447 |
"Cost($)": 0.0
|
448 |
},
|
449 |
"AQuA": {
|
450 |
-
"Score":
|
451 |
"Cost($)": 0.0
|
452 |
}
|
453 |
},
|
@@ -518,11 +518,11 @@
|
|
518 |
"Eval Date": "2025/1/22"
|
519 |
},
|
520 |
"gsm8k": {
|
521 |
-
"Score":
|
522 |
"Cost($)": 0.0
|
523 |
},
|
524 |
"AQuA": {
|
525 |
-
"Score":
|
526 |
"Cost($)": 0.0
|
527 |
}
|
528 |
},
|
@@ -593,11 +593,11 @@
|
|
593 |
"Eval Date": "2025/1/22"
|
594 |
},
|
595 |
"gsm8k": {
|
596 |
-
"Score":
|
597 |
"Cost($)": 0.0
|
598 |
},
|
599 |
"AQuA": {
|
600 |
-
"Score":
|
601 |
"Cost($)": 0.0
|
602 |
}
|
603 |
},
|
@@ -668,11 +668,11 @@
|
|
668 |
"Eval Date": "2025/1/22"
|
669 |
},
|
670 |
"gsm8k": {
|
671 |
-
"Score":
|
672 |
"Cost($)": 0.0
|
673 |
},
|
674 |
"AQuA": {
|
675 |
-
"Score":
|
676 |
"Cost($)": 0.0
|
677 |
}
|
678 |
},
|
@@ -743,11 +743,11 @@
|
|
743 |
"Eval Date": "2025/1/22"
|
744 |
},
|
745 |
"gsm8k": {
|
746 |
-
"Score":
|
747 |
"Cost($)": 0.0
|
748 |
},
|
749 |
"AQuA": {
|
750 |
-
"Score":
|
751 |
"Cost($)": 0.0
|
752 |
}
|
753 |
}
|
|
|
1 |
{
|
2 |
+
"time": "2025-01-24 15:10:27",
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"META": {
|
|
|
68 |
"Eval Date": "2025/1/7"
|
69 |
},
|
70 |
"gsm8k": {
|
71 |
+
"Score": 79.91,
|
72 |
+
"Cost($)": 3.3938
|
73 |
},
|
74 |
"AQuA": {
|
75 |
+
"Score": 66.14,
|
76 |
+
"Cost($)": 0.7888
|
77 |
}
|
78 |
},
|
79 |
"IO-Doubao-lite-32k": {
|
|
|
143 |
"Eval Date": "2025/1/7"
|
144 |
},
|
145 |
"gsm8k": {
|
146 |
+
"Score": 87.26,
|
147 |
+
"Cost($)": 0.2083
|
148 |
},
|
149 |
"AQuA": {
|
150 |
+
"Score": 81.1,
|
151 |
+
"Cost($)": 0.0519
|
152 |
}
|
153 |
},
|
154 |
"IO-gpt-4o": {
|
|
|
218 |
"Eval Date": "2025/1/22"
|
219 |
},
|
220 |
"gsm8k": {
|
221 |
+
"Score": 90.3,
|
222 |
+
"Cost($)": 31.0542
|
223 |
},
|
224 |
"AQuA": {
|
225 |
+
"Score": 86.61,
|
226 |
+
"Cost($)": 8.1485
|
227 |
}
|
228 |
},
|
229 |
"IO-Qwen2.5-72B-Instruct": {
|
|
|
293 |
"Eval Date": "2025/1/22"
|
294 |
},
|
295 |
"gsm8k": {
|
296 |
+
"Score": 93.86,
|
297 |
+
"Cost($)": 5.9858
|
298 |
},
|
299 |
"AQuA": {
|
300 |
+
"Score": 85.04,
|
301 |
+
"Cost($)": 1.0348
|
302 |
}
|
303 |
},
|
304 |
"IO-Llama-3.3-70B-Instruct": {
|
|
|
368 |
"Eval Date": "2025/1/22"
|
369 |
},
|
370 |
"gsm8k": {
|
371 |
+
"Score": 95.07,
|
372 |
+
"Cost($)": 6.2005
|
373 |
},
|
374 |
"AQuA": {
|
375 |
+
"Score": 82.28,
|
376 |
+
"Cost($)": 1.0756
|
377 |
}
|
378 |
},
|
379 |
"IO-Qwen2.5-7B-Instruct": {
|
|
|
443 |
"Eval Date": "2025/1/22"
|
444 |
},
|
445 |
"gsm8k": {
|
446 |
+
"Score": 91.13,
|
447 |
"Cost($)": 0.0
|
448 |
},
|
449 |
"AQuA": {
|
450 |
+
"Score": 79.92,
|
451 |
"Cost($)": 0.0
|
452 |
}
|
453 |
},
|
|
|
518 |
"Eval Date": "2025/1/22"
|
519 |
},
|
520 |
"gsm8k": {
|
521 |
+
"Score": 73.46,
|
522 |
"Cost($)": 0.0
|
523 |
},
|
524 |
"AQuA": {
|
525 |
+
"Score": 59.45,
|
526 |
"Cost($)": 0.0
|
527 |
}
|
528 |
},
|
|
|
593 |
"Eval Date": "2025/1/22"
|
594 |
},
|
595 |
"gsm8k": {
|
596 |
+
"Score": 48.22,
|
597 |
"Cost($)": 0.0
|
598 |
},
|
599 |
"AQuA": {
|
600 |
+
"Score": 39.37,
|
601 |
"Cost($)": 0.0
|
602 |
}
|
603 |
},
|
|
|
668 |
"Eval Date": "2025/1/22"
|
669 |
},
|
670 |
"gsm8k": {
|
671 |
+
"Score": 11.75,
|
672 |
"Cost($)": 0.0
|
673 |
},
|
674 |
"AQuA": {
|
675 |
+
"Score": 23.62,
|
676 |
"Cost($)": 0.0
|
677 |
}
|
678 |
},
|
|
|
743 |
"Eval Date": "2025/1/22"
|
744 |
},
|
745 |
"gsm8k": {
|
746 |
+
"Score": 1.67,
|
747 |
"Cost($)": 0.0
|
748 |
},
|
749 |
"AQuA": {
|
750 |
+
"Score": 22.83,
|
751 |
"Cost($)": 0.0
|
752 |
}
|
753 |
}
|
src/overall_results.csv
CHANGED
@@ -1,40 +1,40 @@
|
|
1 |
Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($)
|
2 |
-
1.0,
|
3 |
-
2.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,89.
|
4 |
-
3.0,
|
5 |
-
4.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,88.
|
6 |
-
5.0,CoT,gpt-4o,2025/1/22,88.
|
7 |
-
6.0,
|
8 |
7.0,IO,Llama-3.3-70B-Instruct,2025/1/22,87.48,92.27,0.4709,82.68,0.0798
|
9 |
8.0,CoT,Doubao-lite-32k,2025/1/7,86.00,89.31,0.0558,82.68,0.0066
|
10 |
-
9.0,
|
11 |
-
10.0,
|
12 |
-
11.0,
|
13 |
-
12.0,PoT,
|
14 |
-
13.0,
|
15 |
-
14.0,
|
16 |
-
15.0,
|
17 |
16.0,IO,gpt-4o,2025/1/22,82.00,88.4,3.3463,75.59,1.1453
|
18 |
17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,81.58,85.6,0.2512,77.56,0.0445
|
19 |
18.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,80.25,87.26,10.5479,73.23,0.3177
|
20 |
19.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,78.64,82.87,0.0,74.41,0.0
|
21 |
-
20.0,
|
22 |
-
21.0,PoT,
|
23 |
-
22.0,
|
24 |
-
23.0,
|
25 |
24.0,CoT,gpt-3.5-turbo,2025/1/7,69.86,78.7,0.6788,61.02,0.0957
|
26 |
25.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,69.74,74.91,3.4633,64.57,0.4928
|
27 |
26.0,PoT,gpt-3.5-turbo,2025/1/7,68.17,76.88,0.6902,59.45,0.1748
|
28 |
27.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,68.04,75.44,0.0,60.63,0.0
|
29 |
28.0,IO,Qwen2.5-7B-Instruct,2025/1/22,67.99,57.24,0.0,78.74,0.0
|
30 |
-
29.0,CoT,
|
31 |
-
30.0,
|
32 |
31.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,63.47,58.83,0.0,68.11,0.0
|
33 |
32.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,61.65,67.78,0.0,55.51,0.0
|
34 |
33.0,ReAct-Pro*,gpt-4o,2025/1/22,60.40,63.31,39.0751,57.48,2.304
|
35 |
34.0,IO,Llama-3.1-8B-Instruct,2025/1/22,54.17,57.16,0.0,51.18,0.0
|
36 |
35.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,48.03,55.5,0.0,40.55,0.0
|
37 |
-
36.0,SC-CoT,Internllm2_5-7B,2025/1/22,
|
38 |
37.0,IO,gpt-3.5-turbo,2025/1/7,38.41,37.83,0.3328,38.98,0.038
|
39 |
38.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,37.64,38.67,0.0,36.61,0.0
|
40 |
39.0,PoT,Internllm2_5-7B,2025/1/22,37.41,38.21,0.0,36.61,0.0
|
@@ -45,7 +45,7 @@ Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA
|
|
45 |
44.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,24.61,18.5,0.0,30.71,0.0
|
46 |
45.0,IO,Qwen2-1.5B-Instruct,2025/1/22,22.91,16.68,0.0,29.13,0.0
|
47 |
46.0,IO,Qwen2-0.5B-Instruct,2025/1/22,20.94,14.71,0.0,27.17,0.0
|
48 |
-
47.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,17.
|
49 |
-
48.0,
|
50 |
-
49.0,
|
51 |
-
50.0,
|
|
|
1 |
Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($)
|
2 |
+
1.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,89.55,92.87,0.7195,86.22,0.0808
|
3 |
+
2.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,89.45,93.86,5.9858,85.04,1.0348
|
4 |
+
3.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,88.70,93.93,0.687,83.46,0.0927
|
5 |
+
4.0,SC-CoT,Llama-3.3-70B-Instruct,2025/1/22,88.68,95.07,6.2005,82.28,1.0756
|
6 |
+
5.0,SC-CoT,gpt-4o,2025/1/22,88.46,90.3,31.0542,86.61,8.1485
|
7 |
+
6.0,CoT,gpt-4o,2025/1/22,88.39,94.09,4.5367,82.68,1.0417
|
8 |
7.0,IO,Llama-3.3-70B-Instruct,2025/1/22,87.48,92.27,0.4709,82.68,0.0798
|
9 |
8.0,CoT,Doubao-lite-32k,2025/1/7,86.00,89.31,0.0558,82.68,0.0066
|
10 |
+
9.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,85.53,91.13,0.0,79.92,0.0
|
11 |
+
10.0,IO,Qwen2.5-72B-Instruct,2025/1/22,85.42,86.58,0.4899,84.25,0.0742
|
12 |
+
11.0,SC-CoT,Doubao-lite-32k,2025/1/7,84.18,87.26,0.2083,81.1,0.0519
|
13 |
+
12.0,PoT,gpt-4o,2025/1/22,84.15,93.1,4.2166,75.2,1.6087
|
14 |
+
13.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,83.77,92.34,0.7054,75.2,0.1645
|
15 |
+
14.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,83.39,87.64,10.1124,79.13,0.768
|
16 |
+
15.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,83.19,85.67,0.0,80.71,0.0
|
17 |
16.0,IO,gpt-4o,2025/1/22,82.00,88.4,3.3463,75.59,1.1453
|
18 |
17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,81.58,85.6,0.2512,77.56,0.0445
|
19 |
18.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,80.25,87.26,10.5479,73.23,0.3177
|
20 |
19.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,78.64,82.87,0.0,74.41,0.0
|
21 |
+
20.0,PoT,Llama-3.3-70B-Instruct,2025/1/22,76.31,73.09,0.9736,79.53,0.1746
|
22 |
+
21.0,PoT,Doubao-lite-32k,2025/1/7,75.63,79.61,0.0576,71.65,0.0147
|
23 |
+
22.0,IO,Doubao-lite-32k,2025/1/7,75.58,72.02,0.0354,79.13,0.0058
|
24 |
+
23.0,SC-CoT,gpt-3.5-turbo,2025/1/7,73.03,79.91,3.3938,66.14,0.7888
|
25 |
24.0,CoT,gpt-3.5-turbo,2025/1/7,69.86,78.7,0.6788,61.02,0.0957
|
26 |
25.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,69.74,74.91,3.4633,64.57,0.4928
|
27 |
26.0,PoT,gpt-3.5-turbo,2025/1/7,68.17,76.88,0.6902,59.45,0.1748
|
28 |
27.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,68.04,75.44,0.0,60.63,0.0
|
29 |
28.0,IO,Qwen2.5-7B-Instruct,2025/1/22,67.99,57.24,0.0,78.74,0.0
|
30 |
+
29.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,66.46,73.46,0.0,59.45,0.0
|
31 |
+
30.0,CoT,Internllm2_5-7B,2025/1/22,65.24,77.71,0.0,52.76,0.0
|
32 |
31.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,63.47,58.83,0.0,68.11,0.0
|
33 |
32.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,61.65,67.78,0.0,55.51,0.0
|
34 |
33.0,ReAct-Pro*,gpt-4o,2025/1/22,60.40,63.31,39.0751,57.48,2.304
|
35 |
34.0,IO,Llama-3.1-8B-Instruct,2025/1/22,54.17,57.16,0.0,51.18,0.0
|
36 |
35.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,48.03,55.5,0.0,40.55,0.0
|
37 |
+
36.0,SC-CoT,Internllm2_5-7B,2025/1/22,43.80,48.22,0.0,39.37,0.0
|
38 |
37.0,IO,gpt-3.5-turbo,2025/1/7,38.41,37.83,0.3328,38.98,0.038
|
39 |
38.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,37.64,38.67,0.0,36.61,0.0
|
40 |
39.0,PoT,Internllm2_5-7B,2025/1/22,37.41,38.21,0.0,36.61,0.0
|
|
|
45 |
44.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,24.61,18.5,0.0,30.71,0.0
|
46 |
45.0,IO,Qwen2-1.5B-Instruct,2025/1/22,22.91,16.68,0.0,29.13,0.0
|
47 |
46.0,IO,Qwen2-0.5B-Instruct,2025/1/22,20.94,14.71,0.0,27.17,0.0
|
48 |
+
47.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,17.69,11.75,0.0,23.62,0.0
|
49 |
+
48.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,15.84,7.66,0.0,24.02,0.0
|
50 |
+
49.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,13.47,9.62,0.0,17.32,0.0
|
51 |
+
50.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,12.25,1.67,0.0,22.83,0.0
|
src/record.csv
CHANGED
@@ -28,7 +28,7 @@ PoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,58.83,70.51,8,,1319,"1,145,390",868,"217
|
|
28 |
PoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,38.67,55.42,8,,1319,"1,147,538",870,"243,573",185,"1,391,111",0.0000,,,,,,,,,,,,,,,,,,,,
|
29 |
PoT,gsm8k,2025/1/22,Internllm2_5-7B,38.21,48.9,8,,1319,"1,136,843",862,"188,106",143,"1,324,949",0.0000,,,,,,,,,,,,,,,,,,,,
|
30 |
PoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,18.5,31.01,8,,1319,"1,151,528",873,"175,994",133,"1,327,522",0.0000,,,,,,,,,,,,,,,,,,,,
|
31 |
-
PoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,9.62,16.
|
32 |
CoT,gsm8k,2025/1/7,gpt-3.5-turbo,78.7,100,8,,1319,"953,242",723,"134,799",102,"1,088,041",0.6788,,,,,,,,,,,,,,,,,,,,
|
33 |
CoT,gsm8k,2025/1/7,Doubao-lite-32k,89.31,100,8,,1319,"1,042,095",790,"159,725",121,"1,201,820",0.0558,0.4084635 (元),,,,,,,,,,,,,,,,,,,
|
34 |
CoT,gsm8k,2025/1/22,gpt-4o,94.09,100,8,,1319,"948,668",719,"216,498",164,"1,165,166",4.5367,,,,,,,,,,,,,,,,,,,,
|
@@ -39,16 +39,16 @@ CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,75.44,99.92,8,,1319,"990,168",751,"258
|
|
39 |
CoT,gsm8k,2025/1/22,Internllm2_5-7B,77.71,99.7,8,,1319,"968,163",734,"234,000",177,"1,202,163",0.0000,,,,,,,,,,,,,,,,,,,,
|
40 |
CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,55.5,100,8,,1319,"1,032,818",783,"185,707",141,"1,218,525",0.0000,,,,,,,,,,,,,,,,,,,,
|
41 |
CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,35.94,99.92,8,,1319,"1,032,818",783,"190,641",145,"1,223,459",0.0000,,,,,,,,,,,,,,,,,,,,
|
42 |
-
SC-CoT,gsm8k,2025/1/7,gpt-3.5-turbo,
|
43 |
-
SC-CoT,gsm8k,2025/1/7,Doubao-lite-32k,
|
44 |
-
SC-CoT,gsm8k,2025/1/22,gpt-4o,90.
|
45 |
-
SC-CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,
|
46 |
-
SC-CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,95.
|
47 |
-
SC-CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,
|
48 |
-
SC-CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,
|
49 |
-
SC-CoT,gsm8k,2025/1/22,Internllm2_5-7B,
|
50 |
-
SC-CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,
|
51 |
-
SC-CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,
|
52 |
IO,AQuA,2025/1/7,gpt-3.5-turbo,38.98,100,0,,254,"25,701",101,"16,770",66,"42,471",0.0380,,,,,,,,,,,,,,,,,,,,
|
53 |
IO,AQuA,2025/1/7,Doubao-lite-32k,79.13,100,0,,254,"33,058",130,"54,684",215,"87,742",0.0058,0.0427(元),,,,,,,,,,,,,,,,,,,
|
54 |
IO,AQuA,2025/1/22,gpt-4o,75.59,97.24,0,,254,"25,631",101,"108,121",426,"133,752",1.1453,,,,,,,,,,,,,,,,,,,,
|
@@ -59,7 +59,7 @@ IO,AQuA,2025/1/22,Llama-3.1-8B-Instruct,51.18,98.82,0,,254,"26,459",104,"106,647
|
|
59 |
IO,AQuA,2025/1/22,Internllm2_5-7B,47.64,90.94,0,,254,"50,232",198,"134,809",531,"185,041",0.0000,,,,,,,,,,,,,,,,,,,,
|
60 |
IO,AQuA,2025/1/22,Qwen2-1.5B-Instruct,29.13,97.64,0,,254,"27,937",110,"43,110",170,"71,047",0.0000,,,,,,,,,,,,,,,,,,,,
|
61 |
IO,AQuA,2025/1/22,Qwen2-0.5B-Instruct,27.17,98.82,0,,254,"27,937",110,"82,478",325,"110,415",0.0000,,,,,,,,,,,,,,,,,,,,
|
62 |
-
CoT,AQuA,2025/1/
|
63 |
CoT,AQuA,2025/1/7,Doubao-lite-32k,82.68,97.24,0,,254,"27,978",110,"66,599",262,"94,577",0.0066,0.0483 (元),,,,,,,,,,,,,,,,,,,
|
64 |
CoT,AQuA,2025/1/22,gpt-4o,82.68,98.03,0,,254,"25,123",99,"97,894",385,"123,017",1.0417,,,,,,,,,,,,,,,,,,,,
|
65 |
CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,86.22,99.21,0,,254,"25,143",99,"118,146",465,"143,289",0.0808,,,,,,,,,,,,,,,,,,,,
|
@@ -79,16 +79,16 @@ PoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,36.61,96.85,0,,254,"240,613",947,"50,30
|
|
79 |
PoT,AQuA,2025/1/22,Internllm2_5-7B,36.61,98.82,0,,254,"233,505",919,"68,457",270,"301,962",0.0000,,,,,,,,,,,,,,,,,,,,
|
80 |
PoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,30.71,96.46,0,,254,"246,560",971,"51,915",204,"298,475",0.0000,,,,,,,,,,,,,,,,,,,,
|
81 |
PoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,17.32,92.13,0,,254,"258,867","1,019","63,414",250,"322,281",0.0000,,,,,,,,,,,,,,,,,,,,
|
82 |
-
SC-CoT,AQuA,2025/1/7,gpt-3.5-turbo,
|
83 |
-
SC-CoT,AQuA,2025/1/7,Doubao-lite-32k,81.
|
84 |
-
SC-CoT,AQuA,2025/1/22,gpt-4o,
|
85 |
-
SC-CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,85.
|
86 |
-
SC-CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,
|
87 |
-
SC-CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,
|
88 |
-
SC-CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,
|
89 |
-
SC-CoT,AQuA,2025/1/22,Internllm2_5-7B,
|
90 |
-
SC-CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,
|
91 |
-
SC-CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,
|
92 |
ReAct-Pro*,AQuA,2025/1/7,gpt-3.5-turbo,64.57,98.03,0,max_steps=10,254,"862,614","3,396","40,973",161,"903,587",0.4928,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
|
93 |
ReAct-Pro*,AQuA,2025/1/7,Doubao-lite-32k,77.56,96.06,0,max_steps=10,254,"977,890","3,850","54,951",216,"1,032,841",0.0445,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
|
94 |
ReAct-Pro*,AQuA,2025/1/22,gpt-4o,57.48,97.24,0,max_steps=10,254,"615,589","2,424","76,507",301,"692,096",2.3040,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
|
@@ -98,7 +98,7 @@ ReAct-Pro*,AQuA,2025/1/22,Qwen2.5-7B-Instruct,74.41,99.21,0,max_steps=10,254,"56
|
|
98 |
ReAct-Pro*,AQuA,2025/1/22,Llama-3.1-8B-Instruct,55.51,96.85,0,max_steps=10,254,"3,764,723","14,822","576,098","2,268","4,340,821",0.0000,,,,,,,,,,,,,,,,,,,,
|
99 |
ReAct-Pro*,AQuA,2025/1/22,Internllm2_5-7B,40.94,96.85,0,max_steps=10,254,"3,592,039","14,142","836,762","3,294","4,428,801",0.0000,,,,,,,,,,,,,,,,,,,,
|
100 |
ReAct-Pro*,AQuA,2025/1/22,Qwen2-1.5B-Instruct,25.59,96.06,0,max_steps=10,254,"4,555,858","17,936","516,146","2,032","5,072,004",0.0000,,,,,,,,,,,,,,,,,,,,
|
101 |
-
ReAct-Pro*,AQuA,2025/1/22,Qwen2-0.5B-Instruct,24.02,96.85,0,max_steps=10,254,
|
102 |
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
103 |
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
104 |
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
|
|
28 |
PoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,38.67,55.42,8,,1319,"1,147,538",870,"243,573",185,"1,391,111",0.0000,,,,,,,,,,,,,,,,,,,,
|
29 |
PoT,gsm8k,2025/1/22,Internllm2_5-7B,38.21,48.9,8,,1319,"1,136,843",862,"188,106",143,"1,324,949",0.0000,,,,,,,,,,,,,,,,,,,,
|
30 |
PoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,18.5,31.01,8,,1319,"1,151,528",873,"175,994",133,"1,327,522",0.0000,,,,,,,,,,,,,,,,,,,,
|
31 |
+
PoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,9.62,16.91,8,,1319,"1,151,528",873,"237,607",180,"1,389,135",0.0000,,,,,,,,,,,,,,,,,,,,
|
32 |
CoT,gsm8k,2025/1/7,gpt-3.5-turbo,78.7,100,8,,1319,"953,242",723,"134,799",102,"1,088,041",0.6788,,,,,,,,,,,,,,,,,,,,
|
33 |
CoT,gsm8k,2025/1/7,Doubao-lite-32k,89.31,100,8,,1319,"1,042,095",790,"159,725",121,"1,201,820",0.0558,0.4084635 (元),,,,,,,,,,,,,,,,,,,
|
34 |
CoT,gsm8k,2025/1/22,gpt-4o,94.09,100,8,,1319,"948,668",719,"216,498",164,"1,165,166",4.5367,,,,,,,,,,,,,,,,,,,,
|
|
|
39 |
CoT,gsm8k,2025/1/22,Internllm2_5-7B,77.71,99.7,8,,1319,"968,163",734,"234,000",177,"1,202,163",0.0000,,,,,,,,,,,,,,,,,,,,
|
40 |
CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,55.5,100,8,,1319,"1,032,818",783,"185,707",141,"1,218,525",0.0000,,,,,,,,,,,,,,,,,,,,
|
41 |
CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,35.94,99.92,8,,1319,"1,032,818",783,"190,641",145,"1,223,459",0.0000,,,,,,,,,,,,,,,,,,,,
|
42 |
+
SC-CoT,gsm8k,2025/1/7,gpt-3.5-turbo,79.91,99.92,8,"temperature=1, path_num=5",1319,"2,740,652","2,078","1,348,960","1,023","4,089,612",3.3938,,,,,,,,,,,,,,,,,,,,
|
43 |
+
SC-CoT,gsm8k,2025/1/7,Doubao-lite-32k,87.26,99.92,8,"temperature=1, path_num=5",1319,"2,691,714","2,041","1,197,099",908,"3,888,813",0.2083,,,,,,,,,,,,,,,,,,,,
|
44 |
+
SC-CoT,gsm8k,2025/1/22,gpt-4o,90.3,99.92,8,"temperature=1, path_num=5",1319,"3,590,336","2,722","2,207,837","1,674","5,798,173",31.0542,,,,,,,,,,,,,,,,,,,,
|
45 |
+
SC-CoT,gsm8k,2025/1/22,Qwen2.5-72B-Instruct,93.86,100,8,"temperature=1, path_num=5",1319,"8,136,223","6,168","2,481,785","1,882","10,618,008",5.9858,,,,,,,,,,,,,,,,,,,,
|
46 |
+
SC-CoT,gsm8k,2025/1/22,Llama-3.3-70B-Instruct,95.07,100,8,"temperature=1, path_num=5",1319,"8,413,717","6,379","2,585,077","1,960","10,998,794",6.2005,,,,,,,,,,,,,,,,,,,,
|
47 |
+
SC-CoT,gsm8k,2025/1/22,Qwen2.5-7B-Instruct,91.13,100,8,"temperature=1, path_num=5",1319,"8,586,888","6,510","2,554,097","1,936","11,140,985",0.0000,,,,,,,,,,,,,,,,,,,,
|
48 |
+
SC-CoT,gsm8k,2025/1/22,Llama-3.1-8B-Instruct,73.46,99.55,8,"temperature=1, path_num=5",1319,"8,630,514","6,543","3,148,202","2,387","11,778,716",0.0000,,,,,,,,,,,,,,,,,,,,
|
49 |
+
SC-CoT,gsm8k,2025/1/22,Internllm2_5-7B,48.22,98.41,8,"temperature=1, path_num=5",1319,"10,678,792","8,096","3,847,639","2,917","14,526,431",0.0000,,,,,,,,,,,,,,,,,,,,
|
50 |
+
SC-CoT,gsm8k,2025/1/22,Qwen2-1.5B-Instruct,11.75,91.89,8,"temperature=1, path_num=5",1319,"9,066,115","6,873","3,345,827","2,537","12,411,942",0.0000,,,,,,,,,,,,,,,,,,,,
|
51 |
+
SC-CoT,gsm8k,2025/1/22,Qwen2-0.5B-Instruct,1.67,94.69,8,"temperature=1, path_num=5",1319,"11,019,864","8,355","5,445,856","4,129","16,465,720",0.0000,,,,,,,,,,,,,,,,,,,,
|
52 |
IO,AQuA,2025/1/7,gpt-3.5-turbo,38.98,100,0,,254,"25,701",101,"16,770",66,"42,471",0.0380,,,,,,,,,,,,,,,,,,,,
|
53 |
IO,AQuA,2025/1/7,Doubao-lite-32k,79.13,100,0,,254,"33,058",130,"54,684",215,"87,742",0.0058,0.0427(元),,,,,,,,,,,,,,,,,,,
|
54 |
IO,AQuA,2025/1/22,gpt-4o,75.59,97.24,0,,254,"25,631",101,"108,121",426,"133,752",1.1453,,,,,,,,,,,,,,,,,,,,
|
|
|
59 |
IO,AQuA,2025/1/22,Internllm2_5-7B,47.64,90.94,0,,254,"50,232",198,"134,809",531,"185,041",0.0000,,,,,,,,,,,,,,,,,,,,
|
60 |
IO,AQuA,2025/1/22,Qwen2-1.5B-Instruct,29.13,97.64,0,,254,"27,937",110,"43,110",170,"71,047",0.0000,,,,,,,,,,,,,,,,,,,,
|
61 |
IO,AQuA,2025/1/22,Qwen2-0.5B-Instruct,27.17,98.82,0,,254,"27,937",110,"82,478",325,"110,415",0.0000,,,,,,,,,,,,,,,,,,,,
|
62 |
+
CoT,AQuA,2025/1/7,gpt-3.5-turbo,61.02,93.7,0,,254,"25,447",100,"55,346",218,"80,793",0.0957,,,,,,,,,,,,,,,,,,,,
|
63 |
CoT,AQuA,2025/1/7,Doubao-lite-32k,82.68,97.24,0,,254,"27,978",110,"66,599",262,"94,577",0.0066,0.0483 (元),,,,,,,,,,,,,,,,,,,
|
64 |
CoT,AQuA,2025/1/22,gpt-4o,82.68,98.03,0,,254,"25,123",99,"97,894",385,"123,017",1.0417,,,,,,,,,,,,,,,,,,,,
|
65 |
CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,86.22,99.21,0,,254,"25,143",99,"118,146",465,"143,289",0.0808,,,,,,,,,,,,,,,,,,,,
|
|
|
79 |
PoT,AQuA,2025/1/22,Internllm2_5-7B,36.61,98.82,0,,254,"233,505",919,"68,457",270,"301,962",0.0000,,,,,,,,,,,,,,,,,,,,
|
80 |
PoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,30.71,96.46,0,,254,"246,560",971,"51,915",204,"298,475",0.0000,,,,,,,,,,,,,,,,,,,,
|
81 |
PoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,17.32,92.13,0,,254,"258,867","1,019","63,414",250,"322,281",0.0000,,,,,,,,,,,,,,,,,,,,
|
82 |
+
SC-CoT,AQuA,2025/1/7,gpt-3.5-turbo,66.14,99.21,0,"temperature=1, path_num=5",254,"482,192","1,898","365,143","1,438","847,335",0.7888,,,,,,,,,,,,,,,,,,,,
|
83 |
+
SC-CoT,AQuA,2025/1/7,Doubao-lite-32k,81.1,97.24,0,"temperature=1, path_num=5",254,"503,751","1,983","382,235","1,505","885,986",0.0519,,,,,,,,,,,,,,,,,,,,
|
84 |
+
SC-CoT,AQuA,2025/1/22,gpt-4o,86.61,98.82,0,"temperature=1, path_num=5",254,"744,478","2,931","628,728","2,475","1,373,206",8.1485,,,,,,,,,,,,,,,,,,,,
|
85 |
+
SC-CoT,AQuA,2025/1/22,Qwen2.5-72B-Instruct,85.04,99.21,0,"temperature=1, path_num=5",254,"1,051,218","4,139","784,451","3,088","1,835,669",1.0348,,,,,,,,,,,,,,,,,,,,
|
86 |
+
SC-CoT,AQuA,2025/1/22,Llama-3.3-70B-Instruct,82.28,99.21,0,"temperature=1, path_num=5",254,"1,135,251","4,469","772,673","3,042","1,907,924",1.0756,,,,,,,,,,,,,,,,,,,,
|
87 |
+
SC-CoT,AQuA,2025/1/22,Qwen2.5-7B-Instruct,79.92,100,0,"temperature=1, path_num=5",254,"1,098,280","4,324","747,052","2,941","1,845,332",0.0000,,,,,,,,,,,,,,,,,,,,
|
88 |
+
SC-CoT,AQuA,2025/1/22,Llama-3.1-8B-Instruct,59.45,97.24,0,"temperature=1, path_num=5",254,"971,003","3,823","680,330","2,678","1,651,333",0.0000,,,,,,,,,,,,,,,,,,,,
|
89 |
+
SC-CoT,AQuA,2025/1/22,Internllm2_5-7B,39.37,98.03,0,"temperature=1, path_num=5",254,"1,420,494","5,592","875,728","3,448","2,296,222",0.0000,,,,,,,,,,,,,,,,,,,,
|
90 |
+
SC-CoT,AQuA,2025/1/22,Qwen2-1.5B-Instruct,23.62,96.46,0,"temperature=1, path_num=5",254,"1,034,362","4,072","740,973","2,917","1,775,335",0.0000,,,,,,,,,,,,,,,,,,,,
|
91 |
+
SC-CoT,AQuA,2025/1/22,Qwen2-0.5B-Instruct,22.83,97.24,0,"temperature=1, path_num=5",254,"1,246,929","4,909","968,162","3,812","2,215,091",0.0000,,,,,,,,,,,,,,,,,,,,
|
92 |
ReAct-Pro*,AQuA,2025/1/7,gpt-3.5-turbo,64.57,98.03,0,max_steps=10,254,"862,614","3,396","40,973",161,"903,587",0.4928,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
|
93 |
ReAct-Pro*,AQuA,2025/1/7,Doubao-lite-32k,77.56,96.06,0,max_steps=10,254,"977,890","3,850","54,951",216,"1,032,841",0.0445,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
|
94 |
ReAct-Pro*,AQuA,2025/1/22,gpt-4o,57.48,97.24,0,max_steps=10,254,"615,589","2,424","76,507",301,"692,096",2.3040,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
|
|
|
98 |
ReAct-Pro*,AQuA,2025/1/22,Llama-3.1-8B-Instruct,55.51,96.85,0,max_steps=10,254,"3,764,723","14,822","576,098","2,268","4,340,821",0.0000,,,,,,,,,,,,,,,,,,,,
|
99 |
ReAct-Pro*,AQuA,2025/1/22,Internllm2_5-7B,40.94,96.85,0,max_steps=10,254,"3,592,039","14,142","836,762","3,294","4,428,801",0.0000,,,,,,,,,,,,,,,,,,,,
|
100 |
ReAct-Pro*,AQuA,2025/1/22,Qwen2-1.5B-Instruct,25.59,96.06,0,max_steps=10,254,"4,555,858","17,936","516,146","2,032","5,072,004",0.0000,,,,,,,,,,,,,,,,,,,,
|
101 |
+
ReAct-Pro*,AQuA,2025/1/22,Qwen2-0.5B-Instruct,24.02,96.85,0,max_steps=10,254,"6,344,167","24,977","825,920","3,252","7,170,087",0.0000,,,,,,,,,,,,,,,,,,,,
|
102 |
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
103 |
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
104 |
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|