Femboyuwu2000 commited on
Commit
439e609
1 Parent(s): 9e769aa

Training in progress, step 5280, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35a69ca220b03dc6504efc110f15929f9ae57c96d280eb843d1af6a4264874a6
3
  size 13982248
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f5bfad4f42d19e9da696cbdb43e4409d0f807b2b877c6a089cfd0a74bf2771f
3
  size 13982248
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df293182e1e9c9d15f7f8f232a05259e88feaf1f67100eb23fec82413de6cd98
3
  size 7062522
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc987e09473dd7b3d086e9ed9bc1748d7d1b3108088d88eddfdaf6972cbf4189
3
  size 7062522
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d0bcdc2595dd39333bdd13b49d9cd082e2eb8110dac33ed099c7f11efef73d4
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1dbdd57232626eddcfa10c081f5c18623de1c4663a61f927134c8ad55d47712
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87ec9f1ba3ae289c7a0ef682226dbd37810689a9b5a9a4bec38e00e5e276301e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b342905fe7ee06e2011340421596fa9b2d4facdf3e6fe1f5ce5617922a76da7c
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.3216,
5
  "eval_steps": 500,
6
- "global_step": 4020,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1414,6 +1414,447 @@
1414
  "learning_rate": 2.775776814817928e-05,
1415
  "loss": 3.4266,
1416
  "step": 4020
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1417
  }
1418
  ],
1419
  "logging_steps": 20,
@@ -1421,7 +1862,7 @@
1421
  "num_input_tokens_seen": 0,
1422
  "num_train_epochs": 2,
1423
  "save_steps": 20,
1424
- "total_flos": 9487404305154048.0,
1425
  "train_batch_size": 8,
1426
  "trial_name": null,
1427
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.4224,
5
  "eval_steps": 500,
6
+ "global_step": 5280,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1414
  "learning_rate": 2.775776814817928e-05,
1415
  "loss": 3.4266,
1416
  "step": 4020
1417
+ },
1418
+ {
1419
+ "epoch": 0.32,
1420
+ "grad_norm": 37.957637786865234,
1421
+ "learning_rate": 2.7732150118568016e-05,
1422
+ "loss": 3.6768,
1423
+ "step": 4040
1424
+ },
1425
+ {
1426
+ "epoch": 0.32,
1427
+ "grad_norm": 42.639320373535156,
1428
+ "learning_rate": 2.770639853472676e-05,
1429
+ "loss": 3.5102,
1430
+ "step": 4060
1431
+ },
1432
+ {
1433
+ "epoch": 0.33,
1434
+ "grad_norm": 29.558870315551758,
1435
+ "learning_rate": 2.768051366677744e-05,
1436
+ "loss": 3.5354,
1437
+ "step": 4080
1438
+ },
1439
+ {
1440
+ "epoch": 0.33,
1441
+ "grad_norm": 23.12784194946289,
1442
+ "learning_rate": 2.765449578624007e-05,
1443
+ "loss": 3.5432,
1444
+ "step": 4100
1445
+ },
1446
+ {
1447
+ "epoch": 0.33,
1448
+ "grad_norm": 27.601444244384766,
1449
+ "learning_rate": 2.7628345166029907e-05,
1450
+ "loss": 3.5672,
1451
+ "step": 4120
1452
+ },
1453
+ {
1454
+ "epoch": 0.33,
1455
+ "grad_norm": 26.26235008239746,
1456
+ "learning_rate": 2.760206208045458e-05,
1457
+ "loss": 3.5635,
1458
+ "step": 4140
1459
+ },
1460
+ {
1461
+ "epoch": 0.33,
1462
+ "grad_norm": 57.84916305541992,
1463
+ "learning_rate": 2.7575646805211224e-05,
1464
+ "loss": 3.5254,
1465
+ "step": 4160
1466
+ },
1467
+ {
1468
+ "epoch": 0.33,
1469
+ "grad_norm": 25.554025650024414,
1470
+ "learning_rate": 2.7549099617383573e-05,
1471
+ "loss": 3.5142,
1472
+ "step": 4180
1473
+ },
1474
+ {
1475
+ "epoch": 0.34,
1476
+ "grad_norm": 38.82815170288086,
1477
+ "learning_rate": 2.7522420795439067e-05,
1478
+ "loss": 3.6104,
1479
+ "step": 4200
1480
+ },
1481
+ {
1482
+ "epoch": 0.34,
1483
+ "grad_norm": 28.093948364257812,
1484
+ "learning_rate": 2.7495610619225925e-05,
1485
+ "loss": 3.5265,
1486
+ "step": 4220
1487
+ },
1488
+ {
1489
+ "epoch": 0.34,
1490
+ "grad_norm": 26.187891006469727,
1491
+ "learning_rate": 2.746866936997021e-05,
1492
+ "loss": 3.4307,
1493
+ "step": 4240
1494
+ },
1495
+ {
1496
+ "epoch": 0.34,
1497
+ "grad_norm": 35.642738342285156,
1498
+ "learning_rate": 2.7441597330272874e-05,
1499
+ "loss": 3.5501,
1500
+ "step": 4260
1501
+ },
1502
+ {
1503
+ "epoch": 0.34,
1504
+ "grad_norm": 32.99201965332031,
1505
+ "learning_rate": 2.7414394784106812e-05,
1506
+ "loss": 3.4463,
1507
+ "step": 4280
1508
+ },
1509
+ {
1510
+ "epoch": 0.34,
1511
+ "grad_norm": 28.848899841308594,
1512
+ "learning_rate": 2.7387062016813845e-05,
1513
+ "loss": 3.5128,
1514
+ "step": 4300
1515
+ },
1516
+ {
1517
+ "epoch": 0.35,
1518
+ "grad_norm": 30.502288818359375,
1519
+ "learning_rate": 2.7359599315101788e-05,
1520
+ "loss": 3.4909,
1521
+ "step": 4320
1522
+ },
1523
+ {
1524
+ "epoch": 0.35,
1525
+ "grad_norm": 27.91356658935547,
1526
+ "learning_rate": 2.7332006967041373e-05,
1527
+ "loss": 3.53,
1528
+ "step": 4340
1529
+ },
1530
+ {
1531
+ "epoch": 0.35,
1532
+ "grad_norm": 47.296627044677734,
1533
+ "learning_rate": 2.7304285262063274e-05,
1534
+ "loss": 3.4793,
1535
+ "step": 4360
1536
+ },
1537
+ {
1538
+ "epoch": 0.35,
1539
+ "grad_norm": 33.32771682739258,
1540
+ "learning_rate": 2.7276434490955074e-05,
1541
+ "loss": 3.4695,
1542
+ "step": 4380
1543
+ },
1544
+ {
1545
+ "epoch": 0.35,
1546
+ "grad_norm": 36.65375518798828,
1547
+ "learning_rate": 2.7248454945858164e-05,
1548
+ "loss": 3.4502,
1549
+ "step": 4400
1550
+ },
1551
+ {
1552
+ "epoch": 0.35,
1553
+ "grad_norm": 62.65798568725586,
1554
+ "learning_rate": 2.7220346920264743e-05,
1555
+ "loss": 3.5893,
1556
+ "step": 4420
1557
+ },
1558
+ {
1559
+ "epoch": 0.36,
1560
+ "grad_norm": 26.921863555908203,
1561
+ "learning_rate": 2.71921107090147e-05,
1562
+ "loss": 3.4381,
1563
+ "step": 4440
1564
+ },
1565
+ {
1566
+ "epoch": 0.36,
1567
+ "grad_norm": 35.91081619262695,
1568
+ "learning_rate": 2.7163746608292525e-05,
1569
+ "loss": 3.5292,
1570
+ "step": 4460
1571
+ },
1572
+ {
1573
+ "epoch": 0.36,
1574
+ "grad_norm": 42.172306060791016,
1575
+ "learning_rate": 2.7135254915624213e-05,
1576
+ "loss": 3.5314,
1577
+ "step": 4480
1578
+ },
1579
+ {
1580
+ "epoch": 0.36,
1581
+ "grad_norm": 65.17137908935547,
1582
+ "learning_rate": 2.710663592987414e-05,
1583
+ "loss": 3.518,
1584
+ "step": 4500
1585
+ },
1586
+ {
1587
+ "epoch": 0.36,
1588
+ "grad_norm": 32.53944396972656,
1589
+ "learning_rate": 2.7077889951241924e-05,
1590
+ "loss": 3.5562,
1591
+ "step": 4520
1592
+ },
1593
+ {
1594
+ "epoch": 0.36,
1595
+ "grad_norm": 25.663211822509766,
1596
+ "learning_rate": 2.704901728125928e-05,
1597
+ "loss": 3.5537,
1598
+ "step": 4540
1599
+ },
1600
+ {
1601
+ "epoch": 0.36,
1602
+ "grad_norm": 23.626951217651367,
1603
+ "learning_rate": 2.702001822278685e-05,
1604
+ "loss": 3.5525,
1605
+ "step": 4560
1606
+ },
1607
+ {
1608
+ "epoch": 0.37,
1609
+ "grad_norm": 30.527162551879883,
1610
+ "learning_rate": 2.699089308001104e-05,
1611
+ "loss": 3.4913,
1612
+ "step": 4580
1613
+ },
1614
+ {
1615
+ "epoch": 0.37,
1616
+ "grad_norm": 37.62814712524414,
1617
+ "learning_rate": 2.696164215844081e-05,
1618
+ "loss": 3.5342,
1619
+ "step": 4600
1620
+ },
1621
+ {
1622
+ "epoch": 0.37,
1623
+ "grad_norm": 26.47550392150879,
1624
+ "learning_rate": 2.6932265764904494e-05,
1625
+ "loss": 3.4708,
1626
+ "step": 4620
1627
+ },
1628
+ {
1629
+ "epoch": 0.37,
1630
+ "grad_norm": 30.779155731201172,
1631
+ "learning_rate": 2.6902764207546553e-05,
1632
+ "loss": 3.5078,
1633
+ "step": 4640
1634
+ },
1635
+ {
1636
+ "epoch": 0.37,
1637
+ "grad_norm": 34.16841506958008,
1638
+ "learning_rate": 2.6873137795824367e-05,
1639
+ "loss": 3.4754,
1640
+ "step": 4660
1641
+ },
1642
+ {
1643
+ "epoch": 0.37,
1644
+ "grad_norm": 36.18644714355469,
1645
+ "learning_rate": 2.6843386840504972e-05,
1646
+ "loss": 3.4413,
1647
+ "step": 4680
1648
+ },
1649
+ {
1650
+ "epoch": 0.38,
1651
+ "grad_norm": 34.17078399658203,
1652
+ "learning_rate": 2.6813511653661817e-05,
1653
+ "loss": 3.4916,
1654
+ "step": 4700
1655
+ },
1656
+ {
1657
+ "epoch": 0.38,
1658
+ "grad_norm": 24.693265914916992,
1659
+ "learning_rate": 2.678351254867147e-05,
1660
+ "loss": 3.4072,
1661
+ "step": 4720
1662
+ },
1663
+ {
1664
+ "epoch": 0.38,
1665
+ "grad_norm": 27.831270217895508,
1666
+ "learning_rate": 2.675338984021035e-05,
1667
+ "loss": 3.5353,
1668
+ "step": 4740
1669
+ },
1670
+ {
1671
+ "epoch": 0.38,
1672
+ "grad_norm": 18.52642059326172,
1673
+ "learning_rate": 2.672314384425142e-05,
1674
+ "loss": 3.4582,
1675
+ "step": 4760
1676
+ },
1677
+ {
1678
+ "epoch": 0.38,
1679
+ "grad_norm": 44.86159133911133,
1680
+ "learning_rate": 2.669277487806085e-05,
1681
+ "loss": 3.4384,
1682
+ "step": 4780
1683
+ },
1684
+ {
1685
+ "epoch": 0.38,
1686
+ "grad_norm": 28.123258590698242,
1687
+ "learning_rate": 2.6662283260194743e-05,
1688
+ "loss": 3.5766,
1689
+ "step": 4800
1690
+ },
1691
+ {
1692
+ "epoch": 0.39,
1693
+ "grad_norm": 27.150848388671875,
1694
+ "learning_rate": 2.6631669310495725e-05,
1695
+ "loss": 3.5095,
1696
+ "step": 4820
1697
+ },
1698
+ {
1699
+ "epoch": 0.39,
1700
+ "grad_norm": 43.018043518066406,
1701
+ "learning_rate": 2.660093335008966e-05,
1702
+ "loss": 3.4795,
1703
+ "step": 4840
1704
+ },
1705
+ {
1706
+ "epoch": 0.39,
1707
+ "grad_norm": 29.27479362487793,
1708
+ "learning_rate": 2.6570075701382213e-05,
1709
+ "loss": 3.5236,
1710
+ "step": 4860
1711
+ },
1712
+ {
1713
+ "epoch": 0.39,
1714
+ "grad_norm": 19.501262664794922,
1715
+ "learning_rate": 2.653909668805553e-05,
1716
+ "loss": 3.5479,
1717
+ "step": 4880
1718
+ },
1719
+ {
1720
+ "epoch": 0.39,
1721
+ "grad_norm": 57.28257369995117,
1722
+ "learning_rate": 2.6507996635064792e-05,
1723
+ "loss": 3.5156,
1724
+ "step": 4900
1725
+ },
1726
+ {
1727
+ "epoch": 0.39,
1728
+ "grad_norm": 27.764036178588867,
1729
+ "learning_rate": 2.647677586863484e-05,
1730
+ "loss": 3.5222,
1731
+ "step": 4920
1732
+ },
1733
+ {
1734
+ "epoch": 0.4,
1735
+ "grad_norm": 33.74861526489258,
1736
+ "learning_rate": 2.644543471625675e-05,
1737
+ "loss": 3.4773,
1738
+ "step": 4940
1739
+ },
1740
+ {
1741
+ "epoch": 0.4,
1742
+ "grad_norm": 25.404314041137695,
1743
+ "learning_rate": 2.6413973506684366e-05,
1744
+ "loss": 3.4646,
1745
+ "step": 4960
1746
+ },
1747
+ {
1748
+ "epoch": 0.4,
1749
+ "grad_norm": 33.307674407958984,
1750
+ "learning_rate": 2.63823925699309e-05,
1751
+ "loss": 3.4975,
1752
+ "step": 4980
1753
+ },
1754
+ {
1755
+ "epoch": 0.4,
1756
+ "grad_norm": 28.22442054748535,
1757
+ "learning_rate": 2.6350692237265428e-05,
1758
+ "loss": 3.4797,
1759
+ "step": 5000
1760
+ },
1761
+ {
1762
+ "epoch": 0.4,
1763
+ "grad_norm": 26.52558135986328,
1764
+ "learning_rate": 2.6318872841209446e-05,
1765
+ "loss": 3.4309,
1766
+ "step": 5020
1767
+ },
1768
+ {
1769
+ "epoch": 0.4,
1770
+ "grad_norm": 36.679386138916016,
1771
+ "learning_rate": 2.6286934715533353e-05,
1772
+ "loss": 3.585,
1773
+ "step": 5040
1774
+ },
1775
+ {
1776
+ "epoch": 0.4,
1777
+ "grad_norm": 42.78778839111328,
1778
+ "learning_rate": 2.6254878195252985e-05,
1779
+ "loss": 3.4239,
1780
+ "step": 5060
1781
+ },
1782
+ {
1783
+ "epoch": 0.41,
1784
+ "grad_norm": 34.719482421875,
1785
+ "learning_rate": 2.622270361662606e-05,
1786
+ "loss": 3.4777,
1787
+ "step": 5080
1788
+ },
1789
+ {
1790
+ "epoch": 0.41,
1791
+ "grad_norm": 33.207427978515625,
1792
+ "learning_rate": 2.619041131714869e-05,
1793
+ "loss": 3.5593,
1794
+ "step": 5100
1795
+ },
1796
+ {
1797
+ "epoch": 0.41,
1798
+ "grad_norm": 35.62514877319336,
1799
+ "learning_rate": 2.6158001635551818e-05,
1800
+ "loss": 3.5606,
1801
+ "step": 5120
1802
+ },
1803
+ {
1804
+ "epoch": 0.41,
1805
+ "grad_norm": 31.691574096679688,
1806
+ "learning_rate": 2.6125474911797664e-05,
1807
+ "loss": 3.4959,
1808
+ "step": 5140
1809
+ },
1810
+ {
1811
+ "epoch": 0.41,
1812
+ "grad_norm": 34.012420654296875,
1813
+ "learning_rate": 2.6092831487076163e-05,
1814
+ "loss": 3.57,
1815
+ "step": 5160
1816
+ },
1817
+ {
1818
+ "epoch": 0.41,
1819
+ "grad_norm": 36.75544357299805,
1820
+ "learning_rate": 2.6060071703801406e-05,
1821
+ "loss": 3.4718,
1822
+ "step": 5180
1823
+ },
1824
+ {
1825
+ "epoch": 0.42,
1826
+ "grad_norm": 37.18219757080078,
1827
+ "learning_rate": 2.6027195905608006e-05,
1828
+ "loss": 3.5332,
1829
+ "step": 5200
1830
+ },
1831
+ {
1832
+ "epoch": 0.42,
1833
+ "grad_norm": 32.344398498535156,
1834
+ "learning_rate": 2.599420443734754e-05,
1835
+ "loss": 3.5154,
1836
+ "step": 5220
1837
+ },
1838
+ {
1839
+ "epoch": 0.42,
1840
+ "grad_norm": 26.169748306274414,
1841
+ "learning_rate": 2.596109764508489e-05,
1842
+ "loss": 3.5462,
1843
+ "step": 5240
1844
+ },
1845
+ {
1846
+ "epoch": 0.42,
1847
+ "grad_norm": 33.38447570800781,
1848
+ "learning_rate": 2.592787587609465e-05,
1849
+ "loss": 3.5658,
1850
+ "step": 5260
1851
+ },
1852
+ {
1853
+ "epoch": 0.42,
1854
+ "grad_norm": 43.4962158203125,
1855
+ "learning_rate": 2.589453947885745e-05,
1856
+ "loss": 3.5018,
1857
+ "step": 5280
1858
  }
1859
  ],
1860
  "logging_steps": 20,
 
1862
  "num_input_tokens_seen": 0,
1863
  "num_train_epochs": 2,
1864
  "save_steps": 20,
1865
+ "total_flos": 1.2497937290428416e+16,
1866
  "train_batch_size": 8,
1867
  "trial_name": null,
1868
  "trial_params": null