CocoRoF commited on
Commit
3d200e5
·
verified ·
1 Parent(s): b25209a

Training in progress, step 110000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a3b4fea81f3900bfa69110b0134fccc266c3f6ab24b41591a22031c6dc0bf38
3
  size 962707376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc49b3d9b0679248cd15f65908e78a9fcfd11827d5389ba5cb0b2ebbdaf749cd
3
  size 962707376
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abda76448b02ed8e0515fa61dee3b896a7eb5b7a285486480cdc210629933bb2
3
  size 61873722
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f366356ff3cc9f946b2c19095ca1bd83dff58b6c74b4b7f4158a23546ef43c24
3
  size 61873722
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1fc40fd4217bb1769994a01649f5a56abbdb75dddc3799ff60503d0cb999b01
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f85fa1b5bcd6943c9f9c56d121f3e4dacc1a19e6d2b661b5087ef101c441a108
3
  size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d537560203eb7afe3f6a8a44879b439cea0a94e1a4b8761fd98f99d53cacdea
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcdc426f94a24625315b78c0cd3d5550941c2561f271869bd8aa30687554b3ef
3
  size 15024
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c29e1df3583637c5354ea07c22896e12d461364bcdcfbc1cbd8878bc3c491cdf
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e95c808f21d4e49616ae42a547aa089261876cc46d0c0199694ad4579fe5b25
3
  size 15024
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e34e43ccaf5787f5a424c1b44fbe292f707243ea8dc15124de1140db9b0f5d4
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d8d792fb2135a74aa62ac4384051c5b425a53ddaef874b5e263803483382478
3
  size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5e9475c739c6dcbf77fbb3156414a1f7621b69f01b559ea426b746f7680f932
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:411cd335eac4c1185a1ba16ecc82a91f7993f0100f715329705d14d99f0dda72
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.362331748008728,
3
- "best_model_checkpoint": "/workspace/plateer_classifier_v0.1_result/checkpoint-55000",
4
- "epoch": 0.32206354899391737,
5
  "eval_steps": 55000,
6
- "global_step": 55000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1592,6 +1592,1555 @@
1592
  "eval_samples_per_second": 210.262,
1593
  "eval_steps_per_second": 6.571,
1594
  "step": 55000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1595
  }
1596
  ],
1597
  "logging_steps": 250,
@@ -1611,7 +3160,7 @@
1611
  "attributes": {}
1612
  }
1613
  },
1614
- "total_flos": 1.4252945436741468e+19,
1615
  "train_batch_size": 8,
1616
  "trial_name": null,
1617
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.3242824375629425,
3
+ "best_model_checkpoint": "/workspace/plateer_classifier_v0.1_result/checkpoint-110000",
4
+ "epoch": 0.6441270979878347,
5
  "eval_steps": 55000,
6
+ "global_step": 110000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1592
  "eval_samples_per_second": 210.262,
1593
  "eval_steps_per_second": 6.571,
1594
  "step": 55000
1595
+ },
1596
+ {
1597
+ "epoch": 0.32352747421661704,
1598
+ "grad_norm": 7.128218650817871,
1599
+ "learning_rate": 0.00014375796931076737,
1600
+ "loss": 0.357,
1601
+ "step": 55250
1602
+ },
1603
+ {
1604
+ "epoch": 0.32499139943931665,
1605
+ "grad_norm": 4.943136692047119,
1606
+ "learning_rate": 0.00014344697181740715,
1607
+ "loss": 0.3576,
1608
+ "step": 55500
1609
+ },
1610
+ {
1611
+ "epoch": 0.32645532466201627,
1612
+ "grad_norm": 7.633016109466553,
1613
+ "learning_rate": 0.00014313597432404694,
1614
+ "loss": 0.3655,
1615
+ "step": 55750
1616
+ },
1617
+ {
1618
+ "epoch": 0.3279192498847159,
1619
+ "grad_norm": 9.49149227142334,
1620
+ "learning_rate": 0.00014282497683068673,
1621
+ "loss": 0.3687,
1622
+ "step": 56000
1623
+ },
1624
+ {
1625
+ "epoch": 0.3293831751074155,
1626
+ "grad_norm": 7.4215521812438965,
1627
+ "learning_rate": 0.00014251397933732655,
1628
+ "loss": 0.3705,
1629
+ "step": 56250
1630
+ },
1631
+ {
1632
+ "epoch": 0.33084710033011516,
1633
+ "grad_norm": 5.638499736785889,
1634
+ "learning_rate": 0.00014220298184396634,
1635
+ "loss": 0.3709,
1636
+ "step": 56500
1637
+ },
1638
+ {
1639
+ "epoch": 0.3323110255528148,
1640
+ "grad_norm": 9.440450668334961,
1641
+ "learning_rate": 0.00014189198435060613,
1642
+ "loss": 0.35,
1643
+ "step": 56750
1644
+ },
1645
+ {
1646
+ "epoch": 0.3337749507755144,
1647
+ "grad_norm": 7.706991195678711,
1648
+ "learning_rate": 0.00014158098685724594,
1649
+ "loss": 0.3601,
1650
+ "step": 57000
1651
+ },
1652
+ {
1653
+ "epoch": 0.335238875998214,
1654
+ "grad_norm": 8.154605865478516,
1655
+ "learning_rate": 0.00014126998936388573,
1656
+ "loss": 0.3625,
1657
+ "step": 57250
1658
+ },
1659
+ {
1660
+ "epoch": 0.3367028012209136,
1661
+ "grad_norm": 7.608438491821289,
1662
+ "learning_rate": 0.00014095899187052552,
1663
+ "loss": 0.3588,
1664
+ "step": 57500
1665
+ },
1666
+ {
1667
+ "epoch": 0.3381667264436133,
1668
+ "grad_norm": 5.466573715209961,
1669
+ "learning_rate": 0.00014064799437716534,
1670
+ "loss": 0.3528,
1671
+ "step": 57750
1672
+ },
1673
+ {
1674
+ "epoch": 0.3396306516663129,
1675
+ "grad_norm": 7.514803409576416,
1676
+ "learning_rate": 0.00014033699688380512,
1677
+ "loss": 0.3624,
1678
+ "step": 58000
1679
+ },
1680
+ {
1681
+ "epoch": 0.3410945768890125,
1682
+ "grad_norm": 4.846391677856445,
1683
+ "learning_rate": 0.00014002599939044491,
1684
+ "loss": 0.3525,
1685
+ "step": 58250
1686
+ },
1687
+ {
1688
+ "epoch": 0.3425585021117121,
1689
+ "grad_norm": 6.116271018981934,
1690
+ "learning_rate": 0.0001397150018970847,
1691
+ "loss": 0.3556,
1692
+ "step": 58500
1693
+ },
1694
+ {
1695
+ "epoch": 0.34402242733441174,
1696
+ "grad_norm": 7.234938621520996,
1697
+ "learning_rate": 0.00013940400440372452,
1698
+ "loss": 0.3723,
1699
+ "step": 58750
1700
+ },
1701
+ {
1702
+ "epoch": 0.3454863525571114,
1703
+ "grad_norm": 8.690266609191895,
1704
+ "learning_rate": 0.0001390930069103643,
1705
+ "loss": 0.3671,
1706
+ "step": 59000
1707
+ },
1708
+ {
1709
+ "epoch": 0.346950277779811,
1710
+ "grad_norm": 5.558066368103027,
1711
+ "learning_rate": 0.0001387820094170041,
1712
+ "loss": 0.3563,
1713
+ "step": 59250
1714
+ },
1715
+ {
1716
+ "epoch": 0.34841420300251064,
1717
+ "grad_norm": 5.277857303619385,
1718
+ "learning_rate": 0.0001384710119236439,
1719
+ "loss": 0.3633,
1720
+ "step": 59500
1721
+ },
1722
+ {
1723
+ "epoch": 0.34987812822521025,
1724
+ "grad_norm": 4.810859680175781,
1725
+ "learning_rate": 0.00013816125842025712,
1726
+ "loss": 0.3615,
1727
+ "step": 59750
1728
+ },
1729
+ {
1730
+ "epoch": 0.35134205344790986,
1731
+ "grad_norm": 6.860721111297607,
1732
+ "learning_rate": 0.00013785026092689694,
1733
+ "loss": 0.3561,
1734
+ "step": 60000
1735
+ },
1736
+ {
1737
+ "epoch": 0.35280597867060953,
1738
+ "grad_norm": 6.673612117767334,
1739
+ "learning_rate": 0.00013753926343353673,
1740
+ "loss": 0.3513,
1741
+ "step": 60250
1742
+ },
1743
+ {
1744
+ "epoch": 0.35426990389330915,
1745
+ "grad_norm": 6.9296956062316895,
1746
+ "learning_rate": 0.00013722826594017652,
1747
+ "loss": 0.3563,
1748
+ "step": 60500
1749
+ },
1750
+ {
1751
+ "epoch": 0.35573382911600876,
1752
+ "grad_norm": 6.235531806945801,
1753
+ "learning_rate": 0.0001369172684468163,
1754
+ "loss": 0.3586,
1755
+ "step": 60750
1756
+ },
1757
+ {
1758
+ "epoch": 0.3571977543387084,
1759
+ "grad_norm": 6.549998760223389,
1760
+ "learning_rate": 0.00013660627095345612,
1761
+ "loss": 0.3572,
1762
+ "step": 61000
1763
+ },
1764
+ {
1765
+ "epoch": 0.358661679561408,
1766
+ "grad_norm": 6.800797939300537,
1767
+ "learning_rate": 0.0001362952734600959,
1768
+ "loss": 0.3687,
1769
+ "step": 61250
1770
+ },
1771
+ {
1772
+ "epoch": 0.3601256047841076,
1773
+ "grad_norm": 5.545276641845703,
1774
+ "learning_rate": 0.0001359842759667357,
1775
+ "loss": 0.3539,
1776
+ "step": 61500
1777
+ },
1778
+ {
1779
+ "epoch": 0.36158953000680727,
1780
+ "grad_norm": 8.63070011138916,
1781
+ "learning_rate": 0.00013567327847337552,
1782
+ "loss": 0.3605,
1783
+ "step": 61750
1784
+ },
1785
+ {
1786
+ "epoch": 0.3630534552295069,
1787
+ "grad_norm": 5.199543476104736,
1788
+ "learning_rate": 0.0001353622809800153,
1789
+ "loss": 0.3559,
1790
+ "step": 62000
1791
+ },
1792
+ {
1793
+ "epoch": 0.3645173804522065,
1794
+ "grad_norm": 27.297420501708984,
1795
+ "learning_rate": 0.0001350512834866551,
1796
+ "loss": 0.3676,
1797
+ "step": 62250
1798
+ },
1799
+ {
1800
+ "epoch": 0.3659813056749061,
1801
+ "grad_norm": 8.235854148864746,
1802
+ "learning_rate": 0.00013474152998326833,
1803
+ "loss": 0.3583,
1804
+ "step": 62500
1805
+ },
1806
+ {
1807
+ "epoch": 0.3674452308976057,
1808
+ "grad_norm": 6.224372386932373,
1809
+ "learning_rate": 0.00013443053248990812,
1810
+ "loss": 0.3623,
1811
+ "step": 62750
1812
+ },
1813
+ {
1814
+ "epoch": 0.3689091561203054,
1815
+ "grad_norm": 8.013957977294922,
1816
+ "learning_rate": 0.0001341195349965479,
1817
+ "loss": 0.3619,
1818
+ "step": 63000
1819
+ },
1820
+ {
1821
+ "epoch": 0.370373081343005,
1822
+ "grad_norm": 6.442314147949219,
1823
+ "learning_rate": 0.00013380853750318773,
1824
+ "loss": 0.3586,
1825
+ "step": 63250
1826
+ },
1827
+ {
1828
+ "epoch": 0.3718370065657046,
1829
+ "grad_norm": 6.883063793182373,
1830
+ "learning_rate": 0.00013349754000982752,
1831
+ "loss": 0.3635,
1832
+ "step": 63500
1833
+ },
1834
+ {
1835
+ "epoch": 0.37330093178840423,
1836
+ "grad_norm": 5.502562999725342,
1837
+ "learning_rate": 0.0001331865425164673,
1838
+ "loss": 0.3525,
1839
+ "step": 63750
1840
+ },
1841
+ {
1842
+ "epoch": 0.37476485701110385,
1843
+ "grad_norm": 6.841543197631836,
1844
+ "learning_rate": 0.00013287554502310712,
1845
+ "loss": 0.3564,
1846
+ "step": 64000
1847
+ },
1848
+ {
1849
+ "epoch": 0.3762287822338035,
1850
+ "grad_norm": 6.850903034210205,
1851
+ "learning_rate": 0.0001325645475297469,
1852
+ "loss": 0.3549,
1853
+ "step": 64250
1854
+ },
1855
+ {
1856
+ "epoch": 0.37769270745650313,
1857
+ "grad_norm": 5.823826313018799,
1858
+ "learning_rate": 0.00013225479402636015,
1859
+ "loss": 0.3488,
1860
+ "step": 64500
1861
+ },
1862
+ {
1863
+ "epoch": 0.37915663267920274,
1864
+ "grad_norm": 9.849250793457031,
1865
+ "learning_rate": 0.00013194379653299997,
1866
+ "loss": 0.3526,
1867
+ "step": 64750
1868
+ },
1869
+ {
1870
+ "epoch": 0.38062055790190236,
1871
+ "grad_norm": 7.8498992919921875,
1872
+ "learning_rate": 0.00013163279903963975,
1873
+ "loss": 0.3596,
1874
+ "step": 65000
1875
+ },
1876
+ {
1877
+ "epoch": 0.38208448312460197,
1878
+ "grad_norm": 7.845436096191406,
1879
+ "learning_rate": 0.00013132180154627954,
1880
+ "loss": 0.3497,
1881
+ "step": 65250
1882
+ },
1883
+ {
1884
+ "epoch": 0.38354840834730164,
1885
+ "grad_norm": 10.533845901489258,
1886
+ "learning_rate": 0.00013101080405291933,
1887
+ "loss": 0.3523,
1888
+ "step": 65500
1889
+ },
1890
+ {
1891
+ "epoch": 0.38501233357000125,
1892
+ "grad_norm": 9.09399127960205,
1893
+ "learning_rate": 0.00013069980655955912,
1894
+ "loss": 0.347,
1895
+ "step": 65750
1896
+ },
1897
+ {
1898
+ "epoch": 0.38647625879270087,
1899
+ "grad_norm": 7.205333232879639,
1900
+ "learning_rate": 0.00013038880906619894,
1901
+ "loss": 0.355,
1902
+ "step": 66000
1903
+ },
1904
+ {
1905
+ "epoch": 0.3879401840154005,
1906
+ "grad_norm": 6.770249843597412,
1907
+ "learning_rate": 0.00013007781157283873,
1908
+ "loss": 0.3549,
1909
+ "step": 66250
1910
+ },
1911
+ {
1912
+ "epoch": 0.3894041092381001,
1913
+ "grad_norm": 8.14482593536377,
1914
+ "learning_rate": 0.00012976681407947851,
1915
+ "loss": 0.3537,
1916
+ "step": 66500
1917
+ },
1918
+ {
1919
+ "epoch": 0.39086803446079976,
1920
+ "grad_norm": 5.998184680938721,
1921
+ "learning_rate": 0.0001294558165861183,
1922
+ "loss": 0.3562,
1923
+ "step": 66750
1924
+ },
1925
+ {
1926
+ "epoch": 0.3923319596834994,
1927
+ "grad_norm": 5.583696365356445,
1928
+ "learning_rate": 0.00012914481909275812,
1929
+ "loss": 0.3499,
1930
+ "step": 67000
1931
+ },
1932
+ {
1933
+ "epoch": 0.393795884906199,
1934
+ "grad_norm": 6.899207592010498,
1935
+ "learning_rate": 0.0001288338215993979,
1936
+ "loss": 0.3506,
1937
+ "step": 67250
1938
+ },
1939
+ {
1940
+ "epoch": 0.3952598101288986,
1941
+ "grad_norm": 6.205395221710205,
1942
+ "learning_rate": 0.0001285228241060377,
1943
+ "loss": 0.3512,
1944
+ "step": 67500
1945
+ },
1946
+ {
1947
+ "epoch": 0.3967237353515982,
1948
+ "grad_norm": 9.125551223754883,
1949
+ "learning_rate": 0.0001282118266126775,
1950
+ "loss": 0.3585,
1951
+ "step": 67750
1952
+ },
1953
+ {
1954
+ "epoch": 0.3981876605742979,
1955
+ "grad_norm": 6.943772792816162,
1956
+ "learning_rate": 0.0001279008291193173,
1957
+ "loss": 0.362,
1958
+ "step": 68000
1959
+ },
1960
+ {
1961
+ "epoch": 0.3996515857969975,
1962
+ "grad_norm": 6.106304168701172,
1963
+ "learning_rate": 0.0001275898316259571,
1964
+ "loss": 0.3545,
1965
+ "step": 68250
1966
+ },
1967
+ {
1968
+ "epoch": 0.4011155110196971,
1969
+ "grad_norm": 6.197811126708984,
1970
+ "learning_rate": 0.00012728007812257036,
1971
+ "loss": 0.3524,
1972
+ "step": 68500
1973
+ },
1974
+ {
1975
+ "epoch": 0.4025794362423967,
1976
+ "grad_norm": 8.07652759552002,
1977
+ "learning_rate": 0.00012696908062921015,
1978
+ "loss": 0.3467,
1979
+ "step": 68750
1980
+ },
1981
+ {
1982
+ "epoch": 0.40404336146509634,
1983
+ "grad_norm": 7.444363117218018,
1984
+ "learning_rate": 0.00012665808313584994,
1985
+ "loss": 0.3541,
1986
+ "step": 69000
1987
+ },
1988
+ {
1989
+ "epoch": 0.405507286687796,
1990
+ "grad_norm": 6.2395782470703125,
1991
+ "learning_rate": 0.00012634708564248972,
1992
+ "loss": 0.3488,
1993
+ "step": 69250
1994
+ },
1995
+ {
1996
+ "epoch": 0.4069712119104956,
1997
+ "grad_norm": 7.489956378936768,
1998
+ "learning_rate": 0.00012603608814912954,
1999
+ "loss": 0.3595,
2000
+ "step": 69500
2001
+ },
2002
+ {
2003
+ "epoch": 0.40843513713319524,
2004
+ "grad_norm": 6.762283802032471,
2005
+ "learning_rate": 0.00012572509065576933,
2006
+ "loss": 0.3555,
2007
+ "step": 69750
2008
+ },
2009
+ {
2010
+ "epoch": 0.40989906235589485,
2011
+ "grad_norm": 10.423229217529297,
2012
+ "learning_rate": 0.00012541409316240912,
2013
+ "loss": 0.3474,
2014
+ "step": 70000
2015
+ },
2016
+ {
2017
+ "epoch": 0.41136298757859446,
2018
+ "grad_norm": 7.812709331512451,
2019
+ "learning_rate": 0.0001251030956690489,
2020
+ "loss": 0.3588,
2021
+ "step": 70250
2022
+ },
2023
+ {
2024
+ "epoch": 0.41282691280129413,
2025
+ "grad_norm": 8.506246566772461,
2026
+ "learning_rate": 0.00012479334216566215,
2027
+ "loss": 0.3473,
2028
+ "step": 70500
2029
+ },
2030
+ {
2031
+ "epoch": 0.41429083802399375,
2032
+ "grad_norm": 6.0005784034729,
2033
+ "learning_rate": 0.00012448234467230196,
2034
+ "loss": 0.3423,
2035
+ "step": 70750
2036
+ },
2037
+ {
2038
+ "epoch": 0.41575476324669336,
2039
+ "grad_norm": 7.6112494468688965,
2040
+ "learning_rate": 0.00012417134717894175,
2041
+ "loss": 0.3469,
2042
+ "step": 71000
2043
+ },
2044
+ {
2045
+ "epoch": 0.417218688469393,
2046
+ "grad_norm": 6.460068225860596,
2047
+ "learning_rate": 0.00012386034968558154,
2048
+ "loss": 0.3514,
2049
+ "step": 71250
2050
+ },
2051
+ {
2052
+ "epoch": 0.4186826136920926,
2053
+ "grad_norm": 25.509037017822266,
2054
+ "learning_rate": 0.00012354935219222136,
2055
+ "loss": 0.3538,
2056
+ "step": 71500
2057
+ },
2058
+ {
2059
+ "epoch": 0.42014653891479226,
2060
+ "grad_norm": 5.778562068939209,
2061
+ "learning_rate": 0.00012323835469886114,
2062
+ "loss": 0.3409,
2063
+ "step": 71750
2064
+ },
2065
+ {
2066
+ "epoch": 0.42161046413749187,
2067
+ "grad_norm": 10.19543170928955,
2068
+ "learning_rate": 0.00012292735720550093,
2069
+ "loss": 0.3487,
2070
+ "step": 72000
2071
+ },
2072
+ {
2073
+ "epoch": 0.4230743893601915,
2074
+ "grad_norm": 7.6341633796691895,
2075
+ "learning_rate": 0.00012261635971214072,
2076
+ "loss": 0.3477,
2077
+ "step": 72250
2078
+ },
2079
+ {
2080
+ "epoch": 0.4245383145828911,
2081
+ "grad_norm": 5.656210422515869,
2082
+ "learning_rate": 0.00012230536221878054,
2083
+ "loss": 0.353,
2084
+ "step": 72500
2085
+ },
2086
+ {
2087
+ "epoch": 0.4260022398055907,
2088
+ "grad_norm": 7.81094217300415,
2089
+ "learning_rate": 0.00012199436472542031,
2090
+ "loss": 0.3589,
2091
+ "step": 72750
2092
+ },
2093
+ {
2094
+ "epoch": 0.4274661650282904,
2095
+ "grad_norm": 5.924116611480713,
2096
+ "learning_rate": 0.0001216833672320601,
2097
+ "loss": 0.346,
2098
+ "step": 73000
2099
+ },
2100
+ {
2101
+ "epoch": 0.42893009025099,
2102
+ "grad_norm": 6.293444633483887,
2103
+ "learning_rate": 0.00012137236973869992,
2104
+ "loss": 0.3496,
2105
+ "step": 73250
2106
+ },
2107
+ {
2108
+ "epoch": 0.4303940154736896,
2109
+ "grad_norm": 9.766921997070312,
2110
+ "learning_rate": 0.00012106137224533971,
2111
+ "loss": 0.347,
2112
+ "step": 73500
2113
+ },
2114
+ {
2115
+ "epoch": 0.4318579406963892,
2116
+ "grad_norm": 5.998900890350342,
2117
+ "learning_rate": 0.0001207503747519795,
2118
+ "loss": 0.3465,
2119
+ "step": 73750
2120
+ },
2121
+ {
2122
+ "epoch": 0.43332186591908883,
2123
+ "grad_norm": 8.364704132080078,
2124
+ "learning_rate": 0.00012043937725861929,
2125
+ "loss": 0.3429,
2126
+ "step": 74000
2127
+ },
2128
+ {
2129
+ "epoch": 0.4347857911417885,
2130
+ "grad_norm": 5.508989334106445,
2131
+ "learning_rate": 0.0001201283797652591,
2132
+ "loss": 0.355,
2133
+ "step": 74250
2134
+ },
2135
+ {
2136
+ "epoch": 0.4362497163644881,
2137
+ "grad_norm": 6.357595443725586,
2138
+ "learning_rate": 0.00011981738227189889,
2139
+ "loss": 0.3504,
2140
+ "step": 74500
2141
+ },
2142
+ {
2143
+ "epoch": 0.43771364158718773,
2144
+ "grad_norm": 8.691376686096191,
2145
+ "learning_rate": 0.00011950762876851213,
2146
+ "loss": 0.3471,
2147
+ "step": 74750
2148
+ },
2149
+ {
2150
+ "epoch": 0.43917756680988734,
2151
+ "grad_norm": 11.246256828308105,
2152
+ "learning_rate": 0.00011919663127515193,
2153
+ "loss": 0.3487,
2154
+ "step": 75000
2155
+ },
2156
+ {
2157
+ "epoch": 0.44064149203258696,
2158
+ "grad_norm": 6.3526811599731445,
2159
+ "learning_rate": 0.00011888563378179172,
2160
+ "loss": 0.3414,
2161
+ "step": 75250
2162
+ },
2163
+ {
2164
+ "epoch": 0.4421054172552866,
2165
+ "grad_norm": 9.6268310546875,
2166
+ "learning_rate": 0.00011857463628843152,
2167
+ "loss": 0.3457,
2168
+ "step": 75500
2169
+ },
2170
+ {
2171
+ "epoch": 0.44356934247798624,
2172
+ "grad_norm": 8.093045234680176,
2173
+ "learning_rate": 0.00011826363879507131,
2174
+ "loss": 0.3515,
2175
+ "step": 75750
2176
+ },
2177
+ {
2178
+ "epoch": 0.44503326770068585,
2179
+ "grad_norm": 7.497385025024414,
2180
+ "learning_rate": 0.00011795264130171111,
2181
+ "loss": 0.3361,
2182
+ "step": 76000
2183
+ },
2184
+ {
2185
+ "epoch": 0.44649719292338547,
2186
+ "grad_norm": 8.374622344970703,
2187
+ "learning_rate": 0.00011764164380835092,
2188
+ "loss": 0.3552,
2189
+ "step": 76250
2190
+ },
2191
+ {
2192
+ "epoch": 0.4479611181460851,
2193
+ "grad_norm": 8.583603858947754,
2194
+ "learning_rate": 0.0001173306463149907,
2195
+ "loss": 0.3395,
2196
+ "step": 76500
2197
+ },
2198
+ {
2199
+ "epoch": 0.44942504336878475,
2200
+ "grad_norm": 5.933279991149902,
2201
+ "learning_rate": 0.0001170196488216305,
2202
+ "loss": 0.3539,
2203
+ "step": 76750
2204
+ },
2205
+ {
2206
+ "epoch": 0.45088896859148436,
2207
+ "grad_norm": 7.1400556564331055,
2208
+ "learning_rate": 0.00011670989531824375,
2209
+ "loss": 0.3556,
2210
+ "step": 77000
2211
+ },
2212
+ {
2213
+ "epoch": 0.452352893814184,
2214
+ "grad_norm": 6.4177374839782715,
2215
+ "learning_rate": 0.00011639889782488354,
2216
+ "loss": 0.34,
2217
+ "step": 77250
2218
+ },
2219
+ {
2220
+ "epoch": 0.4538168190368836,
2221
+ "grad_norm": 8.248872756958008,
2222
+ "learning_rate": 0.00011608790033152333,
2223
+ "loss": 0.3454,
2224
+ "step": 77500
2225
+ },
2226
+ {
2227
+ "epoch": 0.4552807442595832,
2228
+ "grad_norm": 6.789691925048828,
2229
+ "learning_rate": 0.00011577690283816314,
2230
+ "loss": 0.3506,
2231
+ "step": 77750
2232
+ },
2233
+ {
2234
+ "epoch": 0.4567446694822829,
2235
+ "grad_norm": 7.519604206085205,
2236
+ "learning_rate": 0.00011546590534480293,
2237
+ "loss": 0.3438,
2238
+ "step": 78000
2239
+ },
2240
+ {
2241
+ "epoch": 0.4582085947049825,
2242
+ "grad_norm": 11.287620544433594,
2243
+ "learning_rate": 0.00011515490785144272,
2244
+ "loss": 0.3536,
2245
+ "step": 78250
2246
+ },
2247
+ {
2248
+ "epoch": 0.4596725199276821,
2249
+ "grad_norm": 5.6864914894104,
2250
+ "learning_rate": 0.00011484391035808254,
2251
+ "loss": 0.348,
2252
+ "step": 78500
2253
+ },
2254
+ {
2255
+ "epoch": 0.4611364451503817,
2256
+ "grad_norm": 7.405890941619873,
2257
+ "learning_rate": 0.00011453291286472232,
2258
+ "loss": 0.3395,
2259
+ "step": 78750
2260
+ },
2261
+ {
2262
+ "epoch": 0.4626003703730813,
2263
+ "grad_norm": 5.379487991333008,
2264
+ "learning_rate": 0.00011422315936133556,
2265
+ "loss": 0.3463,
2266
+ "step": 79000
2267
+ },
2268
+ {
2269
+ "epoch": 0.46406429559578094,
2270
+ "grad_norm": 7.769617080688477,
2271
+ "learning_rate": 0.00011391216186797535,
2272
+ "loss": 0.3458,
2273
+ "step": 79250
2274
+ },
2275
+ {
2276
+ "epoch": 0.4655282208184806,
2277
+ "grad_norm": 9.26171875,
2278
+ "learning_rate": 0.00011360116437461514,
2279
+ "loss": 0.3394,
2280
+ "step": 79500
2281
+ },
2282
+ {
2283
+ "epoch": 0.4669921460411802,
2284
+ "grad_norm": 9.037941932678223,
2285
+ "learning_rate": 0.00011329016688125493,
2286
+ "loss": 0.349,
2287
+ "step": 79750
2288
+ },
2289
+ {
2290
+ "epoch": 0.46845607126387984,
2291
+ "grad_norm": 8.776792526245117,
2292
+ "learning_rate": 0.00011297916938789475,
2293
+ "loss": 0.3384,
2294
+ "step": 80000
2295
+ },
2296
+ {
2297
+ "epoch": 0.46991999648657945,
2298
+ "grad_norm": 6.737313270568848,
2299
+ "learning_rate": 0.00011266817189453454,
2300
+ "loss": 0.3472,
2301
+ "step": 80250
2302
+ },
2303
+ {
2304
+ "epoch": 0.47138392170927906,
2305
+ "grad_norm": 7.2374114990234375,
2306
+ "learning_rate": 0.00011235717440117432,
2307
+ "loss": 0.3434,
2308
+ "step": 80500
2309
+ },
2310
+ {
2311
+ "epoch": 0.47284784693197873,
2312
+ "grad_norm": 6.939677715301514,
2313
+ "learning_rate": 0.00011204617690781414,
2314
+ "loss": 0.3451,
2315
+ "step": 80750
2316
+ },
2317
+ {
2318
+ "epoch": 0.47431177215467835,
2319
+ "grad_norm": 4.702803611755371,
2320
+ "learning_rate": 0.00011173517941445393,
2321
+ "loss": 0.3508,
2322
+ "step": 81000
2323
+ },
2324
+ {
2325
+ "epoch": 0.47577569737737796,
2326
+ "grad_norm": 7.359582901000977,
2327
+ "learning_rate": 0.00011142418192109372,
2328
+ "loss": 0.3415,
2329
+ "step": 81250
2330
+ },
2331
+ {
2332
+ "epoch": 0.4772396226000776,
2333
+ "grad_norm": 8.404651641845703,
2334
+ "learning_rate": 0.00011111442841770696,
2335
+ "loss": 0.3438,
2336
+ "step": 81500
2337
+ },
2338
+ {
2339
+ "epoch": 0.4787035478227772,
2340
+ "grad_norm": 6.176925182342529,
2341
+ "learning_rate": 0.00011080343092434675,
2342
+ "loss": 0.3484,
2343
+ "step": 81750
2344
+ },
2345
+ {
2346
+ "epoch": 0.48016747304547686,
2347
+ "grad_norm": 8.614276885986328,
2348
+ "learning_rate": 0.00011049243343098655,
2349
+ "loss": 0.3525,
2350
+ "step": 82000
2351
+ },
2352
+ {
2353
+ "epoch": 0.48163139826817647,
2354
+ "grad_norm": 5.756929874420166,
2355
+ "learning_rate": 0.00011018143593762635,
2356
+ "loss": 0.3432,
2357
+ "step": 82250
2358
+ },
2359
+ {
2360
+ "epoch": 0.4830953234908761,
2361
+ "grad_norm": 7.686267852783203,
2362
+ "learning_rate": 0.00010987043844426614,
2363
+ "loss": 0.3508,
2364
+ "step": 82500
2365
+ },
2366
+ {
2367
+ "epoch": 0.4845592487135757,
2368
+ "grad_norm": 6.590146541595459,
2369
+ "learning_rate": 0.00010955944095090593,
2370
+ "loss": 0.3357,
2371
+ "step": 82750
2372
+ },
2373
+ {
2374
+ "epoch": 0.4860231739362753,
2375
+ "grad_norm": 7.363981246948242,
2376
+ "learning_rate": 0.00010924968744751918,
2377
+ "loss": 0.3469,
2378
+ "step": 83000
2379
+ },
2380
+ {
2381
+ "epoch": 0.487487099158975,
2382
+ "grad_norm": 5.942411422729492,
2383
+ "learning_rate": 0.00010893868995415897,
2384
+ "loss": 0.3464,
2385
+ "step": 83250
2386
+ },
2387
+ {
2388
+ "epoch": 0.4889510243816746,
2389
+ "grad_norm": 8.531744003295898,
2390
+ "learning_rate": 0.00010862769246079879,
2391
+ "loss": 0.3349,
2392
+ "step": 83500
2393
+ },
2394
+ {
2395
+ "epoch": 0.4904149496043742,
2396
+ "grad_norm": 20.821125030517578,
2397
+ "learning_rate": 0.00010831669496743858,
2398
+ "loss": 0.3434,
2399
+ "step": 83750
2400
+ },
2401
+ {
2402
+ "epoch": 0.4918788748270738,
2403
+ "grad_norm": 9.569067001342773,
2404
+ "learning_rate": 0.00010800569747407836,
2405
+ "loss": 0.3421,
2406
+ "step": 84000
2407
+ },
2408
+ {
2409
+ "epoch": 0.49334280004977343,
2410
+ "grad_norm": 7.6851725578308105,
2411
+ "learning_rate": 0.00010769469998071815,
2412
+ "loss": 0.3407,
2413
+ "step": 84250
2414
+ },
2415
+ {
2416
+ "epoch": 0.4948067252724731,
2417
+ "grad_norm": 9.591890335083008,
2418
+ "learning_rate": 0.00010738370248735797,
2419
+ "loss": 0.347,
2420
+ "step": 84500
2421
+ },
2422
+ {
2423
+ "epoch": 0.4962706504951727,
2424
+ "grad_norm": 5.16259765625,
2425
+ "learning_rate": 0.00010707270499399776,
2426
+ "loss": 0.3383,
2427
+ "step": 84750
2428
+ },
2429
+ {
2430
+ "epoch": 0.49773457571787233,
2431
+ "grad_norm": 4.6993794441223145,
2432
+ "learning_rate": 0.00010676170750063755,
2433
+ "loss": 0.3392,
2434
+ "step": 85000
2435
+ },
2436
+ {
2437
+ "epoch": 0.49919850094057194,
2438
+ "grad_norm": 6.331507682800293,
2439
+ "learning_rate": 0.00010645071000727735,
2440
+ "loss": 0.351,
2441
+ "step": 85250
2442
+ },
2443
+ {
2444
+ "epoch": 0.5006624261632716,
2445
+ "grad_norm": 7.329137325286865,
2446
+ "learning_rate": 0.00010613971251391714,
2447
+ "loss": 0.3486,
2448
+ "step": 85500
2449
+ },
2450
+ {
2451
+ "epoch": 0.5021263513859712,
2452
+ "grad_norm": 6.907947540283203,
2453
+ "learning_rate": 0.00010582871502055694,
2454
+ "loss": 0.3443,
2455
+ "step": 85750
2456
+ },
2457
+ {
2458
+ "epoch": 0.5035902766086708,
2459
+ "grad_norm": 4.780885696411133,
2460
+ "learning_rate": 0.00010551771752719674,
2461
+ "loss": 0.3401,
2462
+ "step": 86000
2463
+ },
2464
+ {
2465
+ "epoch": 0.5050542018313705,
2466
+ "grad_norm": 9.042526245117188,
2467
+ "learning_rate": 0.00010520672003383653,
2468
+ "loss": 0.3402,
2469
+ "step": 86250
2470
+ },
2471
+ {
2472
+ "epoch": 0.5065181270540701,
2473
+ "grad_norm": 5.397533416748047,
2474
+ "learning_rate": 0.00010489572254047632,
2475
+ "loss": 0.3392,
2476
+ "step": 86500
2477
+ },
2478
+ {
2479
+ "epoch": 0.5079820522767697,
2480
+ "grad_norm": 7.72251033782959,
2481
+ "learning_rate": 0.00010458472504711612,
2482
+ "loss": 0.3337,
2483
+ "step": 86750
2484
+ },
2485
+ {
2486
+ "epoch": 0.5094459774994693,
2487
+ "grad_norm": 7.379674434661865,
2488
+ "learning_rate": 0.00010427497154372936,
2489
+ "loss": 0.3457,
2490
+ "step": 87000
2491
+ },
2492
+ {
2493
+ "epoch": 0.510909902722169,
2494
+ "grad_norm": 7.123027801513672,
2495
+ "learning_rate": 0.00010396397405036915,
2496
+ "loss": 0.3311,
2497
+ "step": 87250
2498
+ },
2499
+ {
2500
+ "epoch": 0.5123738279448685,
2501
+ "grad_norm": 6.388451099395752,
2502
+ "learning_rate": 0.00010365297655700897,
2503
+ "loss": 0.3386,
2504
+ "step": 87500
2505
+ },
2506
+ {
2507
+ "epoch": 0.5138377531675682,
2508
+ "grad_norm": 8.933717727661133,
2509
+ "learning_rate": 0.00010334197906364876,
2510
+ "loss": 0.3377,
2511
+ "step": 87750
2512
+ },
2513
+ {
2514
+ "epoch": 0.5153016783902679,
2515
+ "grad_norm": 5.813757419586182,
2516
+ "learning_rate": 0.000103032225560262,
2517
+ "loss": 0.3368,
2518
+ "step": 88000
2519
+ },
2520
+ {
2521
+ "epoch": 0.5167656036129674,
2522
+ "grad_norm": 10.707741737365723,
2523
+ "learning_rate": 0.00010272122806690178,
2524
+ "loss": 0.3429,
2525
+ "step": 88250
2526
+ },
2527
+ {
2528
+ "epoch": 0.5182295288356671,
2529
+ "grad_norm": 7.433245658874512,
2530
+ "learning_rate": 0.00010241023057354157,
2531
+ "loss": 0.3457,
2532
+ "step": 88500
2533
+ },
2534
+ {
2535
+ "epoch": 0.5196934540583666,
2536
+ "grad_norm": 6.408331394195557,
2537
+ "learning_rate": 0.00010209923308018139,
2538
+ "loss": 0.3409,
2539
+ "step": 88750
2540
+ },
2541
+ {
2542
+ "epoch": 0.5211573792810663,
2543
+ "grad_norm": 7.5843987464904785,
2544
+ "learning_rate": 0.00010178823558682118,
2545
+ "loss": 0.3347,
2546
+ "step": 89000
2547
+ },
2548
+ {
2549
+ "epoch": 0.522621304503766,
2550
+ "grad_norm": 9.049858093261719,
2551
+ "learning_rate": 0.00010147723809346097,
2552
+ "loss": 0.3392,
2553
+ "step": 89250
2554
+ },
2555
+ {
2556
+ "epoch": 0.5240852297264655,
2557
+ "grad_norm": 8.207107543945312,
2558
+ "learning_rate": 0.00010116624060010076,
2559
+ "loss": 0.334,
2560
+ "step": 89500
2561
+ },
2562
+ {
2563
+ "epoch": 0.5255491549491652,
2564
+ "grad_norm": 6.511790752410889,
2565
+ "learning_rate": 0.00010085648709671401,
2566
+ "loss": 0.3462,
2567
+ "step": 89750
2568
+ },
2569
+ {
2570
+ "epoch": 0.5270130801718648,
2571
+ "grad_norm": 5.541443824768066,
2572
+ "learning_rate": 0.0001005454896033538,
2573
+ "loss": 0.3318,
2574
+ "step": 90000
2575
+ },
2576
+ {
2577
+ "epoch": 0.5284770053945644,
2578
+ "grad_norm": 6.216821670532227,
2579
+ "learning_rate": 0.0001002344921099936,
2580
+ "loss": 0.338,
2581
+ "step": 90250
2582
+ },
2583
+ {
2584
+ "epoch": 0.5299409306172641,
2585
+ "grad_norm": 5.138360977172852,
2586
+ "learning_rate": 9.992349461663339e-05,
2587
+ "loss": 0.3457,
2588
+ "step": 90500
2589
+ },
2590
+ {
2591
+ "epoch": 0.5314048558399637,
2592
+ "grad_norm": 8.401073455810547,
2593
+ "learning_rate": 9.961249712327319e-05,
2594
+ "loss": 0.3523,
2595
+ "step": 90750
2596
+ },
2597
+ {
2598
+ "epoch": 0.5328687810626633,
2599
+ "grad_norm": 8.749157905578613,
2600
+ "learning_rate": 9.930149962991298e-05,
2601
+ "loss": 0.3391,
2602
+ "step": 91000
2603
+ },
2604
+ {
2605
+ "epoch": 0.5343327062853629,
2606
+ "grad_norm": 7.809004783630371,
2607
+ "learning_rate": 9.899050213655278e-05,
2608
+ "loss": 0.3422,
2609
+ "step": 91250
2610
+ },
2611
+ {
2612
+ "epoch": 0.5357966315080626,
2613
+ "grad_norm": 7.649618148803711,
2614
+ "learning_rate": 9.867950464319257e-05,
2615
+ "loss": 0.3512,
2616
+ "step": 91500
2617
+ },
2618
+ {
2619
+ "epoch": 0.5372605567307622,
2620
+ "grad_norm": 8.770468711853027,
2621
+ "learning_rate": 9.836850714983237e-05,
2622
+ "loss": 0.3367,
2623
+ "step": 91750
2624
+ },
2625
+ {
2626
+ "epoch": 0.5387244819534618,
2627
+ "grad_norm": 8.32112979888916,
2628
+ "learning_rate": 9.805750965647216e-05,
2629
+ "loss": 0.3384,
2630
+ "step": 92000
2631
+ },
2632
+ {
2633
+ "epoch": 0.5401884071761615,
2634
+ "grad_norm": 9.602888107299805,
2635
+ "learning_rate": 9.774651216311197e-05,
2636
+ "loss": 0.3344,
2637
+ "step": 92250
2638
+ },
2639
+ {
2640
+ "epoch": 0.541652332398861,
2641
+ "grad_norm": 3.2295093536376953,
2642
+ "learning_rate": 9.743551466975177e-05,
2643
+ "loss": 0.3314,
2644
+ "step": 92500
2645
+ },
2646
+ {
2647
+ "epoch": 0.5431162576215607,
2648
+ "grad_norm": 5.456012725830078,
2649
+ "learning_rate": 9.712451717639156e-05,
2650
+ "loss": 0.3313,
2651
+ "step": 92750
2652
+ },
2653
+ {
2654
+ "epoch": 0.5445801828442604,
2655
+ "grad_norm": 7.777164936065674,
2656
+ "learning_rate": 9.681351968303136e-05,
2657
+ "loss": 0.3417,
2658
+ "step": 93000
2659
+ },
2660
+ {
2661
+ "epoch": 0.5460441080669599,
2662
+ "grad_norm": 10.10175895690918,
2663
+ "learning_rate": 9.650252218967115e-05,
2664
+ "loss": 0.3357,
2665
+ "step": 93250
2666
+ },
2667
+ {
2668
+ "epoch": 0.5475080332896596,
2669
+ "grad_norm": 8.296233177185059,
2670
+ "learning_rate": 9.619152469631095e-05,
2671
+ "loss": 0.3368,
2672
+ "step": 93500
2673
+ },
2674
+ {
2675
+ "epoch": 0.5489719585123591,
2676
+ "grad_norm": 5.55683708190918,
2677
+ "learning_rate": 9.588052720295075e-05,
2678
+ "loss": 0.3338,
2679
+ "step": 93750
2680
+ },
2681
+ {
2682
+ "epoch": 0.5504358837350588,
2683
+ "grad_norm": 5.92700719833374,
2684
+ "learning_rate": 9.556952970959054e-05,
2685
+ "loss": 0.3431,
2686
+ "step": 94000
2687
+ },
2688
+ {
2689
+ "epoch": 0.5518998089577585,
2690
+ "grad_norm": 5.411899089813232,
2691
+ "learning_rate": 9.525853221623034e-05,
2692
+ "loss": 0.3393,
2693
+ "step": 94250
2694
+ },
2695
+ {
2696
+ "epoch": 0.553363734180458,
2697
+ "grad_norm": 6.517271995544434,
2698
+ "learning_rate": 9.494753472287013e-05,
2699
+ "loss": 0.3332,
2700
+ "step": 94500
2701
+ },
2702
+ {
2703
+ "epoch": 0.5548276594031577,
2704
+ "grad_norm": 9.099715232849121,
2705
+ "learning_rate": 9.463653722950994e-05,
2706
+ "loss": 0.3343,
2707
+ "step": 94750
2708
+ },
2709
+ {
2710
+ "epoch": 0.5562915846258573,
2711
+ "grad_norm": 4.845067501068115,
2712
+ "learning_rate": 9.432553973614972e-05,
2713
+ "loss": 0.3344,
2714
+ "step": 95000
2715
+ },
2716
+ {
2717
+ "epoch": 0.5577555098485569,
2718
+ "grad_norm": 8.56153392791748,
2719
+ "learning_rate": 9.401454224278953e-05,
2720
+ "loss": 0.33,
2721
+ "step": 95250
2722
+ },
2723
+ {
2724
+ "epoch": 0.5592194350712566,
2725
+ "grad_norm": 7.1542439460754395,
2726
+ "learning_rate": 9.370354474942933e-05,
2727
+ "loss": 0.3186,
2728
+ "step": 95500
2729
+ },
2730
+ {
2731
+ "epoch": 0.5606833602939562,
2732
+ "grad_norm": 7.00217342376709,
2733
+ "learning_rate": 9.339254725606912e-05,
2734
+ "loss": 0.335,
2735
+ "step": 95750
2736
+ },
2737
+ {
2738
+ "epoch": 0.5621472855166558,
2739
+ "grad_norm": 7.365664482116699,
2740
+ "learning_rate": 9.308279375268236e-05,
2741
+ "loss": 0.3303,
2742
+ "step": 96000
2743
+ },
2744
+ {
2745
+ "epoch": 0.5636112107393554,
2746
+ "grad_norm": 8.063042640686035,
2747
+ "learning_rate": 9.277179625932215e-05,
2748
+ "loss": 0.3441,
2749
+ "step": 96250
2750
+ },
2751
+ {
2752
+ "epoch": 0.565075135962055,
2753
+ "grad_norm": 5.403791904449463,
2754
+ "learning_rate": 9.246079876596195e-05,
2755
+ "loss": 0.3318,
2756
+ "step": 96500
2757
+ },
2758
+ {
2759
+ "epoch": 0.5665390611847547,
2760
+ "grad_norm": 5.911950588226318,
2761
+ "learning_rate": 9.215104526257519e-05,
2762
+ "loss": 0.3327,
2763
+ "step": 96750
2764
+ },
2765
+ {
2766
+ "epoch": 0.5680029864074543,
2767
+ "grad_norm": 5.484018802642822,
2768
+ "learning_rate": 9.184004776921499e-05,
2769
+ "loss": 0.3384,
2770
+ "step": 97000
2771
+ },
2772
+ {
2773
+ "epoch": 0.569466911630154,
2774
+ "grad_norm": 4.785627365112305,
2775
+ "learning_rate": 9.152905027585478e-05,
2776
+ "loss": 0.3437,
2777
+ "step": 97250
2778
+ },
2779
+ {
2780
+ "epoch": 0.5709308368528535,
2781
+ "grad_norm": 7.17230749130249,
2782
+ "learning_rate": 9.121805278249458e-05,
2783
+ "loss": 0.3331,
2784
+ "step": 97500
2785
+ },
2786
+ {
2787
+ "epoch": 0.5723947620755532,
2788
+ "grad_norm": 7.777104377746582,
2789
+ "learning_rate": 9.090705528913437e-05,
2790
+ "loss": 0.3371,
2791
+ "step": 97750
2792
+ },
2793
+ {
2794
+ "epoch": 0.5738586872982528,
2795
+ "grad_norm": 6.8572001457214355,
2796
+ "learning_rate": 9.059605779577417e-05,
2797
+ "loss": 0.3397,
2798
+ "step": 98000
2799
+ },
2800
+ {
2801
+ "epoch": 0.5753226125209524,
2802
+ "grad_norm": 9.132293701171875,
2803
+ "learning_rate": 9.028506030241398e-05,
2804
+ "loss": 0.3421,
2805
+ "step": 98250
2806
+ },
2807
+ {
2808
+ "epoch": 0.5767865377436521,
2809
+ "grad_norm": 7.351444244384766,
2810
+ "learning_rate": 8.997406280905376e-05,
2811
+ "loss": 0.3315,
2812
+ "step": 98500
2813
+ },
2814
+ {
2815
+ "epoch": 0.5782504629663516,
2816
+ "grad_norm": 5.444695949554443,
2817
+ "learning_rate": 8.966306531569357e-05,
2818
+ "loss": 0.3313,
2819
+ "step": 98750
2820
+ },
2821
+ {
2822
+ "epoch": 0.5797143881890513,
2823
+ "grad_norm": 6.229501724243164,
2824
+ "learning_rate": 8.935206782233336e-05,
2825
+ "loss": 0.3321,
2826
+ "step": 99000
2827
+ },
2828
+ {
2829
+ "epoch": 0.581178313411751,
2830
+ "grad_norm": 4.431236743927002,
2831
+ "learning_rate": 8.904107032897316e-05,
2832
+ "loss": 0.3326,
2833
+ "step": 99250
2834
+ },
2835
+ {
2836
+ "epoch": 0.5826422386344505,
2837
+ "grad_norm": 4.78348445892334,
2838
+ "learning_rate": 8.873007283561296e-05,
2839
+ "loss": 0.3362,
2840
+ "step": 99500
2841
+ },
2842
+ {
2843
+ "epoch": 0.5841061638571502,
2844
+ "grad_norm": 5.964051723480225,
2845
+ "learning_rate": 8.841907534225275e-05,
2846
+ "loss": 0.3408,
2847
+ "step": 99750
2848
+ },
2849
+ {
2850
+ "epoch": 0.5855700890798498,
2851
+ "grad_norm": 5.310559272766113,
2852
+ "learning_rate": 8.810807784889255e-05,
2853
+ "loss": 0.3328,
2854
+ "step": 100000
2855
+ },
2856
+ {
2857
+ "epoch": 0.5870340143025494,
2858
+ "grad_norm": 4.985818862915039,
2859
+ "learning_rate": 8.779708035553234e-05,
2860
+ "loss": 0.337,
2861
+ "step": 100250
2862
+ },
2863
+ {
2864
+ "epoch": 0.5884979395252491,
2865
+ "grad_norm": 4.851356506347656,
2866
+ "learning_rate": 8.748608286217213e-05,
2867
+ "loss": 0.3314,
2868
+ "step": 100500
2869
+ },
2870
+ {
2871
+ "epoch": 0.5899618647479486,
2872
+ "grad_norm": 6.863201141357422,
2873
+ "learning_rate": 8.717508536881193e-05,
2874
+ "loss": 0.3231,
2875
+ "step": 100750
2876
+ },
2877
+ {
2878
+ "epoch": 0.5914257899706483,
2879
+ "grad_norm": 6.387337684631348,
2880
+ "learning_rate": 8.686533186542517e-05,
2881
+ "loss": 0.322,
2882
+ "step": 101000
2883
+ },
2884
+ {
2885
+ "epoch": 0.5928897151933479,
2886
+ "grad_norm": 7.897363662719727,
2887
+ "learning_rate": 8.655433437206496e-05,
2888
+ "loss": 0.3361,
2889
+ "step": 101250
2890
+ },
2891
+ {
2892
+ "epoch": 0.5943536404160475,
2893
+ "grad_norm": 5.876019477844238,
2894
+ "learning_rate": 8.624333687870476e-05,
2895
+ "loss": 0.3211,
2896
+ "step": 101500
2897
+ },
2898
+ {
2899
+ "epoch": 0.5958175656387472,
2900
+ "grad_norm": 4.175768852233887,
2901
+ "learning_rate": 8.593233938534457e-05,
2902
+ "loss": 0.3317,
2903
+ "step": 101750
2904
+ },
2905
+ {
2906
+ "epoch": 0.5972814908614468,
2907
+ "grad_norm": 6.496226787567139,
2908
+ "learning_rate": 8.562134189198435e-05,
2909
+ "loss": 0.3289,
2910
+ "step": 102000
2911
+ },
2912
+ {
2913
+ "epoch": 0.5987454160841464,
2914
+ "grad_norm": 7.092103004455566,
2915
+ "learning_rate": 8.531034439862416e-05,
2916
+ "loss": 0.3329,
2917
+ "step": 102250
2918
+ },
2919
+ {
2920
+ "epoch": 0.600209341306846,
2921
+ "grad_norm": 7.335963726043701,
2922
+ "learning_rate": 8.499934690526395e-05,
2923
+ "loss": 0.3305,
2924
+ "step": 102500
2925
+ },
2926
+ {
2927
+ "epoch": 0.6016732665295457,
2928
+ "grad_norm": 6.620415687561035,
2929
+ "learning_rate": 8.468834941190375e-05,
2930
+ "loss": 0.3324,
2931
+ "step": 102750
2932
+ },
2933
+ {
2934
+ "epoch": 0.6031371917522453,
2935
+ "grad_norm": 6.866759777069092,
2936
+ "learning_rate": 8.437735191854355e-05,
2937
+ "loss": 0.3395,
2938
+ "step": 103000
2939
+ },
2940
+ {
2941
+ "epoch": 0.6046011169749449,
2942
+ "grad_norm": 7.7242045402526855,
2943
+ "learning_rate": 8.406759841515678e-05,
2944
+ "loss": 0.3368,
2945
+ "step": 103250
2946
+ },
2947
+ {
2948
+ "epoch": 0.6060650421976446,
2949
+ "grad_norm": 6.402958869934082,
2950
+ "learning_rate": 8.375660092179658e-05,
2951
+ "loss": 0.3366,
2952
+ "step": 103500
2953
+ },
2954
+ {
2955
+ "epoch": 0.6075289674203441,
2956
+ "grad_norm": 6.456150531768799,
2957
+ "learning_rate": 8.344560342843637e-05,
2958
+ "loss": 0.3372,
2959
+ "step": 103750
2960
+ },
2961
+ {
2962
+ "epoch": 0.6089928926430438,
2963
+ "grad_norm": 7.6825971603393555,
2964
+ "learning_rate": 8.313460593507617e-05,
2965
+ "loss": 0.3331,
2966
+ "step": 104000
2967
+ },
2968
+ {
2969
+ "epoch": 0.6104568178657435,
2970
+ "grad_norm": 11.974824905395508,
2971
+ "learning_rate": 8.282360844171596e-05,
2972
+ "loss": 0.3317,
2973
+ "step": 104250
2974
+ },
2975
+ {
2976
+ "epoch": 0.611920743088443,
2977
+ "grad_norm": 5.445409774780273,
2978
+ "learning_rate": 8.251261094835576e-05,
2979
+ "loss": 0.3303,
2980
+ "step": 104500
2981
+ },
2982
+ {
2983
+ "epoch": 0.6133846683111427,
2984
+ "grad_norm": 8.099034309387207,
2985
+ "learning_rate": 8.220161345499555e-05,
2986
+ "loss": 0.3317,
2987
+ "step": 104750
2988
+ },
2989
+ {
2990
+ "epoch": 0.6148485935338422,
2991
+ "grad_norm": 21.789043426513672,
2992
+ "learning_rate": 8.189061596163535e-05,
2993
+ "loss": 0.3146,
2994
+ "step": 105000
2995
+ },
2996
+ {
2997
+ "epoch": 0.6163125187565419,
2998
+ "grad_norm": 6.879361152648926,
2999
+ "learning_rate": 8.158086245824859e-05,
3000
+ "loss": 0.3346,
3001
+ "step": 105250
3002
+ },
3003
+ {
3004
+ "epoch": 0.6177764439792416,
3005
+ "grad_norm": 5.477085113525391,
3006
+ "learning_rate": 8.126986496488838e-05,
3007
+ "loss": 0.3274,
3008
+ "step": 105500
3009
+ },
3010
+ {
3011
+ "epoch": 0.6192403692019411,
3012
+ "grad_norm": 6.2816667556762695,
3013
+ "learning_rate": 8.095886747152818e-05,
3014
+ "loss": 0.3271,
3015
+ "step": 105750
3016
+ },
3017
+ {
3018
+ "epoch": 0.6207042944246408,
3019
+ "grad_norm": 9.089285850524902,
3020
+ "learning_rate": 8.064786997816797e-05,
3021
+ "loss": 0.3351,
3022
+ "step": 106000
3023
+ },
3024
+ {
3025
+ "epoch": 0.6221682196473404,
3026
+ "grad_norm": 6.114886283874512,
3027
+ "learning_rate": 8.033687248480777e-05,
3028
+ "loss": 0.3296,
3029
+ "step": 106250
3030
+ },
3031
+ {
3032
+ "epoch": 0.62363214487004,
3033
+ "grad_norm": 7.2542548179626465,
3034
+ "learning_rate": 8.002587499144756e-05,
3035
+ "loss": 0.3246,
3036
+ "step": 106500
3037
+ },
3038
+ {
3039
+ "epoch": 0.6250960700927397,
3040
+ "grad_norm": 5.58528995513916,
3041
+ "learning_rate": 7.971487749808737e-05,
3042
+ "loss": 0.3327,
3043
+ "step": 106750
3044
+ },
3045
+ {
3046
+ "epoch": 0.6265599953154393,
3047
+ "grad_norm": 3.898178815841675,
3048
+ "learning_rate": 7.940388000472715e-05,
3049
+ "loss": 0.3291,
3050
+ "step": 107000
3051
+ },
3052
+ {
3053
+ "epoch": 0.6280239205381389,
3054
+ "grad_norm": 5.644820690155029,
3055
+ "learning_rate": 7.909288251136696e-05,
3056
+ "loss": 0.3281,
3057
+ "step": 107250
3058
+ },
3059
+ {
3060
+ "epoch": 0.6294878457608385,
3061
+ "grad_norm": 6.363776206970215,
3062
+ "learning_rate": 7.878188501800676e-05,
3063
+ "loss": 0.3304,
3064
+ "step": 107500
3065
+ },
3066
+ {
3067
+ "epoch": 0.6309517709835382,
3068
+ "grad_norm": 5.209687232971191,
3069
+ "learning_rate": 7.847213151462e-05,
3070
+ "loss": 0.3224,
3071
+ "step": 107750
3072
+ },
3073
+ {
3074
+ "epoch": 0.6324156962062378,
3075
+ "grad_norm": 6.911553382873535,
3076
+ "learning_rate": 7.81611340212598e-05,
3077
+ "loss": 0.3246,
3078
+ "step": 108000
3079
+ },
3080
+ {
3081
+ "epoch": 0.6338796214289374,
3082
+ "grad_norm": 7.6557111740112305,
3083
+ "learning_rate": 7.785013652789959e-05,
3084
+ "loss": 0.322,
3085
+ "step": 108250
3086
+ },
3087
+ {
3088
+ "epoch": 0.6353435466516371,
3089
+ "grad_norm": 7.857481002807617,
3090
+ "learning_rate": 7.753913903453939e-05,
3091
+ "loss": 0.3318,
3092
+ "step": 108500
3093
+ },
3094
+ {
3095
+ "epoch": 0.6368074718743366,
3096
+ "grad_norm": 5.911120891571045,
3097
+ "learning_rate": 7.722814154117918e-05,
3098
+ "loss": 0.325,
3099
+ "step": 108750
3100
+ },
3101
+ {
3102
+ "epoch": 0.6382713970970363,
3103
+ "grad_norm": 8.592209815979004,
3104
+ "learning_rate": 7.691714404781898e-05,
3105
+ "loss": 0.3209,
3106
+ "step": 109000
3107
+ },
3108
+ {
3109
+ "epoch": 0.639735322319736,
3110
+ "grad_norm": 6.824602127075195,
3111
+ "learning_rate": 7.660614655445879e-05,
3112
+ "loss": 0.3331,
3113
+ "step": 109250
3114
+ },
3115
+ {
3116
+ "epoch": 0.6411992475424355,
3117
+ "grad_norm": 6.813981056213379,
3118
+ "learning_rate": 7.629514906109858e-05,
3119
+ "loss": 0.3313,
3120
+ "step": 109500
3121
+ },
3122
+ {
3123
+ "epoch": 0.6426631727651352,
3124
+ "grad_norm": 5.7169671058654785,
3125
+ "learning_rate": 7.598539555771181e-05,
3126
+ "loss": 0.3206,
3127
+ "step": 109750
3128
+ },
3129
+ {
3130
+ "epoch": 0.6441270979878347,
3131
+ "grad_norm": 5.429720401763916,
3132
+ "learning_rate": 7.56743980643516e-05,
3133
+ "loss": 0.3192,
3134
+ "step": 110000
3135
+ },
3136
+ {
3137
+ "epoch": 0.6441270979878347,
3138
+ "eval_accuracy": 0.8997983351325891,
3139
+ "eval_loss": 0.3242824375629425,
3140
+ "eval_runtime": 11546.6804,
3141
+ "eval_samples_per_second": 210.345,
3142
+ "eval_steps_per_second": 6.573,
3143
+ "step": 110000
3144
  }
3145
  ],
3146
  "logging_steps": 250,
 
3160
  "attributes": {}
3161
  }
3162
  },
3163
+ "total_flos": 2.8505890873482936e+19,
3164
  "train_batch_size": 8,
3165
  "trial_name": null,
3166
  "trial_params": null