{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 22.995433789954337,
  "eval_steps": 500,
  "global_step": 2518,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.182648401826484,
      "grad_norm": 0.4084379971027374,
      "learning_rate": 0.0001,
      "loss": 0.8312,
      "step": 20
    },
    {
      "epoch": 0.365296803652968,
      "grad_norm": 0.24725936353206635,
      "learning_rate": 0.0001,
      "loss": 0.1547,
      "step": 40
    },
    {
      "epoch": 0.547945205479452,
      "grad_norm": 0.1690889149904251,
      "learning_rate": 0.0001,
      "loss": 0.0644,
      "step": 60
    },
    {
      "epoch": 0.730593607305936,
      "grad_norm": 0.09192364662885666,
      "learning_rate": 0.0001,
      "loss": 0.0466,
      "step": 80
    },
    {
      "epoch": 0.91324200913242,
      "grad_norm": 0.08266641944646835,
      "learning_rate": 0.0001,
      "loss": 0.0385,
      "step": 100
    },
    {
      "epoch": 1.095890410958904,
      "grad_norm": 0.10168185085058212,
      "learning_rate": 0.0001,
      "loss": 0.0379,
      "step": 120
    },
    {
      "epoch": 1.278538812785388,
      "grad_norm": 0.10715723037719727,
      "learning_rate": 0.0001,
      "loss": 0.0337,
      "step": 140
    },
    {
      "epoch": 1.461187214611872,
      "grad_norm": 0.08185174316167831,
      "learning_rate": 0.0001,
      "loss": 0.0304,
      "step": 160
    },
    {
      "epoch": 1.643835616438356,
      "grad_norm": 0.0720980241894722,
      "learning_rate": 0.0001,
      "loss": 0.0342,
      "step": 180
    },
    {
      "epoch": 1.82648401826484,
      "grad_norm": 0.07974616438150406,
      "learning_rate": 0.0001,
      "loss": 0.0312,
      "step": 200
    },
    {
      "epoch": 2.009132420091324,
      "grad_norm": 0.08611268550157547,
      "learning_rate": 0.0001,
      "loss": 0.0315,
      "step": 220
    },
    {
      "epoch": 2.191780821917808,
      "grad_norm": 0.06699004024267197,
      "learning_rate": 0.0001,
      "loss": 0.0267,
      "step": 240
    },
    {
      "epoch": 2.374429223744292,
      "grad_norm": 0.1077587902545929,
      "learning_rate": 0.0001,
      "loss": 0.0246,
      "step": 260
    },
    {
      "epoch": 2.557077625570776,
      "grad_norm": 0.10352851450443268,
      "learning_rate": 0.0001,
      "loss": 0.0267,
      "step": 280
    },
    {
      "epoch": 2.73972602739726,
      "grad_norm": 0.08488716930150986,
      "learning_rate": 0.0001,
      "loss": 0.0297,
      "step": 300
    },
    {
      "epoch": 2.922374429223744,
      "grad_norm": 0.08407847583293915,
      "learning_rate": 0.0001,
      "loss": 0.0269,
      "step": 320
    },
    {
      "epoch": 3.105022831050228,
      "grad_norm": 0.0976366400718689,
      "learning_rate": 0.0001,
      "loss": 0.0251,
      "step": 340
    },
    {
      "epoch": 3.287671232876712,
      "grad_norm": 0.08240761607885361,
      "learning_rate": 0.0001,
      "loss": 0.0229,
      "step": 360
    },
    {
      "epoch": 3.470319634703196,
      "grad_norm": 0.0689239650964737,
      "learning_rate": 0.0001,
      "loss": 0.0232,
      "step": 380
    },
    {
      "epoch": 3.65296803652968,
      "grad_norm": 0.0607539638876915,
      "learning_rate": 0.0001,
      "loss": 0.0231,
      "step": 400
    },
    {
      "epoch": 3.8356164383561646,
      "grad_norm": 0.06858925521373749,
      "learning_rate": 0.0001,
      "loss": 0.023,
      "step": 420
    },
    {
      "epoch": 4.018264840182648,
      "grad_norm": 0.04049643874168396,
      "learning_rate": 0.0001,
      "loss": 0.0231,
      "step": 440
    },
    {
      "epoch": 4.200913242009133,
      "grad_norm": 0.08556920289993286,
      "learning_rate": 0.0001,
      "loss": 0.018,
      "step": 460
    },
    {
      "epoch": 4.383561643835616,
      "grad_norm": 0.05961354076862335,
      "learning_rate": 0.0001,
      "loss": 0.0183,
      "step": 480
    },
    {
      "epoch": 4.566210045662101,
      "grad_norm": 0.05691586434841156,
      "learning_rate": 0.0001,
      "loss": 0.02,
      "step": 500
    },
    {
      "epoch": 4.748858447488584,
      "grad_norm": 0.05423538759350777,
      "learning_rate": 0.0001,
      "loss": 0.0196,
      "step": 520
    },
    {
      "epoch": 4.931506849315069,
      "grad_norm": 0.10058747231960297,
      "learning_rate": 0.0001,
      "loss": 0.0206,
      "step": 540
    },
    {
      "epoch": 5.114155251141552,
      "grad_norm": 0.064676932990551,
      "learning_rate": 0.0001,
      "loss": 0.0177,
      "step": 560
    },
    {
      "epoch": 5.296803652968037,
      "grad_norm": 0.08128379285335541,
      "learning_rate": 0.0001,
      "loss": 0.0157,
      "step": 580
    },
    {
      "epoch": 5.47945205479452,
      "grad_norm": 0.10474538058042526,
      "learning_rate": 0.0001,
      "loss": 0.0169,
      "step": 600
    },
    {
      "epoch": 5.662100456621005,
      "grad_norm": 0.09420209378004074,
      "learning_rate": 0.0001,
      "loss": 0.0207,
      "step": 620
    },
    {
      "epoch": 5.844748858447488,
      "grad_norm": 0.07704417407512665,
      "learning_rate": 0.0001,
      "loss": 0.018,
      "step": 640
    },
    {
      "epoch": 6.027397260273973,
      "grad_norm": 0.044411078095436096,
      "learning_rate": 0.0001,
      "loss": 0.0168,
      "step": 660
    },
    {
      "epoch": 6.210045662100456,
      "grad_norm": 0.09763959795236588,
      "learning_rate": 0.0001,
      "loss": 0.0131,
      "step": 680
    },
    {
      "epoch": 6.392694063926941,
      "grad_norm": 0.08706251531839371,
      "learning_rate": 0.0001,
      "loss": 0.0146,
      "step": 700
    },
    {
      "epoch": 6.575342465753424,
      "grad_norm": 0.10404196381568909,
      "learning_rate": 0.0001,
      "loss": 0.0169,
      "step": 720
    },
    {
      "epoch": 6.757990867579909,
      "grad_norm": 0.1037658154964447,
      "learning_rate": 0.0001,
      "loss": 0.0165,
      "step": 740
    },
    {
      "epoch": 6.940639269406392,
      "grad_norm": 0.07572110742330551,
      "learning_rate": 0.0001,
      "loss": 0.0168,
      "step": 760
    },
    {
      "epoch": 7.123287671232877,
      "grad_norm": 0.06740553677082062,
      "learning_rate": 0.0001,
      "loss": 0.0139,
      "step": 780
    },
    {
      "epoch": 7.30593607305936,
      "grad_norm": 0.08043979108333588,
      "learning_rate": 0.0001,
      "loss": 0.014,
      "step": 800
    },
    {
      "epoch": 7.488584474885845,
      "grad_norm": 0.06607798486948013,
      "learning_rate": 0.0001,
      "loss": 0.0136,
      "step": 820
    },
    {
      "epoch": 7.671232876712329,
      "grad_norm": 0.11705009639263153,
      "learning_rate": 0.0001,
      "loss": 0.0146,
      "step": 840
    },
    {
      "epoch": 7.853881278538813,
      "grad_norm": 0.04560132324695587,
      "learning_rate": 0.0001,
      "loss": 0.0154,
      "step": 860
    },
    {
      "epoch": 8.036529680365296,
      "grad_norm": 0.05037812143564224,
      "learning_rate": 0.0001,
      "loss": 0.0129,
      "step": 880
    },
    {
      "epoch": 8.219178082191782,
      "grad_norm": 0.07135117053985596,
      "learning_rate": 0.0001,
      "loss": 0.0109,
      "step": 900
    },
    {
      "epoch": 8.401826484018265,
      "grad_norm": 0.05977578088641167,
      "learning_rate": 0.0001,
      "loss": 0.0117,
      "step": 920
    },
    {
      "epoch": 8.584474885844749,
      "grad_norm": 0.07411223649978638,
      "learning_rate": 0.0001,
      "loss": 0.0111,
      "step": 940
    },
    {
      "epoch": 8.767123287671232,
      "grad_norm": 0.08515261113643646,
      "learning_rate": 0.0001,
      "loss": 0.0122,
      "step": 960
    },
    {
      "epoch": 8.949771689497716,
      "grad_norm": 0.07383166998624802,
      "learning_rate": 0.0001,
      "loss": 0.0125,
      "step": 980
    },
    {
      "epoch": 9.132420091324201,
      "grad_norm": 0.041954681277275085,
      "learning_rate": 0.0001,
      "loss": 0.0105,
      "step": 1000
    },
    {
      "epoch": 9.315068493150685,
      "grad_norm": 0.09089387208223343,
      "learning_rate": 0.0001,
      "loss": 0.0105,
      "step": 1020
    },
    {
      "epoch": 9.497716894977168,
      "grad_norm": 0.08716876059770584,
      "learning_rate": 0.0001,
      "loss": 0.011,
      "step": 1040
    },
    {
      "epoch": 9.680365296803654,
      "grad_norm": 0.04927799850702286,
      "learning_rate": 0.0001,
      "loss": 0.0106,
      "step": 1060
    },
    {
      "epoch": 9.863013698630137,
      "grad_norm": 0.05259260907769203,
      "learning_rate": 0.0001,
      "loss": 0.0111,
      "step": 1080
    },
    {
      "epoch": 10.045662100456621,
      "grad_norm": 0.04412449151277542,
      "learning_rate": 0.0001,
      "loss": 0.0106,
      "step": 1100
    },
    {
      "epoch": 10.228310502283104,
      "grad_norm": 0.05673637241125107,
      "learning_rate": 0.0001,
      "loss": 0.0087,
      "step": 1120
    },
    {
      "epoch": 10.41095890410959,
      "grad_norm": 0.04577219486236572,
      "learning_rate": 0.0001,
      "loss": 0.0094,
      "step": 1140
    },
    {
      "epoch": 10.593607305936073,
      "grad_norm": 0.05691211298108101,
      "learning_rate": 0.0001,
      "loss": 0.0098,
      "step": 1160
    },
    {
      "epoch": 10.776255707762557,
      "grad_norm": 0.05354565382003784,
      "learning_rate": 0.0001,
      "loss": 0.01,
      "step": 1180
    },
    {
      "epoch": 10.95890410958904,
      "grad_norm": 0.06758158653974533,
      "learning_rate": 0.0001,
      "loss": 0.0104,
      "step": 1200
    },
    {
      "epoch": 11.141552511415526,
      "grad_norm": 0.05417347326874733,
      "learning_rate": 0.0001,
      "loss": 0.009,
      "step": 1220
    },
    {
      "epoch": 11.32420091324201,
      "grad_norm": 0.05120660364627838,
      "learning_rate": 0.0001,
      "loss": 0.0104,
      "step": 1240
    },
    {
      "epoch": 11.506849315068493,
      "grad_norm": 0.051275257021188736,
      "learning_rate": 0.0001,
      "loss": 0.0095,
      "step": 1260
    },
    {
      "epoch": 11.689497716894977,
      "grad_norm": 0.044794872403144836,
      "learning_rate": 0.0001,
      "loss": 0.0088,
      "step": 1280
    },
    {
      "epoch": 11.872146118721462,
      "grad_norm": 0.09698979556560516,
      "learning_rate": 0.0001,
      "loss": 0.009,
      "step": 1300
    },
    {
      "epoch": 12.054794520547945,
      "grad_norm": 0.060981862246990204,
      "learning_rate": 0.0001,
      "loss": 0.0091,
      "step": 1320
    },
    {
      "epoch": 12.237442922374429,
      "grad_norm": 0.0480022057890892,
      "learning_rate": 0.0001,
      "loss": 0.0085,
      "step": 1340
    },
    {
      "epoch": 12.420091324200913,
      "grad_norm": 0.05448669195175171,
      "learning_rate": 0.0001,
      "loss": 0.0082,
      "step": 1360
    },
    {
      "epoch": 12.602739726027398,
      "grad_norm": 0.06578750908374786,
      "learning_rate": 0.0001,
      "loss": 0.0086,
      "step": 1380
    },
    {
      "epoch": 12.785388127853881,
      "grad_norm": 0.0766950324177742,
      "learning_rate": 0.0001,
      "loss": 0.0089,
      "step": 1400
    },
    {
      "epoch": 12.968036529680365,
      "grad_norm": 0.05735301971435547,
      "learning_rate": 0.0001,
      "loss": 0.0094,
      "step": 1420
    },
    {
      "epoch": 13.150684931506849,
      "grad_norm": 0.05431370809674263,
      "learning_rate": 0.0001,
      "loss": 0.0094,
      "step": 1440
    },
    {
      "epoch": 13.333333333333334,
      "grad_norm": 0.04852620139718056,
      "learning_rate": 0.0001,
      "loss": 0.008,
      "step": 1460
    },
    {
      "epoch": 13.515981735159817,
      "grad_norm": 0.022798724472522736,
      "learning_rate": 0.0001,
      "loss": 0.0083,
      "step": 1480
    },
    {
      "epoch": 13.698630136986301,
      "grad_norm": 0.040976837277412415,
      "learning_rate": 0.0001,
      "loss": 0.0083,
      "step": 1500
    },
    {
      "epoch": 13.881278538812785,
      "grad_norm": 0.04464666545391083,
      "learning_rate": 0.0001,
      "loss": 0.0087,
      "step": 1520
    },
    {
      "epoch": 14.06392694063927,
      "grad_norm": 0.03699250891804695,
      "learning_rate": 0.0001,
      "loss": 0.0075,
      "step": 1540
    },
    {
      "epoch": 14.246575342465754,
      "grad_norm": 0.03735646605491638,
      "learning_rate": 0.0001,
      "loss": 0.008,
      "step": 1560
    },
    {
      "epoch": 14.429223744292237,
      "grad_norm": 0.04999354109168053,
      "learning_rate": 0.0001,
      "loss": 0.0077,
      "step": 1580
    },
    {
      "epoch": 14.61187214611872,
      "grad_norm": 0.032685693353414536,
      "learning_rate": 0.0001,
      "loss": 0.0078,
      "step": 1600
    },
    {
      "epoch": 14.794520547945206,
      "grad_norm": 0.051545485854148865,
      "learning_rate": 0.0001,
      "loss": 0.0076,
      "step": 1620
    },
    {
      "epoch": 14.97716894977169,
      "grad_norm": 0.046069201081991196,
      "learning_rate": 0.0001,
      "loss": 0.0088,
      "step": 1640
    },
    {
      "epoch": 15.159817351598173,
      "grad_norm": 0.07814318686723709,
      "learning_rate": 0.0001,
      "loss": 0.0088,
      "step": 1660
    },
    {
      "epoch": 15.342465753424657,
      "grad_norm": 0.05025108903646469,
      "learning_rate": 0.0001,
      "loss": 0.0076,
      "step": 1680
    },
    {
      "epoch": 15.525114155251142,
      "grad_norm": 0.05556974932551384,
      "learning_rate": 0.0001,
      "loss": 0.0079,
      "step": 1700
    },
    {
      "epoch": 15.707762557077626,
      "grad_norm": 0.16063013672828674,
      "learning_rate": 0.0001,
      "loss": 0.0091,
      "step": 1720
    },
    {
      "epoch": 15.89041095890411,
      "grad_norm": 0.05164189264178276,
      "learning_rate": 0.0001,
      "loss": 0.008,
      "step": 1740
    },
    {
      "epoch": 16.073059360730593,
      "grad_norm": 0.07564544677734375,
      "learning_rate": 0.0001,
      "loss": 0.0082,
      "step": 1760
    },
    {
      "epoch": 16.255707762557076,
      "grad_norm": 0.13074898719787598,
      "learning_rate": 0.0001,
      "loss": 0.0081,
      "step": 1780
    },
    {
      "epoch": 16.438356164383563,
      "grad_norm": 0.05894935131072998,
      "learning_rate": 0.0001,
      "loss": 0.0079,
      "step": 1800
    },
    {
      "epoch": 16.621004566210047,
      "grad_norm": 0.05766492336988449,
      "learning_rate": 0.0001,
      "loss": 0.0082,
      "step": 1820
    },
    {
      "epoch": 16.80365296803653,
      "grad_norm": 0.04949938878417015,
      "learning_rate": 0.0001,
      "loss": 0.0081,
      "step": 1840
    },
    {
      "epoch": 16.986301369863014,
      "grad_norm": 0.0840422660112381,
      "learning_rate": 0.0001,
      "loss": 0.0084,
      "step": 1860
    },
    {
      "epoch": 17.168949771689498,
      "grad_norm": 0.059242211282253265,
      "learning_rate": 0.0001,
      "loss": 0.0067,
      "step": 1880
    },
    {
      "epoch": 17.35159817351598,
      "grad_norm": 0.08060181140899658,
      "learning_rate": 0.0001,
      "loss": 0.0076,
      "step": 1900
    },
    {
      "epoch": 17.534246575342465,
      "grad_norm": 0.04570634663105011,
      "learning_rate": 0.0001,
      "loss": 0.0077,
      "step": 1920
    },
    {
      "epoch": 17.71689497716895,
      "grad_norm": 0.05903254821896553,
      "learning_rate": 0.0001,
      "loss": 0.0078,
      "step": 1940
    },
    {
      "epoch": 17.899543378995435,
      "grad_norm": 0.04280728101730347,
      "learning_rate": 0.0001,
      "loss": 0.0076,
      "step": 1960
    },
    {
      "epoch": 18.08219178082192,
      "grad_norm": 0.04099351540207863,
      "learning_rate": 0.0001,
      "loss": 0.0074,
      "step": 1980
    },
    {
      "epoch": 18.264840182648403,
      "grad_norm": 0.03019886650145054,
      "learning_rate": 0.0001,
      "loss": 0.0069,
      "step": 2000
    },
    {
      "epoch": 18.447488584474886,
      "grad_norm": 0.033835116773843765,
      "learning_rate": 0.0001,
      "loss": 0.0068,
      "step": 2020
    },
    {
      "epoch": 18.63013698630137,
      "grad_norm": 0.04315301775932312,
      "learning_rate": 0.0001,
      "loss": 0.0075,
      "step": 2040
    },
    {
      "epoch": 18.812785388127853,
      "grad_norm": 0.07251162081956863,
      "learning_rate": 0.0001,
      "loss": 0.0072,
      "step": 2060
    },
    {
      "epoch": 18.995433789954337,
      "grad_norm": 0.0530848354101181,
      "learning_rate": 0.0001,
      "loss": 0.0077,
      "step": 2080
    },
    {
      "epoch": 19.17808219178082,
      "grad_norm": 0.03507260978221893,
      "learning_rate": 0.0001,
      "loss": 0.0065,
      "step": 2100
    },
    {
      "epoch": 19.360730593607308,
      "grad_norm": 0.038022469729185104,
      "learning_rate": 0.0001,
      "loss": 0.0065,
      "step": 2120
    },
    {
      "epoch": 19.54337899543379,
      "grad_norm": 0.07193304598331451,
      "learning_rate": 0.0001,
      "loss": 0.0073,
      "step": 2140
    },
    {
      "epoch": 19.726027397260275,
      "grad_norm": 0.03509294614195824,
      "learning_rate": 0.0001,
      "loss": 0.0077,
      "step": 2160
    },
    {
      "epoch": 19.908675799086758,
      "grad_norm": 0.04408028721809387,
      "learning_rate": 0.0001,
      "loss": 0.0073,
      "step": 2180
    },
    {
      "epoch": 20.091324200913242,
      "grad_norm": 0.043653570115566254,
      "learning_rate": 0.0001,
      "loss": 0.0066,
      "step": 2200
    },
    {
      "epoch": 20.273972602739725,
      "grad_norm": 0.04543660581111908,
      "learning_rate": 0.0001,
      "loss": 0.0064,
      "step": 2220
    },
    {
      "epoch": 20.45662100456621,
      "grad_norm": 0.020475788041949272,
      "learning_rate": 0.0001,
      "loss": 0.0067,
      "step": 2240
    },
    {
      "epoch": 20.639269406392692,
      "grad_norm": 0.029115352779626846,
      "learning_rate": 0.0001,
      "loss": 0.0069,
      "step": 2260
    },
    {
      "epoch": 20.82191780821918,
      "grad_norm": 0.03670508787035942,
      "learning_rate": 0.0001,
      "loss": 0.0075,
      "step": 2280
    },
    {
      "epoch": 21.004566210045663,
      "grad_norm": 0.03692976012825966,
      "learning_rate": 0.0001,
      "loss": 0.007,
      "step": 2300
    },
    {
      "epoch": 21.187214611872147,
      "grad_norm": 0.02499617636203766,
      "learning_rate": 0.0001,
      "loss": 0.0064,
      "step": 2320
    },
    {
      "epoch": 21.36986301369863,
      "grad_norm": 0.02078641578555107,
      "learning_rate": 0.0001,
      "loss": 0.0061,
      "step": 2340
    },
    {
      "epoch": 21.552511415525114,
      "grad_norm": 0.03682105615735054,
      "learning_rate": 0.0001,
      "loss": 0.0065,
      "step": 2360
    },
    {
      "epoch": 21.735159817351597,
      "grad_norm": 0.02736218087375164,
      "learning_rate": 0.0001,
      "loss": 0.0069,
      "step": 2380
    },
    {
      "epoch": 21.91780821917808,
      "grad_norm": 0.04047287255525589,
      "learning_rate": 0.0001,
      "loss": 0.007,
      "step": 2400
    },
    {
      "epoch": 22.100456621004565,
      "grad_norm": 0.01508075650781393,
      "learning_rate": 0.0001,
      "loss": 0.0069,
      "step": 2420
    },
    {
      "epoch": 22.28310502283105,
      "grad_norm": 0.06898848712444305,
      "learning_rate": 0.0001,
      "loss": 0.0071,
      "step": 2440
    },
    {
      "epoch": 22.465753424657535,
      "grad_norm": 0.037652187049388885,
      "learning_rate": 0.0001,
      "loss": 0.0064,
      "step": 2460
    },
    {
      "epoch": 22.64840182648402,
      "grad_norm": 0.03481518477201462,
      "learning_rate": 0.0001,
      "loss": 0.0069,
      "step": 2480
    },
    {
      "epoch": 22.831050228310502,
      "grad_norm": 0.03326406702399254,
      "learning_rate": 0.0001,
      "loss": 0.007,
      "step": 2500
    }
  ],
  "logging_steps": 20,
  "max_steps": 10900,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 100,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 1.910639205385347e+18,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}