{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9945945945945946,
  "eval_steps": 500,
  "global_step": 115,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.008648648648648649,
      "grad_norm": 2.249741315841675,
      "learning_rate": 0.0001,
      "loss": 1.8319,
      "step": 1
    },
    {
      "epoch": 0.017297297297297298,
      "grad_norm": 2.1813502311706543,
      "learning_rate": 0.0002,
      "loss": 1.4027,
      "step": 2
    },
    {
      "epoch": 0.025945945945945945,
      "grad_norm": 0.8601759672164917,
      "learning_rate": 0.00019823008849557524,
      "loss": 1.1102,
      "step": 3
    },
    {
      "epoch": 0.034594594594594595,
      "grad_norm": 1.7297605276107788,
      "learning_rate": 0.00019646017699115044,
      "loss": 1.3774,
      "step": 4
    },
    {
      "epoch": 0.043243243243243246,
      "grad_norm": 1.0936262607574463,
      "learning_rate": 0.00019469026548672567,
      "loss": 0.895,
      "step": 5
    },
    {
      "epoch": 0.05189189189189189,
      "grad_norm": 0.6946480870246887,
      "learning_rate": 0.00019292035398230087,
      "loss": 0.7451,
      "step": 6
    },
    {
      "epoch": 0.06054054054054054,
      "grad_norm": 0.45863592624664307,
      "learning_rate": 0.00019115044247787613,
      "loss": 0.876,
      "step": 7
    },
    {
      "epoch": 0.06918918918918919,
      "grad_norm": 0.5447478890419006,
      "learning_rate": 0.00018938053097345133,
      "loss": 0.7719,
      "step": 8
    },
    {
      "epoch": 0.07783783783783783,
      "grad_norm": 0.45514124631881714,
      "learning_rate": 0.00018761061946902656,
      "loss": 0.5759,
      "step": 9
    },
    {
      "epoch": 0.08648648648648649,
      "grad_norm": 0.4590395987033844,
      "learning_rate": 0.0001858407079646018,
      "loss": 0.5838,
      "step": 10
    },
    {
      "epoch": 0.09513513513513513,
      "grad_norm": 0.5425634384155273,
      "learning_rate": 0.000184070796460177,
      "loss": 0.6641,
      "step": 11
    },
    {
      "epoch": 0.10378378378378378,
      "grad_norm": 1.0379027128219604,
      "learning_rate": 0.00018230088495575222,
      "loss": 0.9623,
      "step": 12
    },
    {
      "epoch": 0.11243243243243244,
      "grad_norm": 0.5286022424697876,
      "learning_rate": 0.00018053097345132742,
      "loss": 0.4761,
      "step": 13
    },
    {
      "epoch": 0.12108108108108108,
      "grad_norm": 0.6451830267906189,
      "learning_rate": 0.00017876106194690265,
      "loss": 0.547,
      "step": 14
    },
    {
      "epoch": 0.12972972972972974,
      "grad_norm": 0.6369953751564026,
      "learning_rate": 0.0001769911504424779,
      "loss": 0.5872,
      "step": 15
    },
    {
      "epoch": 0.13837837837837838,
      "grad_norm": 0.4720052182674408,
      "learning_rate": 0.0001752212389380531,
      "loss": 0.3248,
      "step": 16
    },
    {
      "epoch": 0.14702702702702702,
      "grad_norm": 0.5918360352516174,
      "learning_rate": 0.00017345132743362834,
      "loss": 0.6277,
      "step": 17
    },
    {
      "epoch": 0.15567567567567567,
      "grad_norm": 0.5242601037025452,
      "learning_rate": 0.00017168141592920354,
      "loss": 0.5645,
      "step": 18
    },
    {
      "epoch": 0.1643243243243243,
      "grad_norm": 0.474292129278183,
      "learning_rate": 0.00016991150442477877,
      "loss": 0.2115,
      "step": 19
    },
    {
      "epoch": 0.17297297297297298,
      "grad_norm": 0.6523647904396057,
      "learning_rate": 0.000168141592920354,
      "loss": 0.5803,
      "step": 20
    },
    {
      "epoch": 0.18162162162162163,
      "grad_norm": 0.521297812461853,
      "learning_rate": 0.0001663716814159292,
      "loss": 0.4483,
      "step": 21
    },
    {
      "epoch": 0.19027027027027027,
      "grad_norm": 0.5689568519592285,
      "learning_rate": 0.00016460176991150443,
      "loss": 0.6231,
      "step": 22
    },
    {
      "epoch": 0.1989189189189189,
      "grad_norm": 0.4570567011833191,
      "learning_rate": 0.00016283185840707966,
      "loss": 0.2368,
      "step": 23
    },
    {
      "epoch": 0.20756756756756756,
      "grad_norm": 0.414307564496994,
      "learning_rate": 0.0001610619469026549,
      "loss": 0.4674,
      "step": 24
    },
    {
      "epoch": 0.21621621621621623,
      "grad_norm": 0.5027227997779846,
      "learning_rate": 0.0001592920353982301,
      "loss": 0.3558,
      "step": 25
    },
    {
      "epoch": 0.22486486486486487,
      "grad_norm": 0.4441507160663605,
      "learning_rate": 0.00015752212389380532,
      "loss": 0.437,
      "step": 26
    },
    {
      "epoch": 0.23351351351351352,
      "grad_norm": 0.4098701477050781,
      "learning_rate": 0.00015575221238938055,
      "loss": 0.3553,
      "step": 27
    },
    {
      "epoch": 0.24216216216216216,
      "grad_norm": 0.3602244257926941,
      "learning_rate": 0.00015398230088495575,
      "loss": 0.3689,
      "step": 28
    },
    {
      "epoch": 0.2508108108108108,
      "grad_norm": 0.4340718984603882,
      "learning_rate": 0.00015221238938053098,
      "loss": 0.318,
      "step": 29
    },
    {
      "epoch": 0.2594594594594595,
      "grad_norm": 0.44470590353012085,
      "learning_rate": 0.00015044247787610618,
      "loss": 0.4992,
      "step": 30
    },
    {
      "epoch": 0.2681081081081081,
      "grad_norm": 0.43699413537979126,
      "learning_rate": 0.00014867256637168144,
      "loss": 0.3362,
      "step": 31
    },
    {
      "epoch": 0.27675675675675676,
      "grad_norm": 0.4950752258300781,
      "learning_rate": 0.00014690265486725664,
      "loss": 0.4464,
      "step": 32
    },
    {
      "epoch": 0.28540540540540543,
      "grad_norm": 0.4312315881252289,
      "learning_rate": 0.00014513274336283187,
      "loss": 0.4786,
      "step": 33
    },
    {
      "epoch": 0.29405405405405405,
      "grad_norm": 0.45234543085098267,
      "learning_rate": 0.0001433628318584071,
      "loss": 0.5572,
      "step": 34
    },
    {
      "epoch": 0.3027027027027027,
      "grad_norm": 0.4373219311237335,
      "learning_rate": 0.0001415929203539823,
      "loss": 0.3873,
      "step": 35
    },
    {
      "epoch": 0.31135135135135134,
      "grad_norm": 0.35862988233566284,
      "learning_rate": 0.00013982300884955753,
      "loss": 0.2902,
      "step": 36
    },
    {
      "epoch": 0.32,
      "grad_norm": 0.41014787554740906,
      "learning_rate": 0.00013805309734513276,
      "loss": 0.3806,
      "step": 37
    },
    {
      "epoch": 0.3286486486486486,
      "grad_norm": 0.4181463420391083,
      "learning_rate": 0.00013628318584070796,
      "loss": 0.3036,
      "step": 38
    },
    {
      "epoch": 0.3372972972972973,
      "grad_norm": 0.3663095235824585,
      "learning_rate": 0.00013451327433628321,
      "loss": 0.1979,
      "step": 39
    },
    {
      "epoch": 0.34594594594594597,
      "grad_norm": 0.46295005083084106,
      "learning_rate": 0.00013274336283185842,
      "loss": 0.4204,
      "step": 40
    },
    {
      "epoch": 0.3545945945945946,
      "grad_norm": 0.39596325159072876,
      "learning_rate": 0.00013097345132743365,
      "loss": 0.3512,
      "step": 41
    },
    {
      "epoch": 0.36324324324324325,
      "grad_norm": 0.7628335952758789,
      "learning_rate": 0.00012920353982300885,
      "loss": 0.4965,
      "step": 42
    },
    {
      "epoch": 0.37189189189189187,
      "grad_norm": 0.5216770172119141,
      "learning_rate": 0.00012743362831858408,
      "loss": 0.4658,
      "step": 43
    },
    {
      "epoch": 0.38054054054054054,
      "grad_norm": 0.38578447699546814,
      "learning_rate": 0.0001256637168141593,
      "loss": 0.2661,
      "step": 44
    },
    {
      "epoch": 0.3891891891891892,
      "grad_norm": 0.2811882197856903,
      "learning_rate": 0.0001238938053097345,
      "loss": 0.1545,
      "step": 45
    },
    {
      "epoch": 0.3978378378378378,
      "grad_norm": 0.3812131881713867,
      "learning_rate": 0.00012212389380530974,
      "loss": 0.3295,
      "step": 46
    },
    {
      "epoch": 0.4064864864864865,
      "grad_norm": 0.3791070878505707,
      "learning_rate": 0.00012035398230088497,
      "loss": 0.2472,
      "step": 47
    },
    {
      "epoch": 0.4151351351351351,
      "grad_norm": 0.38515138626098633,
      "learning_rate": 0.0001185840707964602,
      "loss": 0.4042,
      "step": 48
    },
    {
      "epoch": 0.4237837837837838,
      "grad_norm": 0.5093116164207458,
      "learning_rate": 0.00011681415929203541,
      "loss": 0.8376,
      "step": 49
    },
    {
      "epoch": 0.43243243243243246,
      "grad_norm": 0.2971178889274597,
      "learning_rate": 0.00011504424778761063,
      "loss": 0.4082,
      "step": 50
    },
    {
      "epoch": 0.4410810810810811,
      "grad_norm": 0.30018818378448486,
      "learning_rate": 0.00011327433628318584,
      "loss": 0.129,
      "step": 51
    },
    {
      "epoch": 0.44972972972972974,
      "grad_norm": 0.4631483256816864,
      "learning_rate": 0.00011150442477876106,
      "loss": 0.3752,
      "step": 52
    },
    {
      "epoch": 0.45837837837837836,
      "grad_norm": 0.3890452980995178,
      "learning_rate": 0.00010973451327433629,
      "loss": 0.4054,
      "step": 53
    },
    {
      "epoch": 0.46702702702702703,
      "grad_norm": 0.3566686511039734,
      "learning_rate": 0.0001079646017699115,
      "loss": 0.2452,
      "step": 54
    },
    {
      "epoch": 0.4756756756756757,
      "grad_norm": 0.4903372526168823,
      "learning_rate": 0.00010619469026548674,
      "loss": 0.4505,
      "step": 55
    },
    {
      "epoch": 0.4843243243243243,
      "grad_norm": 0.3836239278316498,
      "learning_rate": 0.00010442477876106196,
      "loss": 0.3952,
      "step": 56
    },
    {
      "epoch": 0.492972972972973,
      "grad_norm": 0.42047417163848877,
      "learning_rate": 0.00010265486725663717,
      "loss": 0.5074,
      "step": 57
    },
    {
      "epoch": 0.5016216216216216,
      "grad_norm": 0.24409635365009308,
      "learning_rate": 0.00010088495575221239,
      "loss": 0.1389,
      "step": 58
    },
    {
      "epoch": 0.5102702702702703,
      "grad_norm": 0.3819220960140228,
      "learning_rate": 9.911504424778762e-05,
      "loss": 0.3945,
      "step": 59
    },
    {
      "epoch": 0.518918918918919,
      "grad_norm": 0.31148406863212585,
      "learning_rate": 9.734513274336283e-05,
      "loss": 0.5203,
      "step": 60
    },
    {
      "epoch": 0.5275675675675676,
      "grad_norm": 0.3157011866569519,
      "learning_rate": 9.557522123893806e-05,
      "loss": 0.262,
      "step": 61
    },
    {
      "epoch": 0.5362162162162162,
      "grad_norm": 0.40180379152297974,
      "learning_rate": 9.380530973451328e-05,
      "loss": 0.2404,
      "step": 62
    },
    {
      "epoch": 0.5448648648648649,
      "grad_norm": 0.4064180552959442,
      "learning_rate": 9.20353982300885e-05,
      "loss": 0.6118,
      "step": 63
    },
    {
      "epoch": 0.5535135135135135,
      "grad_norm": 0.3912467956542969,
      "learning_rate": 9.026548672566371e-05,
      "loss": 0.271,
      "step": 64
    },
    {
      "epoch": 0.5621621621621622,
      "grad_norm": 0.31059980392456055,
      "learning_rate": 8.849557522123895e-05,
      "loss": 0.2373,
      "step": 65
    },
    {
      "epoch": 0.5708108108108109,
      "grad_norm": 0.30928152799606323,
      "learning_rate": 8.672566371681417e-05,
      "loss": 0.4169,
      "step": 66
    },
    {
      "epoch": 0.5794594594594594,
      "grad_norm": 0.40631791949272156,
      "learning_rate": 8.495575221238938e-05,
      "loss": 0.4175,
      "step": 67
    },
    {
      "epoch": 0.5881081081081081,
      "grad_norm": 0.40440961718559265,
      "learning_rate": 8.31858407079646e-05,
      "loss": 0.3269,
      "step": 68
    },
    {
      "epoch": 0.5967567567567568,
      "grad_norm": 0.4534294009208679,
      "learning_rate": 8.141592920353983e-05,
      "loss": 0.2242,
      "step": 69
    },
    {
      "epoch": 0.6054054054054054,
      "grad_norm": 0.41317978501319885,
      "learning_rate": 7.964601769911504e-05,
      "loss": 0.2633,
      "step": 70
    },
    {
      "epoch": 0.614054054054054,
      "grad_norm": 0.272535115480423,
      "learning_rate": 7.787610619469027e-05,
      "loss": 0.1455,
      "step": 71
    },
    {
      "epoch": 0.6227027027027027,
      "grad_norm": 0.4280416667461395,
      "learning_rate": 7.610619469026549e-05,
      "loss": 0.5289,
      "step": 72
    },
    {
      "epoch": 0.6313513513513513,
      "grad_norm": 0.4870530664920807,
      "learning_rate": 7.433628318584072e-05,
      "loss": 0.5633,
      "step": 73
    },
    {
      "epoch": 0.64,
      "grad_norm": 0.38074707984924316,
      "learning_rate": 7.256637168141593e-05,
      "loss": 0.4738,
      "step": 74
    },
    {
      "epoch": 0.6486486486486487,
      "grad_norm": 0.32775411009788513,
      "learning_rate": 7.079646017699115e-05,
      "loss": 0.2764,
      "step": 75
    },
    {
      "epoch": 0.6572972972972972,
      "grad_norm": 0.3663316071033478,
      "learning_rate": 6.902654867256638e-05,
      "loss": 0.4794,
      "step": 76
    },
    {
      "epoch": 0.6659459459459459,
      "grad_norm": 0.36854031682014465,
      "learning_rate": 6.725663716814161e-05,
      "loss": 0.1809,
      "step": 77
    },
    {
      "epoch": 0.6745945945945946,
      "grad_norm": 0.37296342849731445,
      "learning_rate": 6.548672566371682e-05,
      "loss": 0.4067,
      "step": 78
    },
    {
      "epoch": 0.6832432432432433,
      "grad_norm": 0.4202044606208801,
      "learning_rate": 6.371681415929204e-05,
      "loss": 0.2752,
      "step": 79
    },
    {
      "epoch": 0.6918918918918919,
      "grad_norm": 0.29250282049179077,
      "learning_rate": 6.194690265486725e-05,
      "loss": 0.1461,
      "step": 80
    },
    {
      "epoch": 0.7005405405405405,
      "grad_norm": 0.37763354182243347,
      "learning_rate": 6.017699115044248e-05,
      "loss": 0.2817,
      "step": 81
    },
    {
      "epoch": 0.7091891891891892,
      "grad_norm": 0.30031171441078186,
      "learning_rate": 5.8407079646017705e-05,
      "loss": 0.1572,
      "step": 82
    },
    {
      "epoch": 0.7178378378378378,
      "grad_norm": 0.4519175887107849,
      "learning_rate": 5.663716814159292e-05,
      "loss": 0.3046,
      "step": 83
    },
    {
      "epoch": 0.7264864864864865,
      "grad_norm": 0.3103352189064026,
      "learning_rate": 5.486725663716814e-05,
      "loss": 0.1347,
      "step": 84
    },
    {
      "epoch": 0.7351351351351352,
      "grad_norm": 0.7960600852966309,
      "learning_rate": 5.309734513274337e-05,
      "loss": 0.3168,
      "step": 85
    },
    {
      "epoch": 0.7437837837837837,
      "grad_norm": 0.3281419277191162,
      "learning_rate": 5.132743362831859e-05,
      "loss": 0.2045,
      "step": 86
    },
    {
      "epoch": 0.7524324324324324,
      "grad_norm": 0.35785752534866333,
      "learning_rate": 4.955752212389381e-05,
      "loss": 0.4077,
      "step": 87
    },
    {
      "epoch": 0.7610810810810811,
      "grad_norm": 0.37461650371551514,
      "learning_rate": 4.778761061946903e-05,
      "loss": 0.3227,
      "step": 88
    },
    {
      "epoch": 0.7697297297297298,
      "grad_norm": 0.3365744352340698,
      "learning_rate": 4.601769911504425e-05,
      "loss": 0.2306,
      "step": 89
    },
    {
      "epoch": 0.7783783783783784,
      "grad_norm": 0.29543980956077576,
      "learning_rate": 4.4247787610619477e-05,
      "loss": 0.3661,
      "step": 90
    },
    {
      "epoch": 0.787027027027027,
      "grad_norm": 0.3135324716567993,
      "learning_rate": 4.247787610619469e-05,
      "loss": 0.2503,
      "step": 91
    },
    {
      "epoch": 0.7956756756756757,
      "grad_norm": 0.23556429147720337,
      "learning_rate": 4.0707964601769914e-05,
      "loss": 0.1044,
      "step": 92
    },
    {
      "epoch": 0.8043243243243243,
      "grad_norm": 0.2718769907951355,
      "learning_rate": 3.893805309734514e-05,
      "loss": 0.1471,
      "step": 93
    },
    {
      "epoch": 0.812972972972973,
      "grad_norm": 0.25528448820114136,
      "learning_rate": 3.716814159292036e-05,
      "loss": 0.1126,
      "step": 94
    },
    {
      "epoch": 0.8216216216216217,
      "grad_norm": 0.514164388179779,
      "learning_rate": 3.5398230088495574e-05,
      "loss": 0.3423,
      "step": 95
    },
    {
      "epoch": 0.8302702702702702,
      "grad_norm": 0.33162716031074524,
      "learning_rate": 3.3628318584070804e-05,
      "loss": 0.3637,
      "step": 96
    },
    {
      "epoch": 0.8389189189189189,
      "grad_norm": 0.25161704421043396,
      "learning_rate": 3.185840707964602e-05,
      "loss": 0.1284,
      "step": 97
    },
    {
      "epoch": 0.8475675675675676,
      "grad_norm": 0.32825589179992676,
      "learning_rate": 3.008849557522124e-05,
      "loss": 0.2171,
      "step": 98
    },
    {
      "epoch": 0.8562162162162162,
      "grad_norm": 0.23435255885124207,
      "learning_rate": 2.831858407079646e-05,
      "loss": 0.16,
      "step": 99
    },
    {
      "epoch": 0.8648648648648649,
      "grad_norm": 0.2661581337451935,
      "learning_rate": 2.6548672566371686e-05,
      "loss": 0.2421,
      "step": 100
    },
    {
      "epoch": 0.8735135135135135,
      "grad_norm": 0.2724602222442627,
      "learning_rate": 2.4778761061946905e-05,
      "loss": 0.1246,
      "step": 101
    },
    {
      "epoch": 0.8821621621621621,
      "grad_norm": 0.47894561290740967,
      "learning_rate": 2.3008849557522124e-05,
      "loss": 0.4472,
      "step": 102
    },
    {
      "epoch": 0.8908108108108108,
      "grad_norm": 0.3064163327217102,
      "learning_rate": 2.1238938053097346e-05,
      "loss": 0.2987,
      "step": 103
    },
    {
      "epoch": 0.8994594594594595,
      "grad_norm": 0.4226900637149811,
      "learning_rate": 1.946902654867257e-05,
      "loss": 0.4185,
      "step": 104
    },
    {
      "epoch": 0.9081081081081082,
      "grad_norm": 0.34745219349861145,
      "learning_rate": 1.7699115044247787e-05,
      "loss": 0.2572,
      "step": 105
    },
    {
      "epoch": 0.9167567567567567,
      "grad_norm": 0.35236531496047974,
      "learning_rate": 1.592920353982301e-05,
      "loss": 0.3427,
      "step": 106
    },
    {
      "epoch": 0.9254054054054054,
      "grad_norm": 0.37095391750335693,
      "learning_rate": 1.415929203539823e-05,
      "loss": 0.4018,
      "step": 107
    },
    {
      "epoch": 0.9340540540540541,
      "grad_norm": 0.3331229090690613,
      "learning_rate": 1.2389380530973452e-05,
      "loss": 0.2038,
      "step": 108
    },
    {
      "epoch": 0.9427027027027027,
      "grad_norm": 0.2652183175086975,
      "learning_rate": 1.0619469026548673e-05,
      "loss": 0.1072,
      "step": 109
    },
    {
      "epoch": 0.9513513513513514,
      "grad_norm": 0.29123690724372864,
      "learning_rate": 8.849557522123894e-06,
      "loss": 0.1406,
      "step": 110
    },
    {
      "epoch": 0.96,
      "grad_norm": 0.3317340612411499,
      "learning_rate": 7.079646017699115e-06,
      "loss": 0.2202,
      "step": 111
    },
    {
      "epoch": 0.9686486486486486,
      "grad_norm": 0.47986647486686707,
      "learning_rate": 5.3097345132743365e-06,
      "loss": 0.3464,
      "step": 112
    },
    {
      "epoch": 0.9772972972972973,
      "grad_norm": 0.2612822949886322,
      "learning_rate": 3.5398230088495575e-06,
      "loss": 0.1271,
      "step": 113
    },
    {
      "epoch": 0.985945945945946,
      "grad_norm": 0.26845863461494446,
      "learning_rate": 1.7699115044247788e-06,
      "loss": 0.1044,
      "step": 114
    },
    {
      "epoch": 0.9945945945945946,
      "grad_norm": 0.2526237368583679,
      "learning_rate": 0.0,
      "loss": 0.1158,
      "step": 115
    },
    {
      "epoch": 0.9945945945945946,
      "step": 115,
      "total_flos": 1.3431114641260646e+17,
      "train_loss": 0.4029887131374815,
      "train_runtime": 1125.7865,
      "train_samples_per_second": 0.822,
      "train_steps_per_second": 0.102
    }
  ],
  "logging_steps": 1,
  "max_steps": 115,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.3431114641260646e+17,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}