t5-base-lora-finetune-tweetsumm / trainer_state.json
samuellimabraz's picture
End of training
ca8816b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1320,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0022727272727272726,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 3.1855,
"step": 1
},
{
"epoch": 0.004545454545454545,
"grad_norm": Infinity,
"learning_rate": 0.0001,
"loss": 4.3007,
"step": 2
},
{
"epoch": 0.006818181818181818,
"grad_norm": Infinity,
"learning_rate": 0.0001,
"loss": 4.3188,
"step": 3
},
{
"epoch": 0.00909090909090909,
"grad_norm": 26.366512298583984,
"learning_rate": 9.992424242424244e-05,
"loss": 3.6582,
"step": 4
},
{
"epoch": 0.011363636363636364,
"grad_norm": 29.344751358032227,
"learning_rate": 9.984848484848486e-05,
"loss": 4.2964,
"step": 5
},
{
"epoch": 0.013636363636363636,
"grad_norm": 29.519277572631836,
"learning_rate": 9.977272727272728e-05,
"loss": 4.0004,
"step": 6
},
{
"epoch": 0.015909090909090907,
"grad_norm": 24.204898834228516,
"learning_rate": 9.96969696969697e-05,
"loss": 3.2453,
"step": 7
},
{
"epoch": 0.01818181818181818,
"grad_norm": 23.69887351989746,
"learning_rate": 9.962121212121213e-05,
"loss": 2.7972,
"step": 8
},
{
"epoch": 0.020454545454545454,
"grad_norm": 52.371498107910156,
"learning_rate": 9.954545454545455e-05,
"loss": 2.5971,
"step": 9
},
{
"epoch": 0.022727272727272728,
"grad_norm": 41.59567642211914,
"learning_rate": 9.946969696969698e-05,
"loss": 3.3081,
"step": 10
},
{
"epoch": 0.025,
"grad_norm": 27.913963317871094,
"learning_rate": 9.939393939393939e-05,
"loss": 3.5977,
"step": 11
},
{
"epoch": 0.02727272727272727,
"grad_norm": 21.261117935180664,
"learning_rate": 9.931818181818182e-05,
"loss": 3.3403,
"step": 12
},
{
"epoch": 0.029545454545454545,
"grad_norm": 20.344589233398438,
"learning_rate": 9.924242424242425e-05,
"loss": 2.0478,
"step": 13
},
{
"epoch": 0.031818181818181815,
"grad_norm": 32.50373077392578,
"learning_rate": 9.916666666666667e-05,
"loss": 3.0773,
"step": 14
},
{
"epoch": 0.03409090909090909,
"grad_norm": 21.426048278808594,
"learning_rate": 9.909090909090911e-05,
"loss": 2.8572,
"step": 15
},
{
"epoch": 0.03636363636363636,
"grad_norm": 27.847314834594727,
"learning_rate": 9.901515151515151e-05,
"loss": 3.129,
"step": 16
},
{
"epoch": 0.038636363636363635,
"grad_norm": 23.516616821289062,
"learning_rate": 9.893939393939395e-05,
"loss": 3.3971,
"step": 17
},
{
"epoch": 0.04090909090909091,
"grad_norm": 29.170352935791016,
"learning_rate": 9.886363636363637e-05,
"loss": 3.6325,
"step": 18
},
{
"epoch": 0.04318181818181818,
"grad_norm": 21.103153228759766,
"learning_rate": 9.87878787878788e-05,
"loss": 2.7935,
"step": 19
},
{
"epoch": 0.045454545454545456,
"grad_norm": 25.863285064697266,
"learning_rate": 9.871212121212122e-05,
"loss": 2.0675,
"step": 20
},
{
"epoch": 0.04772727272727273,
"grad_norm": 25.554828643798828,
"learning_rate": 9.863636363636364e-05,
"loss": 2.8331,
"step": 21
},
{
"epoch": 0.05,
"grad_norm": 26.424827575683594,
"learning_rate": 9.856060606060607e-05,
"loss": 4.0934,
"step": 22
},
{
"epoch": 0.05227272727272727,
"grad_norm": 40.84152603149414,
"learning_rate": 9.848484848484849e-05,
"loss": 2.7315,
"step": 23
},
{
"epoch": 0.05454545454545454,
"grad_norm": 17.789630889892578,
"learning_rate": 9.840909090909092e-05,
"loss": 2.5798,
"step": 24
},
{
"epoch": 0.056818181818181816,
"grad_norm": 15.23817253112793,
"learning_rate": 9.833333333333333e-05,
"loss": 2.3981,
"step": 25
},
{
"epoch": 0.05909090909090909,
"grad_norm": 17.333356857299805,
"learning_rate": 9.825757575757576e-05,
"loss": 2.0097,
"step": 26
},
{
"epoch": 0.06136363636363636,
"grad_norm": 17.358461380004883,
"learning_rate": 9.818181818181818e-05,
"loss": 1.5636,
"step": 27
},
{
"epoch": 0.06363636363636363,
"grad_norm": 15.479598999023438,
"learning_rate": 9.810606060606061e-05,
"loss": 2.3064,
"step": 28
},
{
"epoch": 0.0659090909090909,
"grad_norm": 18.889394760131836,
"learning_rate": 9.803030303030303e-05,
"loss": 1.6592,
"step": 29
},
{
"epoch": 0.06818181818181818,
"grad_norm": 19.264772415161133,
"learning_rate": 9.795454545454545e-05,
"loss": 2.9327,
"step": 30
},
{
"epoch": 0.07045454545454545,
"grad_norm": 19.369556427001953,
"learning_rate": 9.787878787878789e-05,
"loss": 3.2685,
"step": 31
},
{
"epoch": 0.07272727272727272,
"grad_norm": 20.017459869384766,
"learning_rate": 9.78030303030303e-05,
"loss": 3.4532,
"step": 32
},
{
"epoch": 0.075,
"grad_norm": 18.956012725830078,
"learning_rate": 9.772727272727274e-05,
"loss": 2.2143,
"step": 33
},
{
"epoch": 0.07727272727272727,
"grad_norm": 15.438785552978516,
"learning_rate": 9.765151515151516e-05,
"loss": 2.407,
"step": 34
},
{
"epoch": 0.07954545454545454,
"grad_norm": 22.79155921936035,
"learning_rate": 9.757575757575758e-05,
"loss": 3.1064,
"step": 35
},
{
"epoch": 0.08181818181818182,
"grad_norm": 15.908382415771484,
"learning_rate": 9.75e-05,
"loss": 2.9192,
"step": 36
},
{
"epoch": 0.08409090909090909,
"grad_norm": 21.536775588989258,
"learning_rate": 9.742424242424243e-05,
"loss": 2.8127,
"step": 37
},
{
"epoch": 0.08636363636363636,
"grad_norm": 19.644390106201172,
"learning_rate": 9.734848484848485e-05,
"loss": 1.704,
"step": 38
},
{
"epoch": 0.08863636363636364,
"grad_norm": 20.067602157592773,
"learning_rate": 9.727272727272728e-05,
"loss": 2.3733,
"step": 39
},
{
"epoch": 0.09090909090909091,
"grad_norm": 16.551055908203125,
"learning_rate": 9.71969696969697e-05,
"loss": 2.2413,
"step": 40
},
{
"epoch": 0.09318181818181819,
"grad_norm": 18.292987823486328,
"learning_rate": 9.712121212121212e-05,
"loss": 2.6103,
"step": 41
},
{
"epoch": 0.09545454545454546,
"grad_norm": 15.751124382019043,
"learning_rate": 9.704545454545456e-05,
"loss": 1.5648,
"step": 42
},
{
"epoch": 0.09772727272727273,
"grad_norm": 25.068395614624023,
"learning_rate": 9.696969696969698e-05,
"loss": 1.8226,
"step": 43
},
{
"epoch": 0.1,
"grad_norm": 25.069040298461914,
"learning_rate": 9.689393939393941e-05,
"loss": 3.8825,
"step": 44
},
{
"epoch": 0.10227272727272728,
"grad_norm": 20.751232147216797,
"learning_rate": 9.681818181818181e-05,
"loss": 2.9331,
"step": 45
},
{
"epoch": 0.10454545454545454,
"grad_norm": 23.918386459350586,
"learning_rate": 9.674242424242425e-05,
"loss": 3.0365,
"step": 46
},
{
"epoch": 0.10681818181818181,
"grad_norm": 16.94843864440918,
"learning_rate": 9.666666666666667e-05,
"loss": 2.0012,
"step": 47
},
{
"epoch": 0.10909090909090909,
"grad_norm": 38.2060432434082,
"learning_rate": 9.65909090909091e-05,
"loss": 2.4087,
"step": 48
},
{
"epoch": 0.11136363636363636,
"grad_norm": 15.836068153381348,
"learning_rate": 9.651515151515152e-05,
"loss": 2.7204,
"step": 49
},
{
"epoch": 0.11363636363636363,
"grad_norm": 20.13130760192871,
"learning_rate": 9.643939393939394e-05,
"loss": 1.8803,
"step": 50
},
{
"epoch": 0.1159090909090909,
"grad_norm": 21.58964729309082,
"learning_rate": 9.636363636363637e-05,
"loss": 2.2448,
"step": 51
},
{
"epoch": 0.11818181818181818,
"grad_norm": 15.996927261352539,
"learning_rate": 9.628787878787879e-05,
"loss": 2.456,
"step": 52
},
{
"epoch": 0.12045454545454545,
"grad_norm": 15.738017082214355,
"learning_rate": 9.621212121212123e-05,
"loss": 2.0494,
"step": 53
},
{
"epoch": 0.12272727272727273,
"grad_norm": 20.54029655456543,
"learning_rate": 9.613636363636363e-05,
"loss": 2.8584,
"step": 54
},
{
"epoch": 0.125,
"grad_norm": 20.11783790588379,
"learning_rate": 9.606060606060606e-05,
"loss": 2.9836,
"step": 55
},
{
"epoch": 0.12727272727272726,
"grad_norm": 15.297281265258789,
"learning_rate": 9.598484848484848e-05,
"loss": 1.8828,
"step": 56
},
{
"epoch": 0.12954545454545455,
"grad_norm": 15.26744270324707,
"learning_rate": 9.590909090909092e-05,
"loss": 1.2548,
"step": 57
},
{
"epoch": 0.1318181818181818,
"grad_norm": 18.839954376220703,
"learning_rate": 9.583333333333334e-05,
"loss": 3.7553,
"step": 58
},
{
"epoch": 0.1340909090909091,
"grad_norm": 17.30214500427246,
"learning_rate": 9.575757575757576e-05,
"loss": 2.2297,
"step": 59
},
{
"epoch": 0.13636363636363635,
"grad_norm": 25.153942108154297,
"learning_rate": 9.568181818181819e-05,
"loss": 2.6817,
"step": 60
},
{
"epoch": 0.13863636363636364,
"grad_norm": 17.55406379699707,
"learning_rate": 9.560606060606061e-05,
"loss": 2.8551,
"step": 61
},
{
"epoch": 0.1409090909090909,
"grad_norm": NaN,
"learning_rate": 9.560606060606061e-05,
"loss": 2.4352,
"step": 62
},
{
"epoch": 0.1431818181818182,
"grad_norm": 18.4881649017334,
"learning_rate": 9.553030303030304e-05,
"loss": 2.1839,
"step": 63
},
{
"epoch": 0.14545454545454545,
"grad_norm": 15.114643096923828,
"learning_rate": 9.545454545454546e-05,
"loss": 1.768,
"step": 64
},
{
"epoch": 0.14772727272727273,
"grad_norm": 17.272735595703125,
"learning_rate": 9.537878787878788e-05,
"loss": 2.4241,
"step": 65
},
{
"epoch": 0.15,
"grad_norm": 18.25682258605957,
"learning_rate": 9.53030303030303e-05,
"loss": 1.8703,
"step": 66
},
{
"epoch": 0.15227272727272728,
"grad_norm": 20.255084991455078,
"learning_rate": 9.522727272727273e-05,
"loss": 2.3706,
"step": 67
},
{
"epoch": 0.15454545454545454,
"grad_norm": 16.153093338012695,
"learning_rate": 9.515151515151515e-05,
"loss": 2.3896,
"step": 68
},
{
"epoch": 0.15681818181818183,
"grad_norm": 14.229001998901367,
"learning_rate": 9.507575757575759e-05,
"loss": 2.5261,
"step": 69
},
{
"epoch": 0.1590909090909091,
"grad_norm": 14.036202430725098,
"learning_rate": 9.5e-05,
"loss": 1.8918,
"step": 70
},
{
"epoch": 0.16136363636363638,
"grad_norm": 16.262582778930664,
"learning_rate": 9.492424242424242e-05,
"loss": 2.7854,
"step": 71
},
{
"epoch": 0.16363636363636364,
"grad_norm": 17.119918823242188,
"learning_rate": 9.484848484848486e-05,
"loss": 2.1371,
"step": 72
},
{
"epoch": 0.16590909090909092,
"grad_norm": 19.72575569152832,
"learning_rate": 9.477272727272728e-05,
"loss": 2.6801,
"step": 73
},
{
"epoch": 0.16818181818181818,
"grad_norm": 17.036550521850586,
"learning_rate": 9.469696969696971e-05,
"loss": 2.6403,
"step": 74
},
{
"epoch": 0.17045454545454544,
"grad_norm": 14.31810188293457,
"learning_rate": 9.462121212121212e-05,
"loss": 1.9865,
"step": 75
},
{
"epoch": 0.17272727272727273,
"grad_norm": 18.39834213256836,
"learning_rate": 9.454545454545455e-05,
"loss": 2.418,
"step": 76
},
{
"epoch": 0.175,
"grad_norm": 18.37046241760254,
"learning_rate": 9.446969696969697e-05,
"loss": 2.2905,
"step": 77
},
{
"epoch": 0.17727272727272728,
"grad_norm": 14.999472618103027,
"learning_rate": 9.43939393939394e-05,
"loss": 2.2521,
"step": 78
},
{
"epoch": 0.17954545454545454,
"grad_norm": 11.88487434387207,
"learning_rate": 9.431818181818182e-05,
"loss": 2.2871,
"step": 79
},
{
"epoch": 0.18181818181818182,
"grad_norm": 21.745532989501953,
"learning_rate": 9.424242424242424e-05,
"loss": 2.6415,
"step": 80
},
{
"epoch": 0.18409090909090908,
"grad_norm": 13.109172821044922,
"learning_rate": 9.416666666666667e-05,
"loss": 1.9554,
"step": 81
},
{
"epoch": 0.18636363636363637,
"grad_norm": 17.222652435302734,
"learning_rate": 9.40909090909091e-05,
"loss": 2.0914,
"step": 82
},
{
"epoch": 0.18863636363636363,
"grad_norm": 17.833839416503906,
"learning_rate": 9.401515151515153e-05,
"loss": 1.6032,
"step": 83
},
{
"epoch": 0.19090909090909092,
"grad_norm": 22.737525939941406,
"learning_rate": 9.393939393939395e-05,
"loss": 3.5915,
"step": 84
},
{
"epoch": 0.19318181818181818,
"grad_norm": 14.926959037780762,
"learning_rate": 9.386363636363637e-05,
"loss": 2.2499,
"step": 85
},
{
"epoch": 0.19545454545454546,
"grad_norm": 13.586040496826172,
"learning_rate": 9.378787878787879e-05,
"loss": 1.8228,
"step": 86
},
{
"epoch": 0.19772727272727272,
"grad_norm": 19.175617218017578,
"learning_rate": 9.371212121212122e-05,
"loss": 2.7846,
"step": 87
},
{
"epoch": 0.2,
"grad_norm": 21.078235626220703,
"learning_rate": 9.363636363636364e-05,
"loss": 2.7906,
"step": 88
},
{
"epoch": 0.20227272727272727,
"grad_norm": 17.618940353393555,
"learning_rate": 9.356060606060606e-05,
"loss": 2.3022,
"step": 89
},
{
"epoch": 0.20454545454545456,
"grad_norm": 16.79983139038086,
"learning_rate": 9.348484848484849e-05,
"loss": 1.8126,
"step": 90
},
{
"epoch": 0.20681818181818182,
"grad_norm": 20.444580078125,
"learning_rate": 9.340909090909091e-05,
"loss": 2.055,
"step": 91
},
{
"epoch": 0.20909090909090908,
"grad_norm": 18.694856643676758,
"learning_rate": 9.333333333333334e-05,
"loss": 2.6534,
"step": 92
},
{
"epoch": 0.21136363636363636,
"grad_norm": 11.254834175109863,
"learning_rate": 9.325757575757576e-05,
"loss": 1.6695,
"step": 93
},
{
"epoch": 0.21363636363636362,
"grad_norm": 14.369203567504883,
"learning_rate": 9.318181818181818e-05,
"loss": 2.3469,
"step": 94
},
{
"epoch": 0.2159090909090909,
"grad_norm": 17.27039909362793,
"learning_rate": 9.31060606060606e-05,
"loss": 1.9188,
"step": 95
},
{
"epoch": 0.21818181818181817,
"grad_norm": 12.644415855407715,
"learning_rate": 9.303030303030303e-05,
"loss": 1.3295,
"step": 96
},
{
"epoch": 0.22045454545454546,
"grad_norm": 20.46677589416504,
"learning_rate": 9.295454545454545e-05,
"loss": 2.4697,
"step": 97
},
{
"epoch": 0.22272727272727272,
"grad_norm": 15.218058586120605,
"learning_rate": 9.287878787878789e-05,
"loss": 2.4472,
"step": 98
},
{
"epoch": 0.225,
"grad_norm": 14.982362747192383,
"learning_rate": 9.280303030303031e-05,
"loss": 1.881,
"step": 99
},
{
"epoch": 0.22727272727272727,
"grad_norm": 20.168306350708008,
"learning_rate": 9.272727272727273e-05,
"loss": 1.6077,
"step": 100
},
{
"epoch": 0.22954545454545455,
"grad_norm": 13.462889671325684,
"learning_rate": 9.265151515151516e-05,
"loss": 1.6057,
"step": 101
},
{
"epoch": 0.2318181818181818,
"grad_norm": 12.3695068359375,
"learning_rate": 9.257575757575758e-05,
"loss": 2.0871,
"step": 102
},
{
"epoch": 0.2340909090909091,
"grad_norm": 15.381841659545898,
"learning_rate": 9.250000000000001e-05,
"loss": 2.0592,
"step": 103
},
{
"epoch": 0.23636363636363636,
"grad_norm": 18.213014602661133,
"learning_rate": 9.242424242424242e-05,
"loss": 2.2397,
"step": 104
},
{
"epoch": 0.23863636363636365,
"grad_norm": 19.589962005615234,
"learning_rate": 9.234848484848485e-05,
"loss": 2.8305,
"step": 105
},
{
"epoch": 0.2409090909090909,
"grad_norm": 21.765127182006836,
"learning_rate": 9.227272727272727e-05,
"loss": 1.8691,
"step": 106
},
{
"epoch": 0.2431818181818182,
"grad_norm": 21.66250228881836,
"learning_rate": 9.21969696969697e-05,
"loss": 2.7176,
"step": 107
},
{
"epoch": 0.24545454545454545,
"grad_norm": 16.438037872314453,
"learning_rate": 9.212121212121214e-05,
"loss": 3.0262,
"step": 108
},
{
"epoch": 0.24772727272727274,
"grad_norm": 18.32391357421875,
"learning_rate": 9.204545454545454e-05,
"loss": 2.4011,
"step": 109
},
{
"epoch": 0.25,
"grad_norm": 18.3424015045166,
"learning_rate": 9.196969696969698e-05,
"loss": 3.3481,
"step": 110
},
{
"epoch": 0.25227272727272726,
"grad_norm": 12.168206214904785,
"learning_rate": 9.18939393939394e-05,
"loss": 1.5084,
"step": 111
},
{
"epoch": 0.2545454545454545,
"grad_norm": 16.183521270751953,
"learning_rate": 9.181818181818183e-05,
"loss": 3.3444,
"step": 112
},
{
"epoch": 0.25681818181818183,
"grad_norm": 17.887187957763672,
"learning_rate": 9.174242424242425e-05,
"loss": 2.4529,
"step": 113
},
{
"epoch": 0.2590909090909091,
"grad_norm": 18.000579833984375,
"learning_rate": 9.166666666666667e-05,
"loss": 2.3228,
"step": 114
},
{
"epoch": 0.26136363636363635,
"grad_norm": 15.579062461853027,
"learning_rate": 9.159090909090909e-05,
"loss": 3.2008,
"step": 115
},
{
"epoch": 0.2636363636363636,
"grad_norm": 14.111518859863281,
"learning_rate": 9.151515151515152e-05,
"loss": 2.2286,
"step": 116
},
{
"epoch": 0.26590909090909093,
"grad_norm": 13.755249977111816,
"learning_rate": 9.143939393939395e-05,
"loss": 1.9561,
"step": 117
},
{
"epoch": 0.2681818181818182,
"grad_norm": 14.665258407592773,
"learning_rate": 9.136363636363637e-05,
"loss": 2.5016,
"step": 118
},
{
"epoch": 0.27045454545454545,
"grad_norm": 14.470067024230957,
"learning_rate": 9.128787878787879e-05,
"loss": 2.3301,
"step": 119
},
{
"epoch": 0.2727272727272727,
"grad_norm": 15.108169555664062,
"learning_rate": 9.121212121212121e-05,
"loss": 2.6079,
"step": 120
},
{
"epoch": 0.275,
"grad_norm": 15.080549240112305,
"learning_rate": 9.113636363636365e-05,
"loss": 2.6349,
"step": 121
},
{
"epoch": 0.2772727272727273,
"grad_norm": 17.71773910522461,
"learning_rate": 9.106060606060606e-05,
"loss": 1.9447,
"step": 122
},
{
"epoch": 0.27954545454545454,
"grad_norm": 11.128664016723633,
"learning_rate": 9.098484848484848e-05,
"loss": 2.2076,
"step": 123
},
{
"epoch": 0.2818181818181818,
"grad_norm": 19.131866455078125,
"learning_rate": 9.090909090909092e-05,
"loss": 1.5932,
"step": 124
},
{
"epoch": 0.2840909090909091,
"grad_norm": 11.3361177444458,
"learning_rate": 9.083333333333334e-05,
"loss": 2.5923,
"step": 125
},
{
"epoch": 0.2863636363636364,
"grad_norm": 16.97115707397461,
"learning_rate": 9.075757575757577e-05,
"loss": 1.828,
"step": 126
},
{
"epoch": 0.28863636363636364,
"grad_norm": 11.52206802368164,
"learning_rate": 9.068181818181819e-05,
"loss": 2.3389,
"step": 127
},
{
"epoch": 0.2909090909090909,
"grad_norm": 18.27076530456543,
"learning_rate": 9.060606060606061e-05,
"loss": 3.1892,
"step": 128
},
{
"epoch": 0.29318181818181815,
"grad_norm": 15.098003387451172,
"learning_rate": 9.053030303030303e-05,
"loss": 2.3429,
"step": 129
},
{
"epoch": 0.29545454545454547,
"grad_norm": 13.432772636413574,
"learning_rate": 9.045454545454546e-05,
"loss": 1.7032,
"step": 130
},
{
"epoch": 0.29772727272727273,
"grad_norm": 21.96811866760254,
"learning_rate": 9.037878787878788e-05,
"loss": 3.3135,
"step": 131
},
{
"epoch": 0.3,
"grad_norm": 17.522789001464844,
"learning_rate": 9.030303030303031e-05,
"loss": 2.0827,
"step": 132
},
{
"epoch": 0.30227272727272725,
"grad_norm": 16.18021011352539,
"learning_rate": 9.022727272727273e-05,
"loss": 2.6956,
"step": 133
},
{
"epoch": 0.30454545454545456,
"grad_norm": 17.834138870239258,
"learning_rate": 9.015151515151515e-05,
"loss": 2.3929,
"step": 134
},
{
"epoch": 0.3068181818181818,
"grad_norm": 18.146596908569336,
"learning_rate": 9.007575757575759e-05,
"loss": 3.0074,
"step": 135
},
{
"epoch": 0.3090909090909091,
"grad_norm": 11.941591262817383,
"learning_rate": 9e-05,
"loss": 1.6793,
"step": 136
},
{
"epoch": 0.31136363636363634,
"grad_norm": 15.524669647216797,
"learning_rate": 8.992424242424244e-05,
"loss": 2.3193,
"step": 137
},
{
"epoch": 0.31363636363636366,
"grad_norm": 17.986879348754883,
"learning_rate": 8.984848484848484e-05,
"loss": 3.1335,
"step": 138
},
{
"epoch": 0.3159090909090909,
"grad_norm": 19.568361282348633,
"learning_rate": 8.977272727272728e-05,
"loss": 2.6232,
"step": 139
},
{
"epoch": 0.3181818181818182,
"grad_norm": 15.213788986206055,
"learning_rate": 8.96969696969697e-05,
"loss": 1.6936,
"step": 140
},
{
"epoch": 0.32045454545454544,
"grad_norm": 16.093795776367188,
"learning_rate": 8.962121212121213e-05,
"loss": 2.38,
"step": 141
},
{
"epoch": 0.32272727272727275,
"grad_norm": 17.010087966918945,
"learning_rate": 8.954545454545455e-05,
"loss": 2.0467,
"step": 142
},
{
"epoch": 0.325,
"grad_norm": 20.31732749938965,
"learning_rate": 8.946969696969697e-05,
"loss": 2.062,
"step": 143
},
{
"epoch": 0.32727272727272727,
"grad_norm": 15.800658226013184,
"learning_rate": 8.93939393939394e-05,
"loss": 1.4575,
"step": 144
},
{
"epoch": 0.32954545454545453,
"grad_norm": 15.116626739501953,
"learning_rate": 8.931818181818182e-05,
"loss": 2.314,
"step": 145
},
{
"epoch": 0.33181818181818185,
"grad_norm": 25.464197158813477,
"learning_rate": 8.924242424242426e-05,
"loss": 2.0073,
"step": 146
},
{
"epoch": 0.3340909090909091,
"grad_norm": 13.291275978088379,
"learning_rate": 8.916666666666667e-05,
"loss": 2.151,
"step": 147
},
{
"epoch": 0.33636363636363636,
"grad_norm": 13.530828475952148,
"learning_rate": 8.90909090909091e-05,
"loss": 2.3051,
"step": 148
},
{
"epoch": 0.3386363636363636,
"grad_norm": 15.941877365112305,
"learning_rate": 8.901515151515151e-05,
"loss": 2.6671,
"step": 149
},
{
"epoch": 0.3409090909090909,
"grad_norm": 16.19255828857422,
"learning_rate": 8.893939393939395e-05,
"loss": 2.4137,
"step": 150
},
{
"epoch": 0.3431818181818182,
"grad_norm": 25.39113998413086,
"learning_rate": 8.886363636363637e-05,
"loss": 3.1836,
"step": 151
},
{
"epoch": 0.34545454545454546,
"grad_norm": 14.128908157348633,
"learning_rate": 8.87878787878788e-05,
"loss": 2.4864,
"step": 152
},
{
"epoch": 0.3477272727272727,
"grad_norm": 14.206392288208008,
"learning_rate": 8.871212121212122e-05,
"loss": 1.3842,
"step": 153
},
{
"epoch": 0.35,
"grad_norm": 11.746234893798828,
"learning_rate": 8.863636363636364e-05,
"loss": 1.69,
"step": 154
},
{
"epoch": 0.3522727272727273,
"grad_norm": 14.249229431152344,
"learning_rate": 8.856060606060607e-05,
"loss": 2.962,
"step": 155
},
{
"epoch": 0.35454545454545455,
"grad_norm": 13.884110450744629,
"learning_rate": 8.848484848484849e-05,
"loss": 1.9429,
"step": 156
},
{
"epoch": 0.3568181818181818,
"grad_norm": 15.577651023864746,
"learning_rate": 8.840909090909091e-05,
"loss": 2.0814,
"step": 157
},
{
"epoch": 0.35909090909090907,
"grad_norm": 13.055503845214844,
"learning_rate": 8.833333333333333e-05,
"loss": 2.286,
"step": 158
},
{
"epoch": 0.3613636363636364,
"grad_norm": 14.148711204528809,
"learning_rate": 8.825757575757576e-05,
"loss": 1.7243,
"step": 159
},
{
"epoch": 0.36363636363636365,
"grad_norm": 18.32880210876465,
"learning_rate": 8.818181818181818e-05,
"loss": 2.0912,
"step": 160
},
{
"epoch": 0.3659090909090909,
"grad_norm": 19.306982040405273,
"learning_rate": 8.810606060606062e-05,
"loss": 2.1032,
"step": 161
},
{
"epoch": 0.36818181818181817,
"grad_norm": 18.99219512939453,
"learning_rate": 8.803030303030304e-05,
"loss": 2.3527,
"step": 162
},
{
"epoch": 0.3704545454545455,
"grad_norm": 14.297601699829102,
"learning_rate": 8.795454545454545e-05,
"loss": 2.8786,
"step": 163
},
{
"epoch": 0.37272727272727274,
"grad_norm": 19.273303985595703,
"learning_rate": 8.787878787878789e-05,
"loss": 2.4364,
"step": 164
},
{
"epoch": 0.375,
"grad_norm": 11.870357513427734,
"learning_rate": 8.780303030303031e-05,
"loss": 2.1716,
"step": 165
},
{
"epoch": 0.37727272727272726,
"grad_norm": 11.26362133026123,
"learning_rate": 8.772727272727274e-05,
"loss": 3.1212,
"step": 166
},
{
"epoch": 0.3795454545454545,
"grad_norm": 12.994135856628418,
"learning_rate": 8.765151515151515e-05,
"loss": 2.4722,
"step": 167
},
{
"epoch": 0.38181818181818183,
"grad_norm": 13.474489212036133,
"learning_rate": 8.757575757575758e-05,
"loss": 2.9132,
"step": 168
},
{
"epoch": 0.3840909090909091,
"grad_norm": 16.456457138061523,
"learning_rate": 8.75e-05,
"loss": 2.1006,
"step": 169
},
{
"epoch": 0.38636363636363635,
"grad_norm": 16.236146926879883,
"learning_rate": 8.742424242424243e-05,
"loss": 2.1458,
"step": 170
},
{
"epoch": 0.3886363636363636,
"grad_norm": 13.122529983520508,
"learning_rate": 8.734848484848485e-05,
"loss": 2.7045,
"step": 171
},
{
"epoch": 0.39090909090909093,
"grad_norm": 12.385522842407227,
"learning_rate": 8.727272727272727e-05,
"loss": 2.2677,
"step": 172
},
{
"epoch": 0.3931818181818182,
"grad_norm": 14.4050931930542,
"learning_rate": 8.71969696969697e-05,
"loss": 1.3401,
"step": 173
},
{
"epoch": 0.39545454545454545,
"grad_norm": 21.25592803955078,
"learning_rate": 8.712121212121212e-05,
"loss": 1.8591,
"step": 174
},
{
"epoch": 0.3977272727272727,
"grad_norm": 13.744414329528809,
"learning_rate": 8.704545454545456e-05,
"loss": 1.8915,
"step": 175
},
{
"epoch": 0.4,
"grad_norm": 14.040199279785156,
"learning_rate": 8.696969696969698e-05,
"loss": 2.1142,
"step": 176
},
{
"epoch": 0.4022727272727273,
"grad_norm": 13.779399871826172,
"learning_rate": 8.68939393939394e-05,
"loss": 1.6946,
"step": 177
},
{
"epoch": 0.40454545454545454,
"grad_norm": 12.878482818603516,
"learning_rate": 8.681818181818182e-05,
"loss": 2.0229,
"step": 178
},
{
"epoch": 0.4068181818181818,
"grad_norm": 10.951014518737793,
"learning_rate": 8.674242424242425e-05,
"loss": 2.2302,
"step": 179
},
{
"epoch": 0.4090909090909091,
"grad_norm": 15.133676528930664,
"learning_rate": 8.666666666666667e-05,
"loss": 1.7796,
"step": 180
},
{
"epoch": 0.4113636363636364,
"grad_norm": 11.56503677368164,
"learning_rate": 8.65909090909091e-05,
"loss": 2.0587,
"step": 181
},
{
"epoch": 0.41363636363636364,
"grad_norm": 12.170353889465332,
"learning_rate": 8.651515151515152e-05,
"loss": 1.9297,
"step": 182
},
{
"epoch": 0.4159090909090909,
"grad_norm": 14.984827995300293,
"learning_rate": 8.643939393939394e-05,
"loss": 1.3361,
"step": 183
},
{
"epoch": 0.41818181818181815,
"grad_norm": 12.686882972717285,
"learning_rate": 8.636363636363637e-05,
"loss": 2.3203,
"step": 184
},
{
"epoch": 0.42045454545454547,
"grad_norm": 19.53303337097168,
"learning_rate": 8.628787878787879e-05,
"loss": 2.1686,
"step": 185
},
{
"epoch": 0.42272727272727273,
"grad_norm": 13.246541976928711,
"learning_rate": 8.621212121212121e-05,
"loss": 2.154,
"step": 186
},
{
"epoch": 0.425,
"grad_norm": 18.38794708251953,
"learning_rate": 8.613636363636363e-05,
"loss": 2.3975,
"step": 187
},
{
"epoch": 0.42727272727272725,
"grad_norm": 19.281801223754883,
"learning_rate": 8.606060606060606e-05,
"loss": 3.1559,
"step": 188
},
{
"epoch": 0.42954545454545456,
"grad_norm": 16.43345069885254,
"learning_rate": 8.598484848484848e-05,
"loss": 2.4324,
"step": 189
},
{
"epoch": 0.4318181818181818,
"grad_norm": 22.686885833740234,
"learning_rate": 8.590909090909092e-05,
"loss": 2.4541,
"step": 190
},
{
"epoch": 0.4340909090909091,
"grad_norm": 16.799205780029297,
"learning_rate": 8.583333333333334e-05,
"loss": 1.9834,
"step": 191
},
{
"epoch": 0.43636363636363634,
"grad_norm": 12.861906051635742,
"learning_rate": 8.575757575757576e-05,
"loss": 1.4132,
"step": 192
},
{
"epoch": 0.43863636363636366,
"grad_norm": 14.350102424621582,
"learning_rate": 8.568181818181819e-05,
"loss": 2.5181,
"step": 193
},
{
"epoch": 0.4409090909090909,
"grad_norm": 9.91285228729248,
"learning_rate": 8.560606060606061e-05,
"loss": 1.1131,
"step": 194
},
{
"epoch": 0.4431818181818182,
"grad_norm": 12.768558502197266,
"learning_rate": 8.553030303030304e-05,
"loss": 1.6889,
"step": 195
},
{
"epoch": 0.44545454545454544,
"grad_norm": 11.671558380126953,
"learning_rate": 8.545454545454545e-05,
"loss": 2.4559,
"step": 196
},
{
"epoch": 0.44772727272727275,
"grad_norm": 12.10418701171875,
"learning_rate": 8.537878787878788e-05,
"loss": 2.2951,
"step": 197
},
{
"epoch": 0.45,
"grad_norm": 12.047237396240234,
"learning_rate": 8.53030303030303e-05,
"loss": 1.7895,
"step": 198
},
{
"epoch": 0.45227272727272727,
"grad_norm": 13.83714485168457,
"learning_rate": 8.522727272727273e-05,
"loss": 2.1267,
"step": 199
},
{
"epoch": 0.45454545454545453,
"grad_norm": 17.289377212524414,
"learning_rate": 8.515151515151515e-05,
"loss": 3.4595,
"step": 200
},
{
"epoch": 0.45681818181818185,
"grad_norm": 16.056198120117188,
"learning_rate": 8.507575757575757e-05,
"loss": 2.2333,
"step": 201
},
{
"epoch": 0.4590909090909091,
"grad_norm": 12.874887466430664,
"learning_rate": 8.5e-05,
"loss": 2.3555,
"step": 202
},
{
"epoch": 0.46136363636363636,
"grad_norm": 11.859071731567383,
"learning_rate": 8.492424242424243e-05,
"loss": 2.0893,
"step": 203
},
{
"epoch": 0.4636363636363636,
"grad_norm": 11.99448013305664,
"learning_rate": 8.484848484848486e-05,
"loss": 2.4165,
"step": 204
},
{
"epoch": 0.4659090909090909,
"grad_norm": 14.352676391601562,
"learning_rate": 8.477272727272728e-05,
"loss": 2.58,
"step": 205
},
{
"epoch": 0.4681818181818182,
"grad_norm": 10.942952156066895,
"learning_rate": 8.46969696969697e-05,
"loss": 2.1313,
"step": 206
},
{
"epoch": 0.47045454545454546,
"grad_norm": 13.232431411743164,
"learning_rate": 8.462121212121212e-05,
"loss": 2.8598,
"step": 207
},
{
"epoch": 0.4727272727272727,
"grad_norm": 14.74603271484375,
"learning_rate": 8.454545454545455e-05,
"loss": 2.5221,
"step": 208
},
{
"epoch": 0.475,
"grad_norm": 11.541604042053223,
"learning_rate": 8.446969696969697e-05,
"loss": 2.6656,
"step": 209
},
{
"epoch": 0.4772727272727273,
"grad_norm": 22.731273651123047,
"learning_rate": 8.43939393939394e-05,
"loss": 1.9391,
"step": 210
},
{
"epoch": 0.47954545454545455,
"grad_norm": 16.327220916748047,
"learning_rate": 8.431818181818182e-05,
"loss": 2.1225,
"step": 211
},
{
"epoch": 0.4818181818181818,
"grad_norm": 15.646464347839355,
"learning_rate": 8.424242424242424e-05,
"loss": 2.1468,
"step": 212
},
{
"epoch": 0.48409090909090907,
"grad_norm": 16.69521141052246,
"learning_rate": 8.416666666666668e-05,
"loss": 2.4979,
"step": 213
},
{
"epoch": 0.4863636363636364,
"grad_norm": 12.17435073852539,
"learning_rate": 8.40909090909091e-05,
"loss": 1.915,
"step": 214
},
{
"epoch": 0.48863636363636365,
"grad_norm": 15.295214653015137,
"learning_rate": 8.401515151515153e-05,
"loss": 2.6765,
"step": 215
},
{
"epoch": 0.4909090909090909,
"grad_norm": 14.532336235046387,
"learning_rate": 8.393939393939393e-05,
"loss": 2.1649,
"step": 216
},
{
"epoch": 0.49318181818181817,
"grad_norm": 9.738990783691406,
"learning_rate": 8.386363636363637e-05,
"loss": 1.7751,
"step": 217
},
{
"epoch": 0.4954545454545455,
"grad_norm": 13.893047332763672,
"learning_rate": 8.378787878787879e-05,
"loss": 2.3839,
"step": 218
},
{
"epoch": 0.49772727272727274,
"grad_norm": 10.604107856750488,
"learning_rate": 8.371212121212122e-05,
"loss": 1.839,
"step": 219
},
{
"epoch": 0.5,
"grad_norm": 14.21572208404541,
"learning_rate": 8.363636363636364e-05,
"loss": 2.4181,
"step": 220
},
{
"epoch": 0.5022727272727273,
"grad_norm": 12.247942924499512,
"learning_rate": 8.356060606060606e-05,
"loss": 1.6214,
"step": 221
},
{
"epoch": 0.5045454545454545,
"grad_norm": 11.43807601928711,
"learning_rate": 8.348484848484849e-05,
"loss": 1.7002,
"step": 222
},
{
"epoch": 0.5068181818181818,
"grad_norm": 12.532363891601562,
"learning_rate": 8.340909090909091e-05,
"loss": 1.6798,
"step": 223
},
{
"epoch": 0.509090909090909,
"grad_norm": 21.122955322265625,
"learning_rate": 8.333333333333334e-05,
"loss": 2.3791,
"step": 224
},
{
"epoch": 0.5113636363636364,
"grad_norm": 15.643569946289062,
"learning_rate": 8.325757575757575e-05,
"loss": 2.2841,
"step": 225
},
{
"epoch": 0.5136363636363637,
"grad_norm": 13.66476821899414,
"learning_rate": 8.318181818181818e-05,
"loss": 2.7105,
"step": 226
},
{
"epoch": 0.5159090909090909,
"grad_norm": 15.538378715515137,
"learning_rate": 8.310606060606062e-05,
"loss": 2.5573,
"step": 227
},
{
"epoch": 0.5181818181818182,
"grad_norm": 14.432341575622559,
"learning_rate": 8.303030303030304e-05,
"loss": 1.6926,
"step": 228
},
{
"epoch": 0.5204545454545455,
"grad_norm": 14.326302528381348,
"learning_rate": 8.295454545454547e-05,
"loss": 1.9976,
"step": 229
},
{
"epoch": 0.5227272727272727,
"grad_norm": 16.38084602355957,
"learning_rate": 8.287878787878787e-05,
"loss": 2.8438,
"step": 230
},
{
"epoch": 0.525,
"grad_norm": 14.56826114654541,
"learning_rate": 8.280303030303031e-05,
"loss": 2.3643,
"step": 231
},
{
"epoch": 0.5272727272727272,
"grad_norm": 10.183893203735352,
"learning_rate": 8.272727272727273e-05,
"loss": 1.9476,
"step": 232
},
{
"epoch": 0.5295454545454545,
"grad_norm": 15.575922012329102,
"learning_rate": 8.265151515151516e-05,
"loss": 2.3493,
"step": 233
},
{
"epoch": 0.5318181818181819,
"grad_norm": 12.653141021728516,
"learning_rate": 8.257575757575758e-05,
"loss": 2.0519,
"step": 234
},
{
"epoch": 0.5340909090909091,
"grad_norm": 12.279047966003418,
"learning_rate": 8.25e-05,
"loss": 2.0694,
"step": 235
},
{
"epoch": 0.5363636363636364,
"grad_norm": 12.395997047424316,
"learning_rate": 8.242424242424243e-05,
"loss": 2.1307,
"step": 236
},
{
"epoch": 0.5386363636363637,
"grad_norm": 10.851142883300781,
"learning_rate": 8.234848484848485e-05,
"loss": 1.9883,
"step": 237
},
{
"epoch": 0.5409090909090909,
"grad_norm": 14.103243827819824,
"learning_rate": 8.227272727272729e-05,
"loss": 2.6901,
"step": 238
},
{
"epoch": 0.5431818181818182,
"grad_norm": 9.63924789428711,
"learning_rate": 8.21969696969697e-05,
"loss": 1.2228,
"step": 239
},
{
"epoch": 0.5454545454545454,
"grad_norm": 13.430061340332031,
"learning_rate": 8.212121212121212e-05,
"loss": 1.7877,
"step": 240
},
{
"epoch": 0.5477272727272727,
"grad_norm": 15.428567886352539,
"learning_rate": 8.204545454545454e-05,
"loss": 2.0201,
"step": 241
},
{
"epoch": 0.55,
"grad_norm": 15.405593872070312,
"learning_rate": 8.196969696969698e-05,
"loss": 2.8325,
"step": 242
},
{
"epoch": 0.5522727272727272,
"grad_norm": 22.855867385864258,
"learning_rate": 8.18939393939394e-05,
"loss": 3.045,
"step": 243
},
{
"epoch": 0.5545454545454546,
"grad_norm": 14.374544143676758,
"learning_rate": 8.181818181818183e-05,
"loss": 2.0002,
"step": 244
},
{
"epoch": 0.5568181818181818,
"grad_norm": 13.37702465057373,
"learning_rate": 8.174242424242425e-05,
"loss": 1.6496,
"step": 245
},
{
"epoch": 0.5590909090909091,
"grad_norm": 13.321274757385254,
"learning_rate": 8.166666666666667e-05,
"loss": 1.9746,
"step": 246
},
{
"epoch": 0.5613636363636364,
"grad_norm": 13.79466438293457,
"learning_rate": 8.15909090909091e-05,
"loss": 2.0699,
"step": 247
},
{
"epoch": 0.5636363636363636,
"grad_norm": 12.355722427368164,
"learning_rate": 8.151515151515152e-05,
"loss": 2.2207,
"step": 248
},
{
"epoch": 0.5659090909090909,
"grad_norm": 14.220561981201172,
"learning_rate": 8.143939393939395e-05,
"loss": 2.1695,
"step": 249
},
{
"epoch": 0.5681818181818182,
"grad_norm": 12.587940216064453,
"learning_rate": 8.136363636363636e-05,
"loss": 1.8604,
"step": 250
},
{
"epoch": 0.5704545454545454,
"grad_norm": 9.54430103302002,
"learning_rate": 8.12878787878788e-05,
"loss": 1.6446,
"step": 251
},
{
"epoch": 0.5727272727272728,
"grad_norm": 14.440407752990723,
"learning_rate": 8.121212121212121e-05,
"loss": 2.4646,
"step": 252
},
{
"epoch": 0.575,
"grad_norm": 14.50412368774414,
"learning_rate": 8.113636363636365e-05,
"loss": 1.5263,
"step": 253
},
{
"epoch": 0.5772727272727273,
"grad_norm": 18.535612106323242,
"learning_rate": 8.106060606060607e-05,
"loss": 2.7942,
"step": 254
},
{
"epoch": 0.5795454545454546,
"grad_norm": 11.250702857971191,
"learning_rate": 8.098484848484848e-05,
"loss": 1.5575,
"step": 255
},
{
"epoch": 0.5818181818181818,
"grad_norm": 12.534632682800293,
"learning_rate": 8.090909090909092e-05,
"loss": 1.9031,
"step": 256
},
{
"epoch": 0.5840909090909091,
"grad_norm": 14.82848834991455,
"learning_rate": 8.083333333333334e-05,
"loss": 1.4666,
"step": 257
},
{
"epoch": 0.5863636363636363,
"grad_norm": 15.74230670928955,
"learning_rate": 8.075757575757577e-05,
"loss": 2.3956,
"step": 258
},
{
"epoch": 0.5886363636363636,
"grad_norm": 13.576948165893555,
"learning_rate": 8.068181818181818e-05,
"loss": 1.9797,
"step": 259
},
{
"epoch": 0.5909090909090909,
"grad_norm": 12.77927303314209,
"learning_rate": 8.060606060606061e-05,
"loss": 2.0894,
"step": 260
},
{
"epoch": 0.5931818181818181,
"grad_norm": 17.75493621826172,
"learning_rate": 8.053030303030303e-05,
"loss": 2.6691,
"step": 261
},
{
"epoch": 0.5954545454545455,
"grad_norm": 12.445291519165039,
"learning_rate": 8.045454545454546e-05,
"loss": 1.9188,
"step": 262
},
{
"epoch": 0.5977272727272728,
"grad_norm": 12.350727081298828,
"learning_rate": 8.037878787878788e-05,
"loss": 1.9648,
"step": 263
},
{
"epoch": 0.6,
"grad_norm": 10.37759780883789,
"learning_rate": 8.03030303030303e-05,
"loss": 1.5221,
"step": 264
},
{
"epoch": 0.6022727272727273,
"grad_norm": 13.281451225280762,
"learning_rate": 8.022727272727273e-05,
"loss": 3.2337,
"step": 265
},
{
"epoch": 0.6045454545454545,
"grad_norm": 11.684523582458496,
"learning_rate": 8.015151515151515e-05,
"loss": 1.7641,
"step": 266
},
{
"epoch": 0.6068181818181818,
"grad_norm": 15.161863327026367,
"learning_rate": 8.007575757575759e-05,
"loss": 3.5694,
"step": 267
},
{
"epoch": 0.6090909090909091,
"grad_norm": 13.221097946166992,
"learning_rate": 8e-05,
"loss": 2.5334,
"step": 268
},
{
"epoch": 0.6113636363636363,
"grad_norm": 15.834603309631348,
"learning_rate": 7.992424242424243e-05,
"loss": 2.5292,
"step": 269
},
{
"epoch": 0.6136363636363636,
"grad_norm": 15.016695976257324,
"learning_rate": 7.984848484848485e-05,
"loss": 1.9177,
"step": 270
},
{
"epoch": 0.615909090909091,
"grad_norm": 18.896211624145508,
"learning_rate": 7.977272727272728e-05,
"loss": 2.2495,
"step": 271
},
{
"epoch": 0.6181818181818182,
"grad_norm": 17.597623825073242,
"learning_rate": 7.96969696969697e-05,
"loss": 2.1252,
"step": 272
},
{
"epoch": 0.6204545454545455,
"grad_norm": 14.346769332885742,
"learning_rate": 7.962121212121213e-05,
"loss": 2.0273,
"step": 273
},
{
"epoch": 0.6227272727272727,
"grad_norm": 13.852729797363281,
"learning_rate": 7.954545454545455e-05,
"loss": 2.7319,
"step": 274
},
{
"epoch": 0.625,
"grad_norm": 12.906790733337402,
"learning_rate": 7.946969696969697e-05,
"loss": 1.6674,
"step": 275
},
{
"epoch": 0.6272727272727273,
"grad_norm": 10.031960487365723,
"learning_rate": 7.93939393939394e-05,
"loss": 1.5017,
"step": 276
},
{
"epoch": 0.6295454545454545,
"grad_norm": 12.02971363067627,
"learning_rate": 7.931818181818182e-05,
"loss": 2.1617,
"step": 277
},
{
"epoch": 0.6318181818181818,
"grad_norm": 12.239229202270508,
"learning_rate": 7.924242424242426e-05,
"loss": 1.285,
"step": 278
},
{
"epoch": 0.634090909090909,
"grad_norm": 12.207528114318848,
"learning_rate": 7.916666666666666e-05,
"loss": 1.4661,
"step": 279
},
{
"epoch": 0.6363636363636364,
"grad_norm": 21.659215927124023,
"learning_rate": 7.90909090909091e-05,
"loss": 1.8808,
"step": 280
},
{
"epoch": 0.6386363636363637,
"grad_norm": 14.419612884521484,
"learning_rate": 7.901515151515151e-05,
"loss": 2.6502,
"step": 281
},
{
"epoch": 0.6409090909090909,
"grad_norm": NaN,
"learning_rate": 7.901515151515151e-05,
"loss": 0.0,
"step": 282
},
{
"epoch": 0.6431818181818182,
"grad_norm": 11.444130897521973,
"learning_rate": 7.893939393939395e-05,
"loss": 1.5987,
"step": 283
},
{
"epoch": 0.6454545454545455,
"grad_norm": 10.316890716552734,
"learning_rate": 7.886363636363637e-05,
"loss": 1.5173,
"step": 284
},
{
"epoch": 0.6477272727272727,
"grad_norm": 13.772204399108887,
"learning_rate": 7.878787878787879e-05,
"loss": 3.0357,
"step": 285
},
{
"epoch": 0.65,
"grad_norm": 12.452784538269043,
"learning_rate": 7.871212121212122e-05,
"loss": 2.2077,
"step": 286
},
{
"epoch": 0.6522727272727272,
"grad_norm": 15.323153495788574,
"learning_rate": 7.863636363636364e-05,
"loss": 1.8941,
"step": 287
},
{
"epoch": 0.6545454545454545,
"grad_norm": 10.558858871459961,
"learning_rate": 7.856060606060607e-05,
"loss": 1.5262,
"step": 288
},
{
"epoch": 0.6568181818181819,
"grad_norm": 15.232844352722168,
"learning_rate": 7.848484848484848e-05,
"loss": 3.1486,
"step": 289
},
{
"epoch": 0.6590909090909091,
"grad_norm": 11.309487342834473,
"learning_rate": 7.840909090909091e-05,
"loss": 1.8324,
"step": 290
},
{
"epoch": 0.6613636363636364,
"grad_norm": 11.427604675292969,
"learning_rate": 7.833333333333333e-05,
"loss": 1.0609,
"step": 291
},
{
"epoch": 0.6636363636363637,
"grad_norm": 15.115833282470703,
"learning_rate": 7.825757575757576e-05,
"loss": 2.8888,
"step": 292
},
{
"epoch": 0.6659090909090909,
"grad_norm": 14.701318740844727,
"learning_rate": 7.818181818181818e-05,
"loss": 2.823,
"step": 293
},
{
"epoch": 0.6681818181818182,
"grad_norm": 10.650053024291992,
"learning_rate": 7.81060606060606e-05,
"loss": 1.8724,
"step": 294
},
{
"epoch": 0.6704545454545454,
"grad_norm": 12.72999382019043,
"learning_rate": 7.803030303030304e-05,
"loss": 1.9267,
"step": 295
},
{
"epoch": 0.6727272727272727,
"grad_norm": 16.98598861694336,
"learning_rate": 7.795454545454546e-05,
"loss": 2.325,
"step": 296
},
{
"epoch": 0.675,
"grad_norm": 12.848193168640137,
"learning_rate": 7.787878787878789e-05,
"loss": 3.1965,
"step": 297
},
{
"epoch": 0.6772727272727272,
"grad_norm": 8.765904426574707,
"learning_rate": 7.780303030303031e-05,
"loss": 1.8081,
"step": 298
},
{
"epoch": 0.6795454545454546,
"grad_norm": 14.633967399597168,
"learning_rate": 7.772727272727273e-05,
"loss": 1.8056,
"step": 299
},
{
"epoch": 0.6818181818181818,
"grad_norm": 9.972925186157227,
"learning_rate": 7.765151515151515e-05,
"loss": 1.8835,
"step": 300
},
{
"epoch": 0.6840909090909091,
"grad_norm": 11.186135292053223,
"learning_rate": 7.757575757575758e-05,
"loss": 1.6734,
"step": 301
},
{
"epoch": 0.6863636363636364,
"grad_norm": 15.052450180053711,
"learning_rate": 7.75e-05,
"loss": 2.2574,
"step": 302
},
{
"epoch": 0.6886363636363636,
"grad_norm": 12.664848327636719,
"learning_rate": 7.742424242424243e-05,
"loss": 1.5916,
"step": 303
},
{
"epoch": 0.6909090909090909,
"grad_norm": 14.287535667419434,
"learning_rate": 7.734848484848485e-05,
"loss": 1.8552,
"step": 304
},
{
"epoch": 0.6931818181818182,
"grad_norm": 14.354594230651855,
"learning_rate": 7.727272727272727e-05,
"loss": 3.0925,
"step": 305
},
{
"epoch": 0.6954545454545454,
"grad_norm": 12.003613471984863,
"learning_rate": 7.71969696969697e-05,
"loss": 1.6642,
"step": 306
},
{
"epoch": 0.6977272727272728,
"grad_norm": 11.559938430786133,
"learning_rate": 7.712121212121212e-05,
"loss": 1.5997,
"step": 307
},
{
"epoch": 0.7,
"grad_norm": 13.42446517944336,
"learning_rate": 7.704545454545456e-05,
"loss": 1.7934,
"step": 308
},
{
"epoch": 0.7022727272727273,
"grad_norm": 11.831766128540039,
"learning_rate": 7.696969696969696e-05,
"loss": 1.7729,
"step": 309
},
{
"epoch": 0.7045454545454546,
"grad_norm": 11.884734153747559,
"learning_rate": 7.68939393939394e-05,
"loss": 1.9489,
"step": 310
},
{
"epoch": 0.7068181818181818,
"grad_norm": 15.816669464111328,
"learning_rate": 7.681818181818182e-05,
"loss": 2.4105,
"step": 311
},
{
"epoch": 0.7090909090909091,
"grad_norm": 12.010058403015137,
"learning_rate": 7.674242424242425e-05,
"loss": 1.9247,
"step": 312
},
{
"epoch": 0.7113636363636363,
"grad_norm": 9.436304092407227,
"learning_rate": 7.666666666666667e-05,
"loss": 1.9038,
"step": 313
},
{
"epoch": 0.7136363636363636,
"grad_norm": 9.153775215148926,
"learning_rate": 7.659090909090909e-05,
"loss": 1.241,
"step": 314
},
{
"epoch": 0.7159090909090909,
"grad_norm": 13.067652702331543,
"learning_rate": 7.651515151515152e-05,
"loss": 2.7662,
"step": 315
},
{
"epoch": 0.7181818181818181,
"grad_norm": 16.106948852539062,
"learning_rate": 7.643939393939394e-05,
"loss": 2.0783,
"step": 316
},
{
"epoch": 0.7204545454545455,
"grad_norm": 13.585596084594727,
"learning_rate": 7.636363636363637e-05,
"loss": 1.919,
"step": 317
},
{
"epoch": 0.7227272727272728,
"grad_norm": 13.833767890930176,
"learning_rate": 7.62878787878788e-05,
"loss": 1.1069,
"step": 318
},
{
"epoch": 0.725,
"grad_norm": 12.201956748962402,
"learning_rate": 7.621212121212121e-05,
"loss": 1.9548,
"step": 319
},
{
"epoch": 0.7272727272727273,
"grad_norm": 15.562934875488281,
"learning_rate": 7.613636363636363e-05,
"loss": 1.9211,
"step": 320
},
{
"epoch": 0.7295454545454545,
"grad_norm": 14.389630317687988,
"learning_rate": 7.606060606060607e-05,
"loss": 1.821,
"step": 321
},
{
"epoch": 0.7318181818181818,
"grad_norm": 14.584891319274902,
"learning_rate": 7.598484848484849e-05,
"loss": 2.5068,
"step": 322
},
{
"epoch": 0.7340909090909091,
"grad_norm": 14.5166654586792,
"learning_rate": 7.59090909090909e-05,
"loss": 1.9124,
"step": 323
},
{
"epoch": 0.7363636363636363,
"grad_norm": 46.67388916015625,
"learning_rate": 7.583333333333334e-05,
"loss": 1.6895,
"step": 324
},
{
"epoch": 0.7386363636363636,
"grad_norm": 12.92702865600586,
"learning_rate": 7.575757575757576e-05,
"loss": 1.7526,
"step": 325
},
{
"epoch": 0.740909090909091,
"grad_norm": 8.52035140991211,
"learning_rate": 7.568181818181819e-05,
"loss": 1.4144,
"step": 326
},
{
"epoch": 0.7431818181818182,
"grad_norm": 13.630702018737793,
"learning_rate": 7.560606060606061e-05,
"loss": 2.2018,
"step": 327
},
{
"epoch": 0.7454545454545455,
"grad_norm": 14.379950523376465,
"learning_rate": 7.553030303030303e-05,
"loss": 2.8618,
"step": 328
},
{
"epoch": 0.7477272727272727,
"grad_norm": 14.78795051574707,
"learning_rate": 7.545454545454545e-05,
"loss": 1.9749,
"step": 329
},
{
"epoch": 0.75,
"grad_norm": 10.462140083312988,
"learning_rate": 7.537878787878788e-05,
"loss": 2.3666,
"step": 330
},
{
"epoch": 0.7522727272727273,
"grad_norm": 11.336270332336426,
"learning_rate": 7.530303030303032e-05,
"loss": 1.4712,
"step": 331
},
{
"epoch": 0.7545454545454545,
"grad_norm": 17.15682029724121,
"learning_rate": 7.522727272727273e-05,
"loss": 3.2442,
"step": 332
},
{
"epoch": 0.7568181818181818,
"grad_norm": 14.129326820373535,
"learning_rate": 7.515151515151515e-05,
"loss": 1.9768,
"step": 333
},
{
"epoch": 0.759090909090909,
"grad_norm": 14.239521026611328,
"learning_rate": 7.507575757575757e-05,
"loss": 1.9933,
"step": 334
},
{
"epoch": 0.7613636363636364,
"grad_norm": 10.573707580566406,
"learning_rate": 7.500000000000001e-05,
"loss": 1.3049,
"step": 335
},
{
"epoch": 0.7636363636363637,
"grad_norm": 15.881331443786621,
"learning_rate": 7.492424242424243e-05,
"loss": 2.7249,
"step": 336
},
{
"epoch": 0.7659090909090909,
"grad_norm": 11.606864929199219,
"learning_rate": 7.484848484848486e-05,
"loss": 1.4883,
"step": 337
},
{
"epoch": 0.7681818181818182,
"grad_norm": 8.834245681762695,
"learning_rate": 7.477272727272727e-05,
"loss": 1.3757,
"step": 338
},
{
"epoch": 0.7704545454545455,
"grad_norm": 10.011686325073242,
"learning_rate": 7.46969696969697e-05,
"loss": 1.4306,
"step": 339
},
{
"epoch": 0.7727272727272727,
"grad_norm": 13.084802627563477,
"learning_rate": 7.462121212121213e-05,
"loss": 2.1676,
"step": 340
},
{
"epoch": 0.775,
"grad_norm": 12.480827331542969,
"learning_rate": 7.454545454545455e-05,
"loss": 2.2564,
"step": 341
},
{
"epoch": 0.7772727272727272,
"grad_norm": 12.32083797454834,
"learning_rate": 7.446969696969698e-05,
"loss": 1.4576,
"step": 342
},
{
"epoch": 0.7795454545454545,
"grad_norm": 13.759376525878906,
"learning_rate": 7.439393939393939e-05,
"loss": 2.5308,
"step": 343
},
{
"epoch": 0.7818181818181819,
"grad_norm": 17.70578384399414,
"learning_rate": 7.431818181818182e-05,
"loss": 3.0816,
"step": 344
},
{
"epoch": 0.7840909090909091,
"grad_norm": 13.809745788574219,
"learning_rate": 7.424242424242424e-05,
"loss": 2.6903,
"step": 345
},
{
"epoch": 0.7863636363636364,
"grad_norm": 13.484768867492676,
"learning_rate": 7.416666666666668e-05,
"loss": 1.6094,
"step": 346
},
{
"epoch": 0.7886363636363637,
"grad_norm": 10.424938201904297,
"learning_rate": 7.40909090909091e-05,
"loss": 1.3566,
"step": 347
},
{
"epoch": 0.7909090909090909,
"grad_norm": 15.058128356933594,
"learning_rate": 7.401515151515152e-05,
"loss": 1.945,
"step": 348
},
{
"epoch": 0.7931818181818182,
"grad_norm": 11.48098373413086,
"learning_rate": 7.393939393939395e-05,
"loss": 2.9329,
"step": 349
},
{
"epoch": 0.7954545454545454,
"grad_norm": 15.027339935302734,
"learning_rate": 7.386363636363637e-05,
"loss": 3.3324,
"step": 350
},
{
"epoch": 0.7977272727272727,
"grad_norm": 12.786996841430664,
"learning_rate": 7.37878787878788e-05,
"loss": 2.7898,
"step": 351
},
{
"epoch": 0.8,
"grad_norm": 14.68897819519043,
"learning_rate": 7.37121212121212e-05,
"loss": 2.1318,
"step": 352
},
{
"epoch": 0.8022727272727272,
"grad_norm": 15.081788063049316,
"learning_rate": 7.363636363636364e-05,
"loss": 2.544,
"step": 353
},
{
"epoch": 0.8045454545454546,
"grad_norm": 13.604434967041016,
"learning_rate": 7.356060606060606e-05,
"loss": 3.242,
"step": 354
},
{
"epoch": 0.8068181818181818,
"grad_norm": 10.167998313903809,
"learning_rate": 7.348484848484849e-05,
"loss": 1.7378,
"step": 355
},
{
"epoch": 0.8090909090909091,
"grad_norm": 11.878591537475586,
"learning_rate": 7.340909090909091e-05,
"loss": 1.9651,
"step": 356
},
{
"epoch": 0.8113636363636364,
"grad_norm": 10.606021881103516,
"learning_rate": 7.333333333333333e-05,
"loss": 1.6922,
"step": 357
},
{
"epoch": 0.8136363636363636,
"grad_norm": 36.99083709716797,
"learning_rate": 7.325757575757576e-05,
"loss": 2.7004,
"step": 358
},
{
"epoch": 0.8159090909090909,
"grad_norm": 12.748845100402832,
"learning_rate": 7.318181818181818e-05,
"loss": 2.0722,
"step": 359
},
{
"epoch": 0.8181818181818182,
"grad_norm": 13.374279975891113,
"learning_rate": 7.310606060606062e-05,
"loss": 2.3361,
"step": 360
},
{
"epoch": 0.8204545454545454,
"grad_norm": 10.289033889770508,
"learning_rate": 7.303030303030304e-05,
"loss": 1.6377,
"step": 361
},
{
"epoch": 0.8227272727272728,
"grad_norm": 10.585772514343262,
"learning_rate": 7.295454545454546e-05,
"loss": 1.6941,
"step": 362
},
{
"epoch": 0.825,
"grad_norm": 13.439225196838379,
"learning_rate": 7.287878787878788e-05,
"loss": 1.9242,
"step": 363
},
{
"epoch": 0.8272727272727273,
"grad_norm": 12.649117469787598,
"learning_rate": 7.280303030303031e-05,
"loss": 3.5932,
"step": 364
},
{
"epoch": 0.8295454545454546,
"grad_norm": 13.014269828796387,
"learning_rate": 7.272727272727273e-05,
"loss": 1.6747,
"step": 365
},
{
"epoch": 0.8318181818181818,
"grad_norm": 10.855698585510254,
"learning_rate": 7.265151515151516e-05,
"loss": 2.2644,
"step": 366
},
{
"epoch": 0.8340909090909091,
"grad_norm": 9.967236518859863,
"learning_rate": 7.257575757575758e-05,
"loss": 1.7373,
"step": 367
},
{
"epoch": 0.8363636363636363,
"grad_norm": 12.029590606689453,
"learning_rate": 7.25e-05,
"loss": 1.7012,
"step": 368
},
{
"epoch": 0.8386363636363636,
"grad_norm": 18.046247482299805,
"learning_rate": 7.242424242424243e-05,
"loss": 2.7507,
"step": 369
},
{
"epoch": 0.8409090909090909,
"grad_norm": 12.02083969116211,
"learning_rate": 7.234848484848485e-05,
"loss": 1.4928,
"step": 370
},
{
"epoch": 0.8431818181818181,
"grad_norm": 14.034537315368652,
"learning_rate": 7.227272727272729e-05,
"loss": 1.5557,
"step": 371
},
{
"epoch": 0.8454545454545455,
"grad_norm": 11.5894775390625,
"learning_rate": 7.219696969696969e-05,
"loss": 2.0848,
"step": 372
},
{
"epoch": 0.8477272727272728,
"grad_norm": 10.489690780639648,
"learning_rate": 7.212121212121213e-05,
"loss": 2.1963,
"step": 373
},
{
"epoch": 0.85,
"grad_norm": 14.684807777404785,
"learning_rate": 7.204545454545454e-05,
"loss": 1.6653,
"step": 374
},
{
"epoch": 0.8522727272727273,
"grad_norm": 10.650580406188965,
"learning_rate": 7.196969696969698e-05,
"loss": 1.5813,
"step": 375
},
{
"epoch": 0.8545454545454545,
"grad_norm": 14.406346321105957,
"learning_rate": 7.18939393939394e-05,
"loss": 1.6018,
"step": 376
},
{
"epoch": 0.8568181818181818,
"grad_norm": 10.684210777282715,
"learning_rate": 7.181818181818182e-05,
"loss": 1.16,
"step": 377
},
{
"epoch": 0.8590909090909091,
"grad_norm": 11.588654518127441,
"learning_rate": 7.174242424242425e-05,
"loss": 1.52,
"step": 378
},
{
"epoch": 0.8613636363636363,
"grad_norm": 13.342896461486816,
"learning_rate": 7.166666666666667e-05,
"loss": 1.3069,
"step": 379
},
{
"epoch": 0.8636363636363636,
"grad_norm": 10.33123779296875,
"learning_rate": 7.15909090909091e-05,
"loss": 2.097,
"step": 380
},
{
"epoch": 0.865909090909091,
"grad_norm": 13.286327362060547,
"learning_rate": 7.151515151515152e-05,
"loss": 1.6996,
"step": 381
},
{
"epoch": 0.8681818181818182,
"grad_norm": 12.737727165222168,
"learning_rate": 7.143939393939394e-05,
"loss": 1.8533,
"step": 382
},
{
"epoch": 0.8704545454545455,
"grad_norm": 10.602120399475098,
"learning_rate": 7.136363636363636e-05,
"loss": 0.9764,
"step": 383
},
{
"epoch": 0.8727272727272727,
"grad_norm": 13.362771034240723,
"learning_rate": 7.12878787878788e-05,
"loss": 2.6888,
"step": 384
},
{
"epoch": 0.875,
"grad_norm": 15.875019073486328,
"learning_rate": 7.121212121212121e-05,
"loss": 1.3865,
"step": 385
},
{
"epoch": 0.8772727272727273,
"grad_norm": 11.602843284606934,
"learning_rate": 7.113636363636363e-05,
"loss": 1.489,
"step": 386
},
{
"epoch": 0.8795454545454545,
"grad_norm": 10.052959442138672,
"learning_rate": 7.106060606060607e-05,
"loss": 1.423,
"step": 387
},
{
"epoch": 0.8818181818181818,
"grad_norm": 15.898283004760742,
"learning_rate": 7.098484848484849e-05,
"loss": 2.0401,
"step": 388
},
{
"epoch": 0.884090909090909,
"grad_norm": 14.83981990814209,
"learning_rate": 7.090909090909092e-05,
"loss": 2.9656,
"step": 389
},
{
"epoch": 0.8863636363636364,
"grad_norm": 12.542622566223145,
"learning_rate": 7.083333333333334e-05,
"loss": 1.7818,
"step": 390
},
{
"epoch": 0.8886363636363637,
"grad_norm": 10.65149974822998,
"learning_rate": 7.075757575757576e-05,
"loss": 1.4115,
"step": 391
},
{
"epoch": 0.8909090909090909,
"grad_norm": 14.208708763122559,
"learning_rate": 7.068181818181818e-05,
"loss": 2.5107,
"step": 392
},
{
"epoch": 0.8931818181818182,
"grad_norm": 13.435481071472168,
"learning_rate": 7.060606060606061e-05,
"loss": 2.0141,
"step": 393
},
{
"epoch": 0.8954545454545455,
"grad_norm": 14.987428665161133,
"learning_rate": 7.053030303030303e-05,
"loss": 1.6295,
"step": 394
},
{
"epoch": 0.8977272727272727,
"grad_norm": 15.590865135192871,
"learning_rate": 7.045454545454546e-05,
"loss": 2.5029,
"step": 395
},
{
"epoch": 0.9,
"grad_norm": 12.00338077545166,
"learning_rate": 7.037878787878788e-05,
"loss": 1.5399,
"step": 396
},
{
"epoch": 0.9022727272727272,
"grad_norm": 10.2390718460083,
"learning_rate": 7.03030303030303e-05,
"loss": 1.2943,
"step": 397
},
{
"epoch": 0.9045454545454545,
"grad_norm": 13.09786319732666,
"learning_rate": 7.022727272727274e-05,
"loss": 1.951,
"step": 398
},
{
"epoch": 0.9068181818181819,
"grad_norm": 14.016656875610352,
"learning_rate": 7.015151515151515e-05,
"loss": 2.4783,
"step": 399
},
{
"epoch": 0.9090909090909091,
"grad_norm": 14.135820388793945,
"learning_rate": 7.007575757575759e-05,
"loss": 1.8109,
"step": 400
},
{
"epoch": 0.9113636363636364,
"grad_norm": 15.545958518981934,
"learning_rate": 7e-05,
"loss": 2.2156,
"step": 401
},
{
"epoch": 0.9136363636363637,
"grad_norm": 15.512310028076172,
"learning_rate": 6.992424242424243e-05,
"loss": 1.8199,
"step": 402
},
{
"epoch": 0.9159090909090909,
"grad_norm": 12.54996109008789,
"learning_rate": 6.984848484848485e-05,
"loss": 2.0134,
"step": 403
},
{
"epoch": 0.9181818181818182,
"grad_norm": 10.554512023925781,
"learning_rate": 6.977272727272728e-05,
"loss": 1.5173,
"step": 404
},
{
"epoch": 0.9204545454545454,
"grad_norm": 13.31303882598877,
"learning_rate": 6.96969696969697e-05,
"loss": 1.7694,
"step": 405
},
{
"epoch": 0.9227272727272727,
"grad_norm": 18.840511322021484,
"learning_rate": 6.962121212121212e-05,
"loss": 3.0551,
"step": 406
},
{
"epoch": 0.925,
"grad_norm": 13.331717491149902,
"learning_rate": 6.954545454545455e-05,
"loss": 2.0296,
"step": 407
},
{
"epoch": 0.9272727272727272,
"grad_norm": 11.75788688659668,
"learning_rate": 6.946969696969697e-05,
"loss": 1.8544,
"step": 408
},
{
"epoch": 0.9295454545454546,
"grad_norm": 14.479559898376465,
"learning_rate": 6.93939393939394e-05,
"loss": 2.4435,
"step": 409
},
{
"epoch": 0.9318181818181818,
"grad_norm": 14.522322654724121,
"learning_rate": 6.931818181818182e-05,
"loss": 2.3013,
"step": 410
},
{
"epoch": 0.9340909090909091,
"grad_norm": 12.853972434997559,
"learning_rate": 6.924242424242424e-05,
"loss": 2.4637,
"step": 411
},
{
"epoch": 0.9363636363636364,
"grad_norm": 10.978107452392578,
"learning_rate": 6.916666666666666e-05,
"loss": 1.5277,
"step": 412
},
{
"epoch": 0.9386363636363636,
"grad_norm": 14.109042167663574,
"learning_rate": 6.90909090909091e-05,
"loss": 1.9601,
"step": 413
},
{
"epoch": 0.9409090909090909,
"grad_norm": 10.699783325195312,
"learning_rate": 6.901515151515152e-05,
"loss": 2.2143,
"step": 414
},
{
"epoch": 0.9431818181818182,
"grad_norm": 10.57825756072998,
"learning_rate": 6.893939393939395e-05,
"loss": 2.0557,
"step": 415
},
{
"epoch": 0.9454545454545454,
"grad_norm": 12.432737350463867,
"learning_rate": 6.886363636363637e-05,
"loss": 1.7554,
"step": 416
},
{
"epoch": 0.9477272727272728,
"grad_norm": 12.157960891723633,
"learning_rate": 6.878787878787879e-05,
"loss": 2.1302,
"step": 417
},
{
"epoch": 0.95,
"grad_norm": 15.89067554473877,
"learning_rate": 6.871212121212122e-05,
"loss": 2.1424,
"step": 418
},
{
"epoch": 0.9522727272727273,
"grad_norm": 10.453248977661133,
"learning_rate": 6.863636363636364e-05,
"loss": 1.8215,
"step": 419
},
{
"epoch": 0.9545454545454546,
"grad_norm": 8.481575012207031,
"learning_rate": 6.856060606060606e-05,
"loss": 1.5999,
"step": 420
},
{
"epoch": 0.9568181818181818,
"grad_norm": 10.795332908630371,
"learning_rate": 6.848484848484848e-05,
"loss": 1.4623,
"step": 421
},
{
"epoch": 0.9590909090909091,
"grad_norm": 18.586315155029297,
"learning_rate": 6.840909090909091e-05,
"loss": 2.1875,
"step": 422
},
{
"epoch": 0.9613636363636363,
"grad_norm": 15.387242317199707,
"learning_rate": 6.833333333333333e-05,
"loss": 2.1544,
"step": 423
},
{
"epoch": 0.9636363636363636,
"grad_norm": 11.277326583862305,
"learning_rate": 6.825757575757576e-05,
"loss": 1.8575,
"step": 424
},
{
"epoch": 0.9659090909090909,
"grad_norm": 9.451603889465332,
"learning_rate": 6.818181818181818e-05,
"loss": 1.6149,
"step": 425
},
{
"epoch": 0.9681818181818181,
"grad_norm": 14.108964920043945,
"learning_rate": 6.81060606060606e-05,
"loss": 2.0166,
"step": 426
},
{
"epoch": 0.9704545454545455,
"grad_norm": 8.922270774841309,
"learning_rate": 6.803030303030304e-05,
"loss": 1.3486,
"step": 427
},
{
"epoch": 0.9727272727272728,
"grad_norm": 9.383979797363281,
"learning_rate": 6.795454545454546e-05,
"loss": 1.0425,
"step": 428
},
{
"epoch": 0.975,
"grad_norm": 13.076512336730957,
"learning_rate": 6.787878787878789e-05,
"loss": 1.7828,
"step": 429
},
{
"epoch": 0.9772727272727273,
"grad_norm": 14.815391540527344,
"learning_rate": 6.78030303030303e-05,
"loss": 1.893,
"step": 430
},
{
"epoch": 0.9795454545454545,
"grad_norm": 10.523706436157227,
"learning_rate": 6.772727272727273e-05,
"loss": 1.5307,
"step": 431
},
{
"epoch": 0.9818181818181818,
"grad_norm": 16.938919067382812,
"learning_rate": 6.765151515151515e-05,
"loss": 1.9001,
"step": 432
},
{
"epoch": 0.9840909090909091,
"grad_norm": 11.781875610351562,
"learning_rate": 6.757575757575758e-05,
"loss": 2.183,
"step": 433
},
{
"epoch": 0.9863636363636363,
"grad_norm": 14.539305686950684,
"learning_rate": 6.750000000000001e-05,
"loss": 2.2021,
"step": 434
},
{
"epoch": 0.9886363636363636,
"grad_norm": 15.532546997070312,
"learning_rate": 6.742424242424242e-05,
"loss": 2.1856,
"step": 435
},
{
"epoch": 0.990909090909091,
"grad_norm": 12.917964935302734,
"learning_rate": 6.734848484848485e-05,
"loss": 2.8732,
"step": 436
},
{
"epoch": 0.9931818181818182,
"grad_norm": 12.498353958129883,
"learning_rate": 6.727272727272727e-05,
"loss": 1.9246,
"step": 437
},
{
"epoch": 0.9954545454545455,
"grad_norm": 14.181402206420898,
"learning_rate": 6.71969696969697e-05,
"loss": 2.3863,
"step": 438
},
{
"epoch": 0.9977272727272727,
"grad_norm": 12.139135360717773,
"learning_rate": 6.712121212121213e-05,
"loss": 2.5505,
"step": 439
},
{
"epoch": 1.0,
"grad_norm": 18.971040725708008,
"learning_rate": 6.704545454545455e-05,
"loss": 2.3566,
"step": 440
},
{
"epoch": 1.0,
"eval_f1": 0.8942,
"eval_gen_len": 41.6727,
"eval_loss": 1.852333426475525,
"eval_precision": 0.8938,
"eval_recall": 0.8947,
"eval_rouge1": 0.4801,
"eval_rouge2": 0.2302,
"eval_rougeL": 0.4078,
"eval_rougeLsum": 0.4472,
"eval_runtime": 28.5976,
"eval_samples_per_second": 3.846,
"eval_steps_per_second": 0.49,
"step": 440
},
{
"epoch": 1.0022727272727272,
"grad_norm": 9.610616683959961,
"learning_rate": 6.696969696969696e-05,
"loss": 1.3656,
"step": 441
},
{
"epoch": 1.0045454545454546,
"grad_norm": 13.653773307800293,
"learning_rate": 6.68939393939394e-05,
"loss": 3.0115,
"step": 442
},
{
"epoch": 1.0068181818181818,
"grad_norm": 10.243281364440918,
"learning_rate": 6.681818181818183e-05,
"loss": 1.7598,
"step": 443
},
{
"epoch": 1.009090909090909,
"grad_norm": 12.79389762878418,
"learning_rate": 6.674242424242425e-05,
"loss": 1.7768,
"step": 444
},
{
"epoch": 1.0113636363636365,
"grad_norm": 8.748100280761719,
"learning_rate": 6.666666666666667e-05,
"loss": 1.4368,
"step": 445
},
{
"epoch": 1.0136363636363637,
"grad_norm": 9.42500114440918,
"learning_rate": 6.659090909090909e-05,
"loss": 1.0754,
"step": 446
},
{
"epoch": 1.0159090909090909,
"grad_norm": 11.976570129394531,
"learning_rate": 6.651515151515152e-05,
"loss": 2.07,
"step": 447
},
{
"epoch": 1.018181818181818,
"grad_norm": 9.448553085327148,
"learning_rate": 6.643939393939394e-05,
"loss": 1.5004,
"step": 448
},
{
"epoch": 1.0204545454545455,
"grad_norm": 10.295342445373535,
"learning_rate": 6.636363636363638e-05,
"loss": 1.6393,
"step": 449
},
{
"epoch": 1.0227272727272727,
"grad_norm": 9.445040702819824,
"learning_rate": 6.628787878787878e-05,
"loss": 1.7432,
"step": 450
},
{
"epoch": 1.025,
"grad_norm": 16.851524353027344,
"learning_rate": 6.621212121212121e-05,
"loss": 2.2318,
"step": 451
},
{
"epoch": 1.0272727272727273,
"grad_norm": 10.721171379089355,
"learning_rate": 6.613636363636365e-05,
"loss": 1.7857,
"step": 452
},
{
"epoch": 1.0295454545454545,
"grad_norm": 10.074830055236816,
"learning_rate": 6.606060606060607e-05,
"loss": 1.5901,
"step": 453
},
{
"epoch": 1.0318181818181817,
"grad_norm": 20.14990234375,
"learning_rate": 6.598484848484849e-05,
"loss": 2.6518,
"step": 454
},
{
"epoch": 1.0340909090909092,
"grad_norm": 10.911235809326172,
"learning_rate": 6.59090909090909e-05,
"loss": 2.1865,
"step": 455
},
{
"epoch": 1.0363636363636364,
"grad_norm": 18.03226089477539,
"learning_rate": 6.583333333333334e-05,
"loss": 2.4383,
"step": 456
},
{
"epoch": 1.0386363636363636,
"grad_norm": 9.279253959655762,
"learning_rate": 6.575757575757576e-05,
"loss": 0.9629,
"step": 457
},
{
"epoch": 1.040909090909091,
"grad_norm": 11.864253997802734,
"learning_rate": 6.568181818181819e-05,
"loss": 2.1734,
"step": 458
},
{
"epoch": 1.0431818181818182,
"grad_norm": 13.346138954162598,
"learning_rate": 6.560606060606061e-05,
"loss": 1.4337,
"step": 459
},
{
"epoch": 1.0454545454545454,
"grad_norm": 8.396434783935547,
"learning_rate": 6.553030303030303e-05,
"loss": 1.54,
"step": 460
},
{
"epoch": 1.0477272727272728,
"grad_norm": 9.705253601074219,
"learning_rate": 6.545454545454546e-05,
"loss": 1.9016,
"step": 461
},
{
"epoch": 1.05,
"grad_norm": 9.6156005859375,
"learning_rate": 6.537878787878788e-05,
"loss": 1.3029,
"step": 462
},
{
"epoch": 1.0522727272727272,
"grad_norm": 16.548994064331055,
"learning_rate": 6.530303030303032e-05,
"loss": 3.5641,
"step": 463
},
{
"epoch": 1.0545454545454545,
"grad_norm": 11.045211791992188,
"learning_rate": 6.522727272727272e-05,
"loss": 1.3876,
"step": 464
},
{
"epoch": 1.0568181818181819,
"grad_norm": 10.465343475341797,
"learning_rate": 6.515151515151516e-05,
"loss": 1.5871,
"step": 465
},
{
"epoch": 1.059090909090909,
"grad_norm": 10.053452491760254,
"learning_rate": 6.507575757575757e-05,
"loss": 1.4177,
"step": 466
},
{
"epoch": 1.0613636363636363,
"grad_norm": 12.043208122253418,
"learning_rate": 6.500000000000001e-05,
"loss": 1.5364,
"step": 467
},
{
"epoch": 1.0636363636363637,
"grad_norm": 11.853958129882812,
"learning_rate": 6.492424242424243e-05,
"loss": 1.3952,
"step": 468
},
{
"epoch": 1.065909090909091,
"grad_norm": 8.25589656829834,
"learning_rate": 6.484848484848485e-05,
"loss": 1.5497,
"step": 469
},
{
"epoch": 1.0681818181818181,
"grad_norm": 13.430974960327148,
"learning_rate": 6.477272727272728e-05,
"loss": 2.4184,
"step": 470
},
{
"epoch": 1.0704545454545455,
"grad_norm": 10.576482772827148,
"learning_rate": 6.46969696969697e-05,
"loss": 1.4223,
"step": 471
},
{
"epoch": 1.0727272727272728,
"grad_norm": 11.786113739013672,
"learning_rate": 6.462121212121213e-05,
"loss": 2.0499,
"step": 472
},
{
"epoch": 1.075,
"grad_norm": 12.00688362121582,
"learning_rate": 6.454545454545455e-05,
"loss": 2.9764,
"step": 473
},
{
"epoch": 1.0772727272727272,
"grad_norm": 10.834086418151855,
"learning_rate": 6.446969696969697e-05,
"loss": 2.0765,
"step": 474
},
{
"epoch": 1.0795454545454546,
"grad_norm": 10.710877418518066,
"learning_rate": 6.439393939393939e-05,
"loss": 1.4314,
"step": 475
},
{
"epoch": 1.0818181818181818,
"grad_norm": 12.800888061523438,
"learning_rate": 6.431818181818182e-05,
"loss": 1.4847,
"step": 476
},
{
"epoch": 1.084090909090909,
"grad_norm": 10.365299224853516,
"learning_rate": 6.424242424242424e-05,
"loss": 1.6775,
"step": 477
},
{
"epoch": 1.0863636363636364,
"grad_norm": 10.344579696655273,
"learning_rate": 6.416666666666668e-05,
"loss": 2.3473,
"step": 478
},
{
"epoch": 1.0886363636363636,
"grad_norm": 13.791784286499023,
"learning_rate": 6.40909090909091e-05,
"loss": 2.5763,
"step": 479
},
{
"epoch": 1.0909090909090908,
"grad_norm": 13.133481979370117,
"learning_rate": 6.401515151515152e-05,
"loss": 1.7025,
"step": 480
},
{
"epoch": 1.0931818181818183,
"grad_norm": 13.444737434387207,
"learning_rate": 6.393939393939395e-05,
"loss": 2.341,
"step": 481
},
{
"epoch": 1.0954545454545455,
"grad_norm": 15.245584487915039,
"learning_rate": 6.386363636363637e-05,
"loss": 1.929,
"step": 482
},
{
"epoch": 1.0977272727272727,
"grad_norm": 10.724458694458008,
"learning_rate": 6.37878787878788e-05,
"loss": 1.4099,
"step": 483
},
{
"epoch": 1.1,
"grad_norm": 11.243814468383789,
"learning_rate": 6.371212121212121e-05,
"loss": 1.5886,
"step": 484
},
{
"epoch": 1.1022727272727273,
"grad_norm": 11.731426239013672,
"learning_rate": 6.363636363636364e-05,
"loss": 1.9571,
"step": 485
},
{
"epoch": 1.1045454545454545,
"grad_norm": 10.820639610290527,
"learning_rate": 6.356060606060606e-05,
"loss": 1.113,
"step": 486
},
{
"epoch": 1.106818181818182,
"grad_norm": 14.63482666015625,
"learning_rate": 6.34848484848485e-05,
"loss": 1.9765,
"step": 487
},
{
"epoch": 1.1090909090909091,
"grad_norm": 12.746257781982422,
"learning_rate": 6.340909090909091e-05,
"loss": 1.5906,
"step": 488
},
{
"epoch": 1.1113636363636363,
"grad_norm": 14.916450500488281,
"learning_rate": 6.333333333333333e-05,
"loss": 1.6616,
"step": 489
},
{
"epoch": 1.1136363636363635,
"grad_norm": 11.509872436523438,
"learning_rate": 6.325757575757577e-05,
"loss": 2.5105,
"step": 490
},
{
"epoch": 1.115909090909091,
"grad_norm": 11.517654418945312,
"learning_rate": 6.318181818181818e-05,
"loss": 1.3542,
"step": 491
},
{
"epoch": 1.1181818181818182,
"grad_norm": 13.984039306640625,
"learning_rate": 6.310606060606062e-05,
"loss": 2.1356,
"step": 492
},
{
"epoch": 1.1204545454545454,
"grad_norm": 13.018148422241211,
"learning_rate": 6.303030303030302e-05,
"loss": 1.5024,
"step": 493
},
{
"epoch": 1.1227272727272728,
"grad_norm": 13.609540939331055,
"learning_rate": 6.295454545454546e-05,
"loss": 2.1359,
"step": 494
},
{
"epoch": 1.125,
"grad_norm": 13.505942344665527,
"learning_rate": 6.287878787878788e-05,
"loss": 2.8486,
"step": 495
},
{
"epoch": 1.1272727272727272,
"grad_norm": 11.420187950134277,
"learning_rate": 6.280303030303031e-05,
"loss": 1.5044,
"step": 496
},
{
"epoch": 1.1295454545454546,
"grad_norm": 14.127695083618164,
"learning_rate": 6.272727272727273e-05,
"loss": 2.6676,
"step": 497
},
{
"epoch": 1.1318181818181818,
"grad_norm": 9.813878059387207,
"learning_rate": 6.265151515151515e-05,
"loss": 1.4169,
"step": 498
},
{
"epoch": 1.134090909090909,
"grad_norm": 9.80479621887207,
"learning_rate": 6.257575757575758e-05,
"loss": 1.5349,
"step": 499
},
{
"epoch": 1.1363636363636362,
"grad_norm": 10.739019393920898,
"learning_rate": 6.25e-05,
"loss": 1.5255,
"step": 500
},
{
"epoch": 1.1386363636363637,
"grad_norm": 11.327676773071289,
"learning_rate": 6.242424242424243e-05,
"loss": 1.3854,
"step": 501
},
{
"epoch": 1.1409090909090909,
"grad_norm": 9.645312309265137,
"learning_rate": 6.234848484848485e-05,
"loss": 1.6148,
"step": 502
},
{
"epoch": 1.143181818181818,
"grad_norm": 12.285623550415039,
"learning_rate": 6.227272727272727e-05,
"loss": 1.9336,
"step": 503
},
{
"epoch": 1.1454545454545455,
"grad_norm": 15.579854011535645,
"learning_rate": 6.219696969696969e-05,
"loss": 2.1064,
"step": 504
},
{
"epoch": 1.1477272727272727,
"grad_norm": 17.76817512512207,
"learning_rate": 6.212121212121213e-05,
"loss": 1.4266,
"step": 505
},
{
"epoch": 1.15,
"grad_norm": 10.037004470825195,
"learning_rate": 6.204545454545455e-05,
"loss": 1.5432,
"step": 506
},
{
"epoch": 1.1522727272727273,
"grad_norm": 10.46380615234375,
"learning_rate": 6.196969696969698e-05,
"loss": 2.1057,
"step": 507
},
{
"epoch": 1.1545454545454545,
"grad_norm": 12.883086204528809,
"learning_rate": 6.18939393939394e-05,
"loss": 2.1955,
"step": 508
},
{
"epoch": 1.1568181818181817,
"grad_norm": 10.667054176330566,
"learning_rate": 6.181818181818182e-05,
"loss": 1.8041,
"step": 509
},
{
"epoch": 1.1590909090909092,
"grad_norm": 13.076772689819336,
"learning_rate": 6.174242424242425e-05,
"loss": 1.9923,
"step": 510
},
{
"epoch": 1.1613636363636364,
"grad_norm": 13.195068359375,
"learning_rate": 6.166666666666667e-05,
"loss": 2.2575,
"step": 511
},
{
"epoch": 1.1636363636363636,
"grad_norm": 25.86856460571289,
"learning_rate": 6.15909090909091e-05,
"loss": 0.9713,
"step": 512
},
{
"epoch": 1.165909090909091,
"grad_norm": 13.29697322845459,
"learning_rate": 6.151515151515151e-05,
"loss": 1.9724,
"step": 513
},
{
"epoch": 1.1681818181818182,
"grad_norm": 11.164151191711426,
"learning_rate": 6.143939393939394e-05,
"loss": 1.7574,
"step": 514
},
{
"epoch": 1.1704545454545454,
"grad_norm": 11.621664047241211,
"learning_rate": 6.136363636363636e-05,
"loss": 2.0349,
"step": 515
},
{
"epoch": 1.1727272727272728,
"grad_norm": 13.135611534118652,
"learning_rate": 6.12878787878788e-05,
"loss": 2.1065,
"step": 516
},
{
"epoch": 1.175,
"grad_norm": 13.730208396911621,
"learning_rate": 6.121212121212121e-05,
"loss": 2.2205,
"step": 517
},
{
"epoch": 1.1772727272727272,
"grad_norm": 11.453598022460938,
"learning_rate": 6.113636363636363e-05,
"loss": 2.2924,
"step": 518
},
{
"epoch": 1.1795454545454545,
"grad_norm": 10.924808502197266,
"learning_rate": 6.106060606060607e-05,
"loss": 1.2283,
"step": 519
},
{
"epoch": 1.1818181818181819,
"grad_norm": 16.08315658569336,
"learning_rate": 6.098484848484849e-05,
"loss": 2.5927,
"step": 520
},
{
"epoch": 1.184090909090909,
"grad_norm": 8.260347366333008,
"learning_rate": 6.090909090909091e-05,
"loss": 1.3534,
"step": 521
},
{
"epoch": 1.1863636363636363,
"grad_norm": 12.075833320617676,
"learning_rate": 6.083333333333333e-05,
"loss": 2.0813,
"step": 522
},
{
"epoch": 1.1886363636363637,
"grad_norm": 10.575677871704102,
"learning_rate": 6.075757575757576e-05,
"loss": 1.4781,
"step": 523
},
{
"epoch": 1.190909090909091,
"grad_norm": 12.236503601074219,
"learning_rate": 6.0681818181818185e-05,
"loss": 2.003,
"step": 524
},
{
"epoch": 1.1931818181818181,
"grad_norm": 12.172025680541992,
"learning_rate": 6.060606060606061e-05,
"loss": 1.4951,
"step": 525
},
{
"epoch": 1.1954545454545455,
"grad_norm": 12.456896781921387,
"learning_rate": 6.053030303030304e-05,
"loss": 1.8737,
"step": 526
},
{
"epoch": 1.1977272727272728,
"grad_norm": 13.824838638305664,
"learning_rate": 6.045454545454545e-05,
"loss": 1.7923,
"step": 527
},
{
"epoch": 1.2,
"grad_norm": 10.863786697387695,
"learning_rate": 6.037878787878788e-05,
"loss": 2.144,
"step": 528
},
{
"epoch": 1.2022727272727272,
"grad_norm": 17.319700241088867,
"learning_rate": 6.03030303030303e-05,
"loss": 1.9297,
"step": 529
},
{
"epoch": 1.2045454545454546,
"grad_norm": 8.89411449432373,
"learning_rate": 6.022727272727273e-05,
"loss": 1.3583,
"step": 530
},
{
"epoch": 1.2068181818181818,
"grad_norm": 16.971437454223633,
"learning_rate": 6.0151515151515156e-05,
"loss": 2.5184,
"step": 531
},
{
"epoch": 1.209090909090909,
"grad_norm": 11.486995697021484,
"learning_rate": 6.0075757575757575e-05,
"loss": 1.5296,
"step": 532
},
{
"epoch": 1.2113636363636364,
"grad_norm": 17.541278839111328,
"learning_rate": 6e-05,
"loss": 2.108,
"step": 533
},
{
"epoch": 1.2136363636363636,
"grad_norm": 13.599751472473145,
"learning_rate": 5.992424242424243e-05,
"loss": 2.0622,
"step": 534
},
{
"epoch": 1.2159090909090908,
"grad_norm": 10.884852409362793,
"learning_rate": 5.9848484848484854e-05,
"loss": 1.5018,
"step": 535
},
{
"epoch": 1.2181818181818183,
"grad_norm": 10.407668113708496,
"learning_rate": 5.977272727272728e-05,
"loss": 1.5013,
"step": 536
},
{
"epoch": 1.2204545454545455,
"grad_norm": 9.911277770996094,
"learning_rate": 5.969696969696969e-05,
"loss": 1.9855,
"step": 537
},
{
"epoch": 1.2227272727272727,
"grad_norm": 11.939435958862305,
"learning_rate": 5.962121212121212e-05,
"loss": 2.359,
"step": 538
},
{
"epoch": 1.225,
"grad_norm": 11.17503547668457,
"learning_rate": 5.9545454545454546e-05,
"loss": 1.4952,
"step": 539
},
{
"epoch": 1.2272727272727273,
"grad_norm": 15.073485374450684,
"learning_rate": 5.946969696969697e-05,
"loss": 2.1802,
"step": 540
},
{
"epoch": 1.2295454545454545,
"grad_norm": 12.413151741027832,
"learning_rate": 5.93939393939394e-05,
"loss": 1.9444,
"step": 541
},
{
"epoch": 1.231818181818182,
"grad_norm": 12.741022109985352,
"learning_rate": 5.931818181818182e-05,
"loss": 1.894,
"step": 542
},
{
"epoch": 1.2340909090909091,
"grad_norm": 11.041027069091797,
"learning_rate": 5.9242424242424244e-05,
"loss": 1.748,
"step": 543
},
{
"epoch": 1.2363636363636363,
"grad_norm": 10.045198440551758,
"learning_rate": 5.916666666666667e-05,
"loss": 1.7848,
"step": 544
},
{
"epoch": 1.2386363636363638,
"grad_norm": 10.759014129638672,
"learning_rate": 5.90909090909091e-05,
"loss": 1.7836,
"step": 545
},
{
"epoch": 1.240909090909091,
"grad_norm": 10.296431541442871,
"learning_rate": 5.901515151515152e-05,
"loss": 1.088,
"step": 546
},
{
"epoch": 1.2431818181818182,
"grad_norm": 11.159008026123047,
"learning_rate": 5.8939393939393936e-05,
"loss": 1.3126,
"step": 547
},
{
"epoch": 1.2454545454545454,
"grad_norm": 7.6021270751953125,
"learning_rate": 5.886363636363636e-05,
"loss": 1.137,
"step": 548
},
{
"epoch": 1.2477272727272728,
"grad_norm": 11.449591636657715,
"learning_rate": 5.878787878787879e-05,
"loss": 1.7471,
"step": 549
},
{
"epoch": 1.25,
"grad_norm": 14.451662063598633,
"learning_rate": 5.871212121212122e-05,
"loss": 2.014,
"step": 550
},
{
"epoch": 1.2522727272727272,
"grad_norm": 11.24593448638916,
"learning_rate": 5.8636363636363634e-05,
"loss": 1.5885,
"step": 551
},
{
"epoch": 1.2545454545454544,
"grad_norm": 10.326696395874023,
"learning_rate": 5.856060606060606e-05,
"loss": 1.5146,
"step": 552
},
{
"epoch": 1.2568181818181818,
"grad_norm": 11.736088752746582,
"learning_rate": 5.848484848484849e-05,
"loss": 2.1627,
"step": 553
},
{
"epoch": 1.259090909090909,
"grad_norm": 14.25733757019043,
"learning_rate": 5.840909090909091e-05,
"loss": 1.8419,
"step": 554
},
{
"epoch": 1.2613636363636362,
"grad_norm": 10.154618263244629,
"learning_rate": 5.833333333333334e-05,
"loss": 1.8319,
"step": 555
},
{
"epoch": 1.2636363636363637,
"grad_norm": 14.464015007019043,
"learning_rate": 5.825757575757575e-05,
"loss": 1.7117,
"step": 556
},
{
"epoch": 1.2659090909090909,
"grad_norm": 9.713830947875977,
"learning_rate": 5.818181818181818e-05,
"loss": 1.4495,
"step": 557
},
{
"epoch": 1.268181818181818,
"grad_norm": 21.958648681640625,
"learning_rate": 5.810606060606061e-05,
"loss": 3.0762,
"step": 558
},
{
"epoch": 1.2704545454545455,
"grad_norm": 11.349808692932129,
"learning_rate": 5.803030303030304e-05,
"loss": 1.9419,
"step": 559
},
{
"epoch": 1.2727272727272727,
"grad_norm": 12.586771965026855,
"learning_rate": 5.7954545454545464e-05,
"loss": 2.1826,
"step": 560
},
{
"epoch": 1.275,
"grad_norm": 10.261626243591309,
"learning_rate": 5.787878787878788e-05,
"loss": 2.0422,
"step": 561
},
{
"epoch": 1.2772727272727273,
"grad_norm": 11.65180492401123,
"learning_rate": 5.78030303030303e-05,
"loss": 1.5295,
"step": 562
},
{
"epoch": 1.2795454545454545,
"grad_norm": 12.369877815246582,
"learning_rate": 5.772727272727273e-05,
"loss": 1.8935,
"step": 563
},
{
"epoch": 1.2818181818181817,
"grad_norm": 10.670714378356934,
"learning_rate": 5.7651515151515156e-05,
"loss": 2.0215,
"step": 564
},
{
"epoch": 1.2840909090909092,
"grad_norm": 13.76659870147705,
"learning_rate": 5.757575757575758e-05,
"loss": 2.2982,
"step": 565
},
{
"epoch": 1.2863636363636364,
"grad_norm": 9.004195213317871,
"learning_rate": 5.7499999999999995e-05,
"loss": 1.6066,
"step": 566
},
{
"epoch": 1.2886363636363636,
"grad_norm": 10.873322486877441,
"learning_rate": 5.742424242424243e-05,
"loss": 1.5999,
"step": 567
},
{
"epoch": 1.290909090909091,
"grad_norm": 11.641073226928711,
"learning_rate": 5.7348484848484854e-05,
"loss": 1.3272,
"step": 568
},
{
"epoch": 1.2931818181818182,
"grad_norm": 9.68420124053955,
"learning_rate": 5.727272727272728e-05,
"loss": 1.413,
"step": 569
},
{
"epoch": 1.2954545454545454,
"grad_norm": 13.477838516235352,
"learning_rate": 5.719696969696971e-05,
"loss": 2.5129,
"step": 570
},
{
"epoch": 1.2977272727272728,
"grad_norm": 11.720010757446289,
"learning_rate": 5.712121212121212e-05,
"loss": 2.1576,
"step": 571
},
{
"epoch": 1.3,
"grad_norm": 13.136527061462402,
"learning_rate": 5.7045454545454546e-05,
"loss": 1.9311,
"step": 572
},
{
"epoch": 1.3022727272727272,
"grad_norm": 8.095415115356445,
"learning_rate": 5.696969696969697e-05,
"loss": 0.8927,
"step": 573
},
{
"epoch": 1.3045454545454547,
"grad_norm": 11.233893394470215,
"learning_rate": 5.68939393939394e-05,
"loss": 2.0108,
"step": 574
},
{
"epoch": 1.3068181818181819,
"grad_norm": 11.203099250793457,
"learning_rate": 5.6818181818181825e-05,
"loss": 1.9241,
"step": 575
},
{
"epoch": 1.309090909090909,
"grad_norm": 9.640209197998047,
"learning_rate": 5.6742424242424244e-05,
"loss": 1.3841,
"step": 576
},
{
"epoch": 1.3113636363636363,
"grad_norm": 10.882938385009766,
"learning_rate": 5.666666666666667e-05,
"loss": 1.4184,
"step": 577
},
{
"epoch": 1.3136363636363637,
"grad_norm": 10.470818519592285,
"learning_rate": 5.65909090909091e-05,
"loss": 2.0096,
"step": 578
},
{
"epoch": 1.315909090909091,
"grad_norm": 12.759695053100586,
"learning_rate": 5.651515151515152e-05,
"loss": 2.138,
"step": 579
},
{
"epoch": 1.3181818181818181,
"grad_norm": 26.707128524780273,
"learning_rate": 5.643939393939395e-05,
"loss": 2.8215,
"step": 580
},
{
"epoch": 1.3204545454545453,
"grad_norm": 11.116402626037598,
"learning_rate": 5.636363636363636e-05,
"loss": 2.4158,
"step": 581
},
{
"epoch": 1.3227272727272728,
"grad_norm": 14.136595726013184,
"learning_rate": 5.628787878787879e-05,
"loss": 1.6545,
"step": 582
},
{
"epoch": 1.325,
"grad_norm": 11.88375473022461,
"learning_rate": 5.6212121212121215e-05,
"loss": 2.1069,
"step": 583
},
{
"epoch": 1.3272727272727272,
"grad_norm": 11.863356590270996,
"learning_rate": 5.613636363636364e-05,
"loss": 1.535,
"step": 584
},
{
"epoch": 1.3295454545454546,
"grad_norm": 11.284381866455078,
"learning_rate": 5.606060606060606e-05,
"loss": 2.3407,
"step": 585
},
{
"epoch": 1.3318181818181818,
"grad_norm": 11.79831600189209,
"learning_rate": 5.598484848484849e-05,
"loss": 1.6409,
"step": 586
},
{
"epoch": 1.334090909090909,
"grad_norm": 11.130000114440918,
"learning_rate": 5.5909090909090913e-05,
"loss": 1.6426,
"step": 587
},
{
"epoch": 1.3363636363636364,
"grad_norm": 9.1551513671875,
"learning_rate": 5.583333333333334e-05,
"loss": 1.8466,
"step": 588
},
{
"epoch": 1.3386363636363636,
"grad_norm": 14.405865669250488,
"learning_rate": 5.5757575757575766e-05,
"loss": 2.066,
"step": 589
},
{
"epoch": 1.3409090909090908,
"grad_norm": 53.46037673950195,
"learning_rate": 5.568181818181818e-05,
"loss": 2.4224,
"step": 590
},
{
"epoch": 1.3431818181818183,
"grad_norm": 11.6724271774292,
"learning_rate": 5.5606060606060605e-05,
"loss": 1.7148,
"step": 591
},
{
"epoch": 1.3454545454545455,
"grad_norm": 15.849516868591309,
"learning_rate": 5.553030303030303e-05,
"loss": 2.0981,
"step": 592
},
{
"epoch": 1.3477272727272727,
"grad_norm": 13.421188354492188,
"learning_rate": 5.545454545454546e-05,
"loss": 1.722,
"step": 593
},
{
"epoch": 1.35,
"grad_norm": 14.319283485412598,
"learning_rate": 5.5378787878787884e-05,
"loss": 1.7284,
"step": 594
},
{
"epoch": 1.3522727272727273,
"grad_norm": 12.210022926330566,
"learning_rate": 5.5303030303030304e-05,
"loss": 1.4507,
"step": 595
},
{
"epoch": 1.3545454545454545,
"grad_norm": 11.60317325592041,
"learning_rate": 5.522727272727273e-05,
"loss": 1.7749,
"step": 596
},
{
"epoch": 1.356818181818182,
"grad_norm": 12.895737648010254,
"learning_rate": 5.5151515151515156e-05,
"loss": 1.5555,
"step": 597
},
{
"epoch": 1.3590909090909091,
"grad_norm": 11.198805809020996,
"learning_rate": 5.507575757575758e-05,
"loss": 1.7624,
"step": 598
},
{
"epoch": 1.3613636363636363,
"grad_norm": 13.309189796447754,
"learning_rate": 5.500000000000001e-05,
"loss": 1.8765,
"step": 599
},
{
"epoch": 1.3636363636363638,
"grad_norm": 10.177202224731445,
"learning_rate": 5.492424242424242e-05,
"loss": 1.1895,
"step": 600
},
{
"epoch": 1.365909090909091,
"grad_norm": 11.205484390258789,
"learning_rate": 5.484848484848485e-05,
"loss": 1.1661,
"step": 601
},
{
"epoch": 1.3681818181818182,
"grad_norm": 12.091497421264648,
"learning_rate": 5.4772727272727274e-05,
"loss": 1.9972,
"step": 602
},
{
"epoch": 1.3704545454545456,
"grad_norm": 11.2894926071167,
"learning_rate": 5.46969696969697e-05,
"loss": 1.7121,
"step": 603
},
{
"epoch": 1.3727272727272728,
"grad_norm": 15.034446716308594,
"learning_rate": 5.462121212121213e-05,
"loss": 2.8078,
"step": 604
},
{
"epoch": 1.375,
"grad_norm": 8.075346946716309,
"learning_rate": 5.4545454545454546e-05,
"loss": 1.0453,
"step": 605
},
{
"epoch": 1.3772727272727272,
"grad_norm": 10.377656936645508,
"learning_rate": 5.446969696969697e-05,
"loss": 1.7973,
"step": 606
},
{
"epoch": 1.3795454545454544,
"grad_norm": 10.147284507751465,
"learning_rate": 5.43939393939394e-05,
"loss": 2.1848,
"step": 607
},
{
"epoch": 1.3818181818181818,
"grad_norm": 11.856623649597168,
"learning_rate": 5.4318181818181825e-05,
"loss": 1.9857,
"step": 608
},
{
"epoch": 1.384090909090909,
"grad_norm": 10.355262756347656,
"learning_rate": 5.424242424242425e-05,
"loss": 1.4383,
"step": 609
},
{
"epoch": 1.3863636363636362,
"grad_norm": 9.085455894470215,
"learning_rate": 5.4166666666666664e-05,
"loss": 1.382,
"step": 610
},
{
"epoch": 1.3886363636363637,
"grad_norm": 13.221922874450684,
"learning_rate": 5.409090909090909e-05,
"loss": 2.3278,
"step": 611
},
{
"epoch": 1.3909090909090909,
"grad_norm": 14.725556373596191,
"learning_rate": 5.401515151515152e-05,
"loss": 2.0181,
"step": 612
},
{
"epoch": 1.393181818181818,
"grad_norm": 11.90503978729248,
"learning_rate": 5.393939393939394e-05,
"loss": 2.5601,
"step": 613
},
{
"epoch": 1.3954545454545455,
"grad_norm": 10.583837509155273,
"learning_rate": 5.386363636363637e-05,
"loss": 1.4886,
"step": 614
},
{
"epoch": 1.3977272727272727,
"grad_norm": 12.369796752929688,
"learning_rate": 5.378787878787879e-05,
"loss": 1.2716,
"step": 615
},
{
"epoch": 1.4,
"grad_norm": 12.412566184997559,
"learning_rate": 5.3712121212121215e-05,
"loss": 2.0391,
"step": 616
},
{
"epoch": 1.4022727272727273,
"grad_norm": 12.033483505249023,
"learning_rate": 5.363636363636364e-05,
"loss": 1.2044,
"step": 617
},
{
"epoch": 1.4045454545454545,
"grad_norm": 11.291866302490234,
"learning_rate": 5.356060606060607e-05,
"loss": 2.3266,
"step": 618
},
{
"epoch": 1.4068181818181817,
"grad_norm": 17.745227813720703,
"learning_rate": 5.348484848484848e-05,
"loss": 1.7097,
"step": 619
},
{
"epoch": 1.4090909090909092,
"grad_norm": 11.858403205871582,
"learning_rate": 5.340909090909091e-05,
"loss": 1.9088,
"step": 620
},
{
"epoch": 1.4113636363636364,
"grad_norm": 14.968146324157715,
"learning_rate": 5.333333333333333e-05,
"loss": 2.009,
"step": 621
},
{
"epoch": 1.4136363636363636,
"grad_norm": 13.16178035736084,
"learning_rate": 5.325757575757576e-05,
"loss": 1.6262,
"step": 622
},
{
"epoch": 1.415909090909091,
"grad_norm": 11.63772201538086,
"learning_rate": 5.3181818181818186e-05,
"loss": 1.481,
"step": 623
},
{
"epoch": 1.4181818181818182,
"grad_norm": 13.266715049743652,
"learning_rate": 5.3106060606060605e-05,
"loss": 2.3015,
"step": 624
},
{
"epoch": 1.4204545454545454,
"grad_norm": 11.690614700317383,
"learning_rate": 5.303030303030303e-05,
"loss": 1.7226,
"step": 625
},
{
"epoch": 1.4227272727272728,
"grad_norm": 10.599973678588867,
"learning_rate": 5.295454545454546e-05,
"loss": 1.0261,
"step": 626
},
{
"epoch": 1.425,
"grad_norm": 17.117259979248047,
"learning_rate": 5.2878787878787884e-05,
"loss": 1.7164,
"step": 627
},
{
"epoch": 1.4272727272727272,
"grad_norm": 11.62483024597168,
"learning_rate": 5.280303030303031e-05,
"loss": 1.3686,
"step": 628
},
{
"epoch": 1.4295454545454547,
"grad_norm": 10.503996849060059,
"learning_rate": 5.272727272727272e-05,
"loss": 1.6085,
"step": 629
},
{
"epoch": 1.4318181818181819,
"grad_norm": 14.493663787841797,
"learning_rate": 5.265151515151515e-05,
"loss": 2.0943,
"step": 630
},
{
"epoch": 1.434090909090909,
"grad_norm": 11.125360488891602,
"learning_rate": 5.2575757575757576e-05,
"loss": 1.8284,
"step": 631
},
{
"epoch": 1.4363636363636363,
"grad_norm": 10.438358306884766,
"learning_rate": 5.25e-05,
"loss": 2.1436,
"step": 632
},
{
"epoch": 1.4386363636363637,
"grad_norm": 13.013614654541016,
"learning_rate": 5.242424242424243e-05,
"loss": 1.6999,
"step": 633
},
{
"epoch": 1.440909090909091,
"grad_norm": 14.21478271484375,
"learning_rate": 5.234848484848485e-05,
"loss": 3.268,
"step": 634
},
{
"epoch": 1.4431818181818181,
"grad_norm": 10.756131172180176,
"learning_rate": 5.2272727272727274e-05,
"loss": 1.1294,
"step": 635
},
{
"epoch": 1.4454545454545453,
"grad_norm": 14.409692764282227,
"learning_rate": 5.21969696969697e-05,
"loss": 1.391,
"step": 636
},
{
"epoch": 1.4477272727272728,
"grad_norm": 9.839500427246094,
"learning_rate": 5.212121212121213e-05,
"loss": 1.4028,
"step": 637
},
{
"epoch": 1.45,
"grad_norm": 13.601579666137695,
"learning_rate": 5.204545454545455e-05,
"loss": 1.6384,
"step": 638
},
{
"epoch": 1.4522727272727272,
"grad_norm": 12.721500396728516,
"learning_rate": 5.1969696969696966e-05,
"loss": 1.9382,
"step": 639
},
{
"epoch": 1.4545454545454546,
"grad_norm": 11.373588562011719,
"learning_rate": 5.189393939393939e-05,
"loss": 2.7324,
"step": 640
},
{
"epoch": 1.4568181818181818,
"grad_norm": 11.873559951782227,
"learning_rate": 5.181818181818182e-05,
"loss": 1.6583,
"step": 641
},
{
"epoch": 1.459090909090909,
"grad_norm": 10.649148941040039,
"learning_rate": 5.1742424242424245e-05,
"loss": 1.7733,
"step": 642
},
{
"epoch": 1.4613636363636364,
"grad_norm": 12.14698314666748,
"learning_rate": 5.166666666666667e-05,
"loss": 1.6434,
"step": 643
},
{
"epoch": 1.4636363636363636,
"grad_norm": 9.80806827545166,
"learning_rate": 5.159090909090909e-05,
"loss": 1.9463,
"step": 644
},
{
"epoch": 1.4659090909090908,
"grad_norm": 7.273732662200928,
"learning_rate": 5.151515151515152e-05,
"loss": 0.8156,
"step": 645
},
{
"epoch": 1.4681818181818183,
"grad_norm": 12.560272216796875,
"learning_rate": 5.143939393939394e-05,
"loss": 2.2347,
"step": 646
},
{
"epoch": 1.4704545454545455,
"grad_norm": 10.116893768310547,
"learning_rate": 5.136363636363637e-05,
"loss": 1.2157,
"step": 647
},
{
"epoch": 1.4727272727272727,
"grad_norm": 11.09861946105957,
"learning_rate": 5.1287878787878796e-05,
"loss": 1.2521,
"step": 648
},
{
"epoch": 1.475,
"grad_norm": 11.454336166381836,
"learning_rate": 5.121212121212121e-05,
"loss": 1.6148,
"step": 649
},
{
"epoch": 1.4772727272727273,
"grad_norm": 11.669930458068848,
"learning_rate": 5.1136363636363635e-05,
"loss": 2.4559,
"step": 650
},
{
"epoch": 1.4795454545454545,
"grad_norm": 10.853449821472168,
"learning_rate": 5.106060606060606e-05,
"loss": 1.6519,
"step": 651
},
{
"epoch": 1.481818181818182,
"grad_norm": 23.87467384338379,
"learning_rate": 5.098484848484849e-05,
"loss": 3.9198,
"step": 652
},
{
"epoch": 1.4840909090909091,
"grad_norm": 15.731586456298828,
"learning_rate": 5.090909090909091e-05,
"loss": 2.4425,
"step": 653
},
{
"epoch": 1.4863636363636363,
"grad_norm": 10.91791820526123,
"learning_rate": 5.0833333333333333e-05,
"loss": 1.4977,
"step": 654
},
{
"epoch": 1.4886363636363638,
"grad_norm": 11.515501022338867,
"learning_rate": 5.075757575757576e-05,
"loss": 1.4377,
"step": 655
},
{
"epoch": 1.490909090909091,
"grad_norm": 9.79021167755127,
"learning_rate": 5.0681818181818186e-05,
"loss": 1.208,
"step": 656
},
{
"epoch": 1.4931818181818182,
"grad_norm": 7.424502849578857,
"learning_rate": 5.060606060606061e-05,
"loss": 1.368,
"step": 657
},
{
"epoch": 1.4954545454545456,
"grad_norm": 9.132887840270996,
"learning_rate": 5.0530303030303025e-05,
"loss": 1.0296,
"step": 658
},
{
"epoch": 1.4977272727272728,
"grad_norm": 14.063539505004883,
"learning_rate": 5.045454545454545e-05,
"loss": 1.9923,
"step": 659
},
{
"epoch": 1.5,
"grad_norm": 10.994144439697266,
"learning_rate": 5.037878787878788e-05,
"loss": 1.5963,
"step": 660
},
{
"epoch": 1.5022727272727274,
"grad_norm": 11.193540573120117,
"learning_rate": 5.030303030303031e-05,
"loss": 2.6418,
"step": 661
},
{
"epoch": 1.5045454545454544,
"grad_norm": 11.344916343688965,
"learning_rate": 5.022727272727274e-05,
"loss": 0.9847,
"step": 662
},
{
"epoch": 1.5068181818181818,
"grad_norm": 16.028928756713867,
"learning_rate": 5.015151515151515e-05,
"loss": 2.7095,
"step": 663
},
{
"epoch": 1.509090909090909,
"grad_norm": 10.2492036819458,
"learning_rate": 5.0075757575757576e-05,
"loss": 1.4351,
"step": 664
},
{
"epoch": 1.5113636363636362,
"grad_norm": 12.819211959838867,
"learning_rate": 5e-05,
"loss": 2.2236,
"step": 665
},
{
"epoch": 1.5136363636363637,
"grad_norm": 9.43850326538086,
"learning_rate": 4.992424242424243e-05,
"loss": 0.988,
"step": 666
},
{
"epoch": 1.5159090909090909,
"grad_norm": 12.35922622680664,
"learning_rate": 4.984848484848485e-05,
"loss": 1.9395,
"step": 667
},
{
"epoch": 1.518181818181818,
"grad_norm": 12.175325393676758,
"learning_rate": 4.9772727272727275e-05,
"loss": 2.0219,
"step": 668
},
{
"epoch": 1.5204545454545455,
"grad_norm": 16.44111442565918,
"learning_rate": 4.9696969696969694e-05,
"loss": 1.7191,
"step": 669
},
{
"epoch": 1.5227272727272727,
"grad_norm": 12.413610458374023,
"learning_rate": 4.962121212121213e-05,
"loss": 2.2003,
"step": 670
},
{
"epoch": 1.525,
"grad_norm": 7.922098159790039,
"learning_rate": 4.9545454545454553e-05,
"loss": 1.1514,
"step": 671
},
{
"epoch": 1.5272727272727273,
"grad_norm": 11.402259826660156,
"learning_rate": 4.946969696969697e-05,
"loss": 1.6611,
"step": 672
},
{
"epoch": 1.5295454545454545,
"grad_norm": 10.548962593078613,
"learning_rate": 4.93939393939394e-05,
"loss": 1.6242,
"step": 673
},
{
"epoch": 1.5318181818181817,
"grad_norm": 14.536432266235352,
"learning_rate": 4.931818181818182e-05,
"loss": 2.2415,
"step": 674
},
{
"epoch": 1.5340909090909092,
"grad_norm": 12.954751014709473,
"learning_rate": 4.9242424242424245e-05,
"loss": 1.8463,
"step": 675
},
{
"epoch": 1.5363636363636364,
"grad_norm": 12.143820762634277,
"learning_rate": 4.9166666666666665e-05,
"loss": 1.97,
"step": 676
},
{
"epoch": 1.5386363636363636,
"grad_norm": 10.134570121765137,
"learning_rate": 4.909090909090909e-05,
"loss": 0.9264,
"step": 677
},
{
"epoch": 1.540909090909091,
"grad_norm": 12.558758735656738,
"learning_rate": 4.901515151515152e-05,
"loss": 1.4608,
"step": 678
},
{
"epoch": 1.5431818181818182,
"grad_norm": 10.165045738220215,
"learning_rate": 4.8939393939393944e-05,
"loss": 1.3453,
"step": 679
},
{
"epoch": 1.5454545454545454,
"grad_norm": 11.995816230773926,
"learning_rate": 4.886363636363637e-05,
"loss": 2.1228,
"step": 680
},
{
"epoch": 1.5477272727272728,
"grad_norm": 10.822747230529785,
"learning_rate": 4.878787878787879e-05,
"loss": 2.0378,
"step": 681
},
{
"epoch": 1.55,
"grad_norm": 16.348892211914062,
"learning_rate": 4.8712121212121216e-05,
"loss": 1.7209,
"step": 682
},
{
"epoch": 1.5522727272727272,
"grad_norm": 9.395282745361328,
"learning_rate": 4.863636363636364e-05,
"loss": 1.4529,
"step": 683
},
{
"epoch": 1.5545454545454547,
"grad_norm": 16.89964485168457,
"learning_rate": 4.856060606060606e-05,
"loss": 2.8833,
"step": 684
},
{
"epoch": 1.5568181818181817,
"grad_norm": 10.703327178955078,
"learning_rate": 4.848484848484849e-05,
"loss": 1.7938,
"step": 685
},
{
"epoch": 1.559090909090909,
"grad_norm": 19.770193099975586,
"learning_rate": 4.840909090909091e-05,
"loss": 1.6041,
"step": 686
},
{
"epoch": 1.5613636363636365,
"grad_norm": 11.777501106262207,
"learning_rate": 4.8333333333333334e-05,
"loss": 2.0716,
"step": 687
},
{
"epoch": 1.5636363636363635,
"grad_norm": 10.248165130615234,
"learning_rate": 4.825757575757576e-05,
"loss": 1.5853,
"step": 688
},
{
"epoch": 1.565909090909091,
"grad_norm": 10.732747077941895,
"learning_rate": 4.8181818181818186e-05,
"loss": 1.2683,
"step": 689
},
{
"epoch": 1.5681818181818183,
"grad_norm": 11.304749488830566,
"learning_rate": 4.810606060606061e-05,
"loss": 2.2432,
"step": 690
},
{
"epoch": 1.5704545454545453,
"grad_norm": 13.820841789245605,
"learning_rate": 4.803030303030303e-05,
"loss": 1.8117,
"step": 691
},
{
"epoch": 1.5727272727272728,
"grad_norm": 9.33556079864502,
"learning_rate": 4.795454545454546e-05,
"loss": 1.0837,
"step": 692
},
{
"epoch": 1.575,
"grad_norm": 13.970429420471191,
"learning_rate": 4.787878787878788e-05,
"loss": 2.5927,
"step": 693
},
{
"epoch": 1.5772727272727272,
"grad_norm": 10.840149879455566,
"learning_rate": 4.7803030303030304e-05,
"loss": 1.8707,
"step": 694
},
{
"epoch": 1.5795454545454546,
"grad_norm": 11.14415168762207,
"learning_rate": 4.772727272727273e-05,
"loss": 1.6668,
"step": 695
},
{
"epoch": 1.5818181818181818,
"grad_norm": 14.185403823852539,
"learning_rate": 4.765151515151515e-05,
"loss": 1.6091,
"step": 696
},
{
"epoch": 1.584090909090909,
"grad_norm": 13.565306663513184,
"learning_rate": 4.7575757575757576e-05,
"loss": 1.8229,
"step": 697
},
{
"epoch": 1.5863636363636364,
"grad_norm": 14.329642295837402,
"learning_rate": 4.75e-05,
"loss": 1.9366,
"step": 698
},
{
"epoch": 1.5886363636363636,
"grad_norm": 12.332931518554688,
"learning_rate": 4.742424242424243e-05,
"loss": 1.683,
"step": 699
},
{
"epoch": 1.5909090909090908,
"grad_norm": 10.493454933166504,
"learning_rate": 4.7348484848484855e-05,
"loss": 1.8994,
"step": 700
},
{
"epoch": 1.5931818181818183,
"grad_norm": 11.809647560119629,
"learning_rate": 4.7272727272727275e-05,
"loss": 1.509,
"step": 701
},
{
"epoch": 1.5954545454545455,
"grad_norm": 12.72128963470459,
"learning_rate": 4.71969696969697e-05,
"loss": 2.1266,
"step": 702
},
{
"epoch": 1.5977272727272727,
"grad_norm": 13.074295043945312,
"learning_rate": 4.712121212121212e-05,
"loss": 1.6113,
"step": 703
},
{
"epoch": 1.6,
"grad_norm": 10.254904747009277,
"learning_rate": 4.704545454545455e-05,
"loss": 2.2737,
"step": 704
},
{
"epoch": 1.6022727272727273,
"grad_norm": 24.574390411376953,
"learning_rate": 4.696969696969697e-05,
"loss": 2.2779,
"step": 705
},
{
"epoch": 1.6045454545454545,
"grad_norm": 10.441598892211914,
"learning_rate": 4.689393939393939e-05,
"loss": 1.8209,
"step": 706
},
{
"epoch": 1.606818181818182,
"grad_norm": 12.4207763671875,
"learning_rate": 4.681818181818182e-05,
"loss": 1.5389,
"step": 707
},
{
"epoch": 1.6090909090909091,
"grad_norm": 15.072708129882812,
"learning_rate": 4.6742424242424245e-05,
"loss": 1.3703,
"step": 708
},
{
"epoch": 1.6113636363636363,
"grad_norm": 11.555070877075195,
"learning_rate": 4.666666666666667e-05,
"loss": 1.9363,
"step": 709
},
{
"epoch": 1.6136363636363638,
"grad_norm": 13.27509593963623,
"learning_rate": 4.659090909090909e-05,
"loss": 1.4334,
"step": 710
},
{
"epoch": 1.615909090909091,
"grad_norm": 12.357429504394531,
"learning_rate": 4.651515151515152e-05,
"loss": 2.3112,
"step": 711
},
{
"epoch": 1.6181818181818182,
"grad_norm": 19.84957504272461,
"learning_rate": 4.6439393939393944e-05,
"loss": 1.1851,
"step": 712
},
{
"epoch": 1.6204545454545456,
"grad_norm": 10.689920425415039,
"learning_rate": 4.636363636363636e-05,
"loss": 1.921,
"step": 713
},
{
"epoch": 1.6227272727272726,
"grad_norm": 10.688066482543945,
"learning_rate": 4.628787878787879e-05,
"loss": 1.2294,
"step": 714
},
{
"epoch": 1.625,
"grad_norm": 11.80333423614502,
"learning_rate": 4.621212121212121e-05,
"loss": 2.5255,
"step": 715
},
{
"epoch": 1.6272727272727274,
"grad_norm": 11.181013107299805,
"learning_rate": 4.6136363636363635e-05,
"loss": 1.2692,
"step": 716
},
{
"epoch": 1.6295454545454544,
"grad_norm": 11.557047843933105,
"learning_rate": 4.606060606060607e-05,
"loss": 1.4575,
"step": 717
},
{
"epoch": 1.6318181818181818,
"grad_norm": 13.798693656921387,
"learning_rate": 4.598484848484849e-05,
"loss": 2.3197,
"step": 718
},
{
"epoch": 1.634090909090909,
"grad_norm": 8.890710830688477,
"learning_rate": 4.5909090909090914e-05,
"loss": 1.5266,
"step": 719
},
{
"epoch": 1.6363636363636362,
"grad_norm": 10.293892860412598,
"learning_rate": 4.5833333333333334e-05,
"loss": 1.9222,
"step": 720
},
{
"epoch": 1.6386363636363637,
"grad_norm": 12.959512710571289,
"learning_rate": 4.575757575757576e-05,
"loss": 1.5771,
"step": 721
},
{
"epoch": 1.6409090909090909,
"grad_norm": 11.565927505493164,
"learning_rate": 4.5681818181818186e-05,
"loss": 1.5313,
"step": 722
},
{
"epoch": 1.643181818181818,
"grad_norm": 9.419241905212402,
"learning_rate": 4.5606060606060606e-05,
"loss": 1.4229,
"step": 723
},
{
"epoch": 1.6454545454545455,
"grad_norm": 15.411003112792969,
"learning_rate": 4.553030303030303e-05,
"loss": 1.8707,
"step": 724
},
{
"epoch": 1.6477272727272727,
"grad_norm": 7.6546711921691895,
"learning_rate": 4.545454545454546e-05,
"loss": 0.742,
"step": 725
},
{
"epoch": 1.65,
"grad_norm": 13.029730796813965,
"learning_rate": 4.5378787878787885e-05,
"loss": 1.5179,
"step": 726
},
{
"epoch": 1.6522727272727273,
"grad_norm": 12.853962898254395,
"learning_rate": 4.5303030303030304e-05,
"loss": 1.8908,
"step": 727
},
{
"epoch": 1.6545454545454545,
"grad_norm": 12.864992141723633,
"learning_rate": 4.522727272727273e-05,
"loss": 1.7175,
"step": 728
},
{
"epoch": 1.6568181818181817,
"grad_norm": 13.25144100189209,
"learning_rate": 4.515151515151516e-05,
"loss": 1.7681,
"step": 729
},
{
"epoch": 1.6590909090909092,
"grad_norm": 9.894201278686523,
"learning_rate": 4.5075757575757577e-05,
"loss": 1.5505,
"step": 730
},
{
"epoch": 1.6613636363636364,
"grad_norm": 16.501630783081055,
"learning_rate": 4.5e-05,
"loss": 1.4968,
"step": 731
},
{
"epoch": 1.6636363636363636,
"grad_norm": 10.3342924118042,
"learning_rate": 4.492424242424242e-05,
"loss": 1.4734,
"step": 732
},
{
"epoch": 1.665909090909091,
"grad_norm": 11.081184387207031,
"learning_rate": 4.484848484848485e-05,
"loss": 2.6513,
"step": 733
},
{
"epoch": 1.6681818181818182,
"grad_norm": 17.005704879760742,
"learning_rate": 4.4772727272727275e-05,
"loss": 2.4109,
"step": 734
},
{
"epoch": 1.6704545454545454,
"grad_norm": 11.718207359313965,
"learning_rate": 4.46969696969697e-05,
"loss": 1.6445,
"step": 735
},
{
"epoch": 1.6727272727272728,
"grad_norm": 12.14245319366455,
"learning_rate": 4.462121212121213e-05,
"loss": 2.335,
"step": 736
},
{
"epoch": 1.675,
"grad_norm": 10.971789360046387,
"learning_rate": 4.454545454545455e-05,
"loss": 1.6266,
"step": 737
},
{
"epoch": 1.6772727272727272,
"grad_norm": 17.435321807861328,
"learning_rate": 4.4469696969696973e-05,
"loss": 2.1164,
"step": 738
},
{
"epoch": 1.6795454545454547,
"grad_norm": 10.45814323425293,
"learning_rate": 4.43939393939394e-05,
"loss": 1.3992,
"step": 739
},
{
"epoch": 1.6818181818181817,
"grad_norm": 12.788302421569824,
"learning_rate": 4.431818181818182e-05,
"loss": 2.4001,
"step": 740
},
{
"epoch": 1.684090909090909,
"grad_norm": 14.425982475280762,
"learning_rate": 4.4242424242424246e-05,
"loss": 2.163,
"step": 741
},
{
"epoch": 1.6863636363636365,
"grad_norm": 9.09310531616211,
"learning_rate": 4.4166666666666665e-05,
"loss": 1.4595,
"step": 742
},
{
"epoch": 1.6886363636363635,
"grad_norm": 11.336987495422363,
"learning_rate": 4.409090909090909e-05,
"loss": 2.6262,
"step": 743
},
{
"epoch": 1.690909090909091,
"grad_norm": 11.697134017944336,
"learning_rate": 4.401515151515152e-05,
"loss": 1.3628,
"step": 744
},
{
"epoch": 1.6931818181818183,
"grad_norm": 8.620695114135742,
"learning_rate": 4.3939393939393944e-05,
"loss": 1.2893,
"step": 745
},
{
"epoch": 1.6954545454545453,
"grad_norm": 9.322046279907227,
"learning_rate": 4.386363636363637e-05,
"loss": 1.9579,
"step": 746
},
{
"epoch": 1.6977272727272728,
"grad_norm": 11.273119926452637,
"learning_rate": 4.378787878787879e-05,
"loss": 2.2207,
"step": 747
},
{
"epoch": 1.7,
"grad_norm": 11.111379623413086,
"learning_rate": 4.3712121212121216e-05,
"loss": 1.4021,
"step": 748
},
{
"epoch": 1.7022727272727272,
"grad_norm": 11.808859825134277,
"learning_rate": 4.3636363636363636e-05,
"loss": 1.4873,
"step": 749
},
{
"epoch": 1.7045454545454546,
"grad_norm": 14.41899585723877,
"learning_rate": 4.356060606060606e-05,
"loss": 1.9247,
"step": 750
},
{
"epoch": 1.7068181818181818,
"grad_norm": 9.383740425109863,
"learning_rate": 4.348484848484849e-05,
"loss": 1.6231,
"step": 751
},
{
"epoch": 1.709090909090909,
"grad_norm": 9.926271438598633,
"learning_rate": 4.340909090909091e-05,
"loss": 2.2661,
"step": 752
},
{
"epoch": 1.7113636363636364,
"grad_norm": 12.015188217163086,
"learning_rate": 4.3333333333333334e-05,
"loss": 1.4877,
"step": 753
},
{
"epoch": 1.7136363636363636,
"grad_norm": 12.057700157165527,
"learning_rate": 4.325757575757576e-05,
"loss": 1.6091,
"step": 754
},
{
"epoch": 1.7159090909090908,
"grad_norm": 8.392674446105957,
"learning_rate": 4.318181818181819e-05,
"loss": 1.4652,
"step": 755
},
{
"epoch": 1.7181818181818183,
"grad_norm": 7.7269287109375,
"learning_rate": 4.3106060606060606e-05,
"loss": 1.1991,
"step": 756
},
{
"epoch": 1.7204545454545455,
"grad_norm": 13.280454635620117,
"learning_rate": 4.303030303030303e-05,
"loss": 1.9597,
"step": 757
},
{
"epoch": 1.7227272727272727,
"grad_norm": 11.144329071044922,
"learning_rate": 4.295454545454546e-05,
"loss": 1.6052,
"step": 758
},
{
"epoch": 1.725,
"grad_norm": 12.23388385772705,
"learning_rate": 4.287878787878788e-05,
"loss": 1.5491,
"step": 759
},
{
"epoch": 1.7272727272727273,
"grad_norm": 11.918728828430176,
"learning_rate": 4.2803030303030305e-05,
"loss": 2.0586,
"step": 760
},
{
"epoch": 1.7295454545454545,
"grad_norm": 7.68416166305542,
"learning_rate": 4.2727272727272724e-05,
"loss": 1.0501,
"step": 761
},
{
"epoch": 1.731818181818182,
"grad_norm": 16.64651870727539,
"learning_rate": 4.265151515151515e-05,
"loss": 1.9819,
"step": 762
},
{
"epoch": 1.7340909090909091,
"grad_norm": 14.889754295349121,
"learning_rate": 4.257575757575758e-05,
"loss": 2.5418,
"step": 763
},
{
"epoch": 1.7363636363636363,
"grad_norm": 13.508451461791992,
"learning_rate": 4.25e-05,
"loss": 1.5028,
"step": 764
},
{
"epoch": 1.7386363636363638,
"grad_norm": 9.541330337524414,
"learning_rate": 4.242424242424243e-05,
"loss": 1.0183,
"step": 765
},
{
"epoch": 1.740909090909091,
"grad_norm": 13.14413833618164,
"learning_rate": 4.234848484848485e-05,
"loss": 2.0542,
"step": 766
},
{
"epoch": 1.7431818181818182,
"grad_norm": 12.490581512451172,
"learning_rate": 4.2272727272727275e-05,
"loss": 1.5971,
"step": 767
},
{
"epoch": 1.7454545454545456,
"grad_norm": 14.117782592773438,
"learning_rate": 4.21969696969697e-05,
"loss": 3.0207,
"step": 768
},
{
"epoch": 1.7477272727272726,
"grad_norm": 12.968109130859375,
"learning_rate": 4.212121212121212e-05,
"loss": 1.9058,
"step": 769
},
{
"epoch": 1.75,
"grad_norm": 10.889745712280273,
"learning_rate": 4.204545454545455e-05,
"loss": 1.535,
"step": 770
},
{
"epoch": 1.7522727272727274,
"grad_norm": 11.901477813720703,
"learning_rate": 4.196969696969697e-05,
"loss": 1.3743,
"step": 771
},
{
"epoch": 1.7545454545454544,
"grad_norm": 11.466394424438477,
"learning_rate": 4.189393939393939e-05,
"loss": 2.1364,
"step": 772
},
{
"epoch": 1.7568181818181818,
"grad_norm": 9.973612785339355,
"learning_rate": 4.181818181818182e-05,
"loss": 1.7472,
"step": 773
},
{
"epoch": 1.759090909090909,
"grad_norm": 11.81697940826416,
"learning_rate": 4.1742424242424246e-05,
"loss": 1.6475,
"step": 774
},
{
"epoch": 1.7613636363636362,
"grad_norm": 10.81869125366211,
"learning_rate": 4.166666666666667e-05,
"loss": 2.433,
"step": 775
},
{
"epoch": 1.7636363636363637,
"grad_norm": 15.867783546447754,
"learning_rate": 4.159090909090909e-05,
"loss": 3.0407,
"step": 776
},
{
"epoch": 1.7659090909090909,
"grad_norm": 12.047411918640137,
"learning_rate": 4.151515151515152e-05,
"loss": 1.7651,
"step": 777
},
{
"epoch": 1.768181818181818,
"grad_norm": 11.829177856445312,
"learning_rate": 4.143939393939394e-05,
"loss": 1.5285,
"step": 778
},
{
"epoch": 1.7704545454545455,
"grad_norm": 13.831562995910645,
"learning_rate": 4.1363636363636364e-05,
"loss": 2.6372,
"step": 779
},
{
"epoch": 1.7727272727272727,
"grad_norm": 10.6288480758667,
"learning_rate": 4.128787878787879e-05,
"loss": 1.8006,
"step": 780
},
{
"epoch": 1.775,
"grad_norm": 12.919150352478027,
"learning_rate": 4.1212121212121216e-05,
"loss": 1.8753,
"step": 781
},
{
"epoch": 1.7772727272727273,
"grad_norm": 14.138745307922363,
"learning_rate": 4.113636363636364e-05,
"loss": 2.1089,
"step": 782
},
{
"epoch": 1.7795454545454545,
"grad_norm": 8.130454063415527,
"learning_rate": 4.106060606060606e-05,
"loss": 0.9243,
"step": 783
},
{
"epoch": 1.7818181818181817,
"grad_norm": 13.32907485961914,
"learning_rate": 4.098484848484849e-05,
"loss": 2.599,
"step": 784
},
{
"epoch": 1.7840909090909092,
"grad_norm": 9.957046508789062,
"learning_rate": 4.0909090909090915e-05,
"loss": 1.1874,
"step": 785
},
{
"epoch": 1.7863636363636364,
"grad_norm": 10.413941383361816,
"learning_rate": 4.0833333333333334e-05,
"loss": 1.2206,
"step": 786
},
{
"epoch": 1.7886363636363636,
"grad_norm": 12.38062858581543,
"learning_rate": 4.075757575757576e-05,
"loss": 1.5484,
"step": 787
},
{
"epoch": 1.790909090909091,
"grad_norm": 10.63827896118164,
"learning_rate": 4.068181818181818e-05,
"loss": 1.4851,
"step": 788
},
{
"epoch": 1.7931818181818182,
"grad_norm": 10.755563735961914,
"learning_rate": 4.0606060606060606e-05,
"loss": 2.0725,
"step": 789
},
{
"epoch": 1.7954545454545454,
"grad_norm": 10.352532386779785,
"learning_rate": 4.053030303030303e-05,
"loss": 1.6825,
"step": 790
},
{
"epoch": 1.7977272727272728,
"grad_norm": 10.303858757019043,
"learning_rate": 4.045454545454546e-05,
"loss": 1.6771,
"step": 791
},
{
"epoch": 1.8,
"grad_norm": 12.914578437805176,
"learning_rate": 4.0378787878787885e-05,
"loss": 2.0149,
"step": 792
},
{
"epoch": 1.8022727272727272,
"grad_norm": 9.389689445495605,
"learning_rate": 4.0303030303030305e-05,
"loss": 1.9987,
"step": 793
},
{
"epoch": 1.8045454545454547,
"grad_norm": 13.615360260009766,
"learning_rate": 4.022727272727273e-05,
"loss": 1.7871,
"step": 794
},
{
"epoch": 1.8068181818181817,
"grad_norm": 12.188302040100098,
"learning_rate": 4.015151515151515e-05,
"loss": 2.1458,
"step": 795
},
{
"epoch": 1.809090909090909,
"grad_norm": 23.321977615356445,
"learning_rate": 4.007575757575758e-05,
"loss": 1.5815,
"step": 796
},
{
"epoch": 1.8113636363636365,
"grad_norm": 13.12856674194336,
"learning_rate": 4e-05,
"loss": 1.9065,
"step": 797
},
{
"epoch": 1.8136363636363635,
"grad_norm": 8.955425262451172,
"learning_rate": 3.992424242424242e-05,
"loss": 1.4415,
"step": 798
},
{
"epoch": 1.815909090909091,
"grad_norm": 14.052294731140137,
"learning_rate": 3.984848484848485e-05,
"loss": 2.6913,
"step": 799
},
{
"epoch": 1.8181818181818183,
"grad_norm": 8.688261032104492,
"learning_rate": 3.9772727272727275e-05,
"loss": 1.6981,
"step": 800
},
{
"epoch": 1.8204545454545453,
"grad_norm": 13.951496124267578,
"learning_rate": 3.96969696969697e-05,
"loss": 1.5787,
"step": 801
},
{
"epoch": 1.8227272727272728,
"grad_norm": 10.023541450500488,
"learning_rate": 3.962121212121213e-05,
"loss": 1.9886,
"step": 802
},
{
"epoch": 1.825,
"grad_norm": 8.397741317749023,
"learning_rate": 3.954545454545455e-05,
"loss": 1.7193,
"step": 803
},
{
"epoch": 1.8272727272727272,
"grad_norm": 10.017319679260254,
"learning_rate": 3.9469696969696974e-05,
"loss": 1.7097,
"step": 804
},
{
"epoch": 1.8295454545454546,
"grad_norm": 13.632206916809082,
"learning_rate": 3.939393939393939e-05,
"loss": 2.1469,
"step": 805
},
{
"epoch": 1.8318181818181818,
"grad_norm": 19.315832138061523,
"learning_rate": 3.931818181818182e-05,
"loss": 2.2873,
"step": 806
},
{
"epoch": 1.834090909090909,
"grad_norm": 11.273087501525879,
"learning_rate": 3.924242424242424e-05,
"loss": 1.352,
"step": 807
},
{
"epoch": 1.8363636363636364,
"grad_norm": 12.127049446105957,
"learning_rate": 3.9166666666666665e-05,
"loss": 1.8422,
"step": 808
},
{
"epoch": 1.8386363636363636,
"grad_norm": 9.968843460083008,
"learning_rate": 3.909090909090909e-05,
"loss": 1.2724,
"step": 809
},
{
"epoch": 1.8409090909090908,
"grad_norm": 13.883306503295898,
"learning_rate": 3.901515151515152e-05,
"loss": 2.6822,
"step": 810
},
{
"epoch": 1.8431818181818183,
"grad_norm": 10.443497657775879,
"learning_rate": 3.8939393939393944e-05,
"loss": 1.2037,
"step": 811
},
{
"epoch": 1.8454545454545455,
"grad_norm": 10.290310859680176,
"learning_rate": 3.8863636363636364e-05,
"loss": 1.5355,
"step": 812
},
{
"epoch": 1.8477272727272727,
"grad_norm": 9.970185279846191,
"learning_rate": 3.878787878787879e-05,
"loss": 1.957,
"step": 813
},
{
"epoch": 1.85,
"grad_norm": 10.905329704284668,
"learning_rate": 3.8712121212121217e-05,
"loss": 1.8562,
"step": 814
},
{
"epoch": 1.8522727272727273,
"grad_norm": 9.466534614562988,
"learning_rate": 3.8636363636363636e-05,
"loss": 1.4522,
"step": 815
},
{
"epoch": 1.8545454545454545,
"grad_norm": 13.48620891571045,
"learning_rate": 3.856060606060606e-05,
"loss": 2.1203,
"step": 816
},
{
"epoch": 1.856818181818182,
"grad_norm": 12.107563018798828,
"learning_rate": 3.848484848484848e-05,
"loss": 1.7011,
"step": 817
},
{
"epoch": 1.8590909090909091,
"grad_norm": 10.786709785461426,
"learning_rate": 3.840909090909091e-05,
"loss": 1.7418,
"step": 818
},
{
"epoch": 1.8613636363636363,
"grad_norm": 10.853336334228516,
"learning_rate": 3.8333333333333334e-05,
"loss": 1.4229,
"step": 819
},
{
"epoch": 1.8636363636363638,
"grad_norm": 11.42320442199707,
"learning_rate": 3.825757575757576e-05,
"loss": 1.6411,
"step": 820
},
{
"epoch": 1.865909090909091,
"grad_norm": 9.623292922973633,
"learning_rate": 3.818181818181819e-05,
"loss": 2.2372,
"step": 821
},
{
"epoch": 1.8681818181818182,
"grad_norm": 19.681766510009766,
"learning_rate": 3.810606060606061e-05,
"loss": 1.7814,
"step": 822
},
{
"epoch": 1.8704545454545456,
"grad_norm": 11.759204864501953,
"learning_rate": 3.803030303030303e-05,
"loss": 1.4783,
"step": 823
},
{
"epoch": 1.8727272727272726,
"grad_norm": 11.130982398986816,
"learning_rate": 3.795454545454545e-05,
"loss": 1.3937,
"step": 824
},
{
"epoch": 1.875,
"grad_norm": 10.193344116210938,
"learning_rate": 3.787878787878788e-05,
"loss": 1.3912,
"step": 825
},
{
"epoch": 1.8772727272727274,
"grad_norm": 8.412622451782227,
"learning_rate": 3.7803030303030305e-05,
"loss": 1.3978,
"step": 826
},
{
"epoch": 1.8795454545454544,
"grad_norm": 12.766166687011719,
"learning_rate": 3.7727272727272725e-05,
"loss": 1.9356,
"step": 827
},
{
"epoch": 1.8818181818181818,
"grad_norm": 11.161136627197266,
"learning_rate": 3.765151515151516e-05,
"loss": 1.8318,
"step": 828
},
{
"epoch": 1.884090909090909,
"grad_norm": 11.214709281921387,
"learning_rate": 3.757575757575758e-05,
"loss": 1.4253,
"step": 829
},
{
"epoch": 1.8863636363636362,
"grad_norm": 12.173728942871094,
"learning_rate": 3.7500000000000003e-05,
"loss": 1.3093,
"step": 830
},
{
"epoch": 1.8886363636363637,
"grad_norm": 12.564881324768066,
"learning_rate": 3.742424242424243e-05,
"loss": 2.0086,
"step": 831
},
{
"epoch": 1.8909090909090909,
"grad_norm": 10.378774642944336,
"learning_rate": 3.734848484848485e-05,
"loss": 2.2117,
"step": 832
},
{
"epoch": 1.893181818181818,
"grad_norm": 13.659943580627441,
"learning_rate": 3.7272727272727276e-05,
"loss": 1.8717,
"step": 833
},
{
"epoch": 1.8954545454545455,
"grad_norm": 10.889350891113281,
"learning_rate": 3.7196969696969695e-05,
"loss": 2.524,
"step": 834
},
{
"epoch": 1.8977272727272727,
"grad_norm": 20.47830581665039,
"learning_rate": 3.712121212121212e-05,
"loss": 1.5575,
"step": 835
},
{
"epoch": 1.9,
"grad_norm": 8.377565383911133,
"learning_rate": 3.704545454545455e-05,
"loss": 1.4985,
"step": 836
},
{
"epoch": 1.9022727272727273,
"grad_norm": 14.420267105102539,
"learning_rate": 3.6969696969696974e-05,
"loss": 2.0562,
"step": 837
},
{
"epoch": 1.9045454545454545,
"grad_norm": 11.469067573547363,
"learning_rate": 3.68939393939394e-05,
"loss": 1.9261,
"step": 838
},
{
"epoch": 1.9068181818181817,
"grad_norm": 14.95913314819336,
"learning_rate": 3.681818181818182e-05,
"loss": 1.4905,
"step": 839
},
{
"epoch": 1.9090909090909092,
"grad_norm": 12.481145858764648,
"learning_rate": 3.6742424242424246e-05,
"loss": 1.3664,
"step": 840
},
{
"epoch": 1.9113636363636364,
"grad_norm": 11.715337753295898,
"learning_rate": 3.6666666666666666e-05,
"loss": 2.0561,
"step": 841
},
{
"epoch": 1.9136363636363636,
"grad_norm": 12.499181747436523,
"learning_rate": 3.659090909090909e-05,
"loss": 1.62,
"step": 842
},
{
"epoch": 1.915909090909091,
"grad_norm": 7.448797225952148,
"learning_rate": 3.651515151515152e-05,
"loss": 0.979,
"step": 843
},
{
"epoch": 1.9181818181818182,
"grad_norm": 11.219677925109863,
"learning_rate": 3.643939393939394e-05,
"loss": 1.8378,
"step": 844
},
{
"epoch": 1.9204545454545454,
"grad_norm": 11.738428115844727,
"learning_rate": 3.6363636363636364e-05,
"loss": 2.1477,
"step": 845
},
{
"epoch": 1.9227272727272728,
"grad_norm": 13.800374031066895,
"learning_rate": 3.628787878787879e-05,
"loss": 2.3644,
"step": 846
},
{
"epoch": 1.925,
"grad_norm": 11.240313529968262,
"learning_rate": 3.621212121212122e-05,
"loss": 1.6775,
"step": 847
},
{
"epoch": 1.9272727272727272,
"grad_norm": 13.477606773376465,
"learning_rate": 3.613636363636364e-05,
"loss": 1.3438,
"step": 848
},
{
"epoch": 1.9295454545454547,
"grad_norm": 12.788423538208008,
"learning_rate": 3.606060606060606e-05,
"loss": 1.7158,
"step": 849
},
{
"epoch": 1.9318181818181817,
"grad_norm": 8.893767356872559,
"learning_rate": 3.598484848484849e-05,
"loss": 1.4747,
"step": 850
},
{
"epoch": 1.934090909090909,
"grad_norm": 12.053075790405273,
"learning_rate": 3.590909090909091e-05,
"loss": 1.0121,
"step": 851
},
{
"epoch": 1.9363636363636365,
"grad_norm": 12.093589782714844,
"learning_rate": 3.5833333333333335e-05,
"loss": 2.1991,
"step": 852
},
{
"epoch": 1.9386363636363635,
"grad_norm": 9.356278419494629,
"learning_rate": 3.575757575757576e-05,
"loss": 1.4497,
"step": 853
},
{
"epoch": 1.940909090909091,
"grad_norm": 12.686812400817871,
"learning_rate": 3.568181818181818e-05,
"loss": 1.5038,
"step": 854
},
{
"epoch": 1.9431818181818183,
"grad_norm": 13.139368057250977,
"learning_rate": 3.560606060606061e-05,
"loss": 2.9399,
"step": 855
},
{
"epoch": 1.9454545454545453,
"grad_norm": 11.385064125061035,
"learning_rate": 3.553030303030303e-05,
"loss": 1.4202,
"step": 856
},
{
"epoch": 1.9477272727272728,
"grad_norm": 9.905313491821289,
"learning_rate": 3.545454545454546e-05,
"loss": 2.5033,
"step": 857
},
{
"epoch": 1.95,
"grad_norm": 9.99422836303711,
"learning_rate": 3.537878787878788e-05,
"loss": 1.631,
"step": 858
},
{
"epoch": 1.9522727272727272,
"grad_norm": 12.235610961914062,
"learning_rate": 3.5303030303030305e-05,
"loss": 1.7517,
"step": 859
},
{
"epoch": 1.9545454545454546,
"grad_norm": 13.225701332092285,
"learning_rate": 3.522727272727273e-05,
"loss": 1.545,
"step": 860
},
{
"epoch": 1.9568181818181818,
"grad_norm": 13.755146980285645,
"learning_rate": 3.515151515151515e-05,
"loss": 1.6548,
"step": 861
},
{
"epoch": 1.959090909090909,
"grad_norm": 14.235300064086914,
"learning_rate": 3.507575757575758e-05,
"loss": 2.2791,
"step": 862
},
{
"epoch": 1.9613636363636364,
"grad_norm": 12.734109878540039,
"learning_rate": 3.5e-05,
"loss": 1.4257,
"step": 863
},
{
"epoch": 1.9636363636363636,
"grad_norm": 12.51075267791748,
"learning_rate": 3.492424242424242e-05,
"loss": 2.1328,
"step": 864
},
{
"epoch": 1.9659090909090908,
"grad_norm": 12.090396881103516,
"learning_rate": 3.484848484848485e-05,
"loss": 2.4949,
"step": 865
},
{
"epoch": 1.9681818181818183,
"grad_norm": 9.898470878601074,
"learning_rate": 3.4772727272727276e-05,
"loss": 1.0122,
"step": 866
},
{
"epoch": 1.9704545454545455,
"grad_norm": 12.299036979675293,
"learning_rate": 3.46969696969697e-05,
"loss": 1.1734,
"step": 867
},
{
"epoch": 1.9727272727272727,
"grad_norm": 10.930243492126465,
"learning_rate": 3.462121212121212e-05,
"loss": 1.8219,
"step": 868
},
{
"epoch": 1.975,
"grad_norm": 11.0517578125,
"learning_rate": 3.454545454545455e-05,
"loss": 1.5023,
"step": 869
},
{
"epoch": 1.9772727272727273,
"grad_norm": 11.98909854888916,
"learning_rate": 3.4469696969696974e-05,
"loss": 1.298,
"step": 870
},
{
"epoch": 1.9795454545454545,
"grad_norm": 12.753129959106445,
"learning_rate": 3.4393939393939394e-05,
"loss": 1.7147,
"step": 871
},
{
"epoch": 1.981818181818182,
"grad_norm": 71.2451171875,
"learning_rate": 3.431818181818182e-05,
"loss": 1.3867,
"step": 872
},
{
"epoch": 1.9840909090909091,
"grad_norm": 9.198206901550293,
"learning_rate": 3.424242424242424e-05,
"loss": 1.2175,
"step": 873
},
{
"epoch": 1.9863636363636363,
"grad_norm": 10.864444732666016,
"learning_rate": 3.4166666666666666e-05,
"loss": 2.4479,
"step": 874
},
{
"epoch": 1.9886363636363638,
"grad_norm": 12.929604530334473,
"learning_rate": 3.409090909090909e-05,
"loss": 2.3538,
"step": 875
},
{
"epoch": 1.990909090909091,
"grad_norm": 15.190954208374023,
"learning_rate": 3.401515151515152e-05,
"loss": 2.7314,
"step": 876
},
{
"epoch": 1.9931818181818182,
"grad_norm": 12.220293045043945,
"learning_rate": 3.3939393939393945e-05,
"loss": 1.8087,
"step": 877
},
{
"epoch": 1.9954545454545456,
"grad_norm": 13.717775344848633,
"learning_rate": 3.3863636363636364e-05,
"loss": 2.2791,
"step": 878
},
{
"epoch": 1.9977272727272726,
"grad_norm": 13.53941822052002,
"learning_rate": 3.378787878787879e-05,
"loss": 1.9205,
"step": 879
},
{
"epoch": 2.0,
"grad_norm": 10.206825256347656,
"learning_rate": 3.371212121212121e-05,
"loss": 1.2968,
"step": 880
},
{
"epoch": 2.0,
"eval_f1": 0.8929,
"eval_gen_len": 41.9091,
"eval_loss": 1.7823115587234497,
"eval_precision": 0.8925,
"eval_recall": 0.8935,
"eval_rouge1": 0.447,
"eval_rouge2": 0.2102,
"eval_rougeL": 0.3795,
"eval_rougeLsum": 0.4136,
"eval_runtime": 29.0339,
"eval_samples_per_second": 3.789,
"eval_steps_per_second": 0.482,
"step": 880
},
{
"epoch": 2.0022727272727274,
"grad_norm": 9.781706809997559,
"learning_rate": 3.3636363636363636e-05,
"loss": 1.0468,
"step": 881
},
{
"epoch": 2.0045454545454544,
"grad_norm": 8.61344051361084,
"learning_rate": 3.356060606060606e-05,
"loss": 1.7286,
"step": 882
},
{
"epoch": 2.006818181818182,
"grad_norm": 11.291481971740723,
"learning_rate": 3.348484848484848e-05,
"loss": 1.1274,
"step": 883
},
{
"epoch": 2.0090909090909093,
"grad_norm": 11.33132553100586,
"learning_rate": 3.3409090909090915e-05,
"loss": 1.4992,
"step": 884
},
{
"epoch": 2.0113636363636362,
"grad_norm": 10.342754364013672,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.7733,
"step": 885
},
{
"epoch": 2.0136363636363637,
"grad_norm": 9.18486499786377,
"learning_rate": 3.325757575757576e-05,
"loss": 1.7391,
"step": 886
},
{
"epoch": 2.015909090909091,
"grad_norm": 35.923648834228516,
"learning_rate": 3.318181818181819e-05,
"loss": 1.8191,
"step": 887
},
{
"epoch": 2.018181818181818,
"grad_norm": 10.737150192260742,
"learning_rate": 3.310606060606061e-05,
"loss": 1.1656,
"step": 888
},
{
"epoch": 2.0204545454545455,
"grad_norm": 7.691224098205566,
"learning_rate": 3.303030303030303e-05,
"loss": 1.1787,
"step": 889
},
{
"epoch": 2.022727272727273,
"grad_norm": 14.402198791503906,
"learning_rate": 3.295454545454545e-05,
"loss": 2.1618,
"step": 890
},
{
"epoch": 2.025,
"grad_norm": 9.567869186401367,
"learning_rate": 3.287878787878788e-05,
"loss": 1.4921,
"step": 891
},
{
"epoch": 2.0272727272727273,
"grad_norm": 12.46391487121582,
"learning_rate": 3.2803030303030305e-05,
"loss": 2.0986,
"step": 892
},
{
"epoch": 2.0295454545454548,
"grad_norm": 12.333531379699707,
"learning_rate": 3.272727272727273e-05,
"loss": 1.5944,
"step": 893
},
{
"epoch": 2.0318181818181817,
"grad_norm": 12.140853881835938,
"learning_rate": 3.265151515151516e-05,
"loss": 1.7773,
"step": 894
},
{
"epoch": 2.034090909090909,
"grad_norm": 9.412683486938477,
"learning_rate": 3.257575757575758e-05,
"loss": 1.2663,
"step": 895
},
{
"epoch": 2.036363636363636,
"grad_norm": 10.711098670959473,
"learning_rate": 3.2500000000000004e-05,
"loss": 1.6462,
"step": 896
},
{
"epoch": 2.0386363636363636,
"grad_norm": 11.64570426940918,
"learning_rate": 3.2424242424242423e-05,
"loss": 1.8232,
"step": 897
},
{
"epoch": 2.040909090909091,
"grad_norm": 12.753011703491211,
"learning_rate": 3.234848484848485e-05,
"loss": 1.9761,
"step": 898
},
{
"epoch": 2.043181818181818,
"grad_norm": 15.42159366607666,
"learning_rate": 3.2272727272727276e-05,
"loss": 1.5225,
"step": 899
},
{
"epoch": 2.0454545454545454,
"grad_norm": 13.561200141906738,
"learning_rate": 3.2196969696969696e-05,
"loss": 2.2342,
"step": 900
},
{
"epoch": 2.047727272727273,
"grad_norm": 11.59468936920166,
"learning_rate": 3.212121212121212e-05,
"loss": 1.3996,
"step": 901
},
{
"epoch": 2.05,
"grad_norm": 12.330318450927734,
"learning_rate": 3.204545454545455e-05,
"loss": 2.3926,
"step": 902
},
{
"epoch": 2.0522727272727272,
"grad_norm": 15.305580139160156,
"learning_rate": 3.1969696969696974e-05,
"loss": 2.5056,
"step": 903
},
{
"epoch": 2.0545454545454547,
"grad_norm": 12.250936508178711,
"learning_rate": 3.18939393939394e-05,
"loss": 2.2595,
"step": 904
},
{
"epoch": 2.0568181818181817,
"grad_norm": 9.258564949035645,
"learning_rate": 3.181818181818182e-05,
"loss": 1.0952,
"step": 905
},
{
"epoch": 2.059090909090909,
"grad_norm": 10.1191987991333,
"learning_rate": 3.174242424242425e-05,
"loss": 2.2179,
"step": 906
},
{
"epoch": 2.0613636363636365,
"grad_norm": 12.793285369873047,
"learning_rate": 3.1666666666666666e-05,
"loss": 1.7858,
"step": 907
},
{
"epoch": 2.0636363636363635,
"grad_norm": 10.188157081604004,
"learning_rate": 3.159090909090909e-05,
"loss": 1.3631,
"step": 908
},
{
"epoch": 2.065909090909091,
"grad_norm": 13.256832122802734,
"learning_rate": 3.151515151515151e-05,
"loss": 2.2464,
"step": 909
},
{
"epoch": 2.0681818181818183,
"grad_norm": 10.160938262939453,
"learning_rate": 3.143939393939394e-05,
"loss": 1.5204,
"step": 910
},
{
"epoch": 2.0704545454545453,
"grad_norm": 10.945446014404297,
"learning_rate": 3.1363636363636365e-05,
"loss": 1.6125,
"step": 911
},
{
"epoch": 2.0727272727272728,
"grad_norm": 10.19439697265625,
"learning_rate": 3.128787878787879e-05,
"loss": 1.5317,
"step": 912
},
{
"epoch": 2.075,
"grad_norm": 9.242986679077148,
"learning_rate": 3.121212121212122e-05,
"loss": 1.7993,
"step": 913
},
{
"epoch": 2.077272727272727,
"grad_norm": 9.43307113647461,
"learning_rate": 3.113636363636364e-05,
"loss": 1.4297,
"step": 914
},
{
"epoch": 2.0795454545454546,
"grad_norm": 9.292837142944336,
"learning_rate": 3.106060606060606e-05,
"loss": 1.1428,
"step": 915
},
{
"epoch": 2.081818181818182,
"grad_norm": 10.290895462036133,
"learning_rate": 3.098484848484849e-05,
"loss": 1.3587,
"step": 916
},
{
"epoch": 2.084090909090909,
"grad_norm": 12.890341758728027,
"learning_rate": 3.090909090909091e-05,
"loss": 1.5721,
"step": 917
},
{
"epoch": 2.0863636363636364,
"grad_norm": 9.548102378845215,
"learning_rate": 3.0833333333333335e-05,
"loss": 1.5717,
"step": 918
},
{
"epoch": 2.088636363636364,
"grad_norm": 11.2235689163208,
"learning_rate": 3.0757575757575755e-05,
"loss": 1.818,
"step": 919
},
{
"epoch": 2.090909090909091,
"grad_norm": 14.528667449951172,
"learning_rate": 3.068181818181818e-05,
"loss": 1.6878,
"step": 920
},
{
"epoch": 2.0931818181818183,
"grad_norm": 13.295345306396484,
"learning_rate": 3.060606060606061e-05,
"loss": 1.8521,
"step": 921
},
{
"epoch": 2.0954545454545457,
"grad_norm": 13.902974128723145,
"learning_rate": 3.0530303030303034e-05,
"loss": 1.7186,
"step": 922
},
{
"epoch": 2.0977272727272727,
"grad_norm": 8.313849449157715,
"learning_rate": 3.0454545454545456e-05,
"loss": 0.8988,
"step": 923
},
{
"epoch": 2.1,
"grad_norm": 11.491289138793945,
"learning_rate": 3.037878787878788e-05,
"loss": 1.1394,
"step": 924
},
{
"epoch": 2.102272727272727,
"grad_norm": 13.124963760375977,
"learning_rate": 3.0303030303030306e-05,
"loss": 1.7424,
"step": 925
},
{
"epoch": 2.1045454545454545,
"grad_norm": 8.5538911819458,
"learning_rate": 3.0227272727272725e-05,
"loss": 1.3577,
"step": 926
},
{
"epoch": 2.106818181818182,
"grad_norm": 12.04502010345459,
"learning_rate": 3.015151515151515e-05,
"loss": 1.2389,
"step": 927
},
{
"epoch": 2.109090909090909,
"grad_norm": 8.608831405639648,
"learning_rate": 3.0075757575757578e-05,
"loss": 1.1577,
"step": 928
},
{
"epoch": 2.1113636363636363,
"grad_norm": 14.802834510803223,
"learning_rate": 3e-05,
"loss": 1.8636,
"step": 929
},
{
"epoch": 2.1136363636363638,
"grad_norm": 9.014802932739258,
"learning_rate": 2.9924242424242427e-05,
"loss": 0.7823,
"step": 930
},
{
"epoch": 2.1159090909090907,
"grad_norm": 10.007800102233887,
"learning_rate": 2.9848484848484847e-05,
"loss": 1.7205,
"step": 931
},
{
"epoch": 2.118181818181818,
"grad_norm": 16.067474365234375,
"learning_rate": 2.9772727272727273e-05,
"loss": 2.443,
"step": 932
},
{
"epoch": 2.1204545454545456,
"grad_norm": 12.624736785888672,
"learning_rate": 2.96969696969697e-05,
"loss": 1.5536,
"step": 933
},
{
"epoch": 2.1227272727272726,
"grad_norm": 10.400491714477539,
"learning_rate": 2.9621212121212122e-05,
"loss": 1.2871,
"step": 934
},
{
"epoch": 2.125,
"grad_norm": 11.056097984313965,
"learning_rate": 2.954545454545455e-05,
"loss": 1.4614,
"step": 935
},
{
"epoch": 2.1272727272727274,
"grad_norm": 9.163816452026367,
"learning_rate": 2.9469696969696968e-05,
"loss": 1.2918,
"step": 936
},
{
"epoch": 2.1295454545454544,
"grad_norm": 8.908564567565918,
"learning_rate": 2.9393939393939394e-05,
"loss": 1.2489,
"step": 937
},
{
"epoch": 2.131818181818182,
"grad_norm": 8.402863502502441,
"learning_rate": 2.9318181818181817e-05,
"loss": 1.4269,
"step": 938
},
{
"epoch": 2.1340909090909093,
"grad_norm": 10.939780235290527,
"learning_rate": 2.9242424242424243e-05,
"loss": 1.4199,
"step": 939
},
{
"epoch": 2.1363636363636362,
"grad_norm": 11.758381843566895,
"learning_rate": 2.916666666666667e-05,
"loss": 1.4597,
"step": 940
},
{
"epoch": 2.1386363636363637,
"grad_norm": 11.411653518676758,
"learning_rate": 2.909090909090909e-05,
"loss": 2.1611,
"step": 941
},
{
"epoch": 2.140909090909091,
"grad_norm": 11.838427543640137,
"learning_rate": 2.901515151515152e-05,
"loss": 1.2373,
"step": 942
},
{
"epoch": 2.143181818181818,
"grad_norm": 14.833626747131348,
"learning_rate": 2.893939393939394e-05,
"loss": 1.9202,
"step": 943
},
{
"epoch": 2.1454545454545455,
"grad_norm": 10.815326690673828,
"learning_rate": 2.8863636363636365e-05,
"loss": 1.5089,
"step": 944
},
{
"epoch": 2.147727272727273,
"grad_norm": 12.253664016723633,
"learning_rate": 2.878787878787879e-05,
"loss": 1.3787,
"step": 945
},
{
"epoch": 2.15,
"grad_norm": 13.154531478881836,
"learning_rate": 2.8712121212121214e-05,
"loss": 1.8925,
"step": 946
},
{
"epoch": 2.1522727272727273,
"grad_norm": 12.020703315734863,
"learning_rate": 2.863636363636364e-05,
"loss": 1.379,
"step": 947
},
{
"epoch": 2.1545454545454543,
"grad_norm": 10.430608749389648,
"learning_rate": 2.856060606060606e-05,
"loss": 1.4203,
"step": 948
},
{
"epoch": 2.1568181818181817,
"grad_norm": 8.769074440002441,
"learning_rate": 2.8484848484848486e-05,
"loss": 1.227,
"step": 949
},
{
"epoch": 2.159090909090909,
"grad_norm": 11.399450302124023,
"learning_rate": 2.8409090909090912e-05,
"loss": 1.3783,
"step": 950
},
{
"epoch": 2.161363636363636,
"grad_norm": 9.87228012084961,
"learning_rate": 2.8333333333333335e-05,
"loss": 1.6523,
"step": 951
},
{
"epoch": 2.1636363636363636,
"grad_norm": 15.94421100616455,
"learning_rate": 2.825757575757576e-05,
"loss": 2.4161,
"step": 952
},
{
"epoch": 2.165909090909091,
"grad_norm": 9.126893043518066,
"learning_rate": 2.818181818181818e-05,
"loss": 1.2675,
"step": 953
},
{
"epoch": 2.168181818181818,
"grad_norm": 15.760127067565918,
"learning_rate": 2.8106060606060607e-05,
"loss": 2.9231,
"step": 954
},
{
"epoch": 2.1704545454545454,
"grad_norm": 8.999767303466797,
"learning_rate": 2.803030303030303e-05,
"loss": 1.5147,
"step": 955
},
{
"epoch": 2.172727272727273,
"grad_norm": 12.179048538208008,
"learning_rate": 2.7954545454545457e-05,
"loss": 1.4017,
"step": 956
},
{
"epoch": 2.175,
"grad_norm": 11.52514934539795,
"learning_rate": 2.7878787878787883e-05,
"loss": 2.2158,
"step": 957
},
{
"epoch": 2.1772727272727272,
"grad_norm": 14.60074520111084,
"learning_rate": 2.7803030303030303e-05,
"loss": 1.6378,
"step": 958
},
{
"epoch": 2.1795454545454547,
"grad_norm": 11.505465507507324,
"learning_rate": 2.772727272727273e-05,
"loss": 1.6039,
"step": 959
},
{
"epoch": 2.1818181818181817,
"grad_norm": 12.141363143920898,
"learning_rate": 2.7651515151515152e-05,
"loss": 2.6782,
"step": 960
},
{
"epoch": 2.184090909090909,
"grad_norm": 10.89749813079834,
"learning_rate": 2.7575757575757578e-05,
"loss": 1.4787,
"step": 961
},
{
"epoch": 2.1863636363636365,
"grad_norm": 11.249963760375977,
"learning_rate": 2.7500000000000004e-05,
"loss": 1.9647,
"step": 962
},
{
"epoch": 2.1886363636363635,
"grad_norm": 9.608443260192871,
"learning_rate": 2.7424242424242424e-05,
"loss": 0.8747,
"step": 963
},
{
"epoch": 2.190909090909091,
"grad_norm": 9.517485618591309,
"learning_rate": 2.734848484848485e-05,
"loss": 1.2376,
"step": 964
},
{
"epoch": 2.1931818181818183,
"grad_norm": 9.044648170471191,
"learning_rate": 2.7272727272727273e-05,
"loss": 0.8014,
"step": 965
},
{
"epoch": 2.1954545454545453,
"grad_norm": 9.988462448120117,
"learning_rate": 2.71969696969697e-05,
"loss": 1.652,
"step": 966
},
{
"epoch": 2.1977272727272728,
"grad_norm": 8.96922492980957,
"learning_rate": 2.7121212121212126e-05,
"loss": 0.9484,
"step": 967
},
{
"epoch": 2.2,
"grad_norm": 10.36929702758789,
"learning_rate": 2.7045454545454545e-05,
"loss": 1.2604,
"step": 968
},
{
"epoch": 2.202272727272727,
"grad_norm": 14.008241653442383,
"learning_rate": 2.696969696969697e-05,
"loss": 2.4898,
"step": 969
},
{
"epoch": 2.2045454545454546,
"grad_norm": 14.017687797546387,
"learning_rate": 2.6893939393939394e-05,
"loss": 1.8664,
"step": 970
},
{
"epoch": 2.206818181818182,
"grad_norm": 11.672577857971191,
"learning_rate": 2.681818181818182e-05,
"loss": 1.8917,
"step": 971
},
{
"epoch": 2.209090909090909,
"grad_norm": 11.760181427001953,
"learning_rate": 2.674242424242424e-05,
"loss": 2.0559,
"step": 972
},
{
"epoch": 2.2113636363636364,
"grad_norm": 13.333674430847168,
"learning_rate": 2.6666666666666667e-05,
"loss": 1.8072,
"step": 973
},
{
"epoch": 2.213636363636364,
"grad_norm": 9.448116302490234,
"learning_rate": 2.6590909090909093e-05,
"loss": 1.2764,
"step": 974
},
{
"epoch": 2.215909090909091,
"grad_norm": 11.52153491973877,
"learning_rate": 2.6515151515151516e-05,
"loss": 1.7083,
"step": 975
},
{
"epoch": 2.2181818181818183,
"grad_norm": 20.444080352783203,
"learning_rate": 2.6439393939393942e-05,
"loss": 2.2781,
"step": 976
},
{
"epoch": 2.2204545454545457,
"grad_norm": 15.952470779418945,
"learning_rate": 2.636363636363636e-05,
"loss": 2.0901,
"step": 977
},
{
"epoch": 2.2227272727272727,
"grad_norm": 10.751893997192383,
"learning_rate": 2.6287878787878788e-05,
"loss": 0.9779,
"step": 978
},
{
"epoch": 2.225,
"grad_norm": 11.89562702178955,
"learning_rate": 2.6212121212121214e-05,
"loss": 1.7043,
"step": 979
},
{
"epoch": 2.227272727272727,
"grad_norm": 12.013797760009766,
"learning_rate": 2.6136363636363637e-05,
"loss": 1.4427,
"step": 980
},
{
"epoch": 2.2295454545454545,
"grad_norm": 13.685124397277832,
"learning_rate": 2.6060606060606063e-05,
"loss": 1.9327,
"step": 981
},
{
"epoch": 2.231818181818182,
"grad_norm": 14.36984920501709,
"learning_rate": 2.5984848484848483e-05,
"loss": 2.4401,
"step": 982
},
{
"epoch": 2.234090909090909,
"grad_norm": 11.657794952392578,
"learning_rate": 2.590909090909091e-05,
"loss": 1.5776,
"step": 983
},
{
"epoch": 2.2363636363636363,
"grad_norm": 9.138626098632812,
"learning_rate": 2.5833333333333336e-05,
"loss": 1.5954,
"step": 984
},
{
"epoch": 2.2386363636363638,
"grad_norm": 11.275242805480957,
"learning_rate": 2.575757575757576e-05,
"loss": 1.5874,
"step": 985
},
{
"epoch": 2.2409090909090907,
"grad_norm": 11.694557189941406,
"learning_rate": 2.5681818181818185e-05,
"loss": 1.2839,
"step": 986
},
{
"epoch": 2.243181818181818,
"grad_norm": 14.328207015991211,
"learning_rate": 2.5606060606060604e-05,
"loss": 2.3689,
"step": 987
},
{
"epoch": 2.2454545454545456,
"grad_norm": 14.487227439880371,
"learning_rate": 2.553030303030303e-05,
"loss": 1.5858,
"step": 988
},
{
"epoch": 2.2477272727272726,
"grad_norm": 14.691239356994629,
"learning_rate": 2.5454545454545454e-05,
"loss": 1.8329,
"step": 989
},
{
"epoch": 2.25,
"grad_norm": 10.622157096862793,
"learning_rate": 2.537878787878788e-05,
"loss": 1.8422,
"step": 990
},
{
"epoch": 2.2522727272727274,
"grad_norm": 13.788392066955566,
"learning_rate": 2.5303030303030306e-05,
"loss": 2.0421,
"step": 991
},
{
"epoch": 2.2545454545454544,
"grad_norm": 8.527210235595703,
"learning_rate": 2.5227272727272726e-05,
"loss": 1.4462,
"step": 992
},
{
"epoch": 2.256818181818182,
"grad_norm": 11.221017837524414,
"learning_rate": 2.5151515151515155e-05,
"loss": 1.7809,
"step": 993
},
{
"epoch": 2.2590909090909093,
"grad_norm": 15.243719100952148,
"learning_rate": 2.5075757575757575e-05,
"loss": 1.7409,
"step": 994
},
{
"epoch": 2.2613636363636362,
"grad_norm": 16.965797424316406,
"learning_rate": 2.5e-05,
"loss": 3.2836,
"step": 995
},
{
"epoch": 2.2636363636363637,
"grad_norm": 10.187609672546387,
"learning_rate": 2.4924242424242424e-05,
"loss": 1.5489,
"step": 996
},
{
"epoch": 2.265909090909091,
"grad_norm": 9.865535736083984,
"learning_rate": 2.4848484848484847e-05,
"loss": 2.0742,
"step": 997
},
{
"epoch": 2.268181818181818,
"grad_norm": 11.739052772521973,
"learning_rate": 2.4772727272727277e-05,
"loss": 1.4237,
"step": 998
},
{
"epoch": 2.2704545454545455,
"grad_norm": 13.875876426696777,
"learning_rate": 2.46969696969697e-05,
"loss": 2.8714,
"step": 999
},
{
"epoch": 2.2727272727272725,
"grad_norm": 11.909977912902832,
"learning_rate": 2.4621212121212123e-05,
"loss": 1.9434,
"step": 1000
},
{
"epoch": 2.275,
"grad_norm": 13.642827033996582,
"learning_rate": 2.4545454545454545e-05,
"loss": 1.4233,
"step": 1001
},
{
"epoch": 2.2772727272727273,
"grad_norm": 10.349024772644043,
"learning_rate": 2.4469696969696972e-05,
"loss": 1.5193,
"step": 1002
},
{
"epoch": 2.2795454545454543,
"grad_norm": 8.302240371704102,
"learning_rate": 2.4393939393939395e-05,
"loss": 1.0769,
"step": 1003
},
{
"epoch": 2.2818181818181817,
"grad_norm": 9.903936386108398,
"learning_rate": 2.431818181818182e-05,
"loss": 1.4596,
"step": 1004
},
{
"epoch": 2.284090909090909,
"grad_norm": 7.976583957672119,
"learning_rate": 2.4242424242424244e-05,
"loss": 1.3187,
"step": 1005
},
{
"epoch": 2.286363636363636,
"grad_norm": 8.382739067077637,
"learning_rate": 2.4166666666666667e-05,
"loss": 1.1004,
"step": 1006
},
{
"epoch": 2.2886363636363636,
"grad_norm": 9.898600578308105,
"learning_rate": 2.4090909090909093e-05,
"loss": 1.3482,
"step": 1007
},
{
"epoch": 2.290909090909091,
"grad_norm": 9.736372947692871,
"learning_rate": 2.4015151515151516e-05,
"loss": 1.0737,
"step": 1008
},
{
"epoch": 2.293181818181818,
"grad_norm": 14.735883712768555,
"learning_rate": 2.393939393939394e-05,
"loss": 1.9045,
"step": 1009
},
{
"epoch": 2.2954545454545454,
"grad_norm": 16.780405044555664,
"learning_rate": 2.3863636363636365e-05,
"loss": 1.9355,
"step": 1010
},
{
"epoch": 2.297727272727273,
"grad_norm": 9.181320190429688,
"learning_rate": 2.3787878787878788e-05,
"loss": 1.4465,
"step": 1011
},
{
"epoch": 2.3,
"grad_norm": 11.207884788513184,
"learning_rate": 2.3712121212121214e-05,
"loss": 1.6341,
"step": 1012
},
{
"epoch": 2.3022727272727272,
"grad_norm": 12.287393569946289,
"learning_rate": 2.3636363636363637e-05,
"loss": 1.806,
"step": 1013
},
{
"epoch": 2.3045454545454547,
"grad_norm": 12.173286437988281,
"learning_rate": 2.356060606060606e-05,
"loss": 2.2166,
"step": 1014
},
{
"epoch": 2.3068181818181817,
"grad_norm": 13.528629302978516,
"learning_rate": 2.3484848484848487e-05,
"loss": 1.5679,
"step": 1015
},
{
"epoch": 2.309090909090909,
"grad_norm": 9.217406272888184,
"learning_rate": 2.340909090909091e-05,
"loss": 1.7179,
"step": 1016
},
{
"epoch": 2.3113636363636365,
"grad_norm": 13.768959999084473,
"learning_rate": 2.3333333333333336e-05,
"loss": 2.1235,
"step": 1017
},
{
"epoch": 2.3136363636363635,
"grad_norm": 9.60761833190918,
"learning_rate": 2.325757575757576e-05,
"loss": 1.3526,
"step": 1018
},
{
"epoch": 2.315909090909091,
"grad_norm": 10.336706161499023,
"learning_rate": 2.318181818181818e-05,
"loss": 1.3543,
"step": 1019
},
{
"epoch": 2.3181818181818183,
"grad_norm": 11.636757850646973,
"learning_rate": 2.3106060606060605e-05,
"loss": 1.8026,
"step": 1020
},
{
"epoch": 2.3204545454545453,
"grad_norm": 10.546634674072266,
"learning_rate": 2.3030303030303034e-05,
"loss": 1.9753,
"step": 1021
},
{
"epoch": 2.3227272727272728,
"grad_norm": 13.629782676696777,
"learning_rate": 2.2954545454545457e-05,
"loss": 1.6927,
"step": 1022
},
{
"epoch": 2.325,
"grad_norm": 13.1149263381958,
"learning_rate": 2.287878787878788e-05,
"loss": 1.4331,
"step": 1023
},
{
"epoch": 2.327272727272727,
"grad_norm": 10.624835968017578,
"learning_rate": 2.2803030303030303e-05,
"loss": 1.4769,
"step": 1024
},
{
"epoch": 2.3295454545454546,
"grad_norm": 13.692902565002441,
"learning_rate": 2.272727272727273e-05,
"loss": 2.7543,
"step": 1025
},
{
"epoch": 2.331818181818182,
"grad_norm": 10.054675102233887,
"learning_rate": 2.2651515151515152e-05,
"loss": 1.2323,
"step": 1026
},
{
"epoch": 2.334090909090909,
"grad_norm": 14.394067764282227,
"learning_rate": 2.257575757575758e-05,
"loss": 2.094,
"step": 1027
},
{
"epoch": 2.3363636363636364,
"grad_norm": 10.581347465515137,
"learning_rate": 2.25e-05,
"loss": 2.2432,
"step": 1028
},
{
"epoch": 2.338636363636364,
"grad_norm": 9.492446899414062,
"learning_rate": 2.2424242424242424e-05,
"loss": 1.3964,
"step": 1029
},
{
"epoch": 2.340909090909091,
"grad_norm": 10.887022972106934,
"learning_rate": 2.234848484848485e-05,
"loss": 2.0411,
"step": 1030
},
{
"epoch": 2.3431818181818183,
"grad_norm": 13.539667129516602,
"learning_rate": 2.2272727272727274e-05,
"loss": 1.3067,
"step": 1031
},
{
"epoch": 2.3454545454545457,
"grad_norm": 9.191630363464355,
"learning_rate": 2.21969696969697e-05,
"loss": 1.266,
"step": 1032
},
{
"epoch": 2.3477272727272727,
"grad_norm": 8.683979034423828,
"learning_rate": 2.2121212121212123e-05,
"loss": 0.8044,
"step": 1033
},
{
"epoch": 2.35,
"grad_norm": 13.170730590820312,
"learning_rate": 2.2045454545454546e-05,
"loss": 2.2811,
"step": 1034
},
{
"epoch": 2.3522727272727275,
"grad_norm": 11.17111873626709,
"learning_rate": 2.1969696969696972e-05,
"loss": 1.3998,
"step": 1035
},
{
"epoch": 2.3545454545454545,
"grad_norm": 11.230095863342285,
"learning_rate": 2.1893939393939395e-05,
"loss": 2.0224,
"step": 1036
},
{
"epoch": 2.356818181818182,
"grad_norm": 11.912615776062012,
"learning_rate": 2.1818181818181818e-05,
"loss": 1.5619,
"step": 1037
},
{
"epoch": 2.359090909090909,
"grad_norm": 10.748661994934082,
"learning_rate": 2.1742424242424244e-05,
"loss": 1.924,
"step": 1038
},
{
"epoch": 2.3613636363636363,
"grad_norm": 9.370635032653809,
"learning_rate": 2.1666666666666667e-05,
"loss": 1.1797,
"step": 1039
},
{
"epoch": 2.3636363636363638,
"grad_norm": 10.01646900177002,
"learning_rate": 2.1590909090909093e-05,
"loss": 2.1678,
"step": 1040
},
{
"epoch": 2.3659090909090907,
"grad_norm": 9.345016479492188,
"learning_rate": 2.1515151515151516e-05,
"loss": 1.4512,
"step": 1041
},
{
"epoch": 2.368181818181818,
"grad_norm": 11.185441970825195,
"learning_rate": 2.143939393939394e-05,
"loss": 1.5958,
"step": 1042
},
{
"epoch": 2.3704545454545456,
"grad_norm": 10.186037063598633,
"learning_rate": 2.1363636363636362e-05,
"loss": 0.8744,
"step": 1043
},
{
"epoch": 2.3727272727272726,
"grad_norm": 16.676177978515625,
"learning_rate": 2.128787878787879e-05,
"loss": 2.0851,
"step": 1044
},
{
"epoch": 2.375,
"grad_norm": 12.497913360595703,
"learning_rate": 2.1212121212121215e-05,
"loss": 1.4765,
"step": 1045
},
{
"epoch": 2.3772727272727274,
"grad_norm": 7.271422386169434,
"learning_rate": 2.1136363636363638e-05,
"loss": 1.0424,
"step": 1046
},
{
"epoch": 2.3795454545454544,
"grad_norm": 14.968780517578125,
"learning_rate": 2.106060606060606e-05,
"loss": 2.1247,
"step": 1047
},
{
"epoch": 2.381818181818182,
"grad_norm": 11.1759672164917,
"learning_rate": 2.0984848484848483e-05,
"loss": 1.5037,
"step": 1048
},
{
"epoch": 2.3840909090909093,
"grad_norm": 9.880687713623047,
"learning_rate": 2.090909090909091e-05,
"loss": 0.8131,
"step": 1049
},
{
"epoch": 2.3863636363636362,
"grad_norm": 7.559080123901367,
"learning_rate": 2.0833333333333336e-05,
"loss": 0.5826,
"step": 1050
},
{
"epoch": 2.3886363636363637,
"grad_norm": 14.357791900634766,
"learning_rate": 2.075757575757576e-05,
"loss": 2.0945,
"step": 1051
},
{
"epoch": 2.390909090909091,
"grad_norm": 11.396363258361816,
"learning_rate": 2.0681818181818182e-05,
"loss": 1.1564,
"step": 1052
},
{
"epoch": 2.393181818181818,
"grad_norm": 11.255867958068848,
"learning_rate": 2.0606060606060608e-05,
"loss": 2.2688,
"step": 1053
},
{
"epoch": 2.3954545454545455,
"grad_norm": 12.590128898620605,
"learning_rate": 2.053030303030303e-05,
"loss": 2.0123,
"step": 1054
},
{
"epoch": 2.3977272727272725,
"grad_norm": 8.069854736328125,
"learning_rate": 2.0454545454545457e-05,
"loss": 1.3967,
"step": 1055
},
{
"epoch": 2.4,
"grad_norm": 12.596185684204102,
"learning_rate": 2.037878787878788e-05,
"loss": 1.6038,
"step": 1056
},
{
"epoch": 2.4022727272727273,
"grad_norm": 10.432991981506348,
"learning_rate": 2.0303030303030303e-05,
"loss": 1.645,
"step": 1057
},
{
"epoch": 2.4045454545454543,
"grad_norm": 10.639815330505371,
"learning_rate": 2.022727272727273e-05,
"loss": 1.5334,
"step": 1058
},
{
"epoch": 2.4068181818181817,
"grad_norm": 8.867145538330078,
"learning_rate": 2.0151515151515152e-05,
"loss": 1.2041,
"step": 1059
},
{
"epoch": 2.409090909090909,
"grad_norm": 9.741902351379395,
"learning_rate": 2.0075757575757575e-05,
"loss": 1.4987,
"step": 1060
},
{
"epoch": 2.411363636363636,
"grad_norm": 9.907489776611328,
"learning_rate": 2e-05,
"loss": 1.299,
"step": 1061
},
{
"epoch": 2.4136363636363636,
"grad_norm": 8.68997859954834,
"learning_rate": 1.9924242424242425e-05,
"loss": 1.2559,
"step": 1062
},
{
"epoch": 2.415909090909091,
"grad_norm": 9.990528106689453,
"learning_rate": 1.984848484848485e-05,
"loss": 2.3812,
"step": 1063
},
{
"epoch": 2.418181818181818,
"grad_norm": 6.777112007141113,
"learning_rate": 1.9772727272727274e-05,
"loss": 1.0051,
"step": 1064
},
{
"epoch": 2.4204545454545454,
"grad_norm": 13.396077156066895,
"learning_rate": 1.9696969696969697e-05,
"loss": 2.4201,
"step": 1065
},
{
"epoch": 2.422727272727273,
"grad_norm": 13.596755981445312,
"learning_rate": 1.962121212121212e-05,
"loss": 2.0457,
"step": 1066
},
{
"epoch": 2.425,
"grad_norm": 10.351893424987793,
"learning_rate": 1.9545454545454546e-05,
"loss": 1.9791,
"step": 1067
},
{
"epoch": 2.4272727272727272,
"grad_norm": 7.505919933319092,
"learning_rate": 1.9469696969696972e-05,
"loss": 1.2944,
"step": 1068
},
{
"epoch": 2.4295454545454547,
"grad_norm": 10.136748313903809,
"learning_rate": 1.9393939393939395e-05,
"loss": 1.2477,
"step": 1069
},
{
"epoch": 2.4318181818181817,
"grad_norm": 8.979276657104492,
"learning_rate": 1.9318181818181818e-05,
"loss": 0.9829,
"step": 1070
},
{
"epoch": 2.434090909090909,
"grad_norm": 11.097721099853516,
"learning_rate": 1.924242424242424e-05,
"loss": 1.5509,
"step": 1071
},
{
"epoch": 2.4363636363636365,
"grad_norm": 10.789654731750488,
"learning_rate": 1.9166666666666667e-05,
"loss": 1.7344,
"step": 1072
},
{
"epoch": 2.4386363636363635,
"grad_norm": 12.25899887084961,
"learning_rate": 1.9090909090909094e-05,
"loss": 2.0121,
"step": 1073
},
{
"epoch": 2.440909090909091,
"grad_norm": 11.828030586242676,
"learning_rate": 1.9015151515151516e-05,
"loss": 1.7356,
"step": 1074
},
{
"epoch": 2.4431818181818183,
"grad_norm": 10.524036407470703,
"learning_rate": 1.893939393939394e-05,
"loss": 1.402,
"step": 1075
},
{
"epoch": 2.4454545454545453,
"grad_norm": 10.572868347167969,
"learning_rate": 1.8863636363636362e-05,
"loss": 1.6468,
"step": 1076
},
{
"epoch": 2.4477272727272728,
"grad_norm": 9.194175720214844,
"learning_rate": 1.878787878787879e-05,
"loss": 1.1557,
"step": 1077
},
{
"epoch": 2.45,
"grad_norm": 11.355244636535645,
"learning_rate": 1.8712121212121215e-05,
"loss": 1.7729,
"step": 1078
},
{
"epoch": 2.452272727272727,
"grad_norm": 10.380278587341309,
"learning_rate": 1.8636363636363638e-05,
"loss": 2.3491,
"step": 1079
},
{
"epoch": 2.4545454545454546,
"grad_norm": 9.57583236694336,
"learning_rate": 1.856060606060606e-05,
"loss": 1.6112,
"step": 1080
},
{
"epoch": 2.456818181818182,
"grad_norm": 12.973028182983398,
"learning_rate": 1.8484848484848487e-05,
"loss": 1.5272,
"step": 1081
},
{
"epoch": 2.459090909090909,
"grad_norm": 9.473404884338379,
"learning_rate": 1.840909090909091e-05,
"loss": 1.2366,
"step": 1082
},
{
"epoch": 2.4613636363636364,
"grad_norm": 9.843785285949707,
"learning_rate": 1.8333333333333333e-05,
"loss": 1.6283,
"step": 1083
},
{
"epoch": 2.463636363636364,
"grad_norm": 13.467684745788574,
"learning_rate": 1.825757575757576e-05,
"loss": 1.5219,
"step": 1084
},
{
"epoch": 2.465909090909091,
"grad_norm": 8.460468292236328,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.8931,
"step": 1085
},
{
"epoch": 2.4681818181818183,
"grad_norm": 8.956411361694336,
"learning_rate": 1.810606060606061e-05,
"loss": 1.1577,
"step": 1086
},
{
"epoch": 2.4704545454545457,
"grad_norm": 10.919206619262695,
"learning_rate": 1.803030303030303e-05,
"loss": 1.719,
"step": 1087
},
{
"epoch": 2.4727272727272727,
"grad_norm": 10.65345287322998,
"learning_rate": 1.7954545454545454e-05,
"loss": 1.5257,
"step": 1088
},
{
"epoch": 2.475,
"grad_norm": 9.616610527038574,
"learning_rate": 1.787878787878788e-05,
"loss": 1.4704,
"step": 1089
},
{
"epoch": 2.4772727272727275,
"grad_norm": 14.458331108093262,
"learning_rate": 1.7803030303030303e-05,
"loss": 1.4181,
"step": 1090
},
{
"epoch": 2.4795454545454545,
"grad_norm": 8.37006664276123,
"learning_rate": 1.772727272727273e-05,
"loss": 1.191,
"step": 1091
},
{
"epoch": 2.481818181818182,
"grad_norm": 13.129170417785645,
"learning_rate": 1.7651515151515153e-05,
"loss": 1.9966,
"step": 1092
},
{
"epoch": 2.484090909090909,
"grad_norm": 12.65162181854248,
"learning_rate": 1.7575757575757576e-05,
"loss": 1.7372,
"step": 1093
},
{
"epoch": 2.4863636363636363,
"grad_norm": 12.132272720336914,
"learning_rate": 1.75e-05,
"loss": 1.9386,
"step": 1094
},
{
"epoch": 2.4886363636363638,
"grad_norm": 11.549707412719727,
"learning_rate": 1.7424242424242425e-05,
"loss": 1.2838,
"step": 1095
},
{
"epoch": 2.4909090909090907,
"grad_norm": 10.115202903747559,
"learning_rate": 1.734848484848485e-05,
"loss": 1.7778,
"step": 1096
},
{
"epoch": 2.493181818181818,
"grad_norm": 14.97376823425293,
"learning_rate": 1.7272727272727274e-05,
"loss": 2.5436,
"step": 1097
},
{
"epoch": 2.4954545454545456,
"grad_norm": 10.270051956176758,
"learning_rate": 1.7196969696969697e-05,
"loss": 1.3943,
"step": 1098
},
{
"epoch": 2.4977272727272726,
"grad_norm": 11.584896087646484,
"learning_rate": 1.712121212121212e-05,
"loss": 1.8023,
"step": 1099
},
{
"epoch": 2.5,
"grad_norm": 11.003795623779297,
"learning_rate": 1.7045454545454546e-05,
"loss": 1.2057,
"step": 1100
},
{
"epoch": 2.5022727272727274,
"grad_norm": 10.495930671691895,
"learning_rate": 1.6969696969696972e-05,
"loss": 1.7265,
"step": 1101
},
{
"epoch": 2.5045454545454544,
"grad_norm": 10.6824951171875,
"learning_rate": 1.6893939393939395e-05,
"loss": 1.4241,
"step": 1102
},
{
"epoch": 2.506818181818182,
"grad_norm": 10.532041549682617,
"learning_rate": 1.6818181818181818e-05,
"loss": 1.4532,
"step": 1103
},
{
"epoch": 2.509090909090909,
"grad_norm": 8.671700477600098,
"learning_rate": 1.674242424242424e-05,
"loss": 1.2539,
"step": 1104
},
{
"epoch": 2.5113636363636362,
"grad_norm": 14.828866004943848,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.4732,
"step": 1105
},
{
"epoch": 2.5136363636363637,
"grad_norm": 11.871790885925293,
"learning_rate": 1.6590909090909094e-05,
"loss": 1.7559,
"step": 1106
},
{
"epoch": 2.5159090909090907,
"grad_norm": 9.144551277160645,
"learning_rate": 1.6515151515151517e-05,
"loss": 1.3562,
"step": 1107
},
{
"epoch": 2.518181818181818,
"grad_norm": 9.856282234191895,
"learning_rate": 1.643939393939394e-05,
"loss": 1.4721,
"step": 1108
},
{
"epoch": 2.5204545454545455,
"grad_norm": 8.48530101776123,
"learning_rate": 1.6363636363636366e-05,
"loss": 1.0045,
"step": 1109
},
{
"epoch": 2.5227272727272725,
"grad_norm": 16.73642349243164,
"learning_rate": 1.628787878787879e-05,
"loss": 2.4458,
"step": 1110
},
{
"epoch": 2.525,
"grad_norm": 10.180378913879395,
"learning_rate": 1.6212121212121212e-05,
"loss": 1.3323,
"step": 1111
},
{
"epoch": 2.5272727272727273,
"grad_norm": 11.56425666809082,
"learning_rate": 1.6136363636363638e-05,
"loss": 2.0303,
"step": 1112
},
{
"epoch": 2.5295454545454543,
"grad_norm": 14.644630432128906,
"learning_rate": 1.606060606060606e-05,
"loss": 1.9247,
"step": 1113
},
{
"epoch": 2.5318181818181817,
"grad_norm": 11.767682075500488,
"learning_rate": 1.5984848484848487e-05,
"loss": 1.7903,
"step": 1114
},
{
"epoch": 2.534090909090909,
"grad_norm": 11.074971199035645,
"learning_rate": 1.590909090909091e-05,
"loss": 2.0781,
"step": 1115
},
{
"epoch": 2.536363636363636,
"grad_norm": 13.846643447875977,
"learning_rate": 1.5833333333333333e-05,
"loss": 1.2449,
"step": 1116
},
{
"epoch": 2.5386363636363636,
"grad_norm": 12.496777534484863,
"learning_rate": 1.5757575757575756e-05,
"loss": 1.287,
"step": 1117
},
{
"epoch": 2.540909090909091,
"grad_norm": 8.406025886535645,
"learning_rate": 1.5681818181818182e-05,
"loss": 1.4133,
"step": 1118
},
{
"epoch": 2.543181818181818,
"grad_norm": 9.715517044067383,
"learning_rate": 1.560606060606061e-05,
"loss": 1.6738,
"step": 1119
},
{
"epoch": 2.5454545454545454,
"grad_norm": 14.14928913116455,
"learning_rate": 1.553030303030303e-05,
"loss": 1.9505,
"step": 1120
},
{
"epoch": 2.547727272727273,
"grad_norm": 10.110836029052734,
"learning_rate": 1.5454545454545454e-05,
"loss": 1.4759,
"step": 1121
},
{
"epoch": 2.55,
"grad_norm": 15.94524097442627,
"learning_rate": 1.5378787878787877e-05,
"loss": 1.7516,
"step": 1122
},
{
"epoch": 2.5522727272727272,
"grad_norm": 16.20330047607422,
"learning_rate": 1.5303030303030304e-05,
"loss": 2.1093,
"step": 1123
},
{
"epoch": 2.5545454545454547,
"grad_norm": 8.647255897521973,
"learning_rate": 1.5227272727272728e-05,
"loss": 1.0308,
"step": 1124
},
{
"epoch": 2.5568181818181817,
"grad_norm": 8.955947875976562,
"learning_rate": 1.5151515151515153e-05,
"loss": 1.0129,
"step": 1125
},
{
"epoch": 2.559090909090909,
"grad_norm": 12.877582550048828,
"learning_rate": 1.5075757575757576e-05,
"loss": 1.4853,
"step": 1126
},
{
"epoch": 2.5613636363636365,
"grad_norm": 14.299208641052246,
"learning_rate": 1.5e-05,
"loss": 2.0464,
"step": 1127
},
{
"epoch": 2.5636363636363635,
"grad_norm": 14.365765571594238,
"learning_rate": 1.4924242424242423e-05,
"loss": 1.9381,
"step": 1128
},
{
"epoch": 2.565909090909091,
"grad_norm": 10.231593132019043,
"learning_rate": 1.484848484848485e-05,
"loss": 1.6777,
"step": 1129
},
{
"epoch": 2.5681818181818183,
"grad_norm": 14.259530067443848,
"learning_rate": 1.4772727272727274e-05,
"loss": 1.6438,
"step": 1130
},
{
"epoch": 2.5704545454545453,
"grad_norm": 13.114981651306152,
"learning_rate": 1.4696969696969697e-05,
"loss": 1.3336,
"step": 1131
},
{
"epoch": 2.5727272727272728,
"grad_norm": 9.463297843933105,
"learning_rate": 1.4621212121212122e-05,
"loss": 1.203,
"step": 1132
},
{
"epoch": 2.575,
"grad_norm": 9.805520057678223,
"learning_rate": 1.4545454545454545e-05,
"loss": 1.2487,
"step": 1133
},
{
"epoch": 2.577272727272727,
"grad_norm": 14.853455543518066,
"learning_rate": 1.446969696969697e-05,
"loss": 1.5734,
"step": 1134
},
{
"epoch": 2.5795454545454546,
"grad_norm": 11.86341381072998,
"learning_rate": 1.4393939393939396e-05,
"loss": 1.4835,
"step": 1135
},
{
"epoch": 2.581818181818182,
"grad_norm": 11.581096649169922,
"learning_rate": 1.431818181818182e-05,
"loss": 2.0558,
"step": 1136
},
{
"epoch": 2.584090909090909,
"grad_norm": 12.040521621704102,
"learning_rate": 1.4242424242424243e-05,
"loss": 1.4117,
"step": 1137
},
{
"epoch": 2.5863636363636364,
"grad_norm": 13.00901985168457,
"learning_rate": 1.4166666666666668e-05,
"loss": 2.9511,
"step": 1138
},
{
"epoch": 2.588636363636364,
"grad_norm": 9.332910537719727,
"learning_rate": 1.409090909090909e-05,
"loss": 1.1121,
"step": 1139
},
{
"epoch": 2.590909090909091,
"grad_norm": 10.607443809509277,
"learning_rate": 1.4015151515151515e-05,
"loss": 1.4706,
"step": 1140
},
{
"epoch": 2.5931818181818183,
"grad_norm": 9.47099494934082,
"learning_rate": 1.3939393939393942e-05,
"loss": 1.6907,
"step": 1141
},
{
"epoch": 2.5954545454545457,
"grad_norm": 12.868734359741211,
"learning_rate": 1.3863636363636364e-05,
"loss": 1.334,
"step": 1142
},
{
"epoch": 2.5977272727272727,
"grad_norm": 7.338480472564697,
"learning_rate": 1.3787878787878789e-05,
"loss": 0.6364,
"step": 1143
},
{
"epoch": 2.6,
"grad_norm": 10.434823989868164,
"learning_rate": 1.3712121212121212e-05,
"loss": 1.7292,
"step": 1144
},
{
"epoch": 2.6022727272727275,
"grad_norm": 10.510713577270508,
"learning_rate": 1.3636363636363637e-05,
"loss": 1.555,
"step": 1145
},
{
"epoch": 2.6045454545454545,
"grad_norm": 11.927501678466797,
"learning_rate": 1.3560606060606063e-05,
"loss": 1.7373,
"step": 1146
},
{
"epoch": 2.606818181818182,
"grad_norm": 8.673569679260254,
"learning_rate": 1.3484848484848486e-05,
"loss": 1.3046,
"step": 1147
},
{
"epoch": 2.6090909090909093,
"grad_norm": 9.680171012878418,
"learning_rate": 1.340909090909091e-05,
"loss": 1.2691,
"step": 1148
},
{
"epoch": 2.6113636363636363,
"grad_norm": 20.66661834716797,
"learning_rate": 1.3333333333333333e-05,
"loss": 3.1138,
"step": 1149
},
{
"epoch": 2.6136363636363638,
"grad_norm": 59.59333801269531,
"learning_rate": 1.3257575757575758e-05,
"loss": 1.8486,
"step": 1150
},
{
"epoch": 2.615909090909091,
"grad_norm": 9.416550636291504,
"learning_rate": 1.318181818181818e-05,
"loss": 1.198,
"step": 1151
},
{
"epoch": 2.618181818181818,
"grad_norm": 11.847350120544434,
"learning_rate": 1.3106060606060607e-05,
"loss": 1.494,
"step": 1152
},
{
"epoch": 2.6204545454545456,
"grad_norm": 8.2369966506958,
"learning_rate": 1.3030303030303032e-05,
"loss": 0.8885,
"step": 1153
},
{
"epoch": 2.6227272727272726,
"grad_norm": 13.204099655151367,
"learning_rate": 1.2954545454545455e-05,
"loss": 1.9838,
"step": 1154
},
{
"epoch": 2.625,
"grad_norm": 11.384471893310547,
"learning_rate": 1.287878787878788e-05,
"loss": 1.5648,
"step": 1155
},
{
"epoch": 2.6272727272727274,
"grad_norm": 43.95447540283203,
"learning_rate": 1.2803030303030302e-05,
"loss": 1.6246,
"step": 1156
},
{
"epoch": 2.6295454545454544,
"grad_norm": 12.041752815246582,
"learning_rate": 1.2727272727272727e-05,
"loss": 1.6404,
"step": 1157
},
{
"epoch": 2.631818181818182,
"grad_norm": 13.470951080322266,
"learning_rate": 1.2651515151515153e-05,
"loss": 2.1278,
"step": 1158
},
{
"epoch": 2.634090909090909,
"grad_norm": 12.769510269165039,
"learning_rate": 1.2575757575757578e-05,
"loss": 1.6486,
"step": 1159
},
{
"epoch": 2.6363636363636362,
"grad_norm": 9.455702781677246,
"learning_rate": 1.25e-05,
"loss": 1.5211,
"step": 1160
},
{
"epoch": 2.6386363636363637,
"grad_norm": 13.590509414672852,
"learning_rate": 1.2424242424242424e-05,
"loss": 2.081,
"step": 1161
},
{
"epoch": 2.6409090909090907,
"grad_norm": 12.029936790466309,
"learning_rate": 1.234848484848485e-05,
"loss": 1.6036,
"step": 1162
},
{
"epoch": 2.643181818181818,
"grad_norm": 65.75121307373047,
"learning_rate": 1.2272727272727273e-05,
"loss": 1.5853,
"step": 1163
},
{
"epoch": 2.6454545454545455,
"grad_norm": 13.093693733215332,
"learning_rate": 1.2196969696969697e-05,
"loss": 1.4623,
"step": 1164
},
{
"epoch": 2.6477272727272725,
"grad_norm": 14.704643249511719,
"learning_rate": 1.2121212121212122e-05,
"loss": 1.7431,
"step": 1165
},
{
"epoch": 2.65,
"grad_norm": 10.710149765014648,
"learning_rate": 1.2045454545454547e-05,
"loss": 1.6442,
"step": 1166
},
{
"epoch": 2.6522727272727273,
"grad_norm": 12.05364990234375,
"learning_rate": 1.196969696969697e-05,
"loss": 2.0733,
"step": 1167
},
{
"epoch": 2.6545454545454543,
"grad_norm": 12.834985733032227,
"learning_rate": 1.1893939393939394e-05,
"loss": 2.8648,
"step": 1168
},
{
"epoch": 2.6568181818181817,
"grad_norm": 9.302035331726074,
"learning_rate": 1.1818181818181819e-05,
"loss": 1.1539,
"step": 1169
},
{
"epoch": 2.659090909090909,
"grad_norm": 9.240340232849121,
"learning_rate": 1.1742424242424243e-05,
"loss": 1.5434,
"step": 1170
},
{
"epoch": 2.661363636363636,
"grad_norm": 14.066667556762695,
"learning_rate": 1.1666666666666668e-05,
"loss": 1.7866,
"step": 1171
},
{
"epoch": 2.6636363636363636,
"grad_norm": 10.935914039611816,
"learning_rate": 1.159090909090909e-05,
"loss": 1.4766,
"step": 1172
},
{
"epoch": 2.665909090909091,
"grad_norm": 8.409308433532715,
"learning_rate": 1.1515151515151517e-05,
"loss": 1.3846,
"step": 1173
},
{
"epoch": 2.668181818181818,
"grad_norm": 10.203055381774902,
"learning_rate": 1.143939393939394e-05,
"loss": 1.1693,
"step": 1174
},
{
"epoch": 2.6704545454545454,
"grad_norm": 11.417679786682129,
"learning_rate": 1.1363636363636365e-05,
"loss": 1.9941,
"step": 1175
},
{
"epoch": 2.672727272727273,
"grad_norm": 13.196696281433105,
"learning_rate": 1.128787878787879e-05,
"loss": 1.8474,
"step": 1176
},
{
"epoch": 2.675,
"grad_norm": 11.088204383850098,
"learning_rate": 1.1212121212121212e-05,
"loss": 1.7153,
"step": 1177
},
{
"epoch": 2.6772727272727272,
"grad_norm": 12.048771858215332,
"learning_rate": 1.1136363636363637e-05,
"loss": 2.5212,
"step": 1178
},
{
"epoch": 2.6795454545454547,
"grad_norm": 13.929719924926758,
"learning_rate": 1.1060606060606061e-05,
"loss": 2.3728,
"step": 1179
},
{
"epoch": 2.6818181818181817,
"grad_norm": 10.445011138916016,
"learning_rate": 1.0984848484848486e-05,
"loss": 0.9737,
"step": 1180
},
{
"epoch": 2.684090909090909,
"grad_norm": 14.0521821975708,
"learning_rate": 1.0909090909090909e-05,
"loss": 1.6476,
"step": 1181
},
{
"epoch": 2.6863636363636365,
"grad_norm": 10.526323318481445,
"learning_rate": 1.0833333333333334e-05,
"loss": 1.4206,
"step": 1182
},
{
"epoch": 2.6886363636363635,
"grad_norm": 11.84065055847168,
"learning_rate": 1.0757575757575758e-05,
"loss": 2.5504,
"step": 1183
},
{
"epoch": 2.690909090909091,
"grad_norm": 13.432804107666016,
"learning_rate": 1.0681818181818181e-05,
"loss": 1.1723,
"step": 1184
},
{
"epoch": 2.6931818181818183,
"grad_norm": 10.570472717285156,
"learning_rate": 1.0606060606060607e-05,
"loss": 1.3094,
"step": 1185
},
{
"epoch": 2.6954545454545453,
"grad_norm": 9.313067436218262,
"learning_rate": 1.053030303030303e-05,
"loss": 1.3848,
"step": 1186
},
{
"epoch": 2.6977272727272728,
"grad_norm": 12.77459716796875,
"learning_rate": 1.0454545454545455e-05,
"loss": 1.9546,
"step": 1187
},
{
"epoch": 2.7,
"grad_norm": 12.23890495300293,
"learning_rate": 1.037878787878788e-05,
"loss": 1.858,
"step": 1188
},
{
"epoch": 2.702272727272727,
"grad_norm": 10.90783977508545,
"learning_rate": 1.0303030303030304e-05,
"loss": 1.7215,
"step": 1189
},
{
"epoch": 2.7045454545454546,
"grad_norm": 11.610969543457031,
"learning_rate": 1.0227272727272729e-05,
"loss": 1.3744,
"step": 1190
},
{
"epoch": 2.706818181818182,
"grad_norm": 13.296714782714844,
"learning_rate": 1.0151515151515152e-05,
"loss": 1.3959,
"step": 1191
},
{
"epoch": 2.709090909090909,
"grad_norm": 11.602737426757812,
"learning_rate": 1.0075757575757576e-05,
"loss": 0.9706,
"step": 1192
},
{
"epoch": 2.7113636363636364,
"grad_norm": 8.904767036437988,
"learning_rate": 1e-05,
"loss": 1.1206,
"step": 1193
},
{
"epoch": 2.713636363636364,
"grad_norm": 9.719966888427734,
"learning_rate": 9.924242424242425e-06,
"loss": 1.326,
"step": 1194
},
{
"epoch": 2.715909090909091,
"grad_norm": 11.37736701965332,
"learning_rate": 9.848484848484848e-06,
"loss": 1.2423,
"step": 1195
},
{
"epoch": 2.7181818181818183,
"grad_norm": 8.89704418182373,
"learning_rate": 9.772727272727273e-06,
"loss": 1.5434,
"step": 1196
},
{
"epoch": 2.7204545454545457,
"grad_norm": 11.980868339538574,
"learning_rate": 9.696969696969698e-06,
"loss": 1.9285,
"step": 1197
},
{
"epoch": 2.7227272727272727,
"grad_norm": 20.147335052490234,
"learning_rate": 9.62121212121212e-06,
"loss": 1.9032,
"step": 1198
},
{
"epoch": 2.725,
"grad_norm": 12.508543014526367,
"learning_rate": 9.545454545454547e-06,
"loss": 2.549,
"step": 1199
},
{
"epoch": 2.7272727272727275,
"grad_norm": 15.286222457885742,
"learning_rate": 9.46969696969697e-06,
"loss": 1.7541,
"step": 1200
},
{
"epoch": 2.7295454545454545,
"grad_norm": 9.950079917907715,
"learning_rate": 9.393939393939394e-06,
"loss": 1.0859,
"step": 1201
},
{
"epoch": 2.731818181818182,
"grad_norm": 9.034377098083496,
"learning_rate": 9.318181818181819e-06,
"loss": 1.6942,
"step": 1202
},
{
"epoch": 2.7340909090909093,
"grad_norm": 10.347823143005371,
"learning_rate": 9.242424242424244e-06,
"loss": 0.7853,
"step": 1203
},
{
"epoch": 2.7363636363636363,
"grad_norm": 13.554040908813477,
"learning_rate": 9.166666666666666e-06,
"loss": 1.6867,
"step": 1204
},
{
"epoch": 2.7386363636363638,
"grad_norm": 12.764242172241211,
"learning_rate": 9.090909090909091e-06,
"loss": 1.7983,
"step": 1205
},
{
"epoch": 2.740909090909091,
"grad_norm": 13.305977821350098,
"learning_rate": 9.015151515151516e-06,
"loss": 1.5904,
"step": 1206
},
{
"epoch": 2.743181818181818,
"grad_norm": 16.118629455566406,
"learning_rate": 8.93939393939394e-06,
"loss": 1.593,
"step": 1207
},
{
"epoch": 2.7454545454545456,
"grad_norm": 9.158020973205566,
"learning_rate": 8.863636363636365e-06,
"loss": 1.2809,
"step": 1208
},
{
"epoch": 2.7477272727272726,
"grad_norm": 12.490316390991211,
"learning_rate": 8.787878787878788e-06,
"loss": 1.5405,
"step": 1209
},
{
"epoch": 2.75,
"grad_norm": 12.778218269348145,
"learning_rate": 8.712121212121212e-06,
"loss": 1.4892,
"step": 1210
},
{
"epoch": 2.7522727272727274,
"grad_norm": 11.4492826461792,
"learning_rate": 8.636363636363637e-06,
"loss": 1.2019,
"step": 1211
},
{
"epoch": 2.7545454545454544,
"grad_norm": 13.168742179870605,
"learning_rate": 8.56060606060606e-06,
"loss": 1.6647,
"step": 1212
},
{
"epoch": 2.756818181818182,
"grad_norm": 10.593256950378418,
"learning_rate": 8.484848484848486e-06,
"loss": 1.3455,
"step": 1213
},
{
"epoch": 2.759090909090909,
"grad_norm": 12.997807502746582,
"learning_rate": 8.409090909090909e-06,
"loss": 1.6967,
"step": 1214
},
{
"epoch": 2.7613636363636362,
"grad_norm": 16.37111473083496,
"learning_rate": 8.333333333333334e-06,
"loss": 1.7001,
"step": 1215
},
{
"epoch": 2.7636363636363637,
"grad_norm": 11.749297142028809,
"learning_rate": 8.257575757575758e-06,
"loss": 0.9918,
"step": 1216
},
{
"epoch": 2.7659090909090907,
"grad_norm": 9.196391105651855,
"learning_rate": 8.181818181818183e-06,
"loss": 1.3952,
"step": 1217
},
{
"epoch": 2.768181818181818,
"grad_norm": 7.304767608642578,
"learning_rate": 8.106060606060606e-06,
"loss": 0.9309,
"step": 1218
},
{
"epoch": 2.7704545454545455,
"grad_norm": 11.371389389038086,
"learning_rate": 8.03030303030303e-06,
"loss": 2.2034,
"step": 1219
},
{
"epoch": 2.7727272727272725,
"grad_norm": 10.503549575805664,
"learning_rate": 7.954545454545455e-06,
"loss": 1.0822,
"step": 1220
},
{
"epoch": 2.775,
"grad_norm": 11.071968078613281,
"learning_rate": 7.878787878787878e-06,
"loss": 1.7071,
"step": 1221
},
{
"epoch": 2.7772727272727273,
"grad_norm": 11.416297912597656,
"learning_rate": 7.803030303030304e-06,
"loss": 2.0261,
"step": 1222
},
{
"epoch": 2.7795454545454543,
"grad_norm": 15.829241752624512,
"learning_rate": 7.727272727272727e-06,
"loss": 2.0085,
"step": 1223
},
{
"epoch": 2.7818181818181817,
"grad_norm": 8.403531074523926,
"learning_rate": 7.651515151515152e-06,
"loss": 1.2764,
"step": 1224
},
{
"epoch": 2.784090909090909,
"grad_norm": 11.730886459350586,
"learning_rate": 7.5757575757575764e-06,
"loss": 1.6733,
"step": 1225
},
{
"epoch": 2.786363636363636,
"grad_norm": 13.102418899536133,
"learning_rate": 7.5e-06,
"loss": 2.139,
"step": 1226
},
{
"epoch": 2.7886363636363636,
"grad_norm": 14.804220199584961,
"learning_rate": 7.424242424242425e-06,
"loss": 2.1015,
"step": 1227
},
{
"epoch": 2.790909090909091,
"grad_norm": 11.839103698730469,
"learning_rate": 7.3484848484848486e-06,
"loss": 1.6026,
"step": 1228
},
{
"epoch": 2.793181818181818,
"grad_norm": 17.421327590942383,
"learning_rate": 7.272727272727272e-06,
"loss": 2.7038,
"step": 1229
},
{
"epoch": 2.7954545454545454,
"grad_norm": 14.81433391571045,
"learning_rate": 7.196969696969698e-06,
"loss": 1.702,
"step": 1230
},
{
"epoch": 2.797727272727273,
"grad_norm": 7.195108413696289,
"learning_rate": 7.1212121212121215e-06,
"loss": 0.9022,
"step": 1231
},
{
"epoch": 2.8,
"grad_norm": 9.045830726623535,
"learning_rate": 7.045454545454545e-06,
"loss": 1.0748,
"step": 1232
},
{
"epoch": 2.8022727272727272,
"grad_norm": 11.995684623718262,
"learning_rate": 6.969696969696971e-06,
"loss": 2.5776,
"step": 1233
},
{
"epoch": 2.8045454545454547,
"grad_norm": 10.528661727905273,
"learning_rate": 6.8939393939393945e-06,
"loss": 1.8155,
"step": 1234
},
{
"epoch": 2.8068181818181817,
"grad_norm": 34.72589111328125,
"learning_rate": 6.818181818181818e-06,
"loss": 2.5481,
"step": 1235
},
{
"epoch": 2.809090909090909,
"grad_norm": 8.032730102539062,
"learning_rate": 6.742424242424243e-06,
"loss": 0.736,
"step": 1236
},
{
"epoch": 2.8113636363636365,
"grad_norm": 9.088884353637695,
"learning_rate": 6.666666666666667e-06,
"loss": 1.6364,
"step": 1237
},
{
"epoch": 2.8136363636363635,
"grad_norm": 9.277338027954102,
"learning_rate": 6.59090909090909e-06,
"loss": 1.4521,
"step": 1238
},
{
"epoch": 2.815909090909091,
"grad_norm": 12.458305358886719,
"learning_rate": 6.515151515151516e-06,
"loss": 1.2296,
"step": 1239
},
{
"epoch": 2.8181818181818183,
"grad_norm": 10.594490051269531,
"learning_rate": 6.43939393939394e-06,
"loss": 1.414,
"step": 1240
},
{
"epoch": 2.8204545454545453,
"grad_norm": 10.604024887084961,
"learning_rate": 6.363636363636363e-06,
"loss": 1.7017,
"step": 1241
},
{
"epoch": 2.8227272727272728,
"grad_norm": 10.347737312316895,
"learning_rate": 6.287878787878789e-06,
"loss": 1.2462,
"step": 1242
},
{
"epoch": 2.825,
"grad_norm": 11.151006698608398,
"learning_rate": 6.212121212121212e-06,
"loss": 1.7713,
"step": 1243
},
{
"epoch": 2.827272727272727,
"grad_norm": 12.432381629943848,
"learning_rate": 6.136363636363636e-06,
"loss": 2.7927,
"step": 1244
},
{
"epoch": 2.8295454545454546,
"grad_norm": 12.030777931213379,
"learning_rate": 6.060606060606061e-06,
"loss": 2.1842,
"step": 1245
},
{
"epoch": 2.831818181818182,
"grad_norm": 14.940272331237793,
"learning_rate": 5.984848484848485e-06,
"loss": 1.6475,
"step": 1246
},
{
"epoch": 2.834090909090909,
"grad_norm": 8.027610778808594,
"learning_rate": 5.909090909090909e-06,
"loss": 0.948,
"step": 1247
},
{
"epoch": 2.8363636363636364,
"grad_norm": 12.356363296508789,
"learning_rate": 5.833333333333334e-06,
"loss": 1.6191,
"step": 1248
},
{
"epoch": 2.838636363636364,
"grad_norm": 12.225868225097656,
"learning_rate": 5.7575757575757586e-06,
"loss": 1.2056,
"step": 1249
},
{
"epoch": 2.840909090909091,
"grad_norm": 11.615985870361328,
"learning_rate": 5.681818181818182e-06,
"loss": 1.5477,
"step": 1250
},
{
"epoch": 2.8431818181818183,
"grad_norm": 13.92235279083252,
"learning_rate": 5.606060606060606e-06,
"loss": 2.401,
"step": 1251
},
{
"epoch": 2.8454545454545457,
"grad_norm": 19.311002731323242,
"learning_rate": 5.530303030303031e-06,
"loss": 2.2211,
"step": 1252
},
{
"epoch": 2.8477272727272727,
"grad_norm": 9.447689056396484,
"learning_rate": 5.4545454545454545e-06,
"loss": 1.2734,
"step": 1253
},
{
"epoch": 2.85,
"grad_norm": 10.197713851928711,
"learning_rate": 5.378787878787879e-06,
"loss": 0.878,
"step": 1254
},
{
"epoch": 2.8522727272727275,
"grad_norm": 14.826508522033691,
"learning_rate": 5.303030303030304e-06,
"loss": 1.6759,
"step": 1255
},
{
"epoch": 2.8545454545454545,
"grad_norm": 10.666242599487305,
"learning_rate": 5.2272727272727274e-06,
"loss": 2.1974,
"step": 1256
},
{
"epoch": 2.856818181818182,
"grad_norm": 13.020369529724121,
"learning_rate": 5.151515151515152e-06,
"loss": 1.4073,
"step": 1257
},
{
"epoch": 2.8590909090909093,
"grad_norm": 14.27531623840332,
"learning_rate": 5.075757575757576e-06,
"loss": 2.1165,
"step": 1258
},
{
"epoch": 2.8613636363636363,
"grad_norm": 11.82662296295166,
"learning_rate": 5e-06,
"loss": 0.7765,
"step": 1259
},
{
"epoch": 2.8636363636363638,
"grad_norm": 12.107914924621582,
"learning_rate": 4.924242424242424e-06,
"loss": 1.2762,
"step": 1260
},
{
"epoch": 2.865909090909091,
"grad_norm": 10.041885375976562,
"learning_rate": 4.848484848484849e-06,
"loss": 2.1775,
"step": 1261
},
{
"epoch": 2.868181818181818,
"grad_norm": 11.078441619873047,
"learning_rate": 4.772727272727273e-06,
"loss": 1.6073,
"step": 1262
},
{
"epoch": 2.8704545454545456,
"grad_norm": 9.000492095947266,
"learning_rate": 4.696969696969697e-06,
"loss": 1.4636,
"step": 1263
},
{
"epoch": 2.8727272727272726,
"grad_norm": 11.069653511047363,
"learning_rate": 4.621212121212122e-06,
"loss": 1.4654,
"step": 1264
},
{
"epoch": 2.875,
"grad_norm": 9.110404968261719,
"learning_rate": 4.5454545454545455e-06,
"loss": 1.8338,
"step": 1265
},
{
"epoch": 2.8772727272727274,
"grad_norm": 16.761194229125977,
"learning_rate": 4.46969696969697e-06,
"loss": 1.0709,
"step": 1266
},
{
"epoch": 2.8795454545454544,
"grad_norm": 13.67717170715332,
"learning_rate": 4.393939393939394e-06,
"loss": 2.0994,
"step": 1267
},
{
"epoch": 2.881818181818182,
"grad_norm": 8.258940696716309,
"learning_rate": 4.3181818181818185e-06,
"loss": 1.2818,
"step": 1268
},
{
"epoch": 2.884090909090909,
"grad_norm": 12.960264205932617,
"learning_rate": 4.242424242424243e-06,
"loss": 1.9218,
"step": 1269
},
{
"epoch": 2.8863636363636362,
"grad_norm": 10.886972427368164,
"learning_rate": 4.166666666666667e-06,
"loss": 1.4611,
"step": 1270
},
{
"epoch": 2.8886363636363637,
"grad_norm": 10.516489028930664,
"learning_rate": 4.0909090909090915e-06,
"loss": 2.3418,
"step": 1271
},
{
"epoch": 2.8909090909090907,
"grad_norm": 12.977254867553711,
"learning_rate": 4.015151515151515e-06,
"loss": 1.5361,
"step": 1272
},
{
"epoch": 2.893181818181818,
"grad_norm": 14.605803489685059,
"learning_rate": 3.939393939393939e-06,
"loss": 1.6679,
"step": 1273
},
{
"epoch": 2.8954545454545455,
"grad_norm": 17.729450225830078,
"learning_rate": 3.863636363636364e-06,
"loss": 1.468,
"step": 1274
},
{
"epoch": 2.8977272727272725,
"grad_norm": 10.65392780303955,
"learning_rate": 3.7878787878787882e-06,
"loss": 1.8606,
"step": 1275
},
{
"epoch": 2.9,
"grad_norm": 18.738691329956055,
"learning_rate": 3.7121212121212124e-06,
"loss": 2.7391,
"step": 1276
},
{
"epoch": 2.9022727272727273,
"grad_norm": 11.129204750061035,
"learning_rate": 3.636363636363636e-06,
"loss": 1.4911,
"step": 1277
},
{
"epoch": 2.9045454545454543,
"grad_norm": 10.117977142333984,
"learning_rate": 3.5606060606060608e-06,
"loss": 1.0915,
"step": 1278
},
{
"epoch": 2.9068181818181817,
"grad_norm": 9.391002655029297,
"learning_rate": 3.4848484848484854e-06,
"loss": 1.1659,
"step": 1279
},
{
"epoch": 2.909090909090909,
"grad_norm": 10.86440372467041,
"learning_rate": 3.409090909090909e-06,
"loss": 1.4967,
"step": 1280
},
{
"epoch": 2.911363636363636,
"grad_norm": 11.438384056091309,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.6597,
"step": 1281
},
{
"epoch": 2.9136363636363636,
"grad_norm": 13.486211776733398,
"learning_rate": 3.257575757575758e-06,
"loss": 1.947,
"step": 1282
},
{
"epoch": 2.915909090909091,
"grad_norm": 13.491000175476074,
"learning_rate": 3.1818181818181817e-06,
"loss": 2.4163,
"step": 1283
},
{
"epoch": 2.918181818181818,
"grad_norm": 10.710677146911621,
"learning_rate": 3.106060606060606e-06,
"loss": 1.8073,
"step": 1284
},
{
"epoch": 2.9204545454545454,
"grad_norm": 12.062322616577148,
"learning_rate": 3.0303030303030305e-06,
"loss": 1.9969,
"step": 1285
},
{
"epoch": 2.922727272727273,
"grad_norm": 70.31402587890625,
"learning_rate": 2.9545454545454547e-06,
"loss": 1.3767,
"step": 1286
},
{
"epoch": 2.925,
"grad_norm": 9.519462585449219,
"learning_rate": 2.8787878787878793e-06,
"loss": 1.4795,
"step": 1287
},
{
"epoch": 2.9272727272727272,
"grad_norm": 13.316557884216309,
"learning_rate": 2.803030303030303e-06,
"loss": 0.858,
"step": 1288
},
{
"epoch": 2.9295454545454547,
"grad_norm": 11.898123741149902,
"learning_rate": 2.7272727272727272e-06,
"loss": 1.7807,
"step": 1289
},
{
"epoch": 2.9318181818181817,
"grad_norm": 13.429510116577148,
"learning_rate": 2.651515151515152e-06,
"loss": 1.7467,
"step": 1290
},
{
"epoch": 2.934090909090909,
"grad_norm": 34.4333381652832,
"learning_rate": 2.575757575757576e-06,
"loss": 1.6774,
"step": 1291
},
{
"epoch": 2.9363636363636365,
"grad_norm": 8.44999885559082,
"learning_rate": 2.5e-06,
"loss": 0.8595,
"step": 1292
},
{
"epoch": 2.9386363636363635,
"grad_norm": 9.824548721313477,
"learning_rate": 2.4242424242424244e-06,
"loss": 1.551,
"step": 1293
},
{
"epoch": 2.940909090909091,
"grad_norm": 10.713866233825684,
"learning_rate": 2.3484848484848486e-06,
"loss": 1.4604,
"step": 1294
},
{
"epoch": 2.9431818181818183,
"grad_norm": 18.695775985717773,
"learning_rate": 2.2727272727272728e-06,
"loss": 2.8512,
"step": 1295
},
{
"epoch": 2.9454545454545453,
"grad_norm": 9.289727210998535,
"learning_rate": 2.196969696969697e-06,
"loss": 1.3539,
"step": 1296
},
{
"epoch": 2.9477272727272728,
"grad_norm": 7.917882442474365,
"learning_rate": 2.1212121212121216e-06,
"loss": 1.2179,
"step": 1297
},
{
"epoch": 2.95,
"grad_norm": 16.269927978515625,
"learning_rate": 2.0454545454545457e-06,
"loss": 1.8904,
"step": 1298
},
{
"epoch": 2.952272727272727,
"grad_norm": 11.293408393859863,
"learning_rate": 1.9696969696969695e-06,
"loss": 1.4438,
"step": 1299
},
{
"epoch": 2.9545454545454546,
"grad_norm": 14.2405424118042,
"learning_rate": 1.8939393939393941e-06,
"loss": 2.2578,
"step": 1300
},
{
"epoch": 2.956818181818182,
"grad_norm": 9.712430953979492,
"learning_rate": 1.818181818181818e-06,
"loss": 1.1685,
"step": 1301
},
{
"epoch": 2.959090909090909,
"grad_norm": 14.34041690826416,
"learning_rate": 1.7424242424242427e-06,
"loss": 1.9741,
"step": 1302
},
{
"epoch": 2.9613636363636364,
"grad_norm": 12.20971965789795,
"learning_rate": 1.6666666666666667e-06,
"loss": 2.283,
"step": 1303
},
{
"epoch": 2.963636363636364,
"grad_norm": 13.051138877868652,
"learning_rate": 1.5909090909090908e-06,
"loss": 2.3128,
"step": 1304
},
{
"epoch": 2.965909090909091,
"grad_norm": 11.069129943847656,
"learning_rate": 1.5151515151515152e-06,
"loss": 1.4379,
"step": 1305
},
{
"epoch": 2.9681818181818183,
"grad_norm": 10.655563354492188,
"learning_rate": 1.4393939393939396e-06,
"loss": 1.4726,
"step": 1306
},
{
"epoch": 2.9704545454545457,
"grad_norm": 9.674460411071777,
"learning_rate": 1.3636363636363636e-06,
"loss": 1.2689,
"step": 1307
},
{
"epoch": 2.9727272727272727,
"grad_norm": 10.24626636505127,
"learning_rate": 1.287878787878788e-06,
"loss": 1.2585,
"step": 1308
},
{
"epoch": 2.975,
"grad_norm": 13.117413520812988,
"learning_rate": 1.2121212121212122e-06,
"loss": 1.8019,
"step": 1309
},
{
"epoch": 2.9772727272727275,
"grad_norm": 11.649164199829102,
"learning_rate": 1.1363636363636364e-06,
"loss": 1.375,
"step": 1310
},
{
"epoch": 2.9795454545454545,
"grad_norm": 11.054950714111328,
"learning_rate": 1.0606060606060608e-06,
"loss": 1.7139,
"step": 1311
},
{
"epoch": 2.981818181818182,
"grad_norm": 9.476350784301758,
"learning_rate": 9.848484848484847e-07,
"loss": 1.1851,
"step": 1312
},
{
"epoch": 2.9840909090909093,
"grad_norm": 9.467584609985352,
"learning_rate": 9.09090909090909e-07,
"loss": 1.0272,
"step": 1313
},
{
"epoch": 2.9863636363636363,
"grad_norm": 11.783283233642578,
"learning_rate": 8.333333333333333e-07,
"loss": 1.886,
"step": 1314
},
{
"epoch": 2.9886363636363638,
"grad_norm": 11.245438575744629,
"learning_rate": 7.575757575757576e-07,
"loss": 1.2872,
"step": 1315
},
{
"epoch": 2.990909090909091,
"grad_norm": 12.71106243133545,
"learning_rate": 6.818181818181818e-07,
"loss": 1.3681,
"step": 1316
},
{
"epoch": 2.993181818181818,
"grad_norm": 11.738058090209961,
"learning_rate": 6.060606060606061e-07,
"loss": 1.9274,
"step": 1317
},
{
"epoch": 2.9954545454545456,
"grad_norm": 12.179485321044922,
"learning_rate": 5.303030303030304e-07,
"loss": 1.6056,
"step": 1318
},
{
"epoch": 2.9977272727272726,
"grad_norm": 9.123523712158203,
"learning_rate": 4.545454545454545e-07,
"loss": 1.2402,
"step": 1319
},
{
"epoch": 3.0,
"grad_norm": 17.10702133178711,
"learning_rate": 3.787878787878788e-07,
"loss": 1.7438,
"step": 1320
},
{
"epoch": 3.0,
"eval_f1": 0.8924,
"eval_gen_len": 41.8818,
"eval_loss": 1.7954092025756836,
"eval_precision": 0.8906,
"eval_recall": 0.8943,
"eval_rouge1": 0.4651,
"eval_rouge2": 0.218,
"eval_rougeL": 0.3904,
"eval_rougeLsum": 0.4291,
"eval_runtime": 28.6293,
"eval_samples_per_second": 3.842,
"eval_steps_per_second": 0.489,
"step": 1320
},
{
"epoch": 3.0,
"step": 1320,
"total_flos": 2659801069854720.0,
"train_loss": 1.8849294849868976,
"train_runtime": 574.0732,
"train_samples_per_second": 4.593,
"train_steps_per_second": 2.299
}
],
"logging_steps": 1,
"max_steps": 1320,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2659801069854720.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}