bloomz-1b1-vn-chat / last-checkpoint /trainer_state.json
Femboyuwu2000's picture
Training in progress, step 18580, checkpoint
6fe79b7 verified
raw
history blame
149 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.26293817132021,
"eval_steps": 500,
"global_step": 18580,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 58.19491958618164,
"learning_rate": 1.6666666666666667e-06,
"loss": 4.5462,
"step": 20
},
{
"epoch": 0.0,
"grad_norm": 51.19196319580078,
"learning_rate": 3.3333333333333333e-06,
"loss": 4.6693,
"step": 40
},
{
"epoch": 0.0,
"grad_norm": 45.4248161315918,
"learning_rate": 5e-06,
"loss": 4.6065,
"step": 60
},
{
"epoch": 0.0,
"grad_norm": 57.08290100097656,
"learning_rate": 6.666666666666667e-06,
"loss": 4.4395,
"step": 80
},
{
"epoch": 0.0,
"grad_norm": 40.65673828125,
"learning_rate": 8.333333333333334e-06,
"loss": 4.4641,
"step": 100
},
{
"epoch": 0.0,
"grad_norm": 40.7547492980957,
"learning_rate": 1e-05,
"loss": 4.4638,
"step": 120
},
{
"epoch": 0.0,
"grad_norm": 40.71052169799805,
"learning_rate": 1.1666666666666668e-05,
"loss": 4.3721,
"step": 140
},
{
"epoch": 0.0,
"grad_norm": 32.69596862792969,
"learning_rate": 1.3333333333333333e-05,
"loss": 4.3784,
"step": 160
},
{
"epoch": 0.0,
"grad_norm": 27.53285026550293,
"learning_rate": 1.5e-05,
"loss": 4.3627,
"step": 180
},
{
"epoch": 0.0,
"grad_norm": 39.0136833190918,
"learning_rate": 1.6666666666666667e-05,
"loss": 4.2018,
"step": 200
},
{
"epoch": 0.0,
"grad_norm": 39.9036750793457,
"learning_rate": 1.8333333333333333e-05,
"loss": 4.1214,
"step": 220
},
{
"epoch": 0.0,
"grad_norm": 26.16208267211914,
"learning_rate": 2e-05,
"loss": 4.0551,
"step": 240
},
{
"epoch": 0.0,
"grad_norm": 35.66220474243164,
"learning_rate": 2.1666666666666667e-05,
"loss": 4.0599,
"step": 260
},
{
"epoch": 0.0,
"grad_norm": 22.310619354248047,
"learning_rate": 2.3333333333333336e-05,
"loss": 4.181,
"step": 280
},
{
"epoch": 0.0,
"grad_norm": 31.29083824157715,
"learning_rate": 2.5e-05,
"loss": 4.0389,
"step": 300
},
{
"epoch": 0.0,
"grad_norm": 18.66942596435547,
"learning_rate": 2.6666666666666667e-05,
"loss": 4.0888,
"step": 320
},
{
"epoch": 0.0,
"grad_norm": 47.483428955078125,
"learning_rate": 2.8333333333333335e-05,
"loss": 4.0918,
"step": 340
},
{
"epoch": 0.01,
"grad_norm": 51.05717468261719,
"learning_rate": 3e-05,
"loss": 3.9807,
"step": 360
},
{
"epoch": 0.01,
"grad_norm": 67.01704406738281,
"learning_rate": 3.1666666666666666e-05,
"loss": 4.0331,
"step": 380
},
{
"epoch": 0.01,
"grad_norm": 40.98155975341797,
"learning_rate": 3.3333333333333335e-05,
"loss": 4.039,
"step": 400
},
{
"epoch": 0.01,
"grad_norm": 29.619321823120117,
"learning_rate": 3.5e-05,
"loss": 4.077,
"step": 420
},
{
"epoch": 0.01,
"grad_norm": 41.605018615722656,
"learning_rate": 3.6666666666666666e-05,
"loss": 4.044,
"step": 440
},
{
"epoch": 0.01,
"grad_norm": 34.36818313598633,
"learning_rate": 3.8333333333333334e-05,
"loss": 3.974,
"step": 460
},
{
"epoch": 0.01,
"grad_norm": 26.917036056518555,
"learning_rate": 4e-05,
"loss": 4.0088,
"step": 480
},
{
"epoch": 0.01,
"grad_norm": 25.219558715820312,
"learning_rate": 4.166666666666667e-05,
"loss": 3.8768,
"step": 500
},
{
"epoch": 0.01,
"grad_norm": 24.45106315612793,
"learning_rate": 4.3333333333333334e-05,
"loss": 3.8979,
"step": 520
},
{
"epoch": 0.01,
"grad_norm": 39.479461669921875,
"learning_rate": 4.5e-05,
"loss": 3.9241,
"step": 540
},
{
"epoch": 0.01,
"grad_norm": 46.96614456176758,
"learning_rate": 4.666666666666667e-05,
"loss": 3.8796,
"step": 560
},
{
"epoch": 0.01,
"grad_norm": 31.622241973876953,
"learning_rate": 4.8333333333333334e-05,
"loss": 3.9045,
"step": 580
},
{
"epoch": 0.01,
"grad_norm": 146.8946990966797,
"learning_rate": 5e-05,
"loss": 3.941,
"step": 600
},
{
"epoch": 0.01,
"grad_norm": 29.78015899658203,
"learning_rate": 4.9999868880914903e-05,
"loss": 3.9279,
"step": 620
},
{
"epoch": 0.01,
"grad_norm": 44.591156005859375,
"learning_rate": 4.999947552503497e-05,
"loss": 3.8695,
"step": 640
},
{
"epoch": 0.01,
"grad_norm": 35.80597686767578,
"learning_rate": 4.9998819936486327e-05,
"loss": 3.9277,
"step": 660
},
{
"epoch": 0.01,
"grad_norm": 35.00313186645508,
"learning_rate": 4.99979021221458e-05,
"loss": 3.881,
"step": 680
},
{
"epoch": 0.01,
"grad_norm": 28.8647403717041,
"learning_rate": 4.999672209164081e-05,
"loss": 3.8286,
"step": 700
},
{
"epoch": 0.01,
"grad_norm": 33.56174087524414,
"learning_rate": 4.999527985734932e-05,
"loss": 3.8631,
"step": 720
},
{
"epoch": 0.01,
"grad_norm": 63.59539794921875,
"learning_rate": 4.999357543439969e-05,
"loss": 3.8931,
"step": 740
},
{
"epoch": 0.01,
"grad_norm": 54.89167785644531,
"learning_rate": 4.999160884067051e-05,
"loss": 3.8953,
"step": 760
},
{
"epoch": 0.01,
"grad_norm": 33.9933967590332,
"learning_rate": 4.998938009679042e-05,
"loss": 3.9113,
"step": 780
},
{
"epoch": 0.01,
"grad_norm": 56.342620849609375,
"learning_rate": 4.998688922613788e-05,
"loss": 3.8079,
"step": 800
},
{
"epoch": 0.01,
"grad_norm": 35.17020797729492,
"learning_rate": 4.998413625484095e-05,
"loss": 3.8289,
"step": 820
},
{
"epoch": 0.01,
"grad_norm": 36.69993209838867,
"learning_rate": 4.998112121177699e-05,
"loss": 3.9726,
"step": 840
},
{
"epoch": 0.01,
"grad_norm": 41.2137565612793,
"learning_rate": 4.997784412857239e-05,
"loss": 3.8602,
"step": 860
},
{
"epoch": 0.01,
"grad_norm": 49.4541130065918,
"learning_rate": 4.99743050396022e-05,
"loss": 3.8549,
"step": 880
},
{
"epoch": 0.01,
"grad_norm": 40.87107849121094,
"learning_rate": 4.997050398198977e-05,
"loss": 3.7832,
"step": 900
},
{
"epoch": 0.01,
"grad_norm": 31.820924758911133,
"learning_rate": 4.9966440995606415e-05,
"loss": 3.8991,
"step": 920
},
{
"epoch": 0.01,
"grad_norm": 37.09877395629883,
"learning_rate": 4.9962116123070924e-05,
"loss": 3.9486,
"step": 940
},
{
"epoch": 0.01,
"grad_norm": 40.25444412231445,
"learning_rate": 4.995752940974918e-05,
"loss": 3.848,
"step": 960
},
{
"epoch": 0.01,
"grad_norm": 38.95152282714844,
"learning_rate": 4.9952680903753627e-05,
"loss": 3.723,
"step": 980
},
{
"epoch": 0.01,
"grad_norm": 52.44506072998047,
"learning_rate": 4.9947570655942796e-05,
"loss": 3.864,
"step": 1000
},
{
"epoch": 0.01,
"grad_norm": 59.793373107910156,
"learning_rate": 4.994219871992077e-05,
"loss": 3.794,
"step": 1020
},
{
"epoch": 0.01,
"grad_norm": 40.9141960144043,
"learning_rate": 4.993656515203662e-05,
"loss": 3.8384,
"step": 1040
},
{
"epoch": 0.02,
"grad_norm": 33.75545883178711,
"learning_rate": 4.99306700113838e-05,
"loss": 3.8811,
"step": 1060
},
{
"epoch": 0.02,
"grad_norm": 30.463613510131836,
"learning_rate": 4.9924513359799554e-05,
"loss": 3.7411,
"step": 1080
},
{
"epoch": 0.02,
"grad_norm": 36.24667739868164,
"learning_rate": 4.991809526186424e-05,
"loss": 3.8915,
"step": 1100
},
{
"epoch": 0.02,
"grad_norm": 35.77268600463867,
"learning_rate": 4.991141578490066e-05,
"loss": 3.7547,
"step": 1120
},
{
"epoch": 0.02,
"grad_norm": 43.09757995605469,
"learning_rate": 4.990447499897339e-05,
"loss": 3.8161,
"step": 1140
},
{
"epoch": 0.02,
"grad_norm": 67.45648956298828,
"learning_rate": 4.989727297688797e-05,
"loss": 3.9635,
"step": 1160
},
{
"epoch": 0.02,
"grad_norm": 31.597640991210938,
"learning_rate": 4.98898097941902e-05,
"loss": 3.8912,
"step": 1180
},
{
"epoch": 0.02,
"grad_norm": 41.68192672729492,
"learning_rate": 4.988208552916535e-05,
"loss": 3.8112,
"step": 1200
},
{
"epoch": 0.02,
"grad_norm": 36.489810943603516,
"learning_rate": 4.9874100262837296e-05,
"loss": 3.7838,
"step": 1220
},
{
"epoch": 0.02,
"grad_norm": 31.755823135375977,
"learning_rate": 4.986585407896772e-05,
"loss": 3.8385,
"step": 1240
},
{
"epoch": 0.02,
"grad_norm": 84.64984130859375,
"learning_rate": 4.985734706405516e-05,
"loss": 3.8727,
"step": 1260
},
{
"epoch": 0.02,
"grad_norm": 32.23849868774414,
"learning_rate": 4.98485793073342e-05,
"loss": 3.8013,
"step": 1280
},
{
"epoch": 0.02,
"grad_norm": 25.90882110595703,
"learning_rate": 4.983955090077444e-05,
"loss": 3.7387,
"step": 1300
},
{
"epoch": 0.02,
"grad_norm": 43.255313873291016,
"learning_rate": 4.9830261939079614e-05,
"loss": 3.8756,
"step": 1320
},
{
"epoch": 0.02,
"grad_norm": 35.833404541015625,
"learning_rate": 4.982071251968652e-05,
"loss": 3.7124,
"step": 1340
},
{
"epoch": 0.02,
"grad_norm": 29.098703384399414,
"learning_rate": 4.981090274276406e-05,
"loss": 3.8525,
"step": 1360
},
{
"epoch": 0.02,
"grad_norm": 35.16478729248047,
"learning_rate": 4.980083271121214e-05,
"loss": 3.8262,
"step": 1380
},
{
"epoch": 0.02,
"grad_norm": 32.62320327758789,
"learning_rate": 4.9790502530660635e-05,
"loss": 3.8903,
"step": 1400
},
{
"epoch": 0.02,
"grad_norm": 48.55181884765625,
"learning_rate": 4.977991230946824e-05,
"loss": 3.7363,
"step": 1420
},
{
"epoch": 0.02,
"grad_norm": 46.640403747558594,
"learning_rate": 4.976906215872138e-05,
"loss": 3.9682,
"step": 1440
},
{
"epoch": 0.02,
"grad_norm": 32.13254928588867,
"learning_rate": 4.9757952192232985e-05,
"loss": 3.6851,
"step": 1460
},
{
"epoch": 0.02,
"grad_norm": 34.074649810791016,
"learning_rate": 4.9746582526541355e-05,
"loss": 3.7781,
"step": 1480
},
{
"epoch": 0.02,
"grad_norm": 37.383548736572266,
"learning_rate": 4.9734953280908904e-05,
"loss": 3.7182,
"step": 1500
},
{
"epoch": 0.02,
"grad_norm": 45.83818435668945,
"learning_rate": 4.972306457732091e-05,
"loss": 3.7685,
"step": 1520
},
{
"epoch": 0.02,
"grad_norm": 35.88654327392578,
"learning_rate": 4.9710916540484265e-05,
"loss": 3.7627,
"step": 1540
},
{
"epoch": 0.02,
"grad_norm": 29.5416202545166,
"learning_rate": 4.96985092978261e-05,
"loss": 3.8022,
"step": 1560
},
{
"epoch": 0.02,
"grad_norm": 31.974184036254883,
"learning_rate": 4.968584297949255e-05,
"loss": 3.792,
"step": 1580
},
{
"epoch": 0.02,
"grad_norm": 32.32705307006836,
"learning_rate": 4.967291771834727e-05,
"loss": 3.7238,
"step": 1600
},
{
"epoch": 0.02,
"grad_norm": 29.011735916137695,
"learning_rate": 4.9659733649970155e-05,
"loss": 3.7215,
"step": 1620
},
{
"epoch": 0.02,
"grad_norm": 33.73636245727539,
"learning_rate": 4.9646290912655834e-05,
"loss": 3.8132,
"step": 1640
},
{
"epoch": 0.02,
"grad_norm": 38.57840347290039,
"learning_rate": 4.9632589647412265e-05,
"loss": 3.8606,
"step": 1660
},
{
"epoch": 0.02,
"grad_norm": 33.149078369140625,
"learning_rate": 4.9618629997959235e-05,
"loss": 3.7518,
"step": 1680
},
{
"epoch": 0.02,
"grad_norm": 58.5382194519043,
"learning_rate": 4.960441211072686e-05,
"loss": 3.7482,
"step": 1700
},
{
"epoch": 0.02,
"grad_norm": 31.86609649658203,
"learning_rate": 4.958993613485405e-05,
"loss": 3.7683,
"step": 1720
},
{
"epoch": 0.02,
"grad_norm": 28.98000144958496,
"learning_rate": 4.9575202222186945e-05,
"loss": 3.8361,
"step": 1740
},
{
"epoch": 0.02,
"grad_norm": 37.06975555419922,
"learning_rate": 4.956021052727731e-05,
"loss": 3.7297,
"step": 1760
},
{
"epoch": 0.03,
"grad_norm": 44.01863479614258,
"learning_rate": 4.954496120738094e-05,
"loss": 3.8244,
"step": 1780
},
{
"epoch": 0.03,
"grad_norm": 31.08086585998535,
"learning_rate": 4.9529454422455976e-05,
"loss": 3.8144,
"step": 1800
},
{
"epoch": 0.03,
"grad_norm": 36.80121994018555,
"learning_rate": 4.951369033516127e-05,
"loss": 3.7668,
"step": 1820
},
{
"epoch": 0.03,
"grad_norm": 24.225065231323242,
"learning_rate": 4.949766911085461e-05,
"loss": 3.7929,
"step": 1840
},
{
"epoch": 0.03,
"grad_norm": 33.50989532470703,
"learning_rate": 4.948139091759108e-05,
"loss": 3.7897,
"step": 1860
},
{
"epoch": 0.03,
"grad_norm": 26.35730743408203,
"learning_rate": 4.9464855926121225e-05,
"loss": 3.8618,
"step": 1880
},
{
"epoch": 0.03,
"grad_norm": 36.487464904785156,
"learning_rate": 4.944806430988927e-05,
"loss": 3.7205,
"step": 1900
},
{
"epoch": 0.03,
"grad_norm": 35.87200164794922,
"learning_rate": 4.943101624503132e-05,
"loss": 3.8324,
"step": 1920
},
{
"epoch": 0.03,
"grad_norm": 26.013994216918945,
"learning_rate": 4.941371191037354e-05,
"loss": 3.6997,
"step": 1940
},
{
"epoch": 0.03,
"grad_norm": 42.59685134887695,
"learning_rate": 4.939615148743017e-05,
"loss": 3.7085,
"step": 1960
},
{
"epoch": 0.03,
"grad_norm": 65.71659851074219,
"learning_rate": 4.9378335160401766e-05,
"loss": 3.8939,
"step": 1980
},
{
"epoch": 0.03,
"grad_norm": 25.612024307250977,
"learning_rate": 4.936026311617316e-05,
"loss": 3.7231,
"step": 2000
},
{
"epoch": 0.03,
"grad_norm": 28.377412796020508,
"learning_rate": 4.9341935544311536e-05,
"loss": 3.7476,
"step": 2020
},
{
"epoch": 0.03,
"grad_norm": 29.760807037353516,
"learning_rate": 4.9323352637064455e-05,
"loss": 3.8374,
"step": 2040
},
{
"epoch": 0.03,
"grad_norm": 35.875770568847656,
"learning_rate": 4.9304514589357834e-05,
"loss": 3.7073,
"step": 2060
},
{
"epoch": 0.03,
"grad_norm": 26.299306869506836,
"learning_rate": 4.928542159879386e-05,
"loss": 3.736,
"step": 2080
},
{
"epoch": 0.03,
"grad_norm": 40.1691780090332,
"learning_rate": 4.926607386564898e-05,
"loss": 3.7416,
"step": 2100
},
{
"epoch": 0.03,
"grad_norm": 35.2581901550293,
"learning_rate": 4.924647159287176e-05,
"loss": 3.7917,
"step": 2120
},
{
"epoch": 0.03,
"grad_norm": 24.038591384887695,
"learning_rate": 4.9226614986080763e-05,
"loss": 3.7164,
"step": 2140
},
{
"epoch": 0.03,
"grad_norm": 41.96257019042969,
"learning_rate": 4.92065042535624e-05,
"loss": 3.8562,
"step": 2160
},
{
"epoch": 0.03,
"grad_norm": 37.07769775390625,
"learning_rate": 4.918613960626873e-05,
"loss": 3.845,
"step": 2180
},
{
"epoch": 0.03,
"grad_norm": 35.35500717163086,
"learning_rate": 4.916552125781528e-05,
"loss": 3.679,
"step": 2200
},
{
"epoch": 0.03,
"grad_norm": 28.356767654418945,
"learning_rate": 4.914464942447876e-05,
"loss": 3.6217,
"step": 2220
},
{
"epoch": 0.03,
"grad_norm": 32.50172805786133,
"learning_rate": 4.912352432519484e-05,
"loss": 3.8185,
"step": 2240
},
{
"epoch": 0.03,
"grad_norm": 36.33710861206055,
"learning_rate": 4.910214618155579e-05,
"loss": 3.7401,
"step": 2260
},
{
"epoch": 0.03,
"grad_norm": 42.05067443847656,
"learning_rate": 4.908051521780824e-05,
"loss": 3.6782,
"step": 2280
},
{
"epoch": 0.03,
"grad_norm": 37.84385299682617,
"learning_rate": 4.9058631660850765e-05,
"loss": 3.7863,
"step": 2300
},
{
"epoch": 0.03,
"grad_norm": 28.022615432739258,
"learning_rate": 4.90364957402315e-05,
"loss": 3.7804,
"step": 2320
},
{
"epoch": 0.03,
"grad_norm": 38.274173736572266,
"learning_rate": 4.9014107688145804e-05,
"loss": 3.6898,
"step": 2340
},
{
"epoch": 0.03,
"grad_norm": 29.532123565673828,
"learning_rate": 4.899146773943374e-05,
"loss": 3.7521,
"step": 2360
},
{
"epoch": 0.03,
"grad_norm": 48.601417541503906,
"learning_rate": 4.896857613157765e-05,
"loss": 3.646,
"step": 2380
},
{
"epoch": 0.03,
"grad_norm": 31.142457962036133,
"learning_rate": 4.894543310469968e-05,
"loss": 3.7694,
"step": 2400
},
{
"epoch": 0.03,
"grad_norm": 39.75430679321289,
"learning_rate": 4.8922038901559224e-05,
"loss": 3.7673,
"step": 2420
},
{
"epoch": 0.03,
"grad_norm": 46.01137924194336,
"learning_rate": 4.8898393767550405e-05,
"loss": 3.7022,
"step": 2440
},
{
"epoch": 0.03,
"grad_norm": 26.171249389648438,
"learning_rate": 4.887449795069948e-05,
"loss": 3.7917,
"step": 2460
},
{
"epoch": 0.04,
"grad_norm": 46.24589538574219,
"learning_rate": 4.885035170166228e-05,
"loss": 3.7352,
"step": 2480
},
{
"epoch": 0.04,
"grad_norm": 31.69544219970703,
"learning_rate": 4.882595527372152e-05,
"loss": 3.694,
"step": 2500
},
{
"epoch": 0.04,
"grad_norm": 35.99808883666992,
"learning_rate": 4.880130892278419e-05,
"loss": 3.7636,
"step": 2520
},
{
"epoch": 0.04,
"grad_norm": 31.871978759765625,
"learning_rate": 4.877641290737884e-05,
"loss": 3.7472,
"step": 2540
},
{
"epoch": 0.04,
"grad_norm": 35.04158401489258,
"learning_rate": 4.87512674886529e-05,
"loss": 3.7445,
"step": 2560
},
{
"epoch": 0.04,
"grad_norm": 46.71685791015625,
"learning_rate": 4.872587293036991e-05,
"loss": 3.7141,
"step": 2580
},
{
"epoch": 0.04,
"grad_norm": 26.907012939453125,
"learning_rate": 4.870022949890676e-05,
"loss": 3.748,
"step": 2600
},
{
"epoch": 0.04,
"grad_norm": 26.9509334564209,
"learning_rate": 4.867433746325093e-05,
"loss": 3.7635,
"step": 2620
},
{
"epoch": 0.04,
"grad_norm": 26.85176658630371,
"learning_rate": 4.8648197094997616e-05,
"loss": 3.824,
"step": 2640
},
{
"epoch": 0.04,
"grad_norm": 22.88348960876465,
"learning_rate": 4.8621808668346906e-05,
"loss": 3.7504,
"step": 2660
},
{
"epoch": 0.04,
"grad_norm": 27.76841163635254,
"learning_rate": 4.859517246010091e-05,
"loss": 3.8228,
"step": 2680
},
{
"epoch": 0.04,
"grad_norm": 41.46321487426758,
"learning_rate": 4.856828874966086e-05,
"loss": 3.6509,
"step": 2700
},
{
"epoch": 0.04,
"grad_norm": 28.96099090576172,
"learning_rate": 4.854115781902414e-05,
"loss": 3.7377,
"step": 2720
},
{
"epoch": 0.04,
"grad_norm": 38.632015228271484,
"learning_rate": 4.851377995278138e-05,
"loss": 3.8471,
"step": 2740
},
{
"epoch": 0.04,
"grad_norm": 32.76665496826172,
"learning_rate": 4.8486155438113454e-05,
"loss": 3.731,
"step": 2760
},
{
"epoch": 0.04,
"grad_norm": 30.798906326293945,
"learning_rate": 4.845828456478842e-05,
"loss": 3.6953,
"step": 2780
},
{
"epoch": 0.04,
"grad_norm": 35.173606872558594,
"learning_rate": 4.8430167625158595e-05,
"loss": 3.6521,
"step": 2800
},
{
"epoch": 0.04,
"grad_norm": 50.02262496948242,
"learning_rate": 4.840180491415733e-05,
"loss": 3.6999,
"step": 2820
},
{
"epoch": 0.04,
"grad_norm": 33.76813507080078,
"learning_rate": 4.837319672929607e-05,
"loss": 3.7118,
"step": 2840
},
{
"epoch": 0.04,
"grad_norm": 24.56015396118164,
"learning_rate": 4.834434337066112e-05,
"loss": 3.7094,
"step": 2860
},
{
"epoch": 0.04,
"grad_norm": 39.17055892944336,
"learning_rate": 4.8315245140910556e-05,
"loss": 3.799,
"step": 2880
},
{
"epoch": 0.04,
"grad_norm": 29.631614685058594,
"learning_rate": 4.828590234527106e-05,
"loss": 3.7785,
"step": 2900
},
{
"epoch": 0.04,
"grad_norm": 46.83203125,
"learning_rate": 4.825631529153466e-05,
"loss": 3.6311,
"step": 2920
},
{
"epoch": 0.04,
"grad_norm": 34.5321044921875,
"learning_rate": 4.822648429005554e-05,
"loss": 3.7288,
"step": 2940
},
{
"epoch": 0.04,
"grad_norm": 19.74892234802246,
"learning_rate": 4.819640965374681e-05,
"loss": 3.6749,
"step": 2960
},
{
"epoch": 0.04,
"grad_norm": 51.736480712890625,
"learning_rate": 4.8166091698077164e-05,
"loss": 3.8733,
"step": 2980
},
{
"epoch": 0.04,
"grad_norm": 24.50010871887207,
"learning_rate": 4.813553074106761e-05,
"loss": 3.7634,
"step": 3000
},
{
"epoch": 0.04,
"grad_norm": 29.08304214477539,
"learning_rate": 4.810472710328812e-05,
"loss": 3.7277,
"step": 3020
},
{
"epoch": 0.04,
"grad_norm": 55.230377197265625,
"learning_rate": 4.80736811078543e-05,
"loss": 3.7238,
"step": 3040
},
{
"epoch": 0.04,
"grad_norm": 19.770660400390625,
"learning_rate": 4.804239308042392e-05,
"loss": 3.7202,
"step": 3060
},
{
"epoch": 0.04,
"grad_norm": 28.955581665039062,
"learning_rate": 4.8010863349193605e-05,
"loss": 3.7079,
"step": 3080
},
{
"epoch": 0.04,
"grad_norm": 32.6827278137207,
"learning_rate": 4.7979092244895305e-05,
"loss": 3.7488,
"step": 3100
},
{
"epoch": 0.04,
"grad_norm": 28.665210723876953,
"learning_rate": 4.794708010079289e-05,
"loss": 3.6798,
"step": 3120
},
{
"epoch": 0.04,
"grad_norm": 31.36636734008789,
"learning_rate": 4.791482725267857e-05,
"loss": 3.7233,
"step": 3140
},
{
"epoch": 0.04,
"grad_norm": 28.98109245300293,
"learning_rate": 4.7882334038869495e-05,
"loss": 3.8137,
"step": 3160
},
{
"epoch": 0.05,
"grad_norm": 25.13091278076172,
"learning_rate": 4.784960080020408e-05,
"loss": 3.756,
"step": 3180
},
{
"epoch": 0.05,
"grad_norm": 43.819313049316406,
"learning_rate": 4.781662788003851e-05,
"loss": 3.7371,
"step": 3200
},
{
"epoch": 0.05,
"grad_norm": 25.864599227905273,
"learning_rate": 4.7783415624243124e-05,
"loss": 3.604,
"step": 3220
},
{
"epoch": 0.05,
"grad_norm": 38.96342468261719,
"learning_rate": 4.7749964381198765e-05,
"loss": 3.7482,
"step": 3240
},
{
"epoch": 0.05,
"grad_norm": 28.412094116210938,
"learning_rate": 4.7716274501793144e-05,
"loss": 3.6766,
"step": 3260
},
{
"epoch": 0.05,
"grad_norm": 35.93290328979492,
"learning_rate": 4.768234633941716e-05,
"loss": 3.6659,
"step": 3280
},
{
"epoch": 0.05,
"grad_norm": 34.64625930786133,
"learning_rate": 4.764818024996117e-05,
"loss": 3.6739,
"step": 3300
},
{
"epoch": 0.05,
"grad_norm": 32.466495513916016,
"learning_rate": 4.76137765918113e-05,
"loss": 3.7524,
"step": 3320
},
{
"epoch": 0.05,
"grad_norm": 33.156776428222656,
"learning_rate": 4.7579135725845635e-05,
"loss": 3.7571,
"step": 3340
},
{
"epoch": 0.05,
"grad_norm": 48.48731994628906,
"learning_rate": 4.7544258015430463e-05,
"loss": 3.6783,
"step": 3360
},
{
"epoch": 0.05,
"grad_norm": 30.641870498657227,
"learning_rate": 4.750914382641648e-05,
"loss": 3.7549,
"step": 3380
},
{
"epoch": 0.05,
"grad_norm": 31.7097110748291,
"learning_rate": 4.747379352713489e-05,
"loss": 3.6388,
"step": 3400
},
{
"epoch": 0.05,
"grad_norm": 45.476951599121094,
"learning_rate": 4.7438207488393616e-05,
"loss": 3.7421,
"step": 3420
},
{
"epoch": 0.05,
"grad_norm": 42.40350341796875,
"learning_rate": 4.740238608347336e-05,
"loss": 3.771,
"step": 3440
},
{
"epoch": 0.05,
"grad_norm": 26.54286003112793,
"learning_rate": 4.736632968812373e-05,
"loss": 3.6409,
"step": 3460
},
{
"epoch": 0.05,
"grad_norm": 33.44880676269531,
"learning_rate": 4.733003868055923e-05,
"loss": 3.6977,
"step": 3480
},
{
"epoch": 0.05,
"grad_norm": 30.746978759765625,
"learning_rate": 4.7293513441455364e-05,
"loss": 3.6403,
"step": 3500
},
{
"epoch": 0.05,
"grad_norm": 30.616453170776367,
"learning_rate": 4.72567543539446e-05,
"loss": 3.7039,
"step": 3520
},
{
"epoch": 0.05,
"grad_norm": 28.486270904541016,
"learning_rate": 4.721976180361238e-05,
"loss": 3.6331,
"step": 3540
},
{
"epoch": 0.05,
"grad_norm": 31.4039363861084,
"learning_rate": 4.718253617849306e-05,
"loss": 3.6498,
"step": 3560
},
{
"epoch": 0.05,
"grad_norm": 22.35509490966797,
"learning_rate": 4.714507786906581e-05,
"loss": 3.709,
"step": 3580
},
{
"epoch": 0.05,
"grad_norm": 25.957500457763672,
"learning_rate": 4.710738726825059e-05,
"loss": 3.7159,
"step": 3600
},
{
"epoch": 0.05,
"grad_norm": 27.019580841064453,
"learning_rate": 4.706946477140396e-05,
"loss": 3.6971,
"step": 3620
},
{
"epoch": 0.05,
"grad_norm": 32.743896484375,
"learning_rate": 4.703131077631497e-05,
"loss": 3.5543,
"step": 3640
},
{
"epoch": 0.05,
"grad_norm": 30.018753051757812,
"learning_rate": 4.699292568320097e-05,
"loss": 3.6811,
"step": 3660
},
{
"epoch": 0.05,
"grad_norm": 27.54176139831543,
"learning_rate": 4.695430989470343e-05,
"loss": 3.6593,
"step": 3680
},
{
"epoch": 0.05,
"grad_norm": 30.283519744873047,
"learning_rate": 4.69154638158837e-05,
"loss": 3.551,
"step": 3700
},
{
"epoch": 0.05,
"grad_norm": 26.505075454711914,
"learning_rate": 4.687638785421875e-05,
"loss": 3.7794,
"step": 3720
},
{
"epoch": 0.05,
"grad_norm": 26.94403839111328,
"learning_rate": 4.683708241959694e-05,
"loss": 3.6415,
"step": 3740
},
{
"epoch": 0.05,
"grad_norm": 31.006845474243164,
"learning_rate": 4.679754792431368e-05,
"loss": 3.6741,
"step": 3760
},
{
"epoch": 0.05,
"grad_norm": 60.343318939208984,
"learning_rate": 4.675778478306712e-05,
"loss": 3.6502,
"step": 3780
},
{
"epoch": 0.05,
"grad_norm": 52.47261047363281,
"learning_rate": 4.671779341295378e-05,
"loss": 3.6878,
"step": 3800
},
{
"epoch": 0.05,
"grad_norm": 34.15403747558594,
"learning_rate": 4.6677574233464226e-05,
"loss": 3.7464,
"step": 3820
},
{
"epoch": 0.05,
"grad_norm": 21.71308135986328,
"learning_rate": 4.663712766647862e-05,
"loss": 3.6239,
"step": 3840
},
{
"epoch": 0.05,
"grad_norm": 25.242189407348633,
"learning_rate": 4.65964541362623e-05,
"loss": 3.8114,
"step": 3860
},
{
"epoch": 0.05,
"grad_norm": 35.03647232055664,
"learning_rate": 4.655555406946135e-05,
"loss": 3.654,
"step": 3880
},
{
"epoch": 0.06,
"grad_norm": 54.89191818237305,
"learning_rate": 4.6514427895098134e-05,
"loss": 3.6936,
"step": 3900
},
{
"epoch": 0.06,
"grad_norm": 24.903459548950195,
"learning_rate": 4.647307604456674e-05,
"loss": 3.8267,
"step": 3920
},
{
"epoch": 0.06,
"grad_norm": 33.852054595947266,
"learning_rate": 4.643149895162854e-05,
"loss": 3.661,
"step": 3940
},
{
"epoch": 0.06,
"grad_norm": 35.687713623046875,
"learning_rate": 4.6389697052407534e-05,
"loss": 3.67,
"step": 3960
},
{
"epoch": 0.06,
"grad_norm": 29.4704647064209,
"learning_rate": 4.6347670785385884e-05,
"loss": 3.7182,
"step": 3980
},
{
"epoch": 0.06,
"grad_norm": 24.089828491210938,
"learning_rate": 4.630542059139924e-05,
"loss": 3.5781,
"step": 4000
},
{
"epoch": 0.06,
"grad_norm": 34.60494613647461,
"learning_rate": 4.626294691363213e-05,
"loss": 3.7001,
"step": 4020
},
{
"epoch": 0.06,
"grad_norm": 53.43947219848633,
"learning_rate": 4.622025019761336e-05,
"loss": 3.6048,
"step": 4040
},
{
"epoch": 0.06,
"grad_norm": 35.322486877441406,
"learning_rate": 4.617733089121127e-05,
"loss": 3.6201,
"step": 4060
},
{
"epoch": 0.06,
"grad_norm": 47.170005798339844,
"learning_rate": 4.613418944462907e-05,
"loss": 3.7443,
"step": 4080
},
{
"epoch": 0.06,
"grad_norm": 30.616161346435547,
"learning_rate": 4.6090826310400116e-05,
"loss": 3.7685,
"step": 4100
},
{
"epoch": 0.06,
"grad_norm": 24.628185272216797,
"learning_rate": 4.6047241943383176e-05,
"loss": 3.6677,
"step": 4120
},
{
"epoch": 0.06,
"grad_norm": 38.79618453979492,
"learning_rate": 4.600343680075764e-05,
"loss": 3.744,
"step": 4140
},
{
"epoch": 0.06,
"grad_norm": 37.38518524169922,
"learning_rate": 4.595941134201871e-05,
"loss": 3.7101,
"step": 4160
},
{
"epoch": 0.06,
"grad_norm": 29.248828887939453,
"learning_rate": 4.5915166028972624e-05,
"loss": 3.7209,
"step": 4180
},
{
"epoch": 0.06,
"grad_norm": 45.65785217285156,
"learning_rate": 4.587070132573178e-05,
"loss": 3.7605,
"step": 4200
},
{
"epoch": 0.06,
"grad_norm": 24.220314025878906,
"learning_rate": 4.582601769870988e-05,
"loss": 3.6609,
"step": 4220
},
{
"epoch": 0.06,
"grad_norm": 27.00070571899414,
"learning_rate": 4.578111561661702e-05,
"loss": 3.6754,
"step": 4240
},
{
"epoch": 0.06,
"grad_norm": 75.85283660888672,
"learning_rate": 4.573599555045479e-05,
"loss": 3.6605,
"step": 4260
},
{
"epoch": 0.06,
"grad_norm": 29.803096771240234,
"learning_rate": 4.569065797351135e-05,
"loss": 3.6287,
"step": 4280
},
{
"epoch": 0.06,
"grad_norm": 26.4781494140625,
"learning_rate": 4.5645103361356415e-05,
"loss": 3.6301,
"step": 4300
},
{
"epoch": 0.06,
"grad_norm": 47.245357513427734,
"learning_rate": 4.5599332191836316e-05,
"loss": 3.6776,
"step": 4320
},
{
"epoch": 0.06,
"grad_norm": 26.005104064941406,
"learning_rate": 4.555334494506896e-05,
"loss": 3.6756,
"step": 4340
},
{
"epoch": 0.06,
"grad_norm": 35.15077590942383,
"learning_rate": 4.5507142103438794e-05,
"loss": 3.7022,
"step": 4360
},
{
"epoch": 0.06,
"grad_norm": 29.038782119750977,
"learning_rate": 4.546072415159179e-05,
"loss": 3.6325,
"step": 4380
},
{
"epoch": 0.06,
"grad_norm": 30.944393157958984,
"learning_rate": 4.541409157643027e-05,
"loss": 3.6343,
"step": 4400
},
{
"epoch": 0.06,
"grad_norm": 31.153432846069336,
"learning_rate": 4.536724486710791e-05,
"loss": 3.7739,
"step": 4420
},
{
"epoch": 0.06,
"grad_norm": 40.95075225830078,
"learning_rate": 4.53201845150245e-05,
"loss": 3.6558,
"step": 4440
},
{
"epoch": 0.06,
"grad_norm": 30.37499237060547,
"learning_rate": 4.5272911013820876e-05,
"loss": 3.6093,
"step": 4460
},
{
"epoch": 0.06,
"grad_norm": 23.894237518310547,
"learning_rate": 4.522542485937369e-05,
"loss": 3.6415,
"step": 4480
},
{
"epoch": 0.06,
"grad_norm": 69.29508209228516,
"learning_rate": 4.517772654979023e-05,
"loss": 3.696,
"step": 4500
},
{
"epoch": 0.06,
"grad_norm": 31.464527130126953,
"learning_rate": 4.5129816585403206e-05,
"loss": 3.7147,
"step": 4520
},
{
"epoch": 0.06,
"grad_norm": 30.76380729675293,
"learning_rate": 4.508169546876547e-05,
"loss": 3.6428,
"step": 4540
},
{
"epoch": 0.06,
"grad_norm": 27.94367027282715,
"learning_rate": 4.503336370464476e-05,
"loss": 3.7018,
"step": 4560
},
{
"epoch": 0.06,
"grad_norm": 22.166793823242188,
"learning_rate": 4.49848218000184e-05,
"loss": 3.7018,
"step": 4580
},
{
"epoch": 0.07,
"grad_norm": 32.058921813964844,
"learning_rate": 4.493607026406802e-05,
"loss": 3.7035,
"step": 4600
},
{
"epoch": 0.07,
"grad_norm": 28.55988311767578,
"learning_rate": 4.488710960817416e-05,
"loss": 3.7725,
"step": 4620
},
{
"epoch": 0.07,
"grad_norm": 23.51280403137207,
"learning_rate": 4.4837940345910925e-05,
"loss": 3.7238,
"step": 4640
},
{
"epoch": 0.07,
"grad_norm": 37.3757209777832,
"learning_rate": 4.4788562993040614e-05,
"loss": 3.701,
"step": 4660
},
{
"epoch": 0.07,
"grad_norm": 38.56554412841797,
"learning_rate": 4.473897806750829e-05,
"loss": 3.7174,
"step": 4680
},
{
"epoch": 0.07,
"grad_norm": 29.553325653076172,
"learning_rate": 4.4689186089436366e-05,
"loss": 3.627,
"step": 4700
},
{
"epoch": 0.07,
"grad_norm": 33.66290283203125,
"learning_rate": 4.463918758111912e-05,
"loss": 3.6307,
"step": 4720
},
{
"epoch": 0.07,
"grad_norm": 29.957775115966797,
"learning_rate": 4.4588983067017257e-05,
"loss": 3.6157,
"step": 4740
},
{
"epoch": 0.07,
"grad_norm": 35.32748794555664,
"learning_rate": 4.4538573073752365e-05,
"loss": 3.5961,
"step": 4760
},
{
"epoch": 0.07,
"grad_norm": 24.597824096679688,
"learning_rate": 4.448795813010142e-05,
"loss": 3.5881,
"step": 4780
},
{
"epoch": 0.07,
"grad_norm": 26.248044967651367,
"learning_rate": 4.443713876699124e-05,
"loss": 3.6057,
"step": 4800
},
{
"epoch": 0.07,
"grad_norm": 25.942325592041016,
"learning_rate": 4.4386115517492874e-05,
"loss": 3.6286,
"step": 4820
},
{
"epoch": 0.07,
"grad_norm": 42.028316497802734,
"learning_rate": 4.43348889168161e-05,
"loss": 3.6306,
"step": 4840
},
{
"epoch": 0.07,
"grad_norm": 24.317644119262695,
"learning_rate": 4.4283459502303695e-05,
"loss": 3.5992,
"step": 4860
},
{
"epoch": 0.07,
"grad_norm": 43.174903869628906,
"learning_rate": 4.4231827813425885e-05,
"loss": 3.6493,
"step": 4880
},
{
"epoch": 0.07,
"grad_norm": 33.58101272583008,
"learning_rate": 4.417999439177466e-05,
"loss": 3.6843,
"step": 4900
},
{
"epoch": 0.07,
"grad_norm": 34.096824645996094,
"learning_rate": 4.412795978105807e-05,
"loss": 3.6134,
"step": 4920
},
{
"epoch": 0.07,
"grad_norm": 35.04353713989258,
"learning_rate": 4.4075724527094584e-05,
"loss": 3.5916,
"step": 4940
},
{
"epoch": 0.07,
"grad_norm": 28.97658920288086,
"learning_rate": 4.402328917780728e-05,
"loss": 3.6362,
"step": 4960
},
{
"epoch": 0.07,
"grad_norm": 35.05881118774414,
"learning_rate": 4.397065428321817e-05,
"loss": 3.7566,
"step": 4980
},
{
"epoch": 0.07,
"grad_norm": 27.057044982910156,
"learning_rate": 4.391782039544238e-05,
"loss": 3.4967,
"step": 5000
},
{
"epoch": 0.07,
"grad_norm": 22.590089797973633,
"learning_rate": 4.386478806868241e-05,
"loss": 3.6759,
"step": 5020
},
{
"epoch": 0.07,
"grad_norm": 34.77460479736328,
"learning_rate": 4.3811557859222254e-05,
"loss": 3.6893,
"step": 5040
},
{
"epoch": 0.07,
"grad_norm": 24.440248489379883,
"learning_rate": 4.375813032542164e-05,
"loss": 3.7167,
"step": 5060
},
{
"epoch": 0.07,
"grad_norm": 42.91717529296875,
"learning_rate": 4.3704506027710105e-05,
"loss": 3.5893,
"step": 5080
},
{
"epoch": 0.07,
"grad_norm": 34.991634368896484,
"learning_rate": 4.365068552858115e-05,
"loss": 3.5482,
"step": 5100
},
{
"epoch": 0.07,
"grad_norm": 37.62036895751953,
"learning_rate": 4.3596669392586365e-05,
"loss": 3.5972,
"step": 5120
},
{
"epoch": 0.07,
"grad_norm": 29.56283950805664,
"learning_rate": 4.354245818632944e-05,
"loss": 3.6804,
"step": 5140
},
{
"epoch": 0.07,
"grad_norm": 35.37843322753906,
"learning_rate": 4.348805247846027e-05,
"loss": 3.6491,
"step": 5160
},
{
"epoch": 0.07,
"grad_norm": 39.210906982421875,
"learning_rate": 4.343345283966901e-05,
"loss": 3.6268,
"step": 5180
},
{
"epoch": 0.07,
"grad_norm": 26.60144805908203,
"learning_rate": 4.337865984268001e-05,
"loss": 3.6277,
"step": 5200
},
{
"epoch": 0.07,
"grad_norm": 32.668052673339844,
"learning_rate": 4.33236740622459e-05,
"loss": 3.6159,
"step": 5220
},
{
"epoch": 0.07,
"grad_norm": 43.837833404541016,
"learning_rate": 4.326849607514148e-05,
"loss": 3.5939,
"step": 5240
},
{
"epoch": 0.07,
"grad_norm": 20.860111236572266,
"learning_rate": 4.321312646015775e-05,
"loss": 3.624,
"step": 5260
},
{
"epoch": 0.07,
"grad_norm": 24.005277633666992,
"learning_rate": 4.3157565798095753e-05,
"loss": 3.6098,
"step": 5280
},
{
"epoch": 0.08,
"grad_norm": 23.65524673461914,
"learning_rate": 4.3101814671760546e-05,
"loss": 3.6969,
"step": 5300
},
{
"epoch": 0.08,
"grad_norm": 40.98033905029297,
"learning_rate": 4.304587366595506e-05,
"loss": 3.8225,
"step": 5320
},
{
"epoch": 0.08,
"grad_norm": 28.647207260131836,
"learning_rate": 4.298974336747397e-05,
"loss": 3.6742,
"step": 5340
},
{
"epoch": 0.08,
"grad_norm": 20.806941986083984,
"learning_rate": 4.2933424365097564e-05,
"loss": 3.5679,
"step": 5360
},
{
"epoch": 0.08,
"grad_norm": 22.459196090698242,
"learning_rate": 4.287691724958551e-05,
"loss": 3.6389,
"step": 5380
},
{
"epoch": 0.08,
"grad_norm": 23.558490753173828,
"learning_rate": 4.2820222613670736e-05,
"loss": 3.6654,
"step": 5400
},
{
"epoch": 0.08,
"grad_norm": 20.315793991088867,
"learning_rate": 4.276334105205312e-05,
"loss": 3.5976,
"step": 5420
},
{
"epoch": 0.08,
"grad_norm": 21.125396728515625,
"learning_rate": 4.2706273161393327e-05,
"loss": 3.5712,
"step": 5440
},
{
"epoch": 0.08,
"grad_norm": 25.103483200073242,
"learning_rate": 4.2649019540306545e-05,
"loss": 3.616,
"step": 5460
},
{
"epoch": 0.08,
"grad_norm": 23.65394401550293,
"learning_rate": 4.2591580789356156e-05,
"loss": 3.6587,
"step": 5480
},
{
"epoch": 0.08,
"grad_norm": 31.216896057128906,
"learning_rate": 4.253395751104748e-05,
"loss": 3.7161,
"step": 5500
},
{
"epoch": 0.08,
"grad_norm": 28.144855499267578,
"learning_rate": 4.247615030982144e-05,
"loss": 3.6847,
"step": 5520
},
{
"epoch": 0.08,
"grad_norm": 23.597564697265625,
"learning_rate": 4.241815979204822e-05,
"loss": 3.6556,
"step": 5540
},
{
"epoch": 0.08,
"grad_norm": 41.00291061401367,
"learning_rate": 4.2359986566020906e-05,
"loss": 3.7665,
"step": 5560
},
{
"epoch": 0.08,
"grad_norm": 37.05702209472656,
"learning_rate": 4.230163124194913e-05,
"loss": 3.5916,
"step": 5580
},
{
"epoch": 0.08,
"grad_norm": 28.161930084228516,
"learning_rate": 4.224309443195261e-05,
"loss": 3.6887,
"step": 5600
},
{
"epoch": 0.08,
"grad_norm": 31.685361862182617,
"learning_rate": 4.2184376750054786e-05,
"loss": 3.5724,
"step": 5620
},
{
"epoch": 0.08,
"grad_norm": 38.13533020019531,
"learning_rate": 4.2125478812176364e-05,
"loss": 3.664,
"step": 5640
},
{
"epoch": 0.08,
"grad_norm": 20.385272979736328,
"learning_rate": 4.206640123612884e-05,
"loss": 3.73,
"step": 5660
},
{
"epoch": 0.08,
"grad_norm": 30.926259994506836,
"learning_rate": 4.200714464160804e-05,
"loss": 3.6472,
"step": 5680
},
{
"epoch": 0.08,
"grad_norm": 19.820131301879883,
"learning_rate": 4.194770965018758e-05,
"loss": 3.6226,
"step": 5700
},
{
"epoch": 0.08,
"grad_norm": 21.318801879882812,
"learning_rate": 4.188809688531241e-05,
"loss": 3.635,
"step": 5720
},
{
"epoch": 0.08,
"grad_norm": 18.304567337036133,
"learning_rate": 4.182830697229223e-05,
"loss": 3.625,
"step": 5740
},
{
"epoch": 0.08,
"grad_norm": 24.25802230834961,
"learning_rate": 4.176834053829492e-05,
"loss": 3.5844,
"step": 5760
},
{
"epoch": 0.08,
"grad_norm": 53.09843444824219,
"learning_rate": 4.170819821234001e-05,
"loss": 3.7058,
"step": 5780
},
{
"epoch": 0.08,
"grad_norm": 39.87876510620117,
"learning_rate": 4.164788062529203e-05,
"loss": 3.725,
"step": 5800
},
{
"epoch": 0.08,
"grad_norm": 32.36482620239258,
"learning_rate": 4.1587388409853935e-05,
"loss": 3.5355,
"step": 5820
},
{
"epoch": 0.08,
"grad_norm": 28.59760284423828,
"learning_rate": 4.1526722200560445e-05,
"loss": 3.6528,
"step": 5840
},
{
"epoch": 0.08,
"grad_norm": 21.3729305267334,
"learning_rate": 4.146588263377137e-05,
"loss": 3.6428,
"step": 5860
},
{
"epoch": 0.08,
"grad_norm": 20.160661697387695,
"learning_rate": 4.140487034766499e-05,
"loss": 3.6116,
"step": 5880
},
{
"epoch": 0.08,
"grad_norm": 31.58021354675293,
"learning_rate": 4.134368598223132e-05,
"loss": 3.6302,
"step": 5900
},
{
"epoch": 0.08,
"grad_norm": 30.793672561645508,
"learning_rate": 4.128233017926538e-05,
"loss": 3.5663,
"step": 5920
},
{
"epoch": 0.08,
"grad_norm": 22.589147567749023,
"learning_rate": 4.122080358236055e-05,
"loss": 3.6292,
"step": 5940
},
{
"epoch": 0.08,
"grad_norm": 32.27565383911133,
"learning_rate": 4.1159106836901674e-05,
"loss": 3.5806,
"step": 5960
},
{
"epoch": 0.08,
"grad_norm": 37.15829849243164,
"learning_rate": 4.109724059005844e-05,
"loss": 3.5662,
"step": 5980
},
{
"epoch": 0.08,
"grad_norm": 38.23238754272461,
"learning_rate": 4.10352054907785e-05,
"loss": 3.6842,
"step": 6000
},
{
"epoch": 0.09,
"grad_norm": 24.37531089782715,
"learning_rate": 4.0973002189780694e-05,
"loss": 3.6153,
"step": 6020
},
{
"epoch": 0.09,
"grad_norm": 24.309982299804688,
"learning_rate": 4.0910631339548206e-05,
"loss": 3.6502,
"step": 6040
},
{
"epoch": 0.09,
"grad_norm": 24.007654190063477,
"learning_rate": 4.084809359432175e-05,
"loss": 3.7203,
"step": 6060
},
{
"epoch": 0.09,
"grad_norm": 24.977094650268555,
"learning_rate": 4.0785389610092686e-05,
"loss": 3.5413,
"step": 6080
},
{
"epoch": 0.09,
"grad_norm": 27.397930145263672,
"learning_rate": 4.072252004459611e-05,
"loss": 3.5612,
"step": 6100
},
{
"epoch": 0.09,
"grad_norm": 26.012800216674805,
"learning_rate": 4.065948555730405e-05,
"loss": 3.6385,
"step": 6120
},
{
"epoch": 0.09,
"grad_norm": 29.745574951171875,
"learning_rate": 4.0596286809418435e-05,
"loss": 3.6646,
"step": 6140
},
{
"epoch": 0.09,
"grad_norm": 30.76190185546875,
"learning_rate": 4.053292446386422e-05,
"loss": 3.6622,
"step": 6160
},
{
"epoch": 0.09,
"grad_norm": 27.577564239501953,
"learning_rate": 4.046939918528243e-05,
"loss": 3.701,
"step": 6180
},
{
"epoch": 0.09,
"grad_norm": 31.610410690307617,
"learning_rate": 4.0405711640023186e-05,
"loss": 3.5977,
"step": 6200
},
{
"epoch": 0.09,
"grad_norm": 28.61423110961914,
"learning_rate": 4.034186249613869e-05,
"loss": 3.7307,
"step": 6220
},
{
"epoch": 0.09,
"grad_norm": 44.62327575683594,
"learning_rate": 4.027785242337626e-05,
"loss": 3.7055,
"step": 6240
},
{
"epoch": 0.09,
"grad_norm": 32.20371627807617,
"learning_rate": 4.0213682093171254e-05,
"loss": 3.6186,
"step": 6260
},
{
"epoch": 0.09,
"grad_norm": 32.36015701293945,
"learning_rate": 4.014935217864009e-05,
"loss": 3.5798,
"step": 6280
},
{
"epoch": 0.09,
"grad_norm": 36.1356201171875,
"learning_rate": 4.008486335457312e-05,
"loss": 3.6395,
"step": 6300
},
{
"epoch": 0.09,
"grad_norm": 20.485820770263672,
"learning_rate": 4.0020216297427594e-05,
"loss": 3.6075,
"step": 6320
},
{
"epoch": 0.09,
"grad_norm": 21.503564834594727,
"learning_rate": 3.995541168532055e-05,
"loss": 3.6099,
"step": 6340
},
{
"epoch": 0.09,
"grad_norm": 29.125812530517578,
"learning_rate": 3.9890450198021704e-05,
"loss": 3.6665,
"step": 6360
},
{
"epoch": 0.09,
"grad_norm": 24.479976654052734,
"learning_rate": 3.982533251694632e-05,
"loss": 3.7168,
"step": 6380
},
{
"epoch": 0.09,
"grad_norm": 36.184410095214844,
"learning_rate": 3.976005932514807e-05,
"loss": 3.5771,
"step": 6400
},
{
"epoch": 0.09,
"grad_norm": 28.156030654907227,
"learning_rate": 3.969463130731183e-05,
"loss": 3.6353,
"step": 6420
},
{
"epoch": 0.09,
"grad_norm": 25.22379493713379,
"learning_rate": 3.962904914974656e-05,
"loss": 3.5015,
"step": 6440
},
{
"epoch": 0.09,
"grad_norm": 31.427339553833008,
"learning_rate": 3.9563313540378055e-05,
"loss": 3.5712,
"step": 6460
},
{
"epoch": 0.09,
"grad_norm": 19.2696590423584,
"learning_rate": 3.949742516874175e-05,
"loss": 3.5929,
"step": 6480
},
{
"epoch": 0.09,
"grad_norm": 23.234111785888672,
"learning_rate": 3.943138472597549e-05,
"loss": 3.6166,
"step": 6500
},
{
"epoch": 0.09,
"grad_norm": 26.726085662841797,
"learning_rate": 3.936519290481226e-05,
"loss": 3.6748,
"step": 6520
},
{
"epoch": 0.09,
"grad_norm": 34.712257385253906,
"learning_rate": 3.929885039957296e-05,
"loss": 3.64,
"step": 6540
},
{
"epoch": 0.09,
"grad_norm": 25.96158218383789,
"learning_rate": 3.923235790615907e-05,
"loss": 3.6119,
"step": 6560
},
{
"epoch": 0.09,
"grad_norm": 34.04408264160156,
"learning_rate": 3.916571612204537e-05,
"loss": 3.6881,
"step": 6580
},
{
"epoch": 0.09,
"grad_norm": 20.656030654907227,
"learning_rate": 3.909892574627266e-05,
"loss": 3.6462,
"step": 6600
},
{
"epoch": 0.09,
"grad_norm": 23.8648738861084,
"learning_rate": 3.9031987479440367e-05,
"loss": 3.597,
"step": 6620
},
{
"epoch": 0.09,
"grad_norm": 34.48773193359375,
"learning_rate": 3.896490202369924e-05,
"loss": 3.5781,
"step": 6640
},
{
"epoch": 0.09,
"grad_norm": 26.09375762939453,
"learning_rate": 3.8897670082743955e-05,
"loss": 3.5463,
"step": 6660
},
{
"epoch": 0.09,
"grad_norm": 25.49962615966797,
"learning_rate": 3.883029236180577e-05,
"loss": 3.637,
"step": 6680
},
{
"epoch": 0.09,
"grad_norm": 37.70731735229492,
"learning_rate": 3.876276956764509e-05,
"loss": 3.6515,
"step": 6700
},
{
"epoch": 0.1,
"grad_norm": 53.345558166503906,
"learning_rate": 3.8695102408544076e-05,
"loss": 3.521,
"step": 6720
},
{
"epoch": 0.1,
"grad_norm": 34.147884368896484,
"learning_rate": 3.862729159429921e-05,
"loss": 3.6443,
"step": 6740
},
{
"epoch": 0.1,
"grad_norm": 29.45001220703125,
"learning_rate": 3.855933783621384e-05,
"loss": 3.5976,
"step": 6760
},
{
"epoch": 0.1,
"grad_norm": 29.933969497680664,
"learning_rate": 3.849124184709073e-05,
"loss": 3.6396,
"step": 6780
},
{
"epoch": 0.1,
"grad_norm": 38.85334014892578,
"learning_rate": 3.84230043412246e-05,
"loss": 3.6518,
"step": 6800
},
{
"epoch": 0.1,
"grad_norm": 34.85492706298828,
"learning_rate": 3.835462603439458e-05,
"loss": 3.6577,
"step": 6820
},
{
"epoch": 0.1,
"grad_norm": 29.77360725402832,
"learning_rate": 3.828610764385676e-05,
"loss": 3.6026,
"step": 6840
},
{
"epoch": 0.1,
"grad_norm": 38.89609909057617,
"learning_rate": 3.821744988833663e-05,
"loss": 3.6144,
"step": 6860
},
{
"epoch": 0.1,
"grad_norm": 25.664960861206055,
"learning_rate": 3.814865348802157e-05,
"loss": 3.5826,
"step": 6880
},
{
"epoch": 0.1,
"grad_norm": 31.955894470214844,
"learning_rate": 3.807971916455325e-05,
"loss": 3.6973,
"step": 6900
},
{
"epoch": 0.1,
"grad_norm": 23.378131866455078,
"learning_rate": 3.8010647641020115e-05,
"loss": 3.6875,
"step": 6920
},
{
"epoch": 0.1,
"grad_norm": 45.89334487915039,
"learning_rate": 3.794143964194976e-05,
"loss": 3.5457,
"step": 6940
},
{
"epoch": 0.1,
"grad_norm": 32.45075988769531,
"learning_rate": 3.787209589330134e-05,
"loss": 3.5719,
"step": 6960
},
{
"epoch": 0.1,
"grad_norm": 32.06966018676758,
"learning_rate": 3.7802617122457975e-05,
"loss": 3.6324,
"step": 6980
},
{
"epoch": 0.1,
"grad_norm": 27.93364715576172,
"learning_rate": 3.773300405821908e-05,
"loss": 3.6093,
"step": 7000
},
{
"epoch": 0.1,
"grad_norm": 23.111515045166016,
"learning_rate": 3.766325743079277e-05,
"loss": 3.5292,
"step": 7020
},
{
"epoch": 0.1,
"grad_norm": 24.405742645263672,
"learning_rate": 3.759337797178816e-05,
"loss": 3.5969,
"step": 7040
},
{
"epoch": 0.1,
"grad_norm": 37.218467712402344,
"learning_rate": 3.752336641420772e-05,
"loss": 3.653,
"step": 7060
},
{
"epoch": 0.1,
"grad_norm": 32.396522521972656,
"learning_rate": 3.745322349243954e-05,
"loss": 3.6483,
"step": 7080
},
{
"epoch": 0.1,
"grad_norm": 35.53373336791992,
"learning_rate": 3.7382949942249694e-05,
"loss": 3.6356,
"step": 7100
},
{
"epoch": 0.1,
"grad_norm": 33.19758987426758,
"learning_rate": 3.731254650077446e-05,
"loss": 3.6017,
"step": 7120
},
{
"epoch": 0.1,
"grad_norm": 36.60466003417969,
"learning_rate": 3.7242013906512626e-05,
"loss": 3.6246,
"step": 7140
},
{
"epoch": 0.1,
"grad_norm": 21.257328033447266,
"learning_rate": 3.717135289931774e-05,
"loss": 3.6046,
"step": 7160
},
{
"epoch": 0.1,
"grad_norm": 25.697444915771484,
"learning_rate": 3.7100564220390326e-05,
"loss": 3.6154,
"step": 7180
},
{
"epoch": 0.1,
"grad_norm": 28.491622924804688,
"learning_rate": 3.702964861227013e-05,
"loss": 3.6983,
"step": 7200
},
{
"epoch": 0.1,
"grad_norm": 26.819791793823242,
"learning_rate": 3.695860681882832e-05,
"loss": 3.5722,
"step": 7220
},
{
"epoch": 0.1,
"grad_norm": 25.864788055419922,
"learning_rate": 3.6887439585259694e-05,
"loss": 3.6825,
"step": 7240
},
{
"epoch": 0.1,
"grad_norm": 22.492717742919922,
"learning_rate": 3.681614765807486e-05,
"loss": 3.6377,
"step": 7260
},
{
"epoch": 0.1,
"grad_norm": 31.227336883544922,
"learning_rate": 3.6744731785092395e-05,
"loss": 3.5476,
"step": 7280
},
{
"epoch": 0.1,
"grad_norm": 29.010467529296875,
"learning_rate": 3.6673192715431015e-05,
"loss": 3.6285,
"step": 7300
},
{
"epoch": 0.1,
"grad_norm": 38.40274429321289,
"learning_rate": 3.6601531199501714e-05,
"loss": 3.6941,
"step": 7320
},
{
"epoch": 0.1,
"grad_norm": 26.361167907714844,
"learning_rate": 3.652974798899988e-05,
"loss": 3.5772,
"step": 7340
},
{
"epoch": 0.1,
"grad_norm": 30.241390228271484,
"learning_rate": 3.645784383689742e-05,
"loss": 3.5177,
"step": 7360
},
{
"epoch": 0.1,
"grad_norm": 43.579349517822266,
"learning_rate": 3.6385819497434876e-05,
"loss": 3.7467,
"step": 7380
},
{
"epoch": 0.1,
"grad_norm": 47.42546081542969,
"learning_rate": 3.631367572611348e-05,
"loss": 3.6651,
"step": 7400
},
{
"epoch": 0.11,
"grad_norm": 29.63494110107422,
"learning_rate": 3.6241413279687254e-05,
"loss": 3.6368,
"step": 7420
},
{
"epoch": 0.11,
"grad_norm": 23.63068389892578,
"learning_rate": 3.616903291615506e-05,
"loss": 3.4684,
"step": 7440
},
{
"epoch": 0.11,
"grad_norm": 24.25609588623047,
"learning_rate": 3.6096535394752676e-05,
"loss": 3.6177,
"step": 7460
},
{
"epoch": 0.11,
"grad_norm": 23.829919815063477,
"learning_rate": 3.6023921475944794e-05,
"loss": 3.6008,
"step": 7480
},
{
"epoch": 0.11,
"grad_norm": 23.879764556884766,
"learning_rate": 3.595119192141706e-05,
"loss": 3.6926,
"step": 7500
},
{
"epoch": 0.11,
"grad_norm": 22.195941925048828,
"learning_rate": 3.5878347494068084e-05,
"loss": 3.6049,
"step": 7520
},
{
"epoch": 0.11,
"grad_norm": 48.3228645324707,
"learning_rate": 3.580538895800144e-05,
"loss": 3.64,
"step": 7540
},
{
"epoch": 0.11,
"grad_norm": 33.77362823486328,
"learning_rate": 3.5732317078517654e-05,
"loss": 3.573,
"step": 7560
},
{
"epoch": 0.11,
"grad_norm": 29.266658782958984,
"learning_rate": 3.565913262210615e-05,
"loss": 3.6385,
"step": 7580
},
{
"epoch": 0.11,
"grad_norm": 42.985694885253906,
"learning_rate": 3.5585836356437264e-05,
"loss": 3.5987,
"step": 7600
},
{
"epoch": 0.11,
"grad_norm": 28.579496383666992,
"learning_rate": 3.551242905035412e-05,
"loss": 3.6161,
"step": 7620
},
{
"epoch": 0.11,
"grad_norm": 27.196502685546875,
"learning_rate": 3.5438911473864634e-05,
"loss": 3.5763,
"step": 7640
},
{
"epoch": 0.11,
"grad_norm": 28.27582359313965,
"learning_rate": 3.5365284398133405e-05,
"loss": 3.6452,
"step": 7660
},
{
"epoch": 0.11,
"grad_norm": 27.310009002685547,
"learning_rate": 3.52915485954736e-05,
"loss": 3.6718,
"step": 7680
},
{
"epoch": 0.11,
"grad_norm": 18.603565216064453,
"learning_rate": 3.521770483933891e-05,
"loss": 3.7397,
"step": 7700
},
{
"epoch": 0.11,
"grad_norm": 26.25426483154297,
"learning_rate": 3.514375390431539e-05,
"loss": 3.6665,
"step": 7720
},
{
"epoch": 0.11,
"grad_norm": 29.20294952392578,
"learning_rate": 3.506969656611335e-05,
"loss": 3.551,
"step": 7740
},
{
"epoch": 0.11,
"grad_norm": 37.7564697265625,
"learning_rate": 3.4995533601559226e-05,
"loss": 3.58,
"step": 7760
},
{
"epoch": 0.11,
"grad_norm": 25.87001609802246,
"learning_rate": 3.4921265788587435e-05,
"loss": 3.5855,
"step": 7780
},
{
"epoch": 0.11,
"grad_norm": 26.17401123046875,
"learning_rate": 3.484689390623218e-05,
"loss": 3.5951,
"step": 7800
},
{
"epoch": 0.11,
"grad_norm": 29.20701026916504,
"learning_rate": 3.4772418734619324e-05,
"loss": 3.6267,
"step": 7820
},
{
"epoch": 0.11,
"grad_norm": 60.92488098144531,
"learning_rate": 3.4697841054958165e-05,
"loss": 3.5733,
"step": 7840
},
{
"epoch": 0.11,
"grad_norm": 23.196178436279297,
"learning_rate": 3.462316164953328e-05,
"loss": 3.6283,
"step": 7860
},
{
"epoch": 0.11,
"grad_norm": 23.13970184326172,
"learning_rate": 3.45483813016963e-05,
"loss": 3.6558,
"step": 7880
},
{
"epoch": 0.11,
"grad_norm": 36.5677375793457,
"learning_rate": 3.447350079585767e-05,
"loss": 3.8141,
"step": 7900
},
{
"epoch": 0.11,
"grad_norm": 24.820940017700195,
"learning_rate": 3.4398520917478476e-05,
"loss": 3.6439,
"step": 7920
},
{
"epoch": 0.11,
"grad_norm": 19.9990291595459,
"learning_rate": 3.4323442453062174e-05,
"loss": 3.601,
"step": 7940
},
{
"epoch": 0.11,
"grad_norm": 20.419004440307617,
"learning_rate": 3.42482661901463e-05,
"loss": 3.4856,
"step": 7960
},
{
"epoch": 0.11,
"grad_norm": 24.06426429748535,
"learning_rate": 3.417299291729431e-05,
"loss": 3.679,
"step": 7980
},
{
"epoch": 0.11,
"grad_norm": 23.68332862854004,
"learning_rate": 3.409762342408719e-05,
"loss": 3.6538,
"step": 8000
},
{
"epoch": 0.11,
"grad_norm": 22.80304527282715,
"learning_rate": 3.402215850111528e-05,
"loss": 3.6685,
"step": 8020
},
{
"epoch": 0.11,
"grad_norm": 30.03902244567871,
"learning_rate": 3.3946598939969896e-05,
"loss": 3.633,
"step": 8040
},
{
"epoch": 0.11,
"grad_norm": 31.799922943115234,
"learning_rate": 3.38709455332351e-05,
"loss": 3.5756,
"step": 8060
},
{
"epoch": 0.11,
"grad_norm": 29.18169403076172,
"learning_rate": 3.379519907447931e-05,
"loss": 3.5886,
"step": 8080
},
{
"epoch": 0.11,
"grad_norm": 34.412113189697266,
"learning_rate": 3.3719360358247054e-05,
"loss": 3.5254,
"step": 8100
},
{
"epoch": 0.11,
"grad_norm": 38.046695709228516,
"learning_rate": 3.3643430180050574e-05,
"loss": 3.6677,
"step": 8120
},
{
"epoch": 0.12,
"grad_norm": 23.16988182067871,
"learning_rate": 3.35674093363615e-05,
"loss": 3.5864,
"step": 8140
},
{
"epoch": 0.12,
"grad_norm": 59.21152114868164,
"learning_rate": 3.349129862460251e-05,
"loss": 3.4903,
"step": 8160
},
{
"epoch": 0.12,
"grad_norm": 21.080909729003906,
"learning_rate": 3.341509884313897e-05,
"loss": 3.5803,
"step": 8180
},
{
"epoch": 0.12,
"grad_norm": 26.221805572509766,
"learning_rate": 3.333881079127052e-05,
"loss": 3.5238,
"step": 8200
},
{
"epoch": 0.12,
"grad_norm": 21.8948917388916,
"learning_rate": 3.326243526922272e-05,
"loss": 3.5498,
"step": 8220
},
{
"epoch": 0.12,
"grad_norm": 29.98341178894043,
"learning_rate": 3.3185973078138664e-05,
"loss": 3.6218,
"step": 8240
},
{
"epoch": 0.12,
"grad_norm": 21.86969757080078,
"learning_rate": 3.310942502007056e-05,
"loss": 3.5104,
"step": 8260
},
{
"epoch": 0.12,
"grad_norm": 29.3415584564209,
"learning_rate": 3.303279189797131e-05,
"loss": 3.5253,
"step": 8280
},
{
"epoch": 0.12,
"grad_norm": 30.171510696411133,
"learning_rate": 3.29560745156861e-05,
"loss": 3.6886,
"step": 8300
},
{
"epoch": 0.12,
"grad_norm": 24.074813842773438,
"learning_rate": 3.287927367794397e-05,
"loss": 3.6401,
"step": 8320
},
{
"epoch": 0.12,
"grad_norm": 25.059324264526367,
"learning_rate": 3.2802390190349366e-05,
"loss": 3.5847,
"step": 8340
},
{
"epoch": 0.12,
"grad_norm": 19.766672134399414,
"learning_rate": 3.272542485937369e-05,
"loss": 3.5812,
"step": 8360
},
{
"epoch": 0.12,
"grad_norm": 25.08376693725586,
"learning_rate": 3.264837849234685e-05,
"loss": 3.55,
"step": 8380
},
{
"epoch": 0.12,
"grad_norm": 27.044347763061523,
"learning_rate": 3.2571251897448765e-05,
"loss": 3.5347,
"step": 8400
},
{
"epoch": 0.12,
"grad_norm": 23.3479061126709,
"learning_rate": 3.249404588370094e-05,
"loss": 3.5016,
"step": 8420
},
{
"epoch": 0.12,
"grad_norm": 25.586896896362305,
"learning_rate": 3.241676126095792e-05,
"loss": 3.537,
"step": 8440
},
{
"epoch": 0.12,
"grad_norm": 31.54664421081543,
"learning_rate": 3.233939883989882e-05,
"loss": 3.6093,
"step": 8460
},
{
"epoch": 0.12,
"grad_norm": 44.6853141784668,
"learning_rate": 3.226195943201883e-05,
"loss": 3.6135,
"step": 8480
},
{
"epoch": 0.12,
"grad_norm": 43.322757720947266,
"learning_rate": 3.218444384962071e-05,
"loss": 3.6048,
"step": 8500
},
{
"epoch": 0.12,
"grad_norm": 19.633960723876953,
"learning_rate": 3.210685290580622e-05,
"loss": 3.5767,
"step": 8520
},
{
"epoch": 0.12,
"grad_norm": 23.640382766723633,
"learning_rate": 3.202918741446764e-05,
"loss": 3.5961,
"step": 8540
},
{
"epoch": 0.12,
"grad_norm": 46.06730270385742,
"learning_rate": 3.1951448190279255e-05,
"loss": 3.5757,
"step": 8560
},
{
"epoch": 0.12,
"grad_norm": 24.966190338134766,
"learning_rate": 3.187363604868872e-05,
"loss": 3.5488,
"step": 8580
},
{
"epoch": 0.12,
"grad_norm": 61.91409683227539,
"learning_rate": 3.1795751805908573e-05,
"loss": 3.6554,
"step": 8600
},
{
"epoch": 0.12,
"grad_norm": 80.80062103271484,
"learning_rate": 3.171779627890769e-05,
"loss": 3.6226,
"step": 8620
},
{
"epoch": 0.12,
"grad_norm": 20.951128005981445,
"learning_rate": 3.163977028540263e-05,
"loss": 3.6122,
"step": 8640
},
{
"epoch": 0.12,
"grad_norm": 19.875343322753906,
"learning_rate": 3.156167464384917e-05,
"loss": 3.5637,
"step": 8660
},
{
"epoch": 0.12,
"grad_norm": 21.2697811126709,
"learning_rate": 3.1483510173433626e-05,
"loss": 3.537,
"step": 8680
},
{
"epoch": 0.12,
"grad_norm": 22.24051856994629,
"learning_rate": 3.1405277694064305e-05,
"loss": 3.5661,
"step": 8700
},
{
"epoch": 0.12,
"grad_norm": 21.55095863342285,
"learning_rate": 3.1326978026362904e-05,
"loss": 3.5573,
"step": 8720
},
{
"epoch": 0.12,
"grad_norm": 32.11522674560547,
"learning_rate": 3.124861199165588e-05,
"loss": 3.5995,
"step": 8740
},
{
"epoch": 0.12,
"grad_norm": 22.775867462158203,
"learning_rate": 3.117018041196585e-05,
"loss": 3.6436,
"step": 8760
},
{
"epoch": 0.12,
"grad_norm": 23.462509155273438,
"learning_rate": 3.109168411000299e-05,
"loss": 3.601,
"step": 8780
},
{
"epoch": 0.12,
"grad_norm": 23.43865203857422,
"learning_rate": 3.101312390915634e-05,
"loss": 3.6081,
"step": 8800
},
{
"epoch": 0.12,
"grad_norm": 27.17888832092285,
"learning_rate": 3.0934500633485255e-05,
"loss": 3.6257,
"step": 8820
},
{
"epoch": 0.13,
"grad_norm": 31.697662353515625,
"learning_rate": 3.0855815107710666e-05,
"loss": 3.5902,
"step": 8840
},
{
"epoch": 0.13,
"grad_norm": 37.07548904418945,
"learning_rate": 3.0777068157206536e-05,
"loss": 3.6514,
"step": 8860
},
{
"epoch": 0.13,
"grad_norm": 20.554109573364258,
"learning_rate": 3.069826060799109e-05,
"loss": 3.5068,
"step": 8880
},
{
"epoch": 0.13,
"grad_norm": 22.490015029907227,
"learning_rate": 3.061939328671824e-05,
"loss": 3.6488,
"step": 8900
},
{
"epoch": 0.13,
"grad_norm": 25.30253791809082,
"learning_rate": 3.0540467020668864e-05,
"loss": 3.5931,
"step": 8920
},
{
"epoch": 0.13,
"grad_norm": 22.97053337097168,
"learning_rate": 3.0461482637742135e-05,
"loss": 3.5475,
"step": 8940
},
{
"epoch": 0.13,
"grad_norm": 22.68851661682129,
"learning_rate": 3.0382440966446875e-05,
"loss": 3.619,
"step": 8960
},
{
"epoch": 0.13,
"grad_norm": 28.575305938720703,
"learning_rate": 3.03033428358928e-05,
"loss": 3.5188,
"step": 8980
},
{
"epoch": 0.13,
"grad_norm": 21.909072875976562,
"learning_rate": 3.0224189075781884e-05,
"loss": 3.5988,
"step": 9000
},
{
"epoch": 0.13,
"grad_norm": 36.04661560058594,
"learning_rate": 3.014498051639959e-05,
"loss": 3.569,
"step": 9020
},
{
"epoch": 0.13,
"grad_norm": 157.5665740966797,
"learning_rate": 3.0065717988606257e-05,
"loss": 3.6474,
"step": 9040
},
{
"epoch": 0.13,
"grad_norm": 37.57924270629883,
"learning_rate": 2.9986402323828272e-05,
"loss": 3.5874,
"step": 9060
},
{
"epoch": 0.13,
"grad_norm": 26.661418914794922,
"learning_rate": 2.990703435404944e-05,
"loss": 3.5982,
"step": 9080
},
{
"epoch": 0.13,
"grad_norm": 38.95368957519531,
"learning_rate": 2.9827614911802203e-05,
"loss": 3.4998,
"step": 9100
},
{
"epoch": 0.13,
"grad_norm": 34.97966003417969,
"learning_rate": 2.9748144830158924e-05,
"loss": 3.5486,
"step": 9120
},
{
"epoch": 0.13,
"grad_norm": 27.682832717895508,
"learning_rate": 2.9668624942723162e-05,
"loss": 3.6144,
"step": 9140
},
{
"epoch": 0.13,
"grad_norm": 29.238054275512695,
"learning_rate": 2.9589056083620902e-05,
"loss": 3.6442,
"step": 9160
},
{
"epoch": 0.13,
"grad_norm": 31.821439743041992,
"learning_rate": 2.9509439087491835e-05,
"loss": 3.6221,
"step": 9180
},
{
"epoch": 0.13,
"grad_norm": 21.238704681396484,
"learning_rate": 2.9429774789480575e-05,
"loss": 3.6278,
"step": 9200
},
{
"epoch": 0.13,
"grad_norm": 28.50370216369629,
"learning_rate": 2.9350064025227897e-05,
"loss": 3.6592,
"step": 9220
},
{
"epoch": 0.13,
"grad_norm": 22.938095092773438,
"learning_rate": 2.927030763086201e-05,
"loss": 3.519,
"step": 9240
},
{
"epoch": 0.13,
"grad_norm": 27.523639678955078,
"learning_rate": 2.9190506442989752e-05,
"loss": 3.5285,
"step": 9260
},
{
"epoch": 0.13,
"grad_norm": 27.63553237915039,
"learning_rate": 2.9110661298687824e-05,
"loss": 3.5641,
"step": 9280
},
{
"epoch": 0.13,
"grad_norm": 43.626129150390625,
"learning_rate": 2.9030773035493997e-05,
"loss": 3.5764,
"step": 9300
},
{
"epoch": 0.13,
"grad_norm": 20.081253051757812,
"learning_rate": 2.8950842491398357e-05,
"loss": 3.6518,
"step": 9320
},
{
"epoch": 0.13,
"grad_norm": 20.68466567993164,
"learning_rate": 2.8870870504834496e-05,
"loss": 3.6206,
"step": 9340
},
{
"epoch": 0.13,
"grad_norm": 72.61478424072266,
"learning_rate": 2.8790857914670698e-05,
"loss": 3.5108,
"step": 9360
},
{
"epoch": 0.13,
"grad_norm": 23.662805557250977,
"learning_rate": 2.871080556020118e-05,
"loss": 3.6223,
"step": 9380
},
{
"epoch": 0.13,
"grad_norm": 25.221176147460938,
"learning_rate": 2.863071428113726e-05,
"loss": 3.644,
"step": 9400
},
{
"epoch": 0.13,
"grad_norm": 42.87479782104492,
"learning_rate": 2.8550584917598554e-05,
"loss": 3.7027,
"step": 9420
},
{
"epoch": 0.13,
"grad_norm": 30.936325073242188,
"learning_rate": 2.8470418310104173e-05,
"loss": 3.5493,
"step": 9440
},
{
"epoch": 0.13,
"grad_norm": 24.074983596801758,
"learning_rate": 2.8390215299563884e-05,
"loss": 3.4781,
"step": 9460
},
{
"epoch": 0.13,
"grad_norm": 22.818313598632812,
"learning_rate": 2.8309976727269332e-05,
"loss": 3.5558,
"step": 9480
},
{
"epoch": 0.13,
"grad_norm": 33.634605407714844,
"learning_rate": 2.8229703434885163e-05,
"loss": 3.5958,
"step": 9500
},
{
"epoch": 0.13,
"grad_norm": 29.69095802307129,
"learning_rate": 2.814939626444023e-05,
"loss": 3.5682,
"step": 9520
},
{
"epoch": 0.14,
"grad_norm": 29.696638107299805,
"learning_rate": 2.8069056058318755e-05,
"loss": 3.5676,
"step": 9540
},
{
"epoch": 0.14,
"grad_norm": 34.828269958496094,
"learning_rate": 2.7988683659251474e-05,
"loss": 3.482,
"step": 9560
},
{
"epoch": 0.14,
"grad_norm": 21.408599853515625,
"learning_rate": 2.7908279910306835e-05,
"loss": 3.5189,
"step": 9580
},
{
"epoch": 0.14,
"grad_norm": 21.67983627319336,
"learning_rate": 2.782784565488211e-05,
"loss": 3.5703,
"step": 9600
},
{
"epoch": 0.14,
"grad_norm": 24.80797576904297,
"learning_rate": 2.7747381736694572e-05,
"loss": 3.512,
"step": 9620
},
{
"epoch": 0.14,
"grad_norm": 35.9987678527832,
"learning_rate": 2.766688899977266e-05,
"loss": 3.5937,
"step": 9640
},
{
"epoch": 0.14,
"grad_norm": 24.59494400024414,
"learning_rate": 2.7586368288447095e-05,
"loss": 3.5829,
"step": 9660
},
{
"epoch": 0.14,
"grad_norm": 26.161178588867188,
"learning_rate": 2.7505820447342028e-05,
"loss": 3.5978,
"step": 9680
},
{
"epoch": 0.14,
"grad_norm": 17.551490783691406,
"learning_rate": 2.7425246321366203e-05,
"loss": 3.527,
"step": 9700
},
{
"epoch": 0.14,
"grad_norm": 31.15974998474121,
"learning_rate": 2.7344646755704078e-05,
"loss": 3.6422,
"step": 9720
},
{
"epoch": 0.14,
"grad_norm": 24.26822280883789,
"learning_rate": 2.7264022595806948e-05,
"loss": 3.5971,
"step": 9740
},
{
"epoch": 0.14,
"grad_norm": 24.76336669921875,
"learning_rate": 2.71833746873841e-05,
"loss": 3.5972,
"step": 9760
},
{
"epoch": 0.14,
"grad_norm": 18.041610717773438,
"learning_rate": 2.7102703876393944e-05,
"loss": 3.5832,
"step": 9780
},
{
"epoch": 0.14,
"grad_norm": 45.17101287841797,
"learning_rate": 2.7022011009035107e-05,
"loss": 3.5754,
"step": 9800
},
{
"epoch": 0.14,
"grad_norm": 30.87299156188965,
"learning_rate": 2.6941296931737585e-05,
"loss": 3.6022,
"step": 9820
},
{
"epoch": 0.14,
"grad_norm": 32.659786224365234,
"learning_rate": 2.686056249115385e-05,
"loss": 3.5947,
"step": 9840
},
{
"epoch": 0.14,
"grad_norm": 25.66715431213379,
"learning_rate": 2.6779808534149987e-05,
"loss": 3.5997,
"step": 9860
},
{
"epoch": 0.14,
"grad_norm": 24.735023498535156,
"learning_rate": 2.6699035907796792e-05,
"loss": 3.5619,
"step": 9880
},
{
"epoch": 0.14,
"grad_norm": 24.357643127441406,
"learning_rate": 2.6618245459360897e-05,
"loss": 3.6028,
"step": 9900
},
{
"epoch": 0.14,
"grad_norm": 29.255617141723633,
"learning_rate": 2.6537438036295875e-05,
"loss": 3.5231,
"step": 9920
},
{
"epoch": 0.14,
"grad_norm": 19.508926391601562,
"learning_rate": 2.6456614486233343e-05,
"loss": 3.5555,
"step": 9940
},
{
"epoch": 0.14,
"grad_norm": 29.48908042907715,
"learning_rate": 2.6375775656974123e-05,
"loss": 3.5376,
"step": 9960
},
{
"epoch": 0.14,
"grad_norm": 23.857908248901367,
"learning_rate": 2.629492239647926e-05,
"loss": 3.5641,
"step": 9980
},
{
"epoch": 0.14,
"grad_norm": 39.08363342285156,
"learning_rate": 2.621405555286121e-05,
"loss": 3.5957,
"step": 10000
},
{
"epoch": 0.14,
"grad_norm": 30.3124942779541,
"learning_rate": 2.6133175974374892e-05,
"loss": 3.5933,
"step": 10020
},
{
"epoch": 0.14,
"grad_norm": 23.010902404785156,
"learning_rate": 2.6052284509408804e-05,
"loss": 3.573,
"step": 10040
},
{
"epoch": 0.14,
"grad_norm": 34.93478775024414,
"learning_rate": 2.5971382006476154e-05,
"loss": 3.5641,
"step": 10060
},
{
"epoch": 0.14,
"grad_norm": 28.346033096313477,
"learning_rate": 2.5890469314205897e-05,
"loss": 3.5833,
"step": 10080
},
{
"epoch": 0.14,
"grad_norm": 41.69817352294922,
"learning_rate": 2.5809547281333902e-05,
"loss": 3.5718,
"step": 10100
},
{
"epoch": 0.14,
"grad_norm": 36.409114837646484,
"learning_rate": 2.5728616756693997e-05,
"loss": 3.5675,
"step": 10120
},
{
"epoch": 0.14,
"grad_norm": 23.812952041625977,
"learning_rate": 2.564767858920909e-05,
"loss": 3.6445,
"step": 10140
},
{
"epoch": 0.14,
"grad_norm": 31.33305549621582,
"learning_rate": 2.556673362788225e-05,
"loss": 3.5669,
"step": 10160
},
{
"epoch": 0.14,
"grad_norm": 34.65876770019531,
"learning_rate": 2.5485782721787837e-05,
"loss": 3.53,
"step": 10180
},
{
"epoch": 0.14,
"grad_norm": 21.920583724975586,
"learning_rate": 2.540482672006254e-05,
"loss": 3.5825,
"step": 10200
},
{
"epoch": 0.14,
"grad_norm": 48.019004821777344,
"learning_rate": 2.5323866471896512e-05,
"loss": 3.5733,
"step": 10220
},
{
"epoch": 0.14,
"grad_norm": 52.00071334838867,
"learning_rate": 2.5242902826524434e-05,
"loss": 3.5487,
"step": 10240
},
{
"epoch": 0.15,
"grad_norm": 47.33055877685547,
"learning_rate": 2.5161936633216653e-05,
"loss": 3.5076,
"step": 10260
},
{
"epoch": 0.15,
"grad_norm": 20.109745025634766,
"learning_rate": 2.5080968741270223e-05,
"loss": 3.5991,
"step": 10280
},
{
"epoch": 0.15,
"grad_norm": 23.458494186401367,
"learning_rate": 2.5e-05,
"loss": 3.6357,
"step": 10300
},
{
"epoch": 0.15,
"grad_norm": 23.68842315673828,
"learning_rate": 2.4919031258729786e-05,
"loss": 3.5449,
"step": 10320
},
{
"epoch": 0.15,
"grad_norm": 21.289154052734375,
"learning_rate": 2.4838063366783353e-05,
"loss": 3.6704,
"step": 10340
},
{
"epoch": 0.15,
"grad_norm": 23.132051467895508,
"learning_rate": 2.4757097173475572e-05,
"loss": 3.6327,
"step": 10360
},
{
"epoch": 0.15,
"grad_norm": 29.875104904174805,
"learning_rate": 2.4676133528103497e-05,
"loss": 3.5294,
"step": 10380
},
{
"epoch": 0.15,
"grad_norm": 23.41105079650879,
"learning_rate": 2.4595173279937464e-05,
"loss": 3.5995,
"step": 10400
},
{
"epoch": 0.15,
"grad_norm": 22.15860939025879,
"learning_rate": 2.451421727821217e-05,
"loss": 3.6109,
"step": 10420
},
{
"epoch": 0.15,
"grad_norm": 28.534278869628906,
"learning_rate": 2.443326637211775e-05,
"loss": 3.6389,
"step": 10440
},
{
"epoch": 0.15,
"grad_norm": 26.33950424194336,
"learning_rate": 2.435232141079092e-05,
"loss": 3.6083,
"step": 10460
},
{
"epoch": 0.15,
"grad_norm": 19.027633666992188,
"learning_rate": 2.4271383243306016e-05,
"loss": 3.5256,
"step": 10480
},
{
"epoch": 0.15,
"grad_norm": 28.898550033569336,
"learning_rate": 2.419045271866611e-05,
"loss": 3.61,
"step": 10500
},
{
"epoch": 0.15,
"grad_norm": 35.347347259521484,
"learning_rate": 2.410953068579411e-05,
"loss": 3.616,
"step": 10520
},
{
"epoch": 0.15,
"grad_norm": 23.184894561767578,
"learning_rate": 2.402861799352386e-05,
"loss": 3.6263,
"step": 10540
},
{
"epoch": 0.15,
"grad_norm": 32.66107177734375,
"learning_rate": 2.3947715490591206e-05,
"loss": 3.5446,
"step": 10560
},
{
"epoch": 0.15,
"grad_norm": 20.614028930664062,
"learning_rate": 2.3866824025625124e-05,
"loss": 3.5989,
"step": 10580
},
{
"epoch": 0.15,
"grad_norm": 25.750699996948242,
"learning_rate": 2.3785944447138802e-05,
"loss": 3.5197,
"step": 10600
},
{
"epoch": 0.15,
"grad_norm": 23.97648048400879,
"learning_rate": 2.370507760352074e-05,
"loss": 3.6399,
"step": 10620
},
{
"epoch": 0.15,
"grad_norm": 19.600095748901367,
"learning_rate": 2.362422434302588e-05,
"loss": 3.5295,
"step": 10640
},
{
"epoch": 0.15,
"grad_norm": 27.21882438659668,
"learning_rate": 2.3543385513766656e-05,
"loss": 3.512,
"step": 10660
},
{
"epoch": 0.15,
"grad_norm": 27.75621795654297,
"learning_rate": 2.3462561963704134e-05,
"loss": 3.5351,
"step": 10680
},
{
"epoch": 0.15,
"grad_norm": 27.200828552246094,
"learning_rate": 2.338175454063911e-05,
"loss": 3.5038,
"step": 10700
},
{
"epoch": 0.15,
"grad_norm": 27.96784782409668,
"learning_rate": 2.3300964092203207e-05,
"loss": 3.6097,
"step": 10720
},
{
"epoch": 0.15,
"grad_norm": 28.206979751586914,
"learning_rate": 2.3220191465850015e-05,
"loss": 3.5254,
"step": 10740
},
{
"epoch": 0.15,
"grad_norm": 22.781152725219727,
"learning_rate": 2.3139437508846155e-05,
"loss": 3.5857,
"step": 10760
},
{
"epoch": 0.15,
"grad_norm": 23.07236099243164,
"learning_rate": 2.305870306826242e-05,
"loss": 3.4872,
"step": 10780
},
{
"epoch": 0.15,
"grad_norm": 22.408714294433594,
"learning_rate": 2.29779889909649e-05,
"loss": 3.5115,
"step": 10800
},
{
"epoch": 0.15,
"grad_norm": 23.98442268371582,
"learning_rate": 2.289729612360606e-05,
"loss": 3.6297,
"step": 10820
},
{
"epoch": 0.15,
"grad_norm": 29.503135681152344,
"learning_rate": 2.2816625312615903e-05,
"loss": 3.6209,
"step": 10840
},
{
"epoch": 0.15,
"grad_norm": 30.787010192871094,
"learning_rate": 2.2735977404193058e-05,
"loss": 3.4921,
"step": 10860
},
{
"epoch": 0.15,
"grad_norm": 24.088376998901367,
"learning_rate": 2.2655353244295928e-05,
"loss": 3.5582,
"step": 10880
},
{
"epoch": 0.15,
"grad_norm": 25.253761291503906,
"learning_rate": 2.25747536786338e-05,
"loss": 3.5297,
"step": 10900
},
{
"epoch": 0.15,
"grad_norm": 24.333845138549805,
"learning_rate": 2.2494179552657978e-05,
"loss": 3.6105,
"step": 10920
},
{
"epoch": 0.15,
"grad_norm": 32.39179229736328,
"learning_rate": 2.241363171155291e-05,
"loss": 3.6122,
"step": 10940
},
{
"epoch": 0.16,
"grad_norm": 31.885894775390625,
"learning_rate": 2.2333111000227342e-05,
"loss": 3.6358,
"step": 10960
},
{
"epoch": 0.16,
"grad_norm": 22.056386947631836,
"learning_rate": 2.225261826330543e-05,
"loss": 3.5181,
"step": 10980
},
{
"epoch": 0.16,
"grad_norm": 23.47673225402832,
"learning_rate": 2.2172154345117894e-05,
"loss": 3.4853,
"step": 11000
},
{
"epoch": 0.16,
"grad_norm": 23.548656463623047,
"learning_rate": 2.2091720089693168e-05,
"loss": 3.5468,
"step": 11020
},
{
"epoch": 0.16,
"grad_norm": 16.66544532775879,
"learning_rate": 2.201131634074853e-05,
"loss": 3.626,
"step": 11040
},
{
"epoch": 0.16,
"grad_norm": 30.556697845458984,
"learning_rate": 2.1930943941681254e-05,
"loss": 3.5565,
"step": 11060
},
{
"epoch": 0.16,
"grad_norm": 62.914642333984375,
"learning_rate": 2.1850603735559778e-05,
"loss": 3.554,
"step": 11080
},
{
"epoch": 0.16,
"grad_norm": 25.617481231689453,
"learning_rate": 2.177029656511485e-05,
"loss": 3.5449,
"step": 11100
},
{
"epoch": 0.16,
"grad_norm": 33.26127243041992,
"learning_rate": 2.169002327273068e-05,
"loss": 3.6071,
"step": 11120
},
{
"epoch": 0.16,
"grad_norm": 21.895418167114258,
"learning_rate": 2.160978470043612e-05,
"loss": 3.4622,
"step": 11140
},
{
"epoch": 0.16,
"grad_norm": 25.30924415588379,
"learning_rate": 2.152958168989584e-05,
"loss": 3.5169,
"step": 11160
},
{
"epoch": 0.16,
"grad_norm": 28.7779541015625,
"learning_rate": 2.1449415082401455e-05,
"loss": 3.5817,
"step": 11180
},
{
"epoch": 0.16,
"grad_norm": 24.38544273376465,
"learning_rate": 2.136928571886275e-05,
"loss": 3.5433,
"step": 11200
},
{
"epoch": 0.16,
"grad_norm": 36.38949966430664,
"learning_rate": 2.1289194439798818e-05,
"loss": 3.5653,
"step": 11220
},
{
"epoch": 0.16,
"grad_norm": 36.11268615722656,
"learning_rate": 2.12091420853293e-05,
"loss": 3.4839,
"step": 11240
},
{
"epoch": 0.16,
"grad_norm": 18.36191749572754,
"learning_rate": 2.1129129495165507e-05,
"loss": 3.5532,
"step": 11260
},
{
"epoch": 0.16,
"grad_norm": 27.239763259887695,
"learning_rate": 2.1049157508601642e-05,
"loss": 3.5536,
"step": 11280
},
{
"epoch": 0.16,
"grad_norm": 25.459758758544922,
"learning_rate": 2.0969226964506006e-05,
"loss": 3.4878,
"step": 11300
},
{
"epoch": 0.16,
"grad_norm": 28.359439849853516,
"learning_rate": 2.0889338701312185e-05,
"loss": 3.563,
"step": 11320
},
{
"epoch": 0.16,
"grad_norm": 25.177392959594727,
"learning_rate": 2.0809493557010247e-05,
"loss": 3.6313,
"step": 11340
},
{
"epoch": 0.16,
"grad_norm": 26.633609771728516,
"learning_rate": 2.072969236913799e-05,
"loss": 3.6034,
"step": 11360
},
{
"epoch": 0.16,
"grad_norm": 19.589900970458984,
"learning_rate": 2.0649935974772105e-05,
"loss": 3.6429,
"step": 11380
},
{
"epoch": 0.16,
"grad_norm": 35.16368865966797,
"learning_rate": 2.0570225210519434e-05,
"loss": 3.5154,
"step": 11400
},
{
"epoch": 0.16,
"grad_norm": 37.59727478027344,
"learning_rate": 2.0490560912508168e-05,
"loss": 3.5652,
"step": 11420
},
{
"epoch": 0.16,
"grad_norm": 37.76837158203125,
"learning_rate": 2.04109439163791e-05,
"loss": 3.6911,
"step": 11440
},
{
"epoch": 0.16,
"grad_norm": 27.86673355102539,
"learning_rate": 2.0331375057276844e-05,
"loss": 3.4824,
"step": 11460
},
{
"epoch": 0.16,
"grad_norm": 51.12165832519531,
"learning_rate": 2.025185516984108e-05,
"loss": 3.558,
"step": 11480
},
{
"epoch": 0.16,
"grad_norm": 22.489160537719727,
"learning_rate": 2.0172385088197803e-05,
"loss": 3.5595,
"step": 11500
},
{
"epoch": 0.16,
"grad_norm": 19.6495304107666,
"learning_rate": 2.0092965645950564e-05,
"loss": 3.5679,
"step": 11520
},
{
"epoch": 0.16,
"grad_norm": 19.997142791748047,
"learning_rate": 2.001359767617173e-05,
"loss": 3.5332,
"step": 11540
},
{
"epoch": 0.16,
"grad_norm": 34.29532241821289,
"learning_rate": 1.9934282011393753e-05,
"loss": 3.4848,
"step": 11560
},
{
"epoch": 0.16,
"grad_norm": 20.737041473388672,
"learning_rate": 1.985501948360041e-05,
"loss": 3.4874,
"step": 11580
},
{
"epoch": 0.16,
"grad_norm": 30.11549186706543,
"learning_rate": 1.9775810924218125e-05,
"loss": 3.5166,
"step": 11600
},
{
"epoch": 0.16,
"grad_norm": 23.56212615966797,
"learning_rate": 1.9696657164107202e-05,
"loss": 3.652,
"step": 11620
},
{
"epoch": 0.16,
"grad_norm": 20.44150733947754,
"learning_rate": 1.9617559033553128e-05,
"loss": 3.5137,
"step": 11640
},
{
"epoch": 0.17,
"grad_norm": 33.37120819091797,
"learning_rate": 1.9538517362257868e-05,
"loss": 3.5163,
"step": 11660
},
{
"epoch": 0.17,
"grad_norm": 29.839820861816406,
"learning_rate": 1.945953297933115e-05,
"loss": 3.5979,
"step": 11680
},
{
"epoch": 0.17,
"grad_norm": 25.600812911987305,
"learning_rate": 1.9380606713281775e-05,
"loss": 3.6111,
"step": 11700
},
{
"epoch": 0.17,
"grad_norm": 40.76740264892578,
"learning_rate": 1.9301739392008923e-05,
"loss": 3.6727,
"step": 11720
},
{
"epoch": 0.17,
"grad_norm": 25.436763763427734,
"learning_rate": 1.9222931842793473e-05,
"loss": 3.6145,
"step": 11740
},
{
"epoch": 0.17,
"grad_norm": 19.53345489501953,
"learning_rate": 1.9144184892289337e-05,
"loss": 3.5486,
"step": 11760
},
{
"epoch": 0.17,
"grad_norm": 21.103118896484375,
"learning_rate": 1.9065499366514757e-05,
"loss": 3.5796,
"step": 11780
},
{
"epoch": 0.17,
"grad_norm": 26.135894775390625,
"learning_rate": 1.8986876090843667e-05,
"loss": 3.5905,
"step": 11800
},
{
"epoch": 0.17,
"grad_norm": 32.71371841430664,
"learning_rate": 1.8908315889997007e-05,
"loss": 3.531,
"step": 11820
},
{
"epoch": 0.17,
"grad_norm": 23.510149002075195,
"learning_rate": 1.882981958803414e-05,
"loss": 3.5597,
"step": 11840
},
{
"epoch": 0.17,
"grad_norm": 23.804306030273438,
"learning_rate": 1.8751388008344117e-05,
"loss": 3.5755,
"step": 11860
},
{
"epoch": 0.17,
"grad_norm": 33.7330436706543,
"learning_rate": 1.8673021973637095e-05,
"loss": 3.5092,
"step": 11880
},
{
"epoch": 0.17,
"grad_norm": 20.88456153869629,
"learning_rate": 1.859472230593569e-05,
"loss": 3.6001,
"step": 11900
},
{
"epoch": 0.17,
"grad_norm": 28.640546798706055,
"learning_rate": 1.8516489826566376e-05,
"loss": 3.5419,
"step": 11920
},
{
"epoch": 0.17,
"grad_norm": 25.2142391204834,
"learning_rate": 1.8438325356150826e-05,
"loss": 3.465,
"step": 11940
},
{
"epoch": 0.17,
"grad_norm": 27.663267135620117,
"learning_rate": 1.836022971459737e-05,
"loss": 3.5017,
"step": 11960
},
{
"epoch": 0.17,
"grad_norm": 31.913984298706055,
"learning_rate": 1.828220372109232e-05,
"loss": 3.5187,
"step": 11980
},
{
"epoch": 0.17,
"grad_norm": 29.825590133666992,
"learning_rate": 1.820424819409143e-05,
"loss": 3.5469,
"step": 12000
},
{
"epoch": 0.17,
"grad_norm": 18.72800636291504,
"learning_rate": 1.8126363951311287e-05,
"loss": 3.5486,
"step": 12020
},
{
"epoch": 0.17,
"grad_norm": 30.366409301757812,
"learning_rate": 1.804855180972075e-05,
"loss": 3.5487,
"step": 12040
},
{
"epoch": 0.17,
"grad_norm": 25.00339698791504,
"learning_rate": 1.797081258553236e-05,
"loss": 3.4778,
"step": 12060
},
{
"epoch": 0.17,
"grad_norm": 29.204017639160156,
"learning_rate": 1.7893147094193786e-05,
"loss": 3.446,
"step": 12080
},
{
"epoch": 0.17,
"grad_norm": 28.64485740661621,
"learning_rate": 1.7815556150379298e-05,
"loss": 3.5421,
"step": 12100
},
{
"epoch": 0.17,
"grad_norm": 31.4785213470459,
"learning_rate": 1.7738040567981166e-05,
"loss": 3.5075,
"step": 12120
},
{
"epoch": 0.17,
"grad_norm": 28.798315048217773,
"learning_rate": 1.766060116010118e-05,
"loss": 3.5049,
"step": 12140
},
{
"epoch": 0.17,
"grad_norm": 27.112850189208984,
"learning_rate": 1.7583238739042086e-05,
"loss": 3.5939,
"step": 12160
},
{
"epoch": 0.17,
"grad_norm": 24.396697998046875,
"learning_rate": 1.7505954116299063e-05,
"loss": 3.4596,
"step": 12180
},
{
"epoch": 0.17,
"grad_norm": 18.46675682067871,
"learning_rate": 1.7428748102551237e-05,
"loss": 3.4861,
"step": 12200
},
{
"epoch": 0.17,
"grad_norm": 25.615234375,
"learning_rate": 1.7351621507653157e-05,
"loss": 3.5211,
"step": 12220
},
{
"epoch": 0.17,
"grad_norm": 23.890607833862305,
"learning_rate": 1.7274575140626318e-05,
"loss": 3.5255,
"step": 12240
},
{
"epoch": 0.17,
"grad_norm": 19.983030319213867,
"learning_rate": 1.7197609809650643e-05,
"loss": 3.5567,
"step": 12260
},
{
"epoch": 0.17,
"grad_norm": 24.144041061401367,
"learning_rate": 1.712072632205604e-05,
"loss": 3.5586,
"step": 12280
},
{
"epoch": 0.17,
"grad_norm": 35.812835693359375,
"learning_rate": 1.704392548431391e-05,
"loss": 3.5274,
"step": 12300
},
{
"epoch": 0.17,
"grad_norm": 18.759809494018555,
"learning_rate": 1.6967208102028697e-05,
"loss": 3.5823,
"step": 12320
},
{
"epoch": 0.17,
"grad_norm": 19.85857391357422,
"learning_rate": 1.6890574979929448e-05,
"loss": 3.5583,
"step": 12340
},
{
"epoch": 0.17,
"grad_norm": 21.088096618652344,
"learning_rate": 1.6814026921861335e-05,
"loss": 3.5084,
"step": 12360
},
{
"epoch": 0.18,
"grad_norm": 21.760805130004883,
"learning_rate": 1.6737564730777284e-05,
"loss": 3.4753,
"step": 12380
},
{
"epoch": 0.18,
"grad_norm": 25.510221481323242,
"learning_rate": 1.666118920872949e-05,
"loss": 3.6024,
"step": 12400
},
{
"epoch": 0.18,
"grad_norm": 29.43688201904297,
"learning_rate": 1.658490115686104e-05,
"loss": 3.647,
"step": 12420
},
{
"epoch": 0.18,
"grad_norm": 19.17232322692871,
"learning_rate": 1.6508701375397487e-05,
"loss": 3.5505,
"step": 12440
},
{
"epoch": 0.18,
"grad_norm": 27.04405975341797,
"learning_rate": 1.64325906636385e-05,
"loss": 3.5158,
"step": 12460
},
{
"epoch": 0.18,
"grad_norm": 34.61522674560547,
"learning_rate": 1.635656981994943e-05,
"loss": 3.5723,
"step": 12480
},
{
"epoch": 0.18,
"grad_norm": 22.05956268310547,
"learning_rate": 1.6280639641752942e-05,
"loss": 3.5133,
"step": 12500
},
{
"epoch": 0.18,
"grad_norm": 26.21821403503418,
"learning_rate": 1.6204800925520685e-05,
"loss": 3.4956,
"step": 12520
},
{
"epoch": 0.18,
"grad_norm": 16.636159896850586,
"learning_rate": 1.6129054466764904e-05,
"loss": 3.5843,
"step": 12540
},
{
"epoch": 0.18,
"grad_norm": 27.356168746948242,
"learning_rate": 1.60534010600301e-05,
"loss": 3.5189,
"step": 12560
},
{
"epoch": 0.18,
"grad_norm": 19.394620895385742,
"learning_rate": 1.5977841498884723e-05,
"loss": 3.5838,
"step": 12580
},
{
"epoch": 0.18,
"grad_norm": 31.54738426208496,
"learning_rate": 1.5902376575912815e-05,
"loss": 3.6633,
"step": 12600
},
{
"epoch": 0.18,
"grad_norm": 23.533172607421875,
"learning_rate": 1.5827007082705698e-05,
"loss": 3.5234,
"step": 12620
},
{
"epoch": 0.18,
"grad_norm": 25.580156326293945,
"learning_rate": 1.5751733809853704e-05,
"loss": 3.5478,
"step": 12640
},
{
"epoch": 0.18,
"grad_norm": 28.20244789123535,
"learning_rate": 1.5676557546937838e-05,
"loss": 3.49,
"step": 12660
},
{
"epoch": 0.18,
"grad_norm": 26.89322280883789,
"learning_rate": 1.5601479082521526e-05,
"loss": 3.5238,
"step": 12680
},
{
"epoch": 0.18,
"grad_norm": 25.817209243774414,
"learning_rate": 1.552649920414233e-05,
"loss": 3.5417,
"step": 12700
},
{
"epoch": 0.18,
"grad_norm": 30.55599594116211,
"learning_rate": 1.545161869830371e-05,
"loss": 3.5908,
"step": 12720
},
{
"epoch": 0.18,
"grad_norm": 49.497894287109375,
"learning_rate": 1.5376838350466725e-05,
"loss": 3.6647,
"step": 12740
},
{
"epoch": 0.18,
"grad_norm": 17.040536880493164,
"learning_rate": 1.5302158945041838e-05,
"loss": 3.5271,
"step": 12760
},
{
"epoch": 0.18,
"grad_norm": 31.21510124206543,
"learning_rate": 1.5227581265380685e-05,
"loss": 3.4708,
"step": 12780
},
{
"epoch": 0.18,
"grad_norm": 26.194982528686523,
"learning_rate": 1.5153106093767827e-05,
"loss": 3.5831,
"step": 12800
},
{
"epoch": 0.18,
"grad_norm": 21.868547439575195,
"learning_rate": 1.5078734211412573e-05,
"loss": 3.532,
"step": 12820
},
{
"epoch": 0.18,
"grad_norm": 27.728635787963867,
"learning_rate": 1.5004466398440775e-05,
"loss": 3.5432,
"step": 12840
},
{
"epoch": 0.18,
"grad_norm": 22.69495964050293,
"learning_rate": 1.493030343388666e-05,
"loss": 3.5464,
"step": 12860
},
{
"epoch": 0.18,
"grad_norm": 22.85190773010254,
"learning_rate": 1.4856246095684622e-05,
"loss": 3.5686,
"step": 12880
},
{
"epoch": 0.18,
"grad_norm": 18.40359115600586,
"learning_rate": 1.4782295160661103e-05,
"loss": 3.4922,
"step": 12900
},
{
"epoch": 0.18,
"grad_norm": 27.058635711669922,
"learning_rate": 1.4708451404526407e-05,
"loss": 3.5231,
"step": 12920
},
{
"epoch": 0.18,
"grad_norm": 28.491573333740234,
"learning_rate": 1.4634715601866606e-05,
"loss": 3.502,
"step": 12940
},
{
"epoch": 0.18,
"grad_norm": 26.94320297241211,
"learning_rate": 1.4561088526135375e-05,
"loss": 3.5746,
"step": 12960
},
{
"epoch": 0.18,
"grad_norm": 26.503097534179688,
"learning_rate": 1.4487570949645888e-05,
"loss": 3.5195,
"step": 12980
},
{
"epoch": 0.18,
"grad_norm": 32.38998794555664,
"learning_rate": 1.4414163643562755e-05,
"loss": 3.5195,
"step": 13000
},
{
"epoch": 0.18,
"grad_norm": 29.240285873413086,
"learning_rate": 1.434086737789386e-05,
"loss": 3.6301,
"step": 13020
},
{
"epoch": 0.18,
"grad_norm": 19.996788024902344,
"learning_rate": 1.4267682921482356e-05,
"loss": 3.5252,
"step": 13040
},
{
"epoch": 0.18,
"grad_norm": 26.969528198242188,
"learning_rate": 1.419461104199856e-05,
"loss": 3.546,
"step": 13060
},
{
"epoch": 0.19,
"grad_norm": 39.526832580566406,
"learning_rate": 1.412165250593192e-05,
"loss": 3.5464,
"step": 13080
},
{
"epoch": 0.19,
"grad_norm": 22.60038948059082,
"learning_rate": 1.4048808078582942e-05,
"loss": 3.475,
"step": 13100
},
{
"epoch": 0.19,
"grad_norm": 20.97502899169922,
"learning_rate": 1.3976078524055203e-05,
"loss": 3.5398,
"step": 13120
},
{
"epoch": 0.19,
"grad_norm": 16.191055297851562,
"learning_rate": 1.3903464605247325e-05,
"loss": 3.4869,
"step": 13140
},
{
"epoch": 0.19,
"grad_norm": 22.308320999145508,
"learning_rate": 1.3830967083844942e-05,
"loss": 3.4316,
"step": 13160
},
{
"epoch": 0.19,
"grad_norm": 23.294443130493164,
"learning_rate": 1.375858672031276e-05,
"loss": 3.6033,
"step": 13180
},
{
"epoch": 0.19,
"grad_norm": 24.432270050048828,
"learning_rate": 1.368632427388653e-05,
"loss": 3.4829,
"step": 13200
},
{
"epoch": 0.19,
"grad_norm": 24.131166458129883,
"learning_rate": 1.3614180502565135e-05,
"loss": 3.5721,
"step": 13220
},
{
"epoch": 0.19,
"grad_norm": 36.706668853759766,
"learning_rate": 1.3542156163102582e-05,
"loss": 3.4877,
"step": 13240
},
{
"epoch": 0.19,
"grad_norm": 22.797359466552734,
"learning_rate": 1.3470252011000123e-05,
"loss": 3.539,
"step": 13260
},
{
"epoch": 0.19,
"grad_norm": 26.030719757080078,
"learning_rate": 1.3398468800498293e-05,
"loss": 3.5415,
"step": 13280
},
{
"epoch": 0.19,
"grad_norm": 24.123130798339844,
"learning_rate": 1.3326807284568984e-05,
"loss": 3.5354,
"step": 13300
},
{
"epoch": 0.19,
"grad_norm": 24.965229034423828,
"learning_rate": 1.3255268214907613e-05,
"loss": 3.387,
"step": 13320
},
{
"epoch": 0.19,
"grad_norm": 27.477920532226562,
"learning_rate": 1.3183852341925145e-05,
"loss": 3.5484,
"step": 13340
},
{
"epoch": 0.19,
"grad_norm": 27.259984970092773,
"learning_rate": 1.3112560414740315e-05,
"loss": 3.5104,
"step": 13360
},
{
"epoch": 0.19,
"grad_norm": 34.4488525390625,
"learning_rate": 1.3041393181171688e-05,
"loss": 3.5881,
"step": 13380
},
{
"epoch": 0.19,
"grad_norm": 30.281177520751953,
"learning_rate": 1.2970351387729873e-05,
"loss": 3.5851,
"step": 13400
},
{
"epoch": 0.19,
"grad_norm": 21.918861389160156,
"learning_rate": 1.2899435779609682e-05,
"loss": 3.5427,
"step": 13420
},
{
"epoch": 0.19,
"grad_norm": 30.275283813476562,
"learning_rate": 1.2828647100682261e-05,
"loss": 3.6322,
"step": 13440
},
{
"epoch": 0.19,
"grad_norm": 57.30770492553711,
"learning_rate": 1.275798609348738e-05,
"loss": 3.5871,
"step": 13460
},
{
"epoch": 0.19,
"grad_norm": 21.865861892700195,
"learning_rate": 1.2687453499225545e-05,
"loss": 3.5117,
"step": 13480
},
{
"epoch": 0.19,
"grad_norm": 19.6708927154541,
"learning_rate": 1.2617050057750322e-05,
"loss": 3.5015,
"step": 13500
},
{
"epoch": 0.19,
"grad_norm": 27.376646041870117,
"learning_rate": 1.2546776507560468e-05,
"loss": 3.5206,
"step": 13520
},
{
"epoch": 0.19,
"grad_norm": 44.89965057373047,
"learning_rate": 1.2476633585792286e-05,
"loss": 3.5766,
"step": 13540
},
{
"epoch": 0.19,
"grad_norm": 37.562286376953125,
"learning_rate": 1.2406622028211844e-05,
"loss": 3.5488,
"step": 13560
},
{
"epoch": 0.19,
"grad_norm": 41.833892822265625,
"learning_rate": 1.2336742569207235e-05,
"loss": 3.6429,
"step": 13580
},
{
"epoch": 0.19,
"grad_norm": 44.58812713623047,
"learning_rate": 1.2266995941780934e-05,
"loss": 3.5362,
"step": 13600
},
{
"epoch": 0.19,
"grad_norm": 26.543933868408203,
"learning_rate": 1.2197382877542041e-05,
"loss": 3.5761,
"step": 13620
},
{
"epoch": 0.19,
"grad_norm": 25.108768463134766,
"learning_rate": 1.2127904106698666e-05,
"loss": 3.4656,
"step": 13640
},
{
"epoch": 0.19,
"grad_norm": 22.213363647460938,
"learning_rate": 1.2058560358050241e-05,
"loss": 3.5438,
"step": 13660
},
{
"epoch": 0.19,
"grad_norm": 25.67693519592285,
"learning_rate": 1.1989352358979888e-05,
"loss": 3.5508,
"step": 13680
},
{
"epoch": 0.19,
"grad_norm": 23.043434143066406,
"learning_rate": 1.1920280835446748e-05,
"loss": 3.5901,
"step": 13700
},
{
"epoch": 0.19,
"grad_norm": 25.011388778686523,
"learning_rate": 1.1851346511978425e-05,
"loss": 3.5773,
"step": 13720
},
{
"epoch": 0.19,
"grad_norm": 29.659713745117188,
"learning_rate": 1.1782550111663369e-05,
"loss": 3.5795,
"step": 13740
},
{
"epoch": 0.19,
"grad_norm": 28.714496612548828,
"learning_rate": 1.1713892356143239e-05,
"loss": 3.5942,
"step": 13760
},
{
"epoch": 0.2,
"grad_norm": 29.76102066040039,
"learning_rate": 1.1645373965605425e-05,
"loss": 3.5008,
"step": 13780
},
{
"epoch": 0.2,
"grad_norm": 36.479854583740234,
"learning_rate": 1.1576995658775405e-05,
"loss": 3.4347,
"step": 13800
},
{
"epoch": 0.2,
"grad_norm": 22.23845100402832,
"learning_rate": 1.1508758152909273e-05,
"loss": 3.559,
"step": 13820
},
{
"epoch": 0.2,
"grad_norm": 52.79865646362305,
"learning_rate": 1.1440662163786167e-05,
"loss": 3.5128,
"step": 13840
},
{
"epoch": 0.2,
"grad_norm": 23.489912033081055,
"learning_rate": 1.1372708405700793e-05,
"loss": 3.6525,
"step": 13860
},
{
"epoch": 0.2,
"grad_norm": 22.640663146972656,
"learning_rate": 1.1304897591455928e-05,
"loss": 3.5387,
"step": 13880
},
{
"epoch": 0.2,
"grad_norm": 23.30765724182129,
"learning_rate": 1.1237230432354912e-05,
"loss": 3.5714,
"step": 13900
},
{
"epoch": 0.2,
"grad_norm": 27.42571258544922,
"learning_rate": 1.1169707638194238e-05,
"loss": 3.6333,
"step": 13920
},
{
"epoch": 0.2,
"grad_norm": 17.366491317749023,
"learning_rate": 1.1102329917256046e-05,
"loss": 3.5651,
"step": 13940
},
{
"epoch": 0.2,
"grad_norm": 17.73781967163086,
"learning_rate": 1.103509797630077e-05,
"loss": 3.6944,
"step": 13960
},
{
"epoch": 0.2,
"grad_norm": 23.699871063232422,
"learning_rate": 1.0968012520559634e-05,
"loss": 3.5914,
"step": 13980
},
{
"epoch": 0.2,
"grad_norm": 21.75518798828125,
"learning_rate": 1.0901074253727336e-05,
"loss": 3.592,
"step": 14000
},
{
"epoch": 0.2,
"grad_norm": 26.731660842895508,
"learning_rate": 1.083428387795463e-05,
"loss": 3.5461,
"step": 14020
},
{
"epoch": 0.2,
"grad_norm": 20.746625900268555,
"learning_rate": 1.0767642093840932e-05,
"loss": 3.5951,
"step": 14040
},
{
"epoch": 0.2,
"grad_norm": 29.204326629638672,
"learning_rate": 1.0701149600427044e-05,
"loss": 3.591,
"step": 14060
},
{
"epoch": 0.2,
"grad_norm": 23.31056022644043,
"learning_rate": 1.0634807095187737e-05,
"loss": 3.4382,
"step": 14080
},
{
"epoch": 0.2,
"grad_norm": 20.306123733520508,
"learning_rate": 1.0568615274024522e-05,
"loss": 3.5539,
"step": 14100
},
{
"epoch": 0.2,
"grad_norm": 19.775318145751953,
"learning_rate": 1.0502574831258259e-05,
"loss": 3.5532,
"step": 14120
},
{
"epoch": 0.2,
"grad_norm": 23.407949447631836,
"learning_rate": 1.043668645962195e-05,
"loss": 3.4811,
"step": 14140
},
{
"epoch": 0.2,
"grad_norm": 27.759410858154297,
"learning_rate": 1.0370950850253449e-05,
"loss": 3.6196,
"step": 14160
},
{
"epoch": 0.2,
"grad_norm": 35.55162048339844,
"learning_rate": 1.0305368692688174e-05,
"loss": 3.4774,
"step": 14180
},
{
"epoch": 0.2,
"grad_norm": 23.29683494567871,
"learning_rate": 1.0239940674851941e-05,
"loss": 3.5437,
"step": 14200
},
{
"epoch": 0.2,
"grad_norm": 23.486143112182617,
"learning_rate": 1.0174667483053682e-05,
"loss": 3.671,
"step": 14220
},
{
"epoch": 0.2,
"grad_norm": 19.907581329345703,
"learning_rate": 1.0109549801978305e-05,
"loss": 3.4272,
"step": 14240
},
{
"epoch": 0.2,
"grad_norm": 35.4050407409668,
"learning_rate": 1.0044588314679451e-05,
"loss": 3.5397,
"step": 14260
},
{
"epoch": 0.2,
"grad_norm": 28.344934463500977,
"learning_rate": 9.979783702572412e-06,
"loss": 3.5157,
"step": 14280
},
{
"epoch": 0.2,
"grad_norm": 27.65180015563965,
"learning_rate": 9.915136645426884e-06,
"loss": 3.5073,
"step": 14300
},
{
"epoch": 0.2,
"grad_norm": 21.25212860107422,
"learning_rate": 9.850647821359918e-06,
"loss": 3.5119,
"step": 14320
},
{
"epoch": 0.2,
"grad_norm": 25.691951751708984,
"learning_rate": 9.786317906828747e-06,
"loss": 3.6237,
"step": 14340
},
{
"epoch": 0.2,
"grad_norm": 32.24767303466797,
"learning_rate": 9.722147576623743e-06,
"loss": 3.5211,
"step": 14360
},
{
"epoch": 0.2,
"grad_norm": 20.08036231994629,
"learning_rate": 9.658137503861314e-06,
"loss": 3.4558,
"step": 14380
},
{
"epoch": 0.2,
"grad_norm": 22.380619049072266,
"learning_rate": 9.594288359976817e-06,
"loss": 3.4814,
"step": 14400
},
{
"epoch": 0.2,
"grad_norm": 28.68309211730957,
"learning_rate": 9.530600814717575e-06,
"loss": 3.5701,
"step": 14420
},
{
"epoch": 0.2,
"grad_norm": 22.603858947753906,
"learning_rate": 9.467075536135787e-06,
"loss": 3.5527,
"step": 14440
},
{
"epoch": 0.2,
"grad_norm": 20.50914192199707,
"learning_rate": 9.403713190581576e-06,
"loss": 3.4903,
"step": 14460
},
{
"epoch": 0.2,
"grad_norm": 30.767824172973633,
"learning_rate": 9.340514442695952e-06,
"loss": 3.5184,
"step": 14480
},
{
"epoch": 0.21,
"grad_norm": 23.36814308166504,
"learning_rate": 9.277479955403887e-06,
"loss": 3.4903,
"step": 14500
},
{
"epoch": 0.21,
"grad_norm": 34.33296203613281,
"learning_rate": 9.214610389907327e-06,
"loss": 3.5716,
"step": 14520
},
{
"epoch": 0.21,
"grad_norm": 21.11824607849121,
"learning_rate": 9.15190640567825e-06,
"loss": 3.6187,
"step": 14540
},
{
"epoch": 0.21,
"grad_norm": 38.17007064819336,
"learning_rate": 9.0893686604518e-06,
"loss": 3.5029,
"step": 14560
},
{
"epoch": 0.21,
"grad_norm": 22.7441349029541,
"learning_rate": 9.026997810219312e-06,
"loss": 3.552,
"step": 14580
},
{
"epoch": 0.21,
"grad_norm": 27.61566734313965,
"learning_rate": 8.964794509221508e-06,
"loss": 3.5794,
"step": 14600
},
{
"epoch": 0.21,
"grad_norm": 28.213449478149414,
"learning_rate": 8.902759409941566e-06,
"loss": 3.6239,
"step": 14620
},
{
"epoch": 0.21,
"grad_norm": 99.79093933105469,
"learning_rate": 8.840893163098331e-06,
"loss": 3.5571,
"step": 14640
},
{
"epoch": 0.21,
"grad_norm": 22.567502975463867,
"learning_rate": 8.779196417639466e-06,
"loss": 3.6038,
"step": 14660
},
{
"epoch": 0.21,
"grad_norm": 16.769285202026367,
"learning_rate": 8.71766982073462e-06,
"loss": 3.5192,
"step": 14680
},
{
"epoch": 0.21,
"grad_norm": 21.841182708740234,
"learning_rate": 8.656314017768693e-06,
"loss": 3.4728,
"step": 14700
},
{
"epoch": 0.21,
"grad_norm": 29.540746688842773,
"learning_rate": 8.595129652335019e-06,
"loss": 3.4656,
"step": 14720
},
{
"epoch": 0.21,
"grad_norm": 31.932985305786133,
"learning_rate": 8.534117366228644e-06,
"loss": 3.5597,
"step": 14740
},
{
"epoch": 0.21,
"grad_norm": 22.49396324157715,
"learning_rate": 8.47327779943957e-06,
"loss": 3.5653,
"step": 14760
},
{
"epoch": 0.21,
"grad_norm": 26.135406494140625,
"learning_rate": 8.412611590146069e-06,
"loss": 3.5669,
"step": 14780
},
{
"epoch": 0.21,
"grad_norm": 26.18607521057129,
"learning_rate": 8.352119374707978e-06,
"loss": 3.4971,
"step": 14800
},
{
"epoch": 0.21,
"grad_norm": 32.66505813598633,
"learning_rate": 8.29180178766e-06,
"loss": 3.6041,
"step": 14820
},
{
"epoch": 0.21,
"grad_norm": 22.13516616821289,
"learning_rate": 8.23165946170509e-06,
"loss": 3.4845,
"step": 14840
},
{
"epoch": 0.21,
"grad_norm": 19.47613525390625,
"learning_rate": 8.171693027707772e-06,
"loss": 3.582,
"step": 14860
},
{
"epoch": 0.21,
"grad_norm": 28.822246551513672,
"learning_rate": 8.111903114687591e-06,
"loss": 3.5498,
"step": 14880
},
{
"epoch": 0.21,
"grad_norm": 27.130552291870117,
"learning_rate": 8.052290349812419e-06,
"loss": 3.5724,
"step": 14900
},
{
"epoch": 0.21,
"grad_norm": 20.185958862304688,
"learning_rate": 7.992855358391967e-06,
"loss": 3.5115,
"step": 14920
},
{
"epoch": 0.21,
"grad_norm": 19.644073486328125,
"learning_rate": 7.933598763871155e-06,
"loss": 3.5116,
"step": 14940
},
{
"epoch": 0.21,
"grad_norm": 28.123699188232422,
"learning_rate": 7.87452118782363e-06,
"loss": 3.5057,
"step": 14960
},
{
"epoch": 0.21,
"grad_norm": 18.76272964477539,
"learning_rate": 7.815623249945214e-06,
"loss": 3.579,
"step": 14980
},
{
"epoch": 0.21,
"grad_norm": 24.90170669555664,
"learning_rate": 7.756905568047393e-06,
"loss": 3.4875,
"step": 15000
},
{
"epoch": 0.21,
"grad_norm": 28.07253074645996,
"learning_rate": 7.698368758050877e-06,
"loss": 3.4413,
"step": 15020
},
{
"epoch": 0.21,
"grad_norm": 25.81130027770996,
"learning_rate": 7.640013433979093e-06,
"loss": 3.5166,
"step": 15040
},
{
"epoch": 0.21,
"grad_norm": 19.91429901123047,
"learning_rate": 7.58184020795179e-06,
"loss": 3.6086,
"step": 15060
},
{
"epoch": 0.21,
"grad_norm": 22.920394897460938,
"learning_rate": 7.523849690178567e-06,
"loss": 3.4341,
"step": 15080
},
{
"epoch": 0.21,
"grad_norm": 32.82981872558594,
"learning_rate": 7.466042488952521e-06,
"loss": 3.5264,
"step": 15100
},
{
"epoch": 0.21,
"grad_norm": 23.146589279174805,
"learning_rate": 7.408419210643847e-06,
"loss": 3.4571,
"step": 15120
},
{
"epoch": 0.21,
"grad_norm": 16.78236198425293,
"learning_rate": 7.350980459693455e-06,
"loss": 3.5377,
"step": 15140
},
{
"epoch": 0.21,
"grad_norm": 19.41143035888672,
"learning_rate": 7.293726838606674e-06,
"loss": 3.5262,
"step": 15160
},
{
"epoch": 0.21,
"grad_norm": 35.83265686035156,
"learning_rate": 7.236658947946886e-06,
"loss": 3.5389,
"step": 15180
},
{
"epoch": 0.22,
"grad_norm": 18.50286102294922,
"learning_rate": 7.179777386329276e-06,
"loss": 3.4822,
"step": 15200
},
{
"epoch": 0.22,
"grad_norm": 22.09296226501465,
"learning_rate": 7.123082750414486e-06,
"loss": 3.6018,
"step": 15220
},
{
"epoch": 0.22,
"grad_norm": 20.790420532226562,
"learning_rate": 7.066575634902436e-06,
"loss": 3.5642,
"step": 15240
},
{
"epoch": 0.22,
"grad_norm": 29.84691047668457,
"learning_rate": 7.010256632526035e-06,
"loss": 3.6224,
"step": 15260
},
{
"epoch": 0.22,
"grad_norm": 45.16112518310547,
"learning_rate": 6.9541263340449496e-06,
"loss": 3.5078,
"step": 15280
},
{
"epoch": 0.22,
"grad_norm": 24.578792572021484,
"learning_rate": 6.898185328239468e-06,
"loss": 3.571,
"step": 15300
},
{
"epoch": 0.22,
"grad_norm": 28.100582122802734,
"learning_rate": 6.842434201904255e-06,
"loss": 3.4775,
"step": 15320
},
{
"epoch": 0.22,
"grad_norm": 31.31978988647461,
"learning_rate": 6.786873539842259e-06,
"loss": 3.586,
"step": 15340
},
{
"epoch": 0.22,
"grad_norm": 25.426467895507812,
"learning_rate": 6.731503924858518e-06,
"loss": 3.6732,
"step": 15360
},
{
"epoch": 0.22,
"grad_norm": 29.726364135742188,
"learning_rate": 6.676325937754102e-06,
"loss": 3.4458,
"step": 15380
},
{
"epoch": 0.22,
"grad_norm": 35.88907241821289,
"learning_rate": 6.621340157319997e-06,
"loss": 3.5081,
"step": 15400
},
{
"epoch": 0.22,
"grad_norm": 18.98563003540039,
"learning_rate": 6.566547160330999e-06,
"loss": 3.4117,
"step": 15420
},
{
"epoch": 0.22,
"grad_norm": 23.43938446044922,
"learning_rate": 6.511947521539738e-06,
"loss": 3.5529,
"step": 15440
},
{
"epoch": 0.22,
"grad_norm": 27.628211975097656,
"learning_rate": 6.457541813670564e-06,
"loss": 3.6043,
"step": 15460
},
{
"epoch": 0.22,
"grad_norm": 32.65241241455078,
"learning_rate": 6.403330607413643e-06,
"loss": 3.5273,
"step": 15480
},
{
"epoch": 0.22,
"grad_norm": 23.956153869628906,
"learning_rate": 6.349314471418849e-06,
"loss": 3.62,
"step": 15500
},
{
"epoch": 0.22,
"grad_norm": 26.069808959960938,
"learning_rate": 6.295493972289904e-06,
"loss": 3.5688,
"step": 15520
},
{
"epoch": 0.22,
"grad_norm": 39.316566467285156,
"learning_rate": 6.241869674578363e-06,
"loss": 3.5178,
"step": 15540
},
{
"epoch": 0.22,
"grad_norm": 27.55044174194336,
"learning_rate": 6.188442140777742e-06,
"loss": 3.4732,
"step": 15560
},
{
"epoch": 0.22,
"grad_norm": 20.663110733032227,
"learning_rate": 6.1352119313175945e-06,
"loss": 3.471,
"step": 15580
},
{
"epoch": 0.22,
"grad_norm": 24.35353660583496,
"learning_rate": 6.082179604557617e-06,
"loss": 3.503,
"step": 15600
},
{
"epoch": 0.22,
"grad_norm": 21.658618927001953,
"learning_rate": 6.029345716781837e-06,
"loss": 3.5414,
"step": 15620
},
{
"epoch": 0.22,
"grad_norm": 29.32859230041504,
"learning_rate": 5.9767108221927216e-06,
"loss": 3.4492,
"step": 15640
},
{
"epoch": 0.22,
"grad_norm": 29.497888565063477,
"learning_rate": 5.924275472905424e-06,
"loss": 3.6211,
"step": 15660
},
{
"epoch": 0.22,
"grad_norm": 22.027616500854492,
"learning_rate": 5.872040218941929e-06,
"loss": 3.6381,
"step": 15680
},
{
"epoch": 0.22,
"grad_norm": 25.56600570678711,
"learning_rate": 5.820005608225346e-06,
"loss": 3.6468,
"step": 15700
},
{
"epoch": 0.22,
"grad_norm": 20.85995864868164,
"learning_rate": 5.768172186574122e-06,
"loss": 3.5111,
"step": 15720
},
{
"epoch": 0.22,
"grad_norm": 27.828828811645508,
"learning_rate": 5.716540497696307e-06,
"loss": 3.4975,
"step": 15740
},
{
"epoch": 0.22,
"grad_norm": 28.20557403564453,
"learning_rate": 5.665111083183905e-06,
"loss": 3.5542,
"step": 15760
},
{
"epoch": 0.22,
"grad_norm": 27.977331161499023,
"learning_rate": 5.613884482507123e-06,
"loss": 3.5096,
"step": 15780
},
{
"epoch": 0.22,
"grad_norm": 15.945253372192383,
"learning_rate": 5.562861233008774e-06,
"loss": 3.4329,
"step": 15800
},
{
"epoch": 0.22,
"grad_norm": 19.057100296020508,
"learning_rate": 5.512041869898585e-06,
"loss": 3.5043,
"step": 15820
},
{
"epoch": 0.22,
"grad_norm": 19.960477828979492,
"learning_rate": 5.46142692624764e-06,
"loss": 3.4124,
"step": 15840
},
{
"epoch": 0.22,
"grad_norm": 20.818775177001953,
"learning_rate": 5.411016932982752e-06,
"loss": 3.4409,
"step": 15860
},
{
"epoch": 0.22,
"grad_norm": 19.86461639404297,
"learning_rate": 5.360812418880884e-06,
"loss": 3.6115,
"step": 15880
},
{
"epoch": 0.23,
"grad_norm": 26.129798889160156,
"learning_rate": 5.310813910563644e-06,
"loss": 3.5875,
"step": 15900
},
{
"epoch": 0.23,
"grad_norm": 17.876537322998047,
"learning_rate": 5.261021932491714e-06,
"loss": 3.5214,
"step": 15920
},
{
"epoch": 0.23,
"grad_norm": 37.67085266113281,
"learning_rate": 5.2114370069593965e-06,
"loss": 3.6228,
"step": 15940
},
{
"epoch": 0.23,
"grad_norm": 21.973346710205078,
"learning_rate": 5.162059654089083e-06,
"loss": 3.457,
"step": 15960
},
{
"epoch": 0.23,
"grad_norm": 30.133270263671875,
"learning_rate": 5.112890391825845e-06,
"loss": 3.4729,
"step": 15980
},
{
"epoch": 0.23,
"grad_norm": 24.22100830078125,
"learning_rate": 5.063929735931985e-06,
"loss": 3.5727,
"step": 16000
},
{
"epoch": 0.23,
"grad_norm": 25.7775821685791,
"learning_rate": 5.015178199981602e-06,
"loss": 3.5195,
"step": 16020
},
{
"epoch": 0.23,
"grad_norm": 21.86203384399414,
"learning_rate": 4.966636295355253e-06,
"loss": 3.5248,
"step": 16040
},
{
"epoch": 0.23,
"grad_norm": 31.954734802246094,
"learning_rate": 4.918304531234533e-06,
"loss": 3.5392,
"step": 16060
},
{
"epoch": 0.23,
"grad_norm": 19.251066207885742,
"learning_rate": 4.870183414596794e-06,
"loss": 3.5204,
"step": 16080
},
{
"epoch": 0.23,
"grad_norm": 30.7813777923584,
"learning_rate": 4.8222734502097665e-06,
"loss": 3.5081,
"step": 16100
},
{
"epoch": 0.23,
"grad_norm": 24.208667755126953,
"learning_rate": 4.7745751406263165e-06,
"loss": 3.5487,
"step": 16120
},
{
"epoch": 0.23,
"grad_norm": 24.639707565307617,
"learning_rate": 4.727088986179129e-06,
"loss": 3.5998,
"step": 16140
},
{
"epoch": 0.23,
"grad_norm": 23.774940490722656,
"learning_rate": 4.679815484975505e-06,
"loss": 3.4195,
"step": 16160
},
{
"epoch": 0.23,
"grad_norm": 20.040874481201172,
"learning_rate": 4.6327551328920945e-06,
"loss": 3.5555,
"step": 16180
},
{
"epoch": 0.23,
"grad_norm": 32.899959564208984,
"learning_rate": 4.585908423569724e-06,
"loss": 3.5204,
"step": 16200
},
{
"epoch": 0.23,
"grad_norm": 29.705387115478516,
"learning_rate": 4.539275848408217e-06,
"loss": 3.5667,
"step": 16220
},
{
"epoch": 0.23,
"grad_norm": 19.238882064819336,
"learning_rate": 4.492857896561204e-06,
"loss": 3.4192,
"step": 16240
},
{
"epoch": 0.23,
"grad_norm": 18.87932777404785,
"learning_rate": 4.446655054931051e-06,
"loss": 3.4987,
"step": 16260
},
{
"epoch": 0.23,
"grad_norm": 25.21925163269043,
"learning_rate": 4.4006678081636884e-06,
"loss": 3.6039,
"step": 16280
},
{
"epoch": 0.23,
"grad_norm": 42.43282699584961,
"learning_rate": 4.35489663864359e-06,
"loss": 3.5736,
"step": 16300
},
{
"epoch": 0.23,
"grad_norm": 34.624874114990234,
"learning_rate": 4.309342026488653e-06,
"loss": 3.4077,
"step": 16320
},
{
"epoch": 0.23,
"grad_norm": 28.791912078857422,
"learning_rate": 4.264004449545206e-06,
"loss": 3.511,
"step": 16340
},
{
"epoch": 0.23,
"grad_norm": 21.95167350769043,
"learning_rate": 4.218884383382987e-06,
"loss": 3.4688,
"step": 16360
},
{
"epoch": 0.23,
"grad_norm": 29.78521156311035,
"learning_rate": 4.173982301290122e-06,
"loss": 3.4808,
"step": 16380
},
{
"epoch": 0.23,
"grad_norm": 24.426288604736328,
"learning_rate": 4.129298674268225e-06,
"loss": 3.5356,
"step": 16400
},
{
"epoch": 0.23,
"grad_norm": 23.966203689575195,
"learning_rate": 4.084833971027379e-06,
"loss": 3.5471,
"step": 16420
},
{
"epoch": 0.23,
"grad_norm": 26.148141860961914,
"learning_rate": 4.040588657981301e-06,
"loss": 3.4811,
"step": 16440
},
{
"epoch": 0.23,
"grad_norm": 29.300769805908203,
"learning_rate": 3.99656319924237e-06,
"loss": 3.5584,
"step": 16460
},
{
"epoch": 0.23,
"grad_norm": 27.95845603942871,
"learning_rate": 3.952758056616826e-06,
"loss": 3.5451,
"step": 16480
},
{
"epoch": 0.23,
"grad_norm": 21.04642677307129,
"learning_rate": 3.90917368959989e-06,
"loss": 3.5002,
"step": 16500
},
{
"epoch": 0.23,
"grad_norm": 38.936370849609375,
"learning_rate": 3.865810555370936e-06,
"loss": 3.4524,
"step": 16520
},
{
"epoch": 0.23,
"grad_norm": 24.49747085571289,
"learning_rate": 3.822669108788738e-06,
"loss": 3.4887,
"step": 16540
},
{
"epoch": 0.23,
"grad_norm": 36.478424072265625,
"learning_rate": 3.7797498023866396e-06,
"loss": 3.5807,
"step": 16560
},
{
"epoch": 0.23,
"grad_norm": 19.034469604492188,
"learning_rate": 3.737053086367873e-06,
"loss": 3.5806,
"step": 16580
},
{
"epoch": 0.23,
"grad_norm": 26.125469207763672,
"learning_rate": 3.694579408600771e-06,
"loss": 3.4561,
"step": 16600
},
{
"epoch": 0.24,
"grad_norm": 30.372791290283203,
"learning_rate": 3.6523292146141227e-06,
"loss": 3.5875,
"step": 16620
},
{
"epoch": 0.24,
"grad_norm": 25.58843421936035,
"learning_rate": 3.6103029475924726e-06,
"loss": 3.498,
"step": 16640
},
{
"epoch": 0.24,
"grad_norm": 27.708513259887695,
"learning_rate": 3.56850104837147e-06,
"loss": 3.5339,
"step": 16660
},
{
"epoch": 0.24,
"grad_norm": 28.574857711791992,
"learning_rate": 3.5269239554332563e-06,
"loss": 3.5488,
"step": 16680
},
{
"epoch": 0.24,
"grad_norm": 37.658775329589844,
"learning_rate": 3.4855721049018688e-06,
"loss": 3.5008,
"step": 16700
},
{
"epoch": 0.24,
"grad_norm": 31.66521644592285,
"learning_rate": 3.4444459305386507e-06,
"loss": 3.4864,
"step": 16720
},
{
"epoch": 0.24,
"grad_norm": 18.813400268554688,
"learning_rate": 3.403545863737706e-06,
"loss": 3.5685,
"step": 16740
},
{
"epoch": 0.24,
"grad_norm": 24.066808700561523,
"learning_rate": 3.3628723335213885e-06,
"loss": 3.5549,
"step": 16760
},
{
"epoch": 0.24,
"grad_norm": 23.029767990112305,
"learning_rate": 3.322425766535778e-06,
"loss": 3.4389,
"step": 16780
},
{
"epoch": 0.24,
"grad_norm": 25.6854190826416,
"learning_rate": 3.2822065870462217e-06,
"loss": 3.4405,
"step": 16800
},
{
"epoch": 0.24,
"grad_norm": 26.14007568359375,
"learning_rate": 3.2422152169328922e-06,
"loss": 3.5291,
"step": 16820
},
{
"epoch": 0.24,
"grad_norm": 22.264413833618164,
"learning_rate": 3.2024520756863243e-06,
"loss": 3.613,
"step": 16840
},
{
"epoch": 0.24,
"grad_norm": 32.82612991333008,
"learning_rate": 3.1629175804030658e-06,
"loss": 3.4603,
"step": 16860
},
{
"epoch": 0.24,
"grad_norm": 32.21546173095703,
"learning_rate": 3.1236121457812544e-06,
"loss": 3.5886,
"step": 16880
},
{
"epoch": 0.24,
"grad_norm": 18.181438446044922,
"learning_rate": 3.08453618411631e-06,
"loss": 3.4568,
"step": 16900
},
{
"epoch": 0.24,
"grad_norm": 23.75358772277832,
"learning_rate": 3.0456901052965724e-06,
"loss": 3.5491,
"step": 16920
},
{
"epoch": 0.24,
"grad_norm": 17.43839454650879,
"learning_rate": 3.0070743167990273e-06,
"loss": 3.5776,
"step": 16940
},
{
"epoch": 0.24,
"grad_norm": 20.617218017578125,
"learning_rate": 2.9686892236850337e-06,
"loss": 3.539,
"step": 16960
},
{
"epoch": 0.24,
"grad_norm": 19.83597755432129,
"learning_rate": 2.93053522859604e-06,
"loss": 3.3575,
"step": 16980
},
{
"epoch": 0.24,
"grad_norm": 18.4063720703125,
"learning_rate": 2.892612731749414e-06,
"loss": 3.3658,
"step": 17000
},
{
"epoch": 0.24,
"grad_norm": 23.77143096923828,
"learning_rate": 2.85492213093419e-06,
"loss": 3.4393,
"step": 17020
},
{
"epoch": 0.24,
"grad_norm": 22.04786491394043,
"learning_rate": 2.8174638215069493e-06,
"loss": 3.5262,
"step": 17040
},
{
"epoch": 0.24,
"grad_norm": 24.277013778686523,
"learning_rate": 2.780238196387619e-06,
"loss": 3.4419,
"step": 17060
},
{
"epoch": 0.24,
"grad_norm": 30.128318786621094,
"learning_rate": 2.743245646055398e-06,
"loss": 3.5387,
"step": 17080
},
{
"epoch": 0.24,
"grad_norm": 26.737049102783203,
"learning_rate": 2.7064865585446434e-06,
"loss": 3.4134,
"step": 17100
},
{
"epoch": 0.24,
"grad_norm": 26.093942642211914,
"learning_rate": 2.6699613194407725e-06,
"loss": 3.5691,
"step": 17120
},
{
"epoch": 0.24,
"grad_norm": 30.150657653808594,
"learning_rate": 2.6336703118762766e-06,
"loss": 3.4658,
"step": 17140
},
{
"epoch": 0.24,
"grad_norm": 21.17641258239746,
"learning_rate": 2.597613916526637e-06,
"loss": 3.4942,
"step": 17160
},
{
"epoch": 0.24,
"grad_norm": 28.02484130859375,
"learning_rate": 2.5617925116063924e-06,
"loss": 3.4448,
"step": 17180
},
{
"epoch": 0.24,
"grad_norm": 24.14384651184082,
"learning_rate": 2.52620647286512e-06,
"loss": 3.5448,
"step": 17200
},
{
"epoch": 0.24,
"grad_norm": 25.69285774230957,
"learning_rate": 2.4908561735835306e-06,
"loss": 3.5668,
"step": 17220
},
{
"epoch": 0.24,
"grad_norm": 19.125316619873047,
"learning_rate": 2.4557419845695427e-06,
"loss": 3.5204,
"step": 17240
},
{
"epoch": 0.24,
"grad_norm": 23.64023208618164,
"learning_rate": 2.420864274154372e-06,
"loss": 3.4345,
"step": 17260
},
{
"epoch": 0.24,
"grad_norm": 24.39943504333496,
"learning_rate": 2.3862234081887036e-06,
"loss": 3.5515,
"step": 17280
},
{
"epoch": 0.24,
"grad_norm": 24.969629287719727,
"learning_rate": 2.351819750038828e-06,
"loss": 3.4973,
"step": 17300
},
{
"epoch": 0.25,
"grad_norm": 18.182634353637695,
"learning_rate": 2.317653660582844e-06,
"loss": 3.6065,
"step": 17320
},
{
"epoch": 0.25,
"grad_norm": 27.141944885253906,
"learning_rate": 2.2837254982068567e-06,
"loss": 3.5106,
"step": 17340
},
{
"epoch": 0.25,
"grad_norm": 20.061452865600586,
"learning_rate": 2.250035618801241e-06,
"loss": 3.4274,
"step": 17360
},
{
"epoch": 0.25,
"grad_norm": 39.53497314453125,
"learning_rate": 2.2165843757568805e-06,
"loss": 3.4597,
"step": 17380
},
{
"epoch": 0.25,
"grad_norm": 14.67487907409668,
"learning_rate": 2.183372119961499e-06,
"loss": 3.5732,
"step": 17400
},
{
"epoch": 0.25,
"grad_norm": 14.984709739685059,
"learning_rate": 2.15039919979593e-06,
"loss": 3.4735,
"step": 17420
},
{
"epoch": 0.25,
"grad_norm": 30.988039016723633,
"learning_rate": 2.117665961130513e-06,
"loss": 3.4269,
"step": 17440
},
{
"epoch": 0.25,
"grad_norm": 23.8664493560791,
"learning_rate": 2.0851727473214315e-06,
"loss": 3.4997,
"step": 17460
},
{
"epoch": 0.25,
"grad_norm": 24.271230697631836,
"learning_rate": 2.05291989920712e-06,
"loss": 3.5919,
"step": 17480
},
{
"epoch": 0.25,
"grad_norm": 32.17240524291992,
"learning_rate": 2.020907755104698e-06,
"loss": 3.4734,
"step": 17500
},
{
"epoch": 0.25,
"grad_norm": 24.72242546081543,
"learning_rate": 1.9891366508064003e-06,
"loss": 3.5043,
"step": 17520
},
{
"epoch": 0.25,
"grad_norm": 29.70708656311035,
"learning_rate": 1.957606919576088e-06,
"loss": 3.4543,
"step": 17540
},
{
"epoch": 0.25,
"grad_norm": 29.549745559692383,
"learning_rate": 1.926318892145712e-06,
"loss": 3.4355,
"step": 17560
},
{
"epoch": 0.25,
"grad_norm": 25.912363052368164,
"learning_rate": 1.8952728967118804e-06,
"loss": 3.4614,
"step": 17580
},
{
"epoch": 0.25,
"grad_norm": 22.835115432739258,
"learning_rate": 1.864469258932397e-06,
"loss": 3.5498,
"step": 17600
},
{
"epoch": 0.25,
"grad_norm": 20.103981018066406,
"learning_rate": 1.8339083019228404e-06,
"loss": 3.5791,
"step": 17620
},
{
"epoch": 0.25,
"grad_norm": 24.153532028198242,
"learning_rate": 1.803590346253195e-06,
"loss": 3.495,
"step": 17640
},
{
"epoch": 0.25,
"grad_norm": 19.70048713684082,
"learning_rate": 1.7735157099444593e-06,
"loss": 3.5439,
"step": 17660
},
{
"epoch": 0.25,
"grad_norm": 23.056358337402344,
"learning_rate": 1.7436847084653456e-06,
"loss": 3.4222,
"step": 17680
},
{
"epoch": 0.25,
"grad_norm": 25.633689880371094,
"learning_rate": 1.7140976547289438e-06,
"loss": 3.5387,
"step": 17700
},
{
"epoch": 0.25,
"grad_norm": 30.34889030456543,
"learning_rate": 1.6847548590894435e-06,
"loss": 3.5579,
"step": 17720
},
{
"epoch": 0.25,
"grad_norm": 19.06514549255371,
"learning_rate": 1.6556566293388892e-06,
"loss": 3.4082,
"step": 17740
},
{
"epoch": 0.25,
"grad_norm": 16.91566276550293,
"learning_rate": 1.626803270703936e-06,
"loss": 3.5513,
"step": 17760
},
{
"epoch": 0.25,
"grad_norm": 26.17884635925293,
"learning_rate": 1.5981950858426714e-06,
"loss": 3.5068,
"step": 17780
},
{
"epoch": 0.25,
"grad_norm": 35.77287292480469,
"learning_rate": 1.5698323748414124e-06,
"loss": 3.4825,
"step": 17800
},
{
"epoch": 0.25,
"grad_norm": 25.684925079345703,
"learning_rate": 1.5417154352115742e-06,
"loss": 3.5529,
"step": 17820
},
{
"epoch": 0.25,
"grad_norm": 23.964488983154297,
"learning_rate": 1.5138445618865544e-06,
"loss": 3.549,
"step": 17840
},
{
"epoch": 0.25,
"grad_norm": 20.69983673095703,
"learning_rate": 1.4862200472186199e-06,
"loss": 3.5607,
"step": 17860
},
{
"epoch": 0.25,
"grad_norm": 24.382530212402344,
"learning_rate": 1.458842180975864e-06,
"loss": 3.4468,
"step": 17880
},
{
"epoch": 0.25,
"grad_norm": 20.305166244506836,
"learning_rate": 1.4317112503391432e-06,
"loss": 3.5468,
"step": 17900
},
{
"epoch": 0.25,
"grad_norm": 20.76270294189453,
"learning_rate": 1.4048275398990896e-06,
"loss": 3.5828,
"step": 17920
},
{
"epoch": 0.25,
"grad_norm": 31.468564987182617,
"learning_rate": 1.3781913316530948e-06,
"loss": 3.6117,
"step": 17940
},
{
"epoch": 0.25,
"grad_norm": 22.185617446899414,
"learning_rate": 1.351802905002386e-06,
"loss": 3.4663,
"step": 17960
},
{
"epoch": 0.25,
"grad_norm": 27.454687118530273,
"learning_rate": 1.32566253674907e-06,
"loss": 3.4419,
"step": 17980
},
{
"epoch": 0.25,
"grad_norm": 20.76512336730957,
"learning_rate": 1.2997705010932393e-06,
"loss": 3.5315,
"step": 18000
},
{
"epoch": 0.26,
"grad_norm": 27.795419692993164,
"learning_rate": 1.274127069630096e-06,
"loss": 3.5435,
"step": 18020
},
{
"epoch": 0.26,
"grad_norm": 45.871864318847656,
"learning_rate": 1.2487325113471032e-06,
"loss": 3.3871,
"step": 18040
},
{
"epoch": 0.26,
"grad_norm": 15.510208129882812,
"learning_rate": 1.2235870926211619e-06,
"loss": 3.5862,
"step": 18060
},
{
"epoch": 0.26,
"grad_norm": 26.943269729614258,
"learning_rate": 1.1986910772158104e-06,
"loss": 3.5032,
"step": 18080
},
{
"epoch": 0.26,
"grad_norm": 28.423053741455078,
"learning_rate": 1.1740447262784781e-06,
"loss": 3.4936,
"step": 18100
},
{
"epoch": 0.26,
"grad_norm": 25.210853576660156,
"learning_rate": 1.1496482983377189e-06,
"loss": 3.4515,
"step": 18120
},
{
"epoch": 0.26,
"grad_norm": 32.88740921020508,
"learning_rate": 1.125502049300517e-06,
"loss": 3.5196,
"step": 18140
},
{
"epoch": 0.26,
"grad_norm": 20.562488555908203,
"learning_rate": 1.1016062324496008e-06,
"loss": 3.4467,
"step": 18160
},
{
"epoch": 0.26,
"grad_norm": 21.112634658813477,
"learning_rate": 1.0779610984407773e-06,
"loss": 3.5286,
"step": 18180
},
{
"epoch": 0.26,
"grad_norm": 29.323238372802734,
"learning_rate": 1.0545668953003241e-06,
"loss": 3.4971,
"step": 18200
},
{
"epoch": 0.26,
"grad_norm": 24.024930953979492,
"learning_rate": 1.0314238684223515e-06,
"loss": 3.5919,
"step": 18220
},
{
"epoch": 0.26,
"grad_norm": 29.396581649780273,
"learning_rate": 1.0085322605662666e-06,
"loss": 3.4255,
"step": 18240
},
{
"epoch": 0.26,
"grad_norm": 19.502662658691406,
"learning_rate": 9.858923118542002e-07,
"loss": 3.464,
"step": 18260
},
{
"epoch": 0.26,
"grad_norm": 20.03078269958496,
"learning_rate": 9.635042597685023e-07,
"loss": 3.4305,
"step": 18280
},
{
"epoch": 0.26,
"grad_norm": 18.905967712402344,
"learning_rate": 9.413683391492456e-07,
"loss": 3.6401,
"step": 18300
},
{
"epoch": 0.26,
"grad_norm": 23.61101531982422,
"learning_rate": 9.194847821917623e-07,
"loss": 3.5543,
"step": 18320
},
{
"epoch": 0.26,
"grad_norm": 18.563806533813477,
"learning_rate": 8.978538184442137e-07,
"loss": 3.4395,
"step": 18340
},
{
"epoch": 0.26,
"grad_norm": 21.695003509521484,
"learning_rate": 8.764756748051662e-07,
"loss": 3.4193,
"step": 18360
},
{
"epoch": 0.26,
"grad_norm": 21.57720947265625,
"learning_rate": 8.553505755212382e-07,
"loss": 3.5357,
"step": 18380
},
{
"epoch": 0.26,
"grad_norm": 30.37428855895996,
"learning_rate": 8.344787421847217e-07,
"loss": 3.5414,
"step": 18400
},
{
"epoch": 0.26,
"grad_norm": 42.314064025878906,
"learning_rate": 8.138603937312722e-07,
"loss": 3.5528,
"step": 18420
},
{
"epoch": 0.26,
"grad_norm": 22.21116065979004,
"learning_rate": 7.934957464376058e-07,
"loss": 3.6419,
"step": 18440
},
{
"epoch": 0.26,
"grad_norm": 26.877450942993164,
"learning_rate": 7.733850139192395e-07,
"loss": 3.5869,
"step": 18460
},
{
"epoch": 0.26,
"grad_norm": 21.281030654907227,
"learning_rate": 7.535284071282455e-07,
"loss": 3.6047,
"step": 18480
},
{
"epoch": 0.26,
"grad_norm": 20.147789001464844,
"learning_rate": 7.339261343510206e-07,
"loss": 3.4247,
"step": 18500
},
{
"epoch": 0.26,
"grad_norm": 19.394601821899414,
"learning_rate": 7.145784012061424e-07,
"loss": 3.5844,
"step": 18520
},
{
"epoch": 0.26,
"grad_norm": 22.156579971313477,
"learning_rate": 6.954854106421715e-07,
"loss": 3.5348,
"step": 18540
},
{
"epoch": 0.26,
"grad_norm": 28.641721725463867,
"learning_rate": 6.766473629355452e-07,
"loss": 3.5451,
"step": 18560
},
{
"epoch": 0.26,
"grad_norm": 19.47591209411621,
"learning_rate": 6.580644556884702e-07,
"loss": 3.5458,
"step": 18580
}
],
"logging_steps": 20,
"max_steps": 20000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 20,
"total_flos": 4.037882943504384e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}