LongLLaVAMed-9B / trainer_state.json
Xidong's picture
Upload trainer_state.json with huggingface_hub
2b88543 verified
raw
history blame
193 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1109,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009017132551848512,
"grad_norm": 10.577064169641238,
"learning_rate": 2.9411764705882356e-07,
"loss": 1.9986,
"step": 1
},
{
"epoch": 0.0018034265103697023,
"grad_norm": 10.36517170209171,
"learning_rate": 5.882352941176471e-07,
"loss": 1.9534,
"step": 2
},
{
"epoch": 0.002705139765554554,
"grad_norm": 10.44504678164916,
"learning_rate": 8.823529411764707e-07,
"loss": 1.9749,
"step": 3
},
{
"epoch": 0.0036068530207394047,
"grad_norm": 10.331516165460968,
"learning_rate": 1.1764705882352942e-06,
"loss": 1.9862,
"step": 4
},
{
"epoch": 0.004508566275924256,
"grad_norm": 8.65995855008459,
"learning_rate": 1.4705882352941177e-06,
"loss": 1.9184,
"step": 5
},
{
"epoch": 0.005410279531109108,
"grad_norm": 7.008907681884329,
"learning_rate": 1.7647058823529414e-06,
"loss": 1.8906,
"step": 6
},
{
"epoch": 0.0063119927862939585,
"grad_norm": 4.835169025945767,
"learning_rate": 2.058823529411765e-06,
"loss": 1.8871,
"step": 7
},
{
"epoch": 0.007213706041478809,
"grad_norm": 3.723351018265124,
"learning_rate": 2.3529411764705885e-06,
"loss": 1.8672,
"step": 8
},
{
"epoch": 0.008115419296663661,
"grad_norm": 3.2896828301664245,
"learning_rate": 2.647058823529412e-06,
"loss": 1.8221,
"step": 9
},
{
"epoch": 0.009017132551848512,
"grad_norm": 3.997777920263939,
"learning_rate": 2.9411764705882355e-06,
"loss": 1.7994,
"step": 10
},
{
"epoch": 0.009918845807033363,
"grad_norm": 4.007279147531519,
"learning_rate": 3.2352941176470594e-06,
"loss": 1.8035,
"step": 11
},
{
"epoch": 0.010820559062218215,
"grad_norm": 3.660990251525157,
"learning_rate": 3.529411764705883e-06,
"loss": 1.767,
"step": 12
},
{
"epoch": 0.011722272317403066,
"grad_norm": 3.3885848000748386,
"learning_rate": 3.8235294117647055e-06,
"loss": 1.7675,
"step": 13
},
{
"epoch": 0.012623985572587917,
"grad_norm": 3.291223708706854,
"learning_rate": 4.11764705882353e-06,
"loss": 1.7637,
"step": 14
},
{
"epoch": 0.013525698827772768,
"grad_norm": 3.1417473603402217,
"learning_rate": 4.411764705882353e-06,
"loss": 1.7701,
"step": 15
},
{
"epoch": 0.014427412082957619,
"grad_norm": 2.763025617498974,
"learning_rate": 4.705882352941177e-06,
"loss": 1.7316,
"step": 16
},
{
"epoch": 0.015329125338142471,
"grad_norm": 2.3886750440304483,
"learning_rate": 5e-06,
"loss": 1.7236,
"step": 17
},
{
"epoch": 0.016230838593327322,
"grad_norm": 2.34515652847235,
"learning_rate": 5.294117647058824e-06,
"loss": 1.7138,
"step": 18
},
{
"epoch": 0.017132551848512173,
"grad_norm": 2.3271436396781393,
"learning_rate": 5.588235294117647e-06,
"loss": 1.7127,
"step": 19
},
{
"epoch": 0.018034265103697024,
"grad_norm": 2.295163831092699,
"learning_rate": 5.882352941176471e-06,
"loss": 1.6975,
"step": 20
},
{
"epoch": 0.018935978358881875,
"grad_norm": 2.114400590233431,
"learning_rate": 6.176470588235295e-06,
"loss": 1.6654,
"step": 21
},
{
"epoch": 0.019837691614066726,
"grad_norm": 1.9750708257462024,
"learning_rate": 6.470588235294119e-06,
"loss": 1.6869,
"step": 22
},
{
"epoch": 0.020739404869251576,
"grad_norm": 1.6817036959999778,
"learning_rate": 6.764705882352942e-06,
"loss": 1.6651,
"step": 23
},
{
"epoch": 0.02164111812443643,
"grad_norm": 1.6450355892304562,
"learning_rate": 7.058823529411766e-06,
"loss": 1.664,
"step": 24
},
{
"epoch": 0.02254283137962128,
"grad_norm": 1.7120938879923542,
"learning_rate": 7.352941176470589e-06,
"loss": 1.6574,
"step": 25
},
{
"epoch": 0.023444544634806132,
"grad_norm": 1.8550225833518525,
"learning_rate": 7.647058823529411e-06,
"loss": 1.6281,
"step": 26
},
{
"epoch": 0.024346257889990983,
"grad_norm": 1.9221800970494332,
"learning_rate": 7.941176470588236e-06,
"loss": 1.6377,
"step": 27
},
{
"epoch": 0.025247971145175834,
"grad_norm": 1.6995797824544692,
"learning_rate": 8.23529411764706e-06,
"loss": 1.602,
"step": 28
},
{
"epoch": 0.026149684400360685,
"grad_norm": 1.6358357311194014,
"learning_rate": 8.529411764705883e-06,
"loss": 1.6061,
"step": 29
},
{
"epoch": 0.027051397655545536,
"grad_norm": 1.580651479451095,
"learning_rate": 8.823529411764707e-06,
"loss": 1.6169,
"step": 30
},
{
"epoch": 0.027953110910730387,
"grad_norm": 1.5756107806972501,
"learning_rate": 9.11764705882353e-06,
"loss": 1.5969,
"step": 31
},
{
"epoch": 0.028854824165915238,
"grad_norm": 1.633779158193918,
"learning_rate": 9.411764705882354e-06,
"loss": 1.6143,
"step": 32
},
{
"epoch": 0.029756537421100092,
"grad_norm": 1.719962101136898,
"learning_rate": 9.705882352941177e-06,
"loss": 1.5881,
"step": 33
},
{
"epoch": 0.030658250676284943,
"grad_norm": 1.6017616780387625,
"learning_rate": 1e-05,
"loss": 1.5739,
"step": 34
},
{
"epoch": 0.031559963931469794,
"grad_norm": 1.5450519445519328,
"learning_rate": 9.999978648788802e-06,
"loss": 1.5829,
"step": 35
},
{
"epoch": 0.032461677186654644,
"grad_norm": 1.4834472866238042,
"learning_rate": 9.999914595337555e-06,
"loss": 1.569,
"step": 36
},
{
"epoch": 0.033363390441839495,
"grad_norm": 1.498305342680757,
"learning_rate": 9.999807840193305e-06,
"loss": 1.5653,
"step": 37
},
{
"epoch": 0.034265103697024346,
"grad_norm": 1.5463338086484935,
"learning_rate": 9.999658384267795e-06,
"loss": 1.562,
"step": 38
},
{
"epoch": 0.0351668169522092,
"grad_norm": 1.459365986133512,
"learning_rate": 9.999466228837452e-06,
"loss": 1.5585,
"step": 39
},
{
"epoch": 0.03606853020739405,
"grad_norm": 1.4693046149427762,
"learning_rate": 9.999231375543374e-06,
"loss": 1.5211,
"step": 40
},
{
"epoch": 0.0369702434625789,
"grad_norm": 1.4984282166549738,
"learning_rate": 9.998953826391322e-06,
"loss": 1.5367,
"step": 41
},
{
"epoch": 0.03787195671776375,
"grad_norm": 1.4384393247143472,
"learning_rate": 9.998633583751702e-06,
"loss": 1.5337,
"step": 42
},
{
"epoch": 0.0387736699729486,
"grad_norm": 1.3711548840775858,
"learning_rate": 9.99827065035954e-06,
"loss": 1.5185,
"step": 43
},
{
"epoch": 0.03967538322813345,
"grad_norm": 1.4277933526776405,
"learning_rate": 9.997865029314464e-06,
"loss": 1.5269,
"step": 44
},
{
"epoch": 0.0405770964833183,
"grad_norm": 1.396702454558118,
"learning_rate": 9.997416724080673e-06,
"loss": 1.485,
"step": 45
},
{
"epoch": 0.04147880973850315,
"grad_norm": 1.47668068877586,
"learning_rate": 9.996925738486913e-06,
"loss": 1.5259,
"step": 46
},
{
"epoch": 0.04238052299368801,
"grad_norm": 1.4557526770144735,
"learning_rate": 9.996392076726436e-06,
"loss": 1.5188,
"step": 47
},
{
"epoch": 0.04328223624887286,
"grad_norm": 1.4234416876774554,
"learning_rate": 9.995815743356973e-06,
"loss": 1.5014,
"step": 48
},
{
"epoch": 0.04418394950405771,
"grad_norm": 1.3986193326213034,
"learning_rate": 9.995196743300693e-06,
"loss": 1.4924,
"step": 49
},
{
"epoch": 0.04508566275924256,
"grad_norm": 1.45785760531205,
"learning_rate": 9.994535081844152e-06,
"loss": 1.5302,
"step": 50
},
{
"epoch": 0.045987376014427414,
"grad_norm": 1.4363542918984882,
"learning_rate": 9.993830764638262e-06,
"loss": 1.4886,
"step": 51
},
{
"epoch": 0.046889089269612265,
"grad_norm": 1.4273053464550627,
"learning_rate": 9.993083797698231e-06,
"loss": 1.4899,
"step": 52
},
{
"epoch": 0.047790802524797116,
"grad_norm": 1.348045620568428,
"learning_rate": 9.992294187403522e-06,
"loss": 1.496,
"step": 53
},
{
"epoch": 0.04869251577998197,
"grad_norm": 1.4233792162905572,
"learning_rate": 9.991461940497786e-06,
"loss": 1.4764,
"step": 54
},
{
"epoch": 0.04959422903516682,
"grad_norm": 1.446283496903928,
"learning_rate": 9.990587064088817e-06,
"loss": 1.5144,
"step": 55
},
{
"epoch": 0.05049594229035167,
"grad_norm": 1.4413266921077565,
"learning_rate": 9.989669565648484e-06,
"loss": 1.4634,
"step": 56
},
{
"epoch": 0.05139765554553652,
"grad_norm": 1.3732602138003445,
"learning_rate": 9.988709453012664e-06,
"loss": 1.5038,
"step": 57
},
{
"epoch": 0.05229936880072137,
"grad_norm": 1.4130255699072003,
"learning_rate": 9.987706734381188e-06,
"loss": 1.4809,
"step": 58
},
{
"epoch": 0.05320108205590622,
"grad_norm": 1.5087152986620616,
"learning_rate": 9.986661418317759e-06,
"loss": 1.4618,
"step": 59
},
{
"epoch": 0.05410279531109107,
"grad_norm": 1.4798629422697944,
"learning_rate": 9.985573513749881e-06,
"loss": 1.477,
"step": 60
},
{
"epoch": 0.05500450856627592,
"grad_norm": 1.3606687996874915,
"learning_rate": 9.984443029968786e-06,
"loss": 1.4612,
"step": 61
},
{
"epoch": 0.05590622182146077,
"grad_norm": 1.4899050929180402,
"learning_rate": 9.983269976629356e-06,
"loss": 1.4826,
"step": 62
},
{
"epoch": 0.056807935076645624,
"grad_norm": 1.432893510375255,
"learning_rate": 9.982054363750028e-06,
"loss": 1.4537,
"step": 63
},
{
"epoch": 0.057709648331830475,
"grad_norm": 1.3576314165902383,
"learning_rate": 9.980796201712734e-06,
"loss": 1.4661,
"step": 64
},
{
"epoch": 0.058611361587015326,
"grad_norm": 1.3587575875659574,
"learning_rate": 9.979495501262781e-06,
"loss": 1.4677,
"step": 65
},
{
"epoch": 0.059513074842200184,
"grad_norm": 1.346258372969497,
"learning_rate": 9.978152273508783e-06,
"loss": 1.4545,
"step": 66
},
{
"epoch": 0.060414788097385035,
"grad_norm": 1.4228289723340597,
"learning_rate": 9.976766529922556e-06,
"loss": 1.4624,
"step": 67
},
{
"epoch": 0.061316501352569885,
"grad_norm": 1.3706888516420923,
"learning_rate": 9.97533828233902e-06,
"loss": 1.4849,
"step": 68
},
{
"epoch": 0.062218214607754736,
"grad_norm": 1.3370859842228668,
"learning_rate": 9.973867542956104e-06,
"loss": 1.4578,
"step": 69
},
{
"epoch": 0.06311992786293959,
"grad_norm": 1.4029903761261626,
"learning_rate": 9.972354324334633e-06,
"loss": 1.4526,
"step": 70
},
{
"epoch": 0.06402164111812443,
"grad_norm": 1.356224319416608,
"learning_rate": 9.970798639398228e-06,
"loss": 1.4337,
"step": 71
},
{
"epoch": 0.06492335437330929,
"grad_norm": 1.4780697718410634,
"learning_rate": 9.969200501433192e-06,
"loss": 1.4494,
"step": 72
},
{
"epoch": 0.06582506762849413,
"grad_norm": 1.3463132650913565,
"learning_rate": 9.967559924088395e-06,
"loss": 1.4364,
"step": 73
},
{
"epoch": 0.06672678088367899,
"grad_norm": 1.3743615711298545,
"learning_rate": 9.965876921375165e-06,
"loss": 1.4429,
"step": 74
},
{
"epoch": 0.06762849413886383,
"grad_norm": 1.38909407325101,
"learning_rate": 9.964151507667162e-06,
"loss": 1.46,
"step": 75
},
{
"epoch": 0.06853020739404869,
"grad_norm": 1.3866587453202093,
"learning_rate": 9.962383697700252e-06,
"loss": 1.4517,
"step": 76
},
{
"epoch": 0.06943192064923355,
"grad_norm": 1.3624396855314103,
"learning_rate": 9.960573506572391e-06,
"loss": 1.4366,
"step": 77
},
{
"epoch": 0.0703336339044184,
"grad_norm": 1.4528468294670056,
"learning_rate": 9.958720949743485e-06,
"loss": 1.4529,
"step": 78
},
{
"epoch": 0.07123534715960325,
"grad_norm": 1.3751194277907128,
"learning_rate": 9.956826043035268e-06,
"loss": 1.4159,
"step": 79
},
{
"epoch": 0.0721370604147881,
"grad_norm": 1.4929981371791885,
"learning_rate": 9.954888802631164e-06,
"loss": 1.431,
"step": 80
},
{
"epoch": 0.07303877366997295,
"grad_norm": 1.44904285843575,
"learning_rate": 9.952909245076141e-06,
"loss": 1.4405,
"step": 81
},
{
"epoch": 0.0739404869251578,
"grad_norm": 1.389639089486905,
"learning_rate": 9.950887387276582e-06,
"loss": 1.4687,
"step": 82
},
{
"epoch": 0.07484220018034266,
"grad_norm": 1.4392438942511672,
"learning_rate": 9.948823246500132e-06,
"loss": 1.454,
"step": 83
},
{
"epoch": 0.0757439134355275,
"grad_norm": 1.3544399783536702,
"learning_rate": 9.946716840375552e-06,
"loss": 1.4374,
"step": 84
},
{
"epoch": 0.07664562669071236,
"grad_norm": 1.3558114162311536,
"learning_rate": 9.944568186892572e-06,
"loss": 1.4245,
"step": 85
},
{
"epoch": 0.0775473399458972,
"grad_norm": 1.3558935909615983,
"learning_rate": 9.94237730440173e-06,
"loss": 1.4482,
"step": 86
},
{
"epoch": 0.07844905320108206,
"grad_norm": 1.4002193410756965,
"learning_rate": 9.940144211614231e-06,
"loss": 1.4092,
"step": 87
},
{
"epoch": 0.0793507664562669,
"grad_norm": 1.4084436792953672,
"learning_rate": 9.937868927601765e-06,
"loss": 1.455,
"step": 88
},
{
"epoch": 0.08025247971145176,
"grad_norm": 1.507546442338142,
"learning_rate": 9.935551471796358e-06,
"loss": 1.4109,
"step": 89
},
{
"epoch": 0.0811541929666366,
"grad_norm": 1.3621915852437079,
"learning_rate": 9.93319186399021e-06,
"loss": 1.4119,
"step": 90
},
{
"epoch": 0.08205590622182146,
"grad_norm": 1.4680244963224889,
"learning_rate": 9.930790124335511e-06,
"loss": 1.4164,
"step": 91
},
{
"epoch": 0.0829576194770063,
"grad_norm": 1.5004514234560973,
"learning_rate": 9.928346273344283e-06,
"loss": 1.4236,
"step": 92
},
{
"epoch": 0.08385933273219116,
"grad_norm": 1.3932901143805643,
"learning_rate": 9.925860331888197e-06,
"loss": 1.4134,
"step": 93
},
{
"epoch": 0.08476104598737602,
"grad_norm": 1.4024872552349996,
"learning_rate": 9.923332321198396e-06,
"loss": 1.4297,
"step": 94
},
{
"epoch": 0.08566275924256087,
"grad_norm": 1.4468222192078053,
"learning_rate": 9.92076226286532e-06,
"loss": 1.4238,
"step": 95
},
{
"epoch": 0.08656447249774572,
"grad_norm": 1.434206641991036,
"learning_rate": 9.918150178838509e-06,
"loss": 1.4353,
"step": 96
},
{
"epoch": 0.08746618575293057,
"grad_norm": 1.4452250943034404,
"learning_rate": 9.915496091426425e-06,
"loss": 1.4128,
"step": 97
},
{
"epoch": 0.08836789900811542,
"grad_norm": 1.4487965090694912,
"learning_rate": 9.912800023296263e-06,
"loss": 1.3926,
"step": 98
},
{
"epoch": 0.08926961226330027,
"grad_norm": 1.465430632187407,
"learning_rate": 9.910061997473753e-06,
"loss": 1.3985,
"step": 99
},
{
"epoch": 0.09017132551848513,
"grad_norm": 1.450793459097913,
"learning_rate": 9.907282037342957e-06,
"loss": 1.405,
"step": 100
},
{
"epoch": 0.09107303877366997,
"grad_norm": 1.4155012864644325,
"learning_rate": 9.904460166646084e-06,
"loss": 1.4281,
"step": 101
},
{
"epoch": 0.09197475202885483,
"grad_norm": 1.4146435969702247,
"learning_rate": 9.901596409483277e-06,
"loss": 1.4252,
"step": 102
},
{
"epoch": 0.09287646528403967,
"grad_norm": 1.4129494426948097,
"learning_rate": 9.898690790312409e-06,
"loss": 1.3925,
"step": 103
},
{
"epoch": 0.09377817853922453,
"grad_norm": 1.391884326471153,
"learning_rate": 9.895743333948875e-06,
"loss": 1.374,
"step": 104
},
{
"epoch": 0.09467989179440937,
"grad_norm": 1.4266801593025809,
"learning_rate": 9.892754065565382e-06,
"loss": 1.3885,
"step": 105
},
{
"epoch": 0.09558160504959423,
"grad_norm": 1.4783691046465195,
"learning_rate": 9.88972301069173e-06,
"loss": 1.43,
"step": 106
},
{
"epoch": 0.09648331830477908,
"grad_norm": 1.4381943632103706,
"learning_rate": 9.886650195214594e-06,
"loss": 1.407,
"step": 107
},
{
"epoch": 0.09738503155996393,
"grad_norm": 1.4774136648375966,
"learning_rate": 9.883535645377307e-06,
"loss": 1.4126,
"step": 108
},
{
"epoch": 0.09828674481514878,
"grad_norm": 1.4848585593390986,
"learning_rate": 9.880379387779637e-06,
"loss": 1.4301,
"step": 109
},
{
"epoch": 0.09918845807033363,
"grad_norm": 1.3447046383888597,
"learning_rate": 9.877181449377549e-06,
"loss": 1.4095,
"step": 110
},
{
"epoch": 0.10009017132551848,
"grad_norm": 1.485125665692246,
"learning_rate": 9.873941857482988e-06,
"loss": 1.3941,
"step": 111
},
{
"epoch": 0.10099188458070334,
"grad_norm": 1.4386601202642741,
"learning_rate": 9.87066063976364e-06,
"loss": 1.3867,
"step": 112
},
{
"epoch": 0.1018935978358882,
"grad_norm": 1.4124995951533683,
"learning_rate": 9.867337824242691e-06,
"loss": 1.3913,
"step": 113
},
{
"epoch": 0.10279531109107304,
"grad_norm": 1.3851353292060657,
"learning_rate": 9.863973439298597e-06,
"loss": 1.4185,
"step": 114
},
{
"epoch": 0.1036970243462579,
"grad_norm": 1.3931533260228668,
"learning_rate": 9.860567513664836e-06,
"loss": 1.4086,
"step": 115
},
{
"epoch": 0.10459873760144274,
"grad_norm": 1.4179784405117548,
"learning_rate": 9.857120076429662e-06,
"loss": 1.4144,
"step": 116
},
{
"epoch": 0.1055004508566276,
"grad_norm": 1.428702853117983,
"learning_rate": 9.85363115703586e-06,
"loss": 1.3668,
"step": 117
},
{
"epoch": 0.10640216411181244,
"grad_norm": 1.3577671717067978,
"learning_rate": 9.85010078528049e-06,
"loss": 1.4005,
"step": 118
},
{
"epoch": 0.1073038773669973,
"grad_norm": 1.429587957434509,
"learning_rate": 9.846528991314638e-06,
"loss": 1.4016,
"step": 119
},
{
"epoch": 0.10820559062218214,
"grad_norm": 1.4148608462310461,
"learning_rate": 9.842915805643156e-06,
"loss": 1.3833,
"step": 120
},
{
"epoch": 0.109107303877367,
"grad_norm": 1.4243012408251199,
"learning_rate": 9.8392612591244e-06,
"loss": 1.398,
"step": 121
},
{
"epoch": 0.11000901713255185,
"grad_norm": 1.3830341629731753,
"learning_rate": 9.835565382969967e-06,
"loss": 1.3933,
"step": 122
},
{
"epoch": 0.1109107303877367,
"grad_norm": 1.3566631053070333,
"learning_rate": 9.83182820874443e-06,
"loss": 1.356,
"step": 123
},
{
"epoch": 0.11181244364292155,
"grad_norm": 1.4976837799309841,
"learning_rate": 9.82804976836507e-06,
"loss": 1.3716,
"step": 124
},
{
"epoch": 0.1127141568981064,
"grad_norm": 1.5201556480768976,
"learning_rate": 9.824230094101591e-06,
"loss": 1.4088,
"step": 125
},
{
"epoch": 0.11361587015329125,
"grad_norm": 1.4654780557555434,
"learning_rate": 9.820369218575871e-06,
"loss": 1.3733,
"step": 126
},
{
"epoch": 0.1145175834084761,
"grad_norm": 1.421025453696537,
"learning_rate": 9.816467174761655e-06,
"loss": 1.3962,
"step": 127
},
{
"epoch": 0.11541929666366095,
"grad_norm": 1.4262157083025124,
"learning_rate": 9.812523995984281e-06,
"loss": 1.3729,
"step": 128
},
{
"epoch": 0.11632100991884581,
"grad_norm": 1.495933174346428,
"learning_rate": 9.808539715920415e-06,
"loss": 1.4102,
"step": 129
},
{
"epoch": 0.11722272317403065,
"grad_norm": 1.4162668123468176,
"learning_rate": 9.804514368597735e-06,
"loss": 1.3732,
"step": 130
},
{
"epoch": 0.11812443642921551,
"grad_norm": 1.4056884823900608,
"learning_rate": 9.800447988394657e-06,
"loss": 1.4001,
"step": 131
},
{
"epoch": 0.11902614968440037,
"grad_norm": 1.379636688570927,
"learning_rate": 9.79634061004004e-06,
"loss": 1.3874,
"step": 132
},
{
"epoch": 0.11992786293958521,
"grad_norm": 1.3822580890806864,
"learning_rate": 9.792192268612881e-06,
"loss": 1.3586,
"step": 133
},
{
"epoch": 0.12082957619477007,
"grad_norm": 1.382134945197591,
"learning_rate": 9.78800299954203e-06,
"loss": 1.4071,
"step": 134
},
{
"epoch": 0.12173128944995491,
"grad_norm": 1.4059077728114613,
"learning_rate": 9.783772838605874e-06,
"loss": 1.3829,
"step": 135
},
{
"epoch": 0.12263300270513977,
"grad_norm": 1.4279808755935588,
"learning_rate": 9.779501821932033e-06,
"loss": 1.4187,
"step": 136
},
{
"epoch": 0.12353471596032461,
"grad_norm": 1.3435404866724177,
"learning_rate": 9.775189985997062e-06,
"loss": 1.391,
"step": 137
},
{
"epoch": 0.12443642921550947,
"grad_norm": 1.3484226031400397,
"learning_rate": 9.770837367626129e-06,
"loss": 1.3655,
"step": 138
},
{
"epoch": 0.12533814247069433,
"grad_norm": 1.4154141732809218,
"learning_rate": 9.766444003992704e-06,
"loss": 1.3935,
"step": 139
},
{
"epoch": 0.12623985572587917,
"grad_norm": 1.3582775740218958,
"learning_rate": 9.762009932618237e-06,
"loss": 1.3836,
"step": 140
},
{
"epoch": 0.12714156898106402,
"grad_norm": 1.4019326999739066,
"learning_rate": 9.75753519137185e-06,
"loss": 1.3656,
"step": 141
},
{
"epoch": 0.12804328223624886,
"grad_norm": 1.3873034739629564,
"learning_rate": 9.753019818469998e-06,
"loss": 1.3783,
"step": 142
},
{
"epoch": 0.12894499549143373,
"grad_norm": 1.3402006066598218,
"learning_rate": 9.748463852476156e-06,
"loss": 1.3687,
"step": 143
},
{
"epoch": 0.12984670874661858,
"grad_norm": 1.468790905251283,
"learning_rate": 9.743867332300478e-06,
"loss": 1.3896,
"step": 144
},
{
"epoch": 0.13074842200180342,
"grad_norm": 1.3625578204301965,
"learning_rate": 9.739230297199477e-06,
"loss": 1.3888,
"step": 145
},
{
"epoch": 0.13165013525698827,
"grad_norm": 1.4208683043924826,
"learning_rate": 9.734552786775678e-06,
"loss": 1.3664,
"step": 146
},
{
"epoch": 0.13255184851217314,
"grad_norm": 1.4500142087444388,
"learning_rate": 9.729834840977284e-06,
"loss": 1.3982,
"step": 147
},
{
"epoch": 0.13345356176735798,
"grad_norm": 1.436842860684569,
"learning_rate": 9.72507650009784e-06,
"loss": 1.3604,
"step": 148
},
{
"epoch": 0.13435527502254282,
"grad_norm": 1.3701784757913484,
"learning_rate": 9.720277804775879e-06,
"loss": 1.3466,
"step": 149
},
{
"epoch": 0.13525698827772767,
"grad_norm": 1.4124491632817213,
"learning_rate": 9.715438795994587e-06,
"loss": 1.3636,
"step": 150
},
{
"epoch": 0.13615870153291254,
"grad_norm": 1.456826333942723,
"learning_rate": 9.710559515081446e-06,
"loss": 1.3634,
"step": 151
},
{
"epoch": 0.13706041478809738,
"grad_norm": 1.412896803942778,
"learning_rate": 9.705640003707873e-06,
"loss": 1.382,
"step": 152
},
{
"epoch": 0.13796212804328223,
"grad_norm": 1.484485406644004,
"learning_rate": 9.700680303888883e-06,
"loss": 1.3983,
"step": 153
},
{
"epoch": 0.1388638412984671,
"grad_norm": 1.4513023024553309,
"learning_rate": 9.695680457982713e-06,
"loss": 1.3747,
"step": 154
},
{
"epoch": 0.13976555455365194,
"grad_norm": 1.4425274167979576,
"learning_rate": 9.69064050869047e-06,
"loss": 1.3836,
"step": 155
},
{
"epoch": 0.1406672678088368,
"grad_norm": 1.4223525469811833,
"learning_rate": 9.685560499055764e-06,
"loss": 1.3659,
"step": 156
},
{
"epoch": 0.14156898106402163,
"grad_norm": 1.385031691152652,
"learning_rate": 9.680440472464337e-06,
"loss": 1.3549,
"step": 157
},
{
"epoch": 0.1424706943192065,
"grad_norm": 1.4266749431487284,
"learning_rate": 9.675280472643696e-06,
"loss": 1.3661,
"step": 158
},
{
"epoch": 0.14337240757439135,
"grad_norm": 1.5012666041389382,
"learning_rate": 9.670080543662742e-06,
"loss": 1.3752,
"step": 159
},
{
"epoch": 0.1442741208295762,
"grad_norm": 1.415739936413478,
"learning_rate": 9.664840729931385e-06,
"loss": 1.3805,
"step": 160
},
{
"epoch": 0.14517583408476104,
"grad_norm": 1.4181819218823457,
"learning_rate": 9.659561076200173e-06,
"loss": 1.3884,
"step": 161
},
{
"epoch": 0.1460775473399459,
"grad_norm": 1.3719905980017162,
"learning_rate": 9.654241627559908e-06,
"loss": 1.3512,
"step": 162
},
{
"epoch": 0.14697926059513075,
"grad_norm": 1.4212733014049073,
"learning_rate": 9.648882429441258e-06,
"loss": 1.3587,
"step": 163
},
{
"epoch": 0.1478809738503156,
"grad_norm": 1.4127423687960647,
"learning_rate": 9.643483527614372e-06,
"loss": 1.3593,
"step": 164
},
{
"epoch": 0.14878268710550044,
"grad_norm": 1.4008058963023071,
"learning_rate": 9.638044968188486e-06,
"loss": 1.375,
"step": 165
},
{
"epoch": 0.1496844003606853,
"grad_norm": 1.382227169874824,
"learning_rate": 9.632566797611535e-06,
"loss": 1.3601,
"step": 166
},
{
"epoch": 0.15058611361587015,
"grad_norm": 1.3915418749349733,
"learning_rate": 9.627049062669747e-06,
"loss": 1.3595,
"step": 167
},
{
"epoch": 0.151487826871055,
"grad_norm": 1.408864080362016,
"learning_rate": 9.621491810487251e-06,
"loss": 1.367,
"step": 168
},
{
"epoch": 0.15238954012623984,
"grad_norm": 1.4146808141780156,
"learning_rate": 9.615895088525677e-06,
"loss": 1.3566,
"step": 169
},
{
"epoch": 0.1532912533814247,
"grad_norm": 1.3902356321346545,
"learning_rate": 9.61025894458374e-06,
"loss": 1.3764,
"step": 170
},
{
"epoch": 0.15419296663660956,
"grad_norm": 1.3597088612356067,
"learning_rate": 9.604583426796837e-06,
"loss": 1.351,
"step": 171
},
{
"epoch": 0.1550946798917944,
"grad_norm": 1.3720474596763996,
"learning_rate": 9.598868583636644e-06,
"loss": 1.3824,
"step": 172
},
{
"epoch": 0.15599639314697927,
"grad_norm": 1.3707229743231295,
"learning_rate": 9.593114463910687e-06,
"loss": 1.367,
"step": 173
},
{
"epoch": 0.15689810640216412,
"grad_norm": 1.3725553676605047,
"learning_rate": 9.587321116761938e-06,
"loss": 1.3599,
"step": 174
},
{
"epoch": 0.15779981965734896,
"grad_norm": 1.4142122379930755,
"learning_rate": 9.581488591668389e-06,
"loss": 1.3453,
"step": 175
},
{
"epoch": 0.1587015329125338,
"grad_norm": 1.3536864485589797,
"learning_rate": 9.57561693844263e-06,
"loss": 1.3353,
"step": 176
},
{
"epoch": 0.15960324616771868,
"grad_norm": 1.4132800800716323,
"learning_rate": 9.56970620723142e-06,
"loss": 1.3537,
"step": 177
},
{
"epoch": 0.16050495942290352,
"grad_norm": 1.3587637930643957,
"learning_rate": 9.563756448515273e-06,
"loss": 1.3526,
"step": 178
},
{
"epoch": 0.16140667267808836,
"grad_norm": 1.3765918418070524,
"learning_rate": 9.557767713108009e-06,
"loss": 1.3452,
"step": 179
},
{
"epoch": 0.1623083859332732,
"grad_norm": 1.3475505521784306,
"learning_rate": 9.551740052156326e-06,
"loss": 1.3572,
"step": 180
},
{
"epoch": 0.16321009918845808,
"grad_norm": 1.4357564962345402,
"learning_rate": 9.545673517139376e-06,
"loss": 1.3636,
"step": 181
},
{
"epoch": 0.16411181244364292,
"grad_norm": 1.4697976472825107,
"learning_rate": 9.5395681598683e-06,
"loss": 1.3441,
"step": 182
},
{
"epoch": 0.16501352569882777,
"grad_norm": 1.4148432779008302,
"learning_rate": 9.533424032485812e-06,
"loss": 1.3691,
"step": 183
},
{
"epoch": 0.1659152389540126,
"grad_norm": 1.459319873185255,
"learning_rate": 9.527241187465735e-06,
"loss": 1.3249,
"step": 184
},
{
"epoch": 0.16681695220919748,
"grad_norm": 1.3570335357652492,
"learning_rate": 9.521019677612559e-06,
"loss": 1.3674,
"step": 185
},
{
"epoch": 0.16771866546438233,
"grad_norm": 1.3486828991979471,
"learning_rate": 9.514759556060996e-06,
"loss": 1.3375,
"step": 186
},
{
"epoch": 0.16862037871956717,
"grad_norm": 1.373345392575501,
"learning_rate": 9.508460876275514e-06,
"loss": 1.3231,
"step": 187
},
{
"epoch": 0.16952209197475204,
"grad_norm": 1.3929600168838754,
"learning_rate": 9.502123692049889e-06,
"loss": 1.3471,
"step": 188
},
{
"epoch": 0.1704238052299369,
"grad_norm": 1.4193281036609189,
"learning_rate": 9.49574805750675e-06,
"loss": 1.3619,
"step": 189
},
{
"epoch": 0.17132551848512173,
"grad_norm": 1.3910185797527803,
"learning_rate": 9.4893340270971e-06,
"loss": 1.3498,
"step": 190
},
{
"epoch": 0.17222723174030657,
"grad_norm": 1.3411529365941561,
"learning_rate": 9.482881655599867e-06,
"loss": 1.363,
"step": 191
},
{
"epoch": 0.17312894499549145,
"grad_norm": 1.3530788239084923,
"learning_rate": 9.47639099812143e-06,
"loss": 1.3447,
"step": 192
},
{
"epoch": 0.1740306582506763,
"grad_norm": 1.4060276890862744,
"learning_rate": 9.46986211009515e-06,
"loss": 1.3603,
"step": 193
},
{
"epoch": 0.17493237150586113,
"grad_norm": 1.4002742616983794,
"learning_rate": 9.463295047280892e-06,
"loss": 1.325,
"step": 194
},
{
"epoch": 0.17583408476104598,
"grad_norm": 1.4079001802402094,
"learning_rate": 9.456689865764554e-06,
"loss": 1.3732,
"step": 195
},
{
"epoch": 0.17673579801623085,
"grad_norm": 1.366324818080461,
"learning_rate": 9.450046621957587e-06,
"loss": 1.3497,
"step": 196
},
{
"epoch": 0.1776375112714157,
"grad_norm": 1.366857559507007,
"learning_rate": 9.443365372596511e-06,
"loss": 1.3287,
"step": 197
},
{
"epoch": 0.17853922452660054,
"grad_norm": 1.3873422124784134,
"learning_rate": 9.436646174742432e-06,
"loss": 1.341,
"step": 198
},
{
"epoch": 0.17944093778178538,
"grad_norm": 1.3742935051526575,
"learning_rate": 9.429889085780559e-06,
"loss": 1.3247,
"step": 199
},
{
"epoch": 0.18034265103697025,
"grad_norm": 1.4007870712786872,
"learning_rate": 9.4230941634197e-06,
"loss": 1.3604,
"step": 200
},
{
"epoch": 0.1812443642921551,
"grad_norm": 1.340061281395059,
"learning_rate": 9.416261465691786e-06,
"loss": 1.3594,
"step": 201
},
{
"epoch": 0.18214607754733994,
"grad_norm": 1.4279648538396195,
"learning_rate": 9.409391050951367e-06,
"loss": 1.3556,
"step": 202
},
{
"epoch": 0.18304779080252478,
"grad_norm": 1.3474825489077324,
"learning_rate": 9.402482977875112e-06,
"loss": 1.3348,
"step": 203
},
{
"epoch": 0.18394950405770966,
"grad_norm": 1.3021713820720349,
"learning_rate": 9.395537305461312e-06,
"loss": 1.3372,
"step": 204
},
{
"epoch": 0.1848512173128945,
"grad_norm": 1.3439292199743982,
"learning_rate": 9.388554093029376e-06,
"loss": 1.3539,
"step": 205
},
{
"epoch": 0.18575293056807934,
"grad_norm": 1.3572209464576004,
"learning_rate": 9.381533400219319e-06,
"loss": 1.3227,
"step": 206
},
{
"epoch": 0.18665464382326422,
"grad_norm": 1.3727277388728627,
"learning_rate": 9.37447528699126e-06,
"loss": 1.3767,
"step": 207
},
{
"epoch": 0.18755635707844906,
"grad_norm": 1.3897163176087035,
"learning_rate": 9.367379813624908e-06,
"loss": 1.3304,
"step": 208
},
{
"epoch": 0.1884580703336339,
"grad_norm": 1.3659826029945907,
"learning_rate": 9.36024704071904e-06,
"loss": 1.3495,
"step": 209
},
{
"epoch": 0.18935978358881875,
"grad_norm": 1.4281647829676292,
"learning_rate": 9.35307702919099e-06,
"loss": 1.3315,
"step": 210
},
{
"epoch": 0.19026149684400362,
"grad_norm": 1.425082817286243,
"learning_rate": 9.345869840276138e-06,
"loss": 1.3374,
"step": 211
},
{
"epoch": 0.19116321009918846,
"grad_norm": 1.3669686996346657,
"learning_rate": 9.338625535527363e-06,
"loss": 1.329,
"step": 212
},
{
"epoch": 0.1920649233543733,
"grad_norm": 1.4493780187902503,
"learning_rate": 9.331344176814537e-06,
"loss": 1.3309,
"step": 213
},
{
"epoch": 0.19296663660955815,
"grad_norm": 1.3984652860472455,
"learning_rate": 9.324025826323995e-06,
"loss": 1.3447,
"step": 214
},
{
"epoch": 0.19386834986474302,
"grad_norm": 1.3758656367160043,
"learning_rate": 9.316670546557994e-06,
"loss": 1.3339,
"step": 215
},
{
"epoch": 0.19477006311992787,
"grad_norm": 1.3483825039725506,
"learning_rate": 9.309278400334184e-06,
"loss": 1.329,
"step": 216
},
{
"epoch": 0.1956717763751127,
"grad_norm": 1.4046844670196472,
"learning_rate": 9.301849450785077e-06,
"loss": 1.3239,
"step": 217
},
{
"epoch": 0.19657348963029755,
"grad_norm": 1.378369632031669,
"learning_rate": 9.294383761357503e-06,
"loss": 1.3293,
"step": 218
},
{
"epoch": 0.19747520288548243,
"grad_norm": 1.390583412942,
"learning_rate": 9.286881395812066e-06,
"loss": 1.3583,
"step": 219
},
{
"epoch": 0.19837691614066727,
"grad_norm": 1.3569691678927214,
"learning_rate": 9.279342418222602e-06,
"loss": 1.3416,
"step": 220
},
{
"epoch": 0.1992786293958521,
"grad_norm": 1.5011309111070126,
"learning_rate": 9.271766892975632e-06,
"loss": 1.3408,
"step": 221
},
{
"epoch": 0.20018034265103696,
"grad_norm": 1.3022805869624663,
"learning_rate": 9.264154884769811e-06,
"loss": 1.3236,
"step": 222
},
{
"epoch": 0.20108205590622183,
"grad_norm": 1.387897289165249,
"learning_rate": 9.256506458615378e-06,
"loss": 1.3469,
"step": 223
},
{
"epoch": 0.20198376916140667,
"grad_norm": 1.4397245147743074,
"learning_rate": 9.248821679833596e-06,
"loss": 1.3522,
"step": 224
},
{
"epoch": 0.20288548241659152,
"grad_norm": 1.3137706884917066,
"learning_rate": 9.241100614056202e-06,
"loss": 1.3244,
"step": 225
},
{
"epoch": 0.2037871956717764,
"grad_norm": 1.3663543578550792,
"learning_rate": 9.233343327224836e-06,
"loss": 1.3152,
"step": 226
},
{
"epoch": 0.20468890892696123,
"grad_norm": 1.349090231463568,
"learning_rate": 9.225549885590487e-06,
"loss": 1.3465,
"step": 227
},
{
"epoch": 0.20559062218214608,
"grad_norm": 1.4177971106430631,
"learning_rate": 9.217720355712924e-06,
"loss": 1.3592,
"step": 228
},
{
"epoch": 0.20649233543733092,
"grad_norm": 1.4430064774802602,
"learning_rate": 9.209854804460121e-06,
"loss": 1.3283,
"step": 229
},
{
"epoch": 0.2073940486925158,
"grad_norm": 1.380627101897418,
"learning_rate": 9.2019532990077e-06,
"loss": 1.3315,
"step": 230
},
{
"epoch": 0.20829576194770064,
"grad_norm": 1.3293715152695407,
"learning_rate": 9.194015906838345e-06,
"loss": 1.3191,
"step": 231
},
{
"epoch": 0.20919747520288548,
"grad_norm": 1.408506235413438,
"learning_rate": 9.186042695741228e-06,
"loss": 1.3445,
"step": 232
},
{
"epoch": 0.21009918845807032,
"grad_norm": 1.4125157387882301,
"learning_rate": 9.17803373381144e-06,
"loss": 1.3189,
"step": 233
},
{
"epoch": 0.2110009017132552,
"grad_norm": 1.3525250458202043,
"learning_rate": 9.16998908944939e-06,
"loss": 1.3423,
"step": 234
},
{
"epoch": 0.21190261496844004,
"grad_norm": 1.3865242623211698,
"learning_rate": 9.161908831360242e-06,
"loss": 1.3126,
"step": 235
},
{
"epoch": 0.21280432822362488,
"grad_norm": 1.377272880444935,
"learning_rate": 9.153793028553314e-06,
"loss": 1.3309,
"step": 236
},
{
"epoch": 0.21370604147880973,
"grad_norm": 1.341817175736238,
"learning_rate": 9.145641750341495e-06,
"loss": 1.3212,
"step": 237
},
{
"epoch": 0.2146077547339946,
"grad_norm": 1.3248285979696608,
"learning_rate": 9.137455066340647e-06,
"loss": 1.3317,
"step": 238
},
{
"epoch": 0.21550946798917944,
"grad_norm": 1.395068585478875,
"learning_rate": 9.129233046469021e-06,
"loss": 1.337,
"step": 239
},
{
"epoch": 0.2164111812443643,
"grad_norm": 1.326627012011638,
"learning_rate": 9.120975760946649e-06,
"loss": 1.3243,
"step": 240
},
{
"epoch": 0.21731289449954913,
"grad_norm": 1.3411117908902908,
"learning_rate": 9.11268328029475e-06,
"loss": 1.3478,
"step": 241
},
{
"epoch": 0.218214607754734,
"grad_norm": 1.354756713038773,
"learning_rate": 9.104355675335124e-06,
"loss": 1.3342,
"step": 242
},
{
"epoch": 0.21911632100991885,
"grad_norm": 1.4335828362826124,
"learning_rate": 9.095993017189554e-06,
"loss": 1.3222,
"step": 243
},
{
"epoch": 0.2200180342651037,
"grad_norm": 1.368829614316604,
"learning_rate": 9.087595377279192e-06,
"loss": 1.3337,
"step": 244
},
{
"epoch": 0.22091974752028856,
"grad_norm": 1.345046468626478,
"learning_rate": 9.079162827323951e-06,
"loss": 1.3293,
"step": 245
},
{
"epoch": 0.2218214607754734,
"grad_norm": 1.4050717986225727,
"learning_rate": 9.070695439341894e-06,
"loss": 1.319,
"step": 246
},
{
"epoch": 0.22272317403065825,
"grad_norm": 1.334690550660624,
"learning_rate": 9.062193285648616e-06,
"loss": 1.3142,
"step": 247
},
{
"epoch": 0.2236248872858431,
"grad_norm": 1.336435641504262,
"learning_rate": 9.053656438856629e-06,
"loss": 1.3453,
"step": 248
},
{
"epoch": 0.22452660054102797,
"grad_norm": 1.4419914394186921,
"learning_rate": 9.045084971874738e-06,
"loss": 1.3324,
"step": 249
},
{
"epoch": 0.2254283137962128,
"grad_norm": 1.333464805154606,
"learning_rate": 9.036478957907426e-06,
"loss": 1.3299,
"step": 250
},
{
"epoch": 0.22633002705139765,
"grad_norm": 1.4001946831119945,
"learning_rate": 9.027838470454222e-06,
"loss": 1.3152,
"step": 251
},
{
"epoch": 0.2272317403065825,
"grad_norm": 1.3082432976301495,
"learning_rate": 9.019163583309077e-06,
"loss": 1.3188,
"step": 252
},
{
"epoch": 0.22813345356176737,
"grad_norm": 1.3085213764833319,
"learning_rate": 9.010454370559723e-06,
"loss": 1.3324,
"step": 253
},
{
"epoch": 0.2290351668169522,
"grad_norm": 1.4543733572461475,
"learning_rate": 9.001710906587064e-06,
"loss": 1.3465,
"step": 254
},
{
"epoch": 0.22993688007213706,
"grad_norm": 1.41721500158758,
"learning_rate": 8.992933266064514e-06,
"loss": 1.3262,
"step": 255
},
{
"epoch": 0.2308385933273219,
"grad_norm": 1.2985535487758335,
"learning_rate": 8.984121523957376e-06,
"loss": 1.3244,
"step": 256
},
{
"epoch": 0.23174030658250677,
"grad_norm": 1.3463578437506631,
"learning_rate": 8.9752757555222e-06,
"loss": 1.3237,
"step": 257
},
{
"epoch": 0.23264201983769162,
"grad_norm": 1.3789337432141782,
"learning_rate": 8.96639603630613e-06,
"loss": 1.3008,
"step": 258
},
{
"epoch": 0.23354373309287646,
"grad_norm": 1.305138757492257,
"learning_rate": 8.957482442146271e-06,
"loss": 1.3433,
"step": 259
},
{
"epoch": 0.2344454463480613,
"grad_norm": 1.3237926877005564,
"learning_rate": 8.948535049169038e-06,
"loss": 1.3605,
"step": 260
},
{
"epoch": 0.23534715960324618,
"grad_norm": 1.3324147166992832,
"learning_rate": 8.939553933789499e-06,
"loss": 1.3266,
"step": 261
},
{
"epoch": 0.23624887285843102,
"grad_norm": 1.3695811772880973,
"learning_rate": 8.93053917271073e-06,
"loss": 1.3074,
"step": 262
},
{
"epoch": 0.23715058611361586,
"grad_norm": 1.3544201012293267,
"learning_rate": 8.921490842923164e-06,
"loss": 1.3187,
"step": 263
},
{
"epoch": 0.23805229936880073,
"grad_norm": 1.3484671838622388,
"learning_rate": 8.912409021703914e-06,
"loss": 1.3293,
"step": 264
},
{
"epoch": 0.23895401262398558,
"grad_norm": 1.4310584062261378,
"learning_rate": 8.903293786616136e-06,
"loss": 1.3142,
"step": 265
},
{
"epoch": 0.23985572587917042,
"grad_norm": 1.3744205816678494,
"learning_rate": 8.894145215508355e-06,
"loss": 1.3398,
"step": 266
},
{
"epoch": 0.24075743913435527,
"grad_norm": 1.3790009084369972,
"learning_rate": 8.884963386513798e-06,
"loss": 1.3037,
"step": 267
},
{
"epoch": 0.24165915238954014,
"grad_norm": 1.450743624351617,
"learning_rate": 8.875748378049734e-06,
"loss": 1.3258,
"step": 268
},
{
"epoch": 0.24256086564472498,
"grad_norm": 1.374563750527912,
"learning_rate": 8.866500268816803e-06,
"loss": 1.2894,
"step": 269
},
{
"epoch": 0.24346257889990983,
"grad_norm": 1.3361505315525928,
"learning_rate": 8.857219137798331e-06,
"loss": 1.3078,
"step": 270
},
{
"epoch": 0.24436429215509467,
"grad_norm": 1.4356181066392604,
"learning_rate": 8.847905064259683e-06,
"loss": 1.3074,
"step": 271
},
{
"epoch": 0.24526600541027954,
"grad_norm": 1.429244400428148,
"learning_rate": 8.838558127747551e-06,
"loss": 1.3456,
"step": 272
},
{
"epoch": 0.24616771866546439,
"grad_norm": 1.315895913876898,
"learning_rate": 8.829178408089305e-06,
"loss": 1.3021,
"step": 273
},
{
"epoch": 0.24706943192064923,
"grad_norm": 1.3578720410840832,
"learning_rate": 8.819765985392297e-06,
"loss": 1.3145,
"step": 274
},
{
"epoch": 0.24797114517583407,
"grad_norm": 1.3227358335583927,
"learning_rate": 8.810320940043173e-06,
"loss": 1.2991,
"step": 275
},
{
"epoch": 0.24887285843101895,
"grad_norm": 1.3473711974386464,
"learning_rate": 8.800843352707197e-06,
"loss": 1.3305,
"step": 276
},
{
"epoch": 0.2497745716862038,
"grad_norm": 1.3837401955745958,
"learning_rate": 8.79133330432756e-06,
"loss": 1.3239,
"step": 277
},
{
"epoch": 0.25067628494138866,
"grad_norm": 1.3473227503086935,
"learning_rate": 8.781790876124679e-06,
"loss": 1.3422,
"step": 278
},
{
"epoch": 0.2515779981965735,
"grad_norm": 1.3098795045608111,
"learning_rate": 8.772216149595515e-06,
"loss": 1.3196,
"step": 279
},
{
"epoch": 0.25247971145175835,
"grad_norm": 1.3488357463698006,
"learning_rate": 8.762609206512871e-06,
"loss": 1.3021,
"step": 280
},
{
"epoch": 0.2533814247069432,
"grad_norm": 1.3906118010589408,
"learning_rate": 8.752970128924696e-06,
"loss": 1.2946,
"step": 281
},
{
"epoch": 0.25428313796212804,
"grad_norm": 1.314415592417016,
"learning_rate": 8.743298999153382e-06,
"loss": 1.2997,
"step": 282
},
{
"epoch": 0.2551848512173129,
"grad_norm": 1.3428549757574573,
"learning_rate": 8.733595899795065e-06,
"loss": 1.3446,
"step": 283
},
{
"epoch": 0.2560865644724977,
"grad_norm": 1.4193472151599897,
"learning_rate": 8.72386091371891e-06,
"loss": 1.3319,
"step": 284
},
{
"epoch": 0.2569882777276826,
"grad_norm": 1.407796205568918,
"learning_rate": 8.714094124066417e-06,
"loss": 1.3153,
"step": 285
},
{
"epoch": 0.25788999098286747,
"grad_norm": 1.368919703126466,
"learning_rate": 8.704295614250702e-06,
"loss": 1.3227,
"step": 286
},
{
"epoch": 0.2587917042380523,
"grad_norm": 1.4588445043175615,
"learning_rate": 8.694465467955787e-06,
"loss": 1.3217,
"step": 287
},
{
"epoch": 0.25969341749323716,
"grad_norm": 1.375947403721078,
"learning_rate": 8.68460376913588e-06,
"loss": 1.3237,
"step": 288
},
{
"epoch": 0.260595130748422,
"grad_norm": 1.4003625630261938,
"learning_rate": 8.674710602014672e-06,
"loss": 1.3122,
"step": 289
},
{
"epoch": 0.26149684400360684,
"grad_norm": 1.3382545085519817,
"learning_rate": 8.664786051084597e-06,
"loss": 1.3101,
"step": 290
},
{
"epoch": 0.2623985572587917,
"grad_norm": 1.4028482237388922,
"learning_rate": 8.654830201106133e-06,
"loss": 1.3257,
"step": 291
},
{
"epoch": 0.26330027051397653,
"grad_norm": 1.3153731433952243,
"learning_rate": 8.644843137107058e-06,
"loss": 1.3028,
"step": 292
},
{
"epoch": 0.26420198376916143,
"grad_norm": 1.3764170057217833,
"learning_rate": 8.634824944381742e-06,
"loss": 1.3147,
"step": 293
},
{
"epoch": 0.2651036970243463,
"grad_norm": 1.359807928213873,
"learning_rate": 8.624775708490403e-06,
"loss": 1.2961,
"step": 294
},
{
"epoch": 0.2660054102795311,
"grad_norm": 1.3720659056268978,
"learning_rate": 8.61469551525838e-06,
"loss": 1.2905,
"step": 295
},
{
"epoch": 0.26690712353471596,
"grad_norm": 1.3558845329560982,
"learning_rate": 8.604584450775414e-06,
"loss": 1.3164,
"step": 296
},
{
"epoch": 0.2678088367899008,
"grad_norm": 1.369224365175921,
"learning_rate": 8.594442601394889e-06,
"loss": 1.3027,
"step": 297
},
{
"epoch": 0.26871055004508565,
"grad_norm": 1.3395926586651208,
"learning_rate": 8.584270053733112e-06,
"loss": 1.2874,
"step": 298
},
{
"epoch": 0.2696122633002705,
"grad_norm": 1.3410001780528837,
"learning_rate": 8.574066894668573e-06,
"loss": 1.3137,
"step": 299
},
{
"epoch": 0.27051397655545534,
"grad_norm": 1.354326879069816,
"learning_rate": 8.56383321134119e-06,
"loss": 1.3243,
"step": 300
},
{
"epoch": 0.27141568981064024,
"grad_norm": 1.3373692508440478,
"learning_rate": 8.553569091151576e-06,
"loss": 1.3162,
"step": 301
},
{
"epoch": 0.2723174030658251,
"grad_norm": 1.308680304455333,
"learning_rate": 8.543274621760294e-06,
"loss": 1.3215,
"step": 302
},
{
"epoch": 0.2732191163210099,
"grad_norm": 1.3423833776970107,
"learning_rate": 8.532949891087095e-06,
"loss": 1.3025,
"step": 303
},
{
"epoch": 0.27412082957619477,
"grad_norm": 1.3888393861713075,
"learning_rate": 8.522594987310184e-06,
"loss": 1.3124,
"step": 304
},
{
"epoch": 0.2750225428313796,
"grad_norm": 1.3464218824388667,
"learning_rate": 8.512209998865457e-06,
"loss": 1.292,
"step": 305
},
{
"epoch": 0.27592425608656446,
"grad_norm": 1.3555408304024268,
"learning_rate": 8.501795014445746e-06,
"loss": 1.3027,
"step": 306
},
{
"epoch": 0.2768259693417493,
"grad_norm": 1.3546921352993,
"learning_rate": 8.491350123000061e-06,
"loss": 1.3414,
"step": 307
},
{
"epoch": 0.2777276825969342,
"grad_norm": 1.3141979508125459,
"learning_rate": 8.48087541373284e-06,
"loss": 1.2961,
"step": 308
},
{
"epoch": 0.27862939585211904,
"grad_norm": 1.2874735002984588,
"learning_rate": 8.470370976103171e-06,
"loss": 1.3051,
"step": 309
},
{
"epoch": 0.2795311091073039,
"grad_norm": 1.3614748416247762,
"learning_rate": 8.45983689982404e-06,
"loss": 1.2838,
"step": 310
},
{
"epoch": 0.28043282236248873,
"grad_norm": 1.3225207951086084,
"learning_rate": 8.449273274861566e-06,
"loss": 1.3064,
"step": 311
},
{
"epoch": 0.2813345356176736,
"grad_norm": 1.2914181710971653,
"learning_rate": 8.438680191434221e-06,
"loss": 1.293,
"step": 312
},
{
"epoch": 0.2822362488728584,
"grad_norm": 1.3066035639024423,
"learning_rate": 8.428057740012073e-06,
"loss": 1.2807,
"step": 313
},
{
"epoch": 0.28313796212804326,
"grad_norm": 1.3473408996502214,
"learning_rate": 8.417406011316e-06,
"loss": 1.3109,
"step": 314
},
{
"epoch": 0.2840396753832281,
"grad_norm": 1.3630749762023038,
"learning_rate": 8.406725096316923e-06,
"loss": 1.2907,
"step": 315
},
{
"epoch": 0.284941388638413,
"grad_norm": 1.3055465275152396,
"learning_rate": 8.396015086235037e-06,
"loss": 1.2946,
"step": 316
},
{
"epoch": 0.28584310189359785,
"grad_norm": 1.3941479121765923,
"learning_rate": 8.385276072539014e-06,
"loss": 1.3111,
"step": 317
},
{
"epoch": 0.2867448151487827,
"grad_norm": 1.3493271368855428,
"learning_rate": 8.374508146945235e-06,
"loss": 1.3202,
"step": 318
},
{
"epoch": 0.28764652840396754,
"grad_norm": 1.2962490184385833,
"learning_rate": 8.363711401417e-06,
"loss": 1.3176,
"step": 319
},
{
"epoch": 0.2885482416591524,
"grad_norm": 1.4044951956948102,
"learning_rate": 8.352885928163748e-06,
"loss": 1.3084,
"step": 320
},
{
"epoch": 0.2894499549143372,
"grad_norm": 1.387641068650636,
"learning_rate": 8.342031819640263e-06,
"loss": 1.2983,
"step": 321
},
{
"epoch": 0.29035166816952207,
"grad_norm": 1.2797748641028517,
"learning_rate": 8.331149168545892e-06,
"loss": 1.2838,
"step": 322
},
{
"epoch": 0.29125338142470697,
"grad_norm": 1.3045921019734228,
"learning_rate": 8.320238067823749e-06,
"loss": 1.292,
"step": 323
},
{
"epoch": 0.2921550946798918,
"grad_norm": 1.3694421776920578,
"learning_rate": 8.309298610659917e-06,
"loss": 1.3046,
"step": 324
},
{
"epoch": 0.29305680793507666,
"grad_norm": 1.348666455986645,
"learning_rate": 8.298330890482661e-06,
"loss": 1.2992,
"step": 325
},
{
"epoch": 0.2939585211902615,
"grad_norm": 1.343752165915506,
"learning_rate": 8.28733500096163e-06,
"loss": 1.3167,
"step": 326
},
{
"epoch": 0.29486023444544635,
"grad_norm": 1.3977838323584155,
"learning_rate": 8.276311036007041e-06,
"loss": 1.2958,
"step": 327
},
{
"epoch": 0.2957619477006312,
"grad_norm": 1.3705652789984946,
"learning_rate": 8.2652590897689e-06,
"loss": 1.3303,
"step": 328
},
{
"epoch": 0.29666366095581603,
"grad_norm": 1.3378543071202886,
"learning_rate": 8.25417925663618e-06,
"loss": 1.3004,
"step": 329
},
{
"epoch": 0.2975653742110009,
"grad_norm": 1.3746197234875142,
"learning_rate": 8.243071631236023e-06,
"loss": 1.2947,
"step": 330
},
{
"epoch": 0.2984670874661858,
"grad_norm": 1.3322172718318712,
"learning_rate": 8.231936308432935e-06,
"loss": 1.3004,
"step": 331
},
{
"epoch": 0.2993688007213706,
"grad_norm": 1.3224285481826337,
"learning_rate": 8.220773383327964e-06,
"loss": 1.3201,
"step": 332
},
{
"epoch": 0.30027051397655546,
"grad_norm": 1.3659756588727383,
"learning_rate": 8.209582951257901e-06,
"loss": 1.293,
"step": 333
},
{
"epoch": 0.3011722272317403,
"grad_norm": 1.3750276505406167,
"learning_rate": 8.198365107794457e-06,
"loss": 1.2945,
"step": 334
},
{
"epoch": 0.30207394048692515,
"grad_norm": 1.3626396570368906,
"learning_rate": 8.18711994874345e-06,
"loss": 1.2826,
"step": 335
},
{
"epoch": 0.30297565374211,
"grad_norm": 1.349389254932144,
"learning_rate": 8.175847570143985e-06,
"loss": 1.3043,
"step": 336
},
{
"epoch": 0.30387736699729484,
"grad_norm": 1.4016282244858203,
"learning_rate": 8.164548068267638e-06,
"loss": 1.3022,
"step": 337
},
{
"epoch": 0.3047790802524797,
"grad_norm": 1.3460811412408489,
"learning_rate": 8.153221539617627e-06,
"loss": 1.3046,
"step": 338
},
{
"epoch": 0.3056807935076646,
"grad_norm": 1.299653550889351,
"learning_rate": 8.141868080927998e-06,
"loss": 1.2623,
"step": 339
},
{
"epoch": 0.3065825067628494,
"grad_norm": 1.3804264420920427,
"learning_rate": 8.130487789162784e-06,
"loss": 1.2922,
"step": 340
},
{
"epoch": 0.30748422001803427,
"grad_norm": 1.3738872809619862,
"learning_rate": 8.119080761515197e-06,
"loss": 1.3044,
"step": 341
},
{
"epoch": 0.3083859332732191,
"grad_norm": 1.3211682882089721,
"learning_rate": 8.107647095406773e-06,
"loss": 1.2938,
"step": 342
},
{
"epoch": 0.30928764652840396,
"grad_norm": 1.3569560470529722,
"learning_rate": 8.09618688848656e-06,
"loss": 1.2996,
"step": 343
},
{
"epoch": 0.3101893597835888,
"grad_norm": 1.3729719633915038,
"learning_rate": 8.084700238630283e-06,
"loss": 1.3086,
"step": 344
},
{
"epoch": 0.31109107303877365,
"grad_norm": 1.406994450093395,
"learning_rate": 8.073187243939494e-06,
"loss": 1.3043,
"step": 345
},
{
"epoch": 0.31199278629395855,
"grad_norm": 1.3654434645311497,
"learning_rate": 8.061648002740743e-06,
"loss": 1.3023,
"step": 346
},
{
"epoch": 0.3128944995491434,
"grad_norm": 1.3240616315684701,
"learning_rate": 8.050082613584745e-06,
"loss": 1.2766,
"step": 347
},
{
"epoch": 0.31379621280432823,
"grad_norm": 1.407262370116193,
"learning_rate": 8.038491175245523e-06,
"loss": 1.3004,
"step": 348
},
{
"epoch": 0.3146979260595131,
"grad_norm": 1.372462705333482,
"learning_rate": 8.026873786719574e-06,
"loss": 1.2837,
"step": 349
},
{
"epoch": 0.3155996393146979,
"grad_norm": 1.274181826236207,
"learning_rate": 8.01523054722503e-06,
"loss": 1.2945,
"step": 350
},
{
"epoch": 0.31650135256988277,
"grad_norm": 1.4141977100447898,
"learning_rate": 8.003561556200796e-06,
"loss": 1.2876,
"step": 351
},
{
"epoch": 0.3174030658250676,
"grad_norm": 1.3649136441566765,
"learning_rate": 7.991866913305705e-06,
"loss": 1.3149,
"step": 352
},
{
"epoch": 0.31830477908025245,
"grad_norm": 1.345609642702963,
"learning_rate": 7.980146718417677e-06,
"loss": 1.2899,
"step": 353
},
{
"epoch": 0.31920649233543735,
"grad_norm": 1.3482430455703702,
"learning_rate": 7.968401071632854e-06,
"loss": 1.2998,
"step": 354
},
{
"epoch": 0.3201082055906222,
"grad_norm": 1.4862722731895457,
"learning_rate": 7.956630073264746e-06,
"loss": 1.287,
"step": 355
},
{
"epoch": 0.32100991884580704,
"grad_norm": 1.3099568378155075,
"learning_rate": 7.94483382384339e-06,
"loss": 1.2857,
"step": 356
},
{
"epoch": 0.3219116321009919,
"grad_norm": 1.389663476713791,
"learning_rate": 7.933012424114463e-06,
"loss": 1.2643,
"step": 357
},
{
"epoch": 0.32281334535617673,
"grad_norm": 1.358115897466939,
"learning_rate": 7.92116597503845e-06,
"loss": 1.2963,
"step": 358
},
{
"epoch": 0.3237150586113616,
"grad_norm": 1.407271812376636,
"learning_rate": 7.909294577789765e-06,
"loss": 1.3218,
"step": 359
},
{
"epoch": 0.3246167718665464,
"grad_norm": 1.3215185181274458,
"learning_rate": 7.897398333755892e-06,
"loss": 1.2808,
"step": 360
},
{
"epoch": 0.3255184851217313,
"grad_norm": 1.3448206583595448,
"learning_rate": 7.885477344536516e-06,
"loss": 1.262,
"step": 361
},
{
"epoch": 0.32642019837691616,
"grad_norm": 1.3369298028154637,
"learning_rate": 7.873531711942664e-06,
"loss": 1.2948,
"step": 362
},
{
"epoch": 0.327321911632101,
"grad_norm": 1.3476691147339084,
"learning_rate": 7.861561537995825e-06,
"loss": 1.2867,
"step": 363
},
{
"epoch": 0.32822362488728585,
"grad_norm": 1.277983015349736,
"learning_rate": 7.849566924927082e-06,
"loss": 1.2919,
"step": 364
},
{
"epoch": 0.3291253381424707,
"grad_norm": 1.3747711906676852,
"learning_rate": 7.837547975176243e-06,
"loss": 1.2826,
"step": 365
},
{
"epoch": 0.33002705139765554,
"grad_norm": 1.4226836538925995,
"learning_rate": 7.825504791390962e-06,
"loss": 1.2753,
"step": 366
},
{
"epoch": 0.3309287646528404,
"grad_norm": 1.3591430506296809,
"learning_rate": 7.813437476425863e-06,
"loss": 1.315,
"step": 367
},
{
"epoch": 0.3318304779080252,
"grad_norm": 1.3810536824360335,
"learning_rate": 7.801346133341663e-06,
"loss": 1.2983,
"step": 368
},
{
"epoch": 0.3327321911632101,
"grad_norm": 1.3918849098123023,
"learning_rate": 7.789230865404287e-06,
"loss": 1.2789,
"step": 369
},
{
"epoch": 0.33363390441839497,
"grad_norm": 1.3944386013086512,
"learning_rate": 7.777091776083996e-06,
"loss": 1.3068,
"step": 370
},
{
"epoch": 0.3345356176735798,
"grad_norm": 1.3315482446866465,
"learning_rate": 7.764928969054493e-06,
"loss": 1.3001,
"step": 371
},
{
"epoch": 0.33543733092876465,
"grad_norm": 1.334078947941813,
"learning_rate": 7.752742548192042e-06,
"loss": 1.2957,
"step": 372
},
{
"epoch": 0.3363390441839495,
"grad_norm": 1.349358556672528,
"learning_rate": 7.74053261757458e-06,
"loss": 1.281,
"step": 373
},
{
"epoch": 0.33724075743913434,
"grad_norm": 1.3764708203915843,
"learning_rate": 7.728299281480833e-06,
"loss": 1.2959,
"step": 374
},
{
"epoch": 0.3381424706943192,
"grad_norm": 1.3835774406343864,
"learning_rate": 7.716042644389417e-06,
"loss": 1.2834,
"step": 375
},
{
"epoch": 0.3390441839495041,
"grad_norm": 1.407266558876184,
"learning_rate": 7.70376281097795e-06,
"loss": 1.2942,
"step": 376
},
{
"epoch": 0.33994589720468893,
"grad_norm": 1.3515850606540596,
"learning_rate": 7.69145988612216e-06,
"loss": 1.2577,
"step": 377
},
{
"epoch": 0.3408476104598738,
"grad_norm": 1.307205353895994,
"learning_rate": 7.679133974894984e-06,
"loss": 1.2955,
"step": 378
},
{
"epoch": 0.3417493237150586,
"grad_norm": 1.4701520498603482,
"learning_rate": 7.666785182565676e-06,
"loss": 1.2532,
"step": 379
},
{
"epoch": 0.34265103697024346,
"grad_norm": 1.3260869562172477,
"learning_rate": 7.654413614598905e-06,
"loss": 1.3014,
"step": 380
},
{
"epoch": 0.3435527502254283,
"grad_norm": 1.3383055059934015,
"learning_rate": 7.642019376653858e-06,
"loss": 1.2616,
"step": 381
},
{
"epoch": 0.34445446348061315,
"grad_norm": 1.3342827241300619,
"learning_rate": 7.62960257458333e-06,
"loss": 1.2798,
"step": 382
},
{
"epoch": 0.345356176735798,
"grad_norm": 1.3650978733267973,
"learning_rate": 7.617163314432825e-06,
"loss": 1.2619,
"step": 383
},
{
"epoch": 0.3462578899909829,
"grad_norm": 1.2878440106478128,
"learning_rate": 7.604701702439652e-06,
"loss": 1.2949,
"step": 384
},
{
"epoch": 0.34715960324616774,
"grad_norm": 1.3114645587549885,
"learning_rate": 7.592217845032016e-06,
"loss": 1.2857,
"step": 385
},
{
"epoch": 0.3480613165013526,
"grad_norm": 1.312097101465185,
"learning_rate": 7.579711848828106e-06,
"loss": 1.2875,
"step": 386
},
{
"epoch": 0.3489630297565374,
"grad_norm": 1.351670846135159,
"learning_rate": 7.567183820635189e-06,
"loss": 1.2838,
"step": 387
},
{
"epoch": 0.34986474301172227,
"grad_norm": 1.3153701472924362,
"learning_rate": 7.554633867448695e-06,
"loss": 1.2935,
"step": 388
},
{
"epoch": 0.3507664562669071,
"grad_norm": 1.3124645024087132,
"learning_rate": 7.542062096451306e-06,
"loss": 1.2747,
"step": 389
},
{
"epoch": 0.35166816952209196,
"grad_norm": 1.2839138356985629,
"learning_rate": 7.5294686150120345e-06,
"loss": 1.2661,
"step": 390
},
{
"epoch": 0.3525698827772768,
"grad_norm": 1.3058425890142953,
"learning_rate": 7.5168535306853155e-06,
"loss": 1.2878,
"step": 391
},
{
"epoch": 0.3534715960324617,
"grad_norm": 1.3249207369867737,
"learning_rate": 7.50421695121008e-06,
"loss": 1.2868,
"step": 392
},
{
"epoch": 0.35437330928764654,
"grad_norm": 1.2942765461903978,
"learning_rate": 7.491558984508838e-06,
"loss": 1.2862,
"step": 393
},
{
"epoch": 0.3552750225428314,
"grad_norm": 1.3224112637420926,
"learning_rate": 7.4788797386867596e-06,
"loss": 1.2769,
"step": 394
},
{
"epoch": 0.35617673579801623,
"grad_norm": 1.3206566542639389,
"learning_rate": 7.466179322030746e-06,
"loss": 1.2846,
"step": 395
},
{
"epoch": 0.3570784490532011,
"grad_norm": 1.3631450867826957,
"learning_rate": 7.453457843008509e-06,
"loss": 1.284,
"step": 396
},
{
"epoch": 0.3579801623083859,
"grad_norm": 1.3218571416387632,
"learning_rate": 7.4407154102676425e-06,
"loss": 1.3038,
"step": 397
},
{
"epoch": 0.35888187556357076,
"grad_norm": 1.317177282255559,
"learning_rate": 7.427952132634694e-06,
"loss": 1.2509,
"step": 398
},
{
"epoch": 0.35978358881875566,
"grad_norm": 1.3276673394491625,
"learning_rate": 7.41516811911424e-06,
"loss": 1.2644,
"step": 399
},
{
"epoch": 0.3606853020739405,
"grad_norm": 1.280809217458966,
"learning_rate": 7.402363478887948e-06,
"loss": 1.285,
"step": 400
},
{
"epoch": 0.36158701532912535,
"grad_norm": 1.3571731498903,
"learning_rate": 7.389538321313652e-06,
"loss": 1.2977,
"step": 401
},
{
"epoch": 0.3624887285843102,
"grad_norm": 1.4009686853014174,
"learning_rate": 7.376692755924407e-06,
"loss": 1.2784,
"step": 402
},
{
"epoch": 0.36339044183949504,
"grad_norm": 1.2677194762164836,
"learning_rate": 7.363826892427568e-06,
"loss": 1.2985,
"step": 403
},
{
"epoch": 0.3642921550946799,
"grad_norm": 1.3137009718811887,
"learning_rate": 7.350940840703842e-06,
"loss": 1.2726,
"step": 404
},
{
"epoch": 0.3651938683498647,
"grad_norm": 1.2806871619916333,
"learning_rate": 7.338034710806353e-06,
"loss": 1.2854,
"step": 405
},
{
"epoch": 0.36609558160504957,
"grad_norm": 1.34164695933686,
"learning_rate": 7.3251086129597034e-06,
"loss": 1.2927,
"step": 406
},
{
"epoch": 0.36699729486023447,
"grad_norm": 1.3014689973098728,
"learning_rate": 7.312162657559031e-06,
"loss": 1.2824,
"step": 407
},
{
"epoch": 0.3678990081154193,
"grad_norm": 1.2963420961664436,
"learning_rate": 7.299196955169068e-06,
"loss": 1.2833,
"step": 408
},
{
"epoch": 0.36880072137060416,
"grad_norm": 1.2885380885948925,
"learning_rate": 7.286211616523193e-06,
"loss": 1.2802,
"step": 409
},
{
"epoch": 0.369702434625789,
"grad_norm": 1.2629464462465954,
"learning_rate": 7.2732067525224914e-06,
"loss": 1.2885,
"step": 410
},
{
"epoch": 0.37060414788097384,
"grad_norm": 1.2729298983223787,
"learning_rate": 7.2601824742347985e-06,
"loss": 1.2759,
"step": 411
},
{
"epoch": 0.3715058611361587,
"grad_norm": 1.3560121385795936,
"learning_rate": 7.247138892893765e-06,
"loss": 1.2683,
"step": 412
},
{
"epoch": 0.37240757439134353,
"grad_norm": 1.3408137997088863,
"learning_rate": 7.2340761198978916e-06,
"loss": 1.2827,
"step": 413
},
{
"epoch": 0.37330928764652843,
"grad_norm": 1.3745114451521934,
"learning_rate": 7.220994266809591e-06,
"loss": 1.2957,
"step": 414
},
{
"epoch": 0.3742110009017133,
"grad_norm": 1.346575107900885,
"learning_rate": 7.207893445354224e-06,
"loss": 1.2978,
"step": 415
},
{
"epoch": 0.3751127141568981,
"grad_norm": 1.2830969629139972,
"learning_rate": 7.1947737674191555e-06,
"loss": 1.2925,
"step": 416
},
{
"epoch": 0.37601442741208296,
"grad_norm": 1.3694758238273899,
"learning_rate": 7.1816353450527886e-06,
"loss": 1.2821,
"step": 417
},
{
"epoch": 0.3769161406672678,
"grad_norm": 1.3231983523784938,
"learning_rate": 7.1684782904636174e-06,
"loss": 1.2968,
"step": 418
},
{
"epoch": 0.37781785392245265,
"grad_norm": 1.2669291717660884,
"learning_rate": 7.155302716019263e-06,
"loss": 1.2601,
"step": 419
},
{
"epoch": 0.3787195671776375,
"grad_norm": 1.3454544044505505,
"learning_rate": 7.142108734245512e-06,
"loss": 1.3008,
"step": 420
},
{
"epoch": 0.37962128043282234,
"grad_norm": 1.3216303173172852,
"learning_rate": 7.128896457825364e-06,
"loss": 1.2821,
"step": 421
},
{
"epoch": 0.38052299368800724,
"grad_norm": 1.3829956233217842,
"learning_rate": 7.115665999598058e-06,
"loss": 1.2677,
"step": 422
},
{
"epoch": 0.3814247069431921,
"grad_norm": 1.312479941373894,
"learning_rate": 7.10241747255812e-06,
"loss": 1.2753,
"step": 423
},
{
"epoch": 0.3823264201983769,
"grad_norm": 1.3644599578334198,
"learning_rate": 7.089150989854385e-06,
"loss": 1.2736,
"step": 424
},
{
"epoch": 0.38322813345356177,
"grad_norm": 1.3270302655112538,
"learning_rate": 7.075866664789047e-06,
"loss": 1.2996,
"step": 425
},
{
"epoch": 0.3841298467087466,
"grad_norm": 1.3217555259246643,
"learning_rate": 7.062564610816678e-06,
"loss": 1.2545,
"step": 426
},
{
"epoch": 0.38503155996393146,
"grad_norm": 1.3491461800964386,
"learning_rate": 7.049244941543259e-06,
"loss": 1.291,
"step": 427
},
{
"epoch": 0.3859332732191163,
"grad_norm": 1.3556856304743925,
"learning_rate": 7.0359077707252235e-06,
"loss": 1.2747,
"step": 428
},
{
"epoch": 0.38683498647430115,
"grad_norm": 1.3561707578414417,
"learning_rate": 7.022553212268469e-06,
"loss": 1.2791,
"step": 429
},
{
"epoch": 0.38773669972948605,
"grad_norm": 1.3184506441485386,
"learning_rate": 7.0091813802273965e-06,
"loss": 1.2883,
"step": 430
},
{
"epoch": 0.3886384129846709,
"grad_norm": 1.263280337390235,
"learning_rate": 6.995792388803929e-06,
"loss": 1.2777,
"step": 431
},
{
"epoch": 0.38954012623985573,
"grad_norm": 1.297689514662243,
"learning_rate": 6.9823863523465405e-06,
"loss": 1.2461,
"step": 432
},
{
"epoch": 0.3904418394950406,
"grad_norm": 1.342033341696052,
"learning_rate": 6.968963385349277e-06,
"loss": 1.2509,
"step": 433
},
{
"epoch": 0.3913435527502254,
"grad_norm": 1.360711918633311,
"learning_rate": 6.95552360245078e-06,
"loss": 1.2967,
"step": 434
},
{
"epoch": 0.39224526600541026,
"grad_norm": 1.3324380530143383,
"learning_rate": 6.942067118433308e-06,
"loss": 1.2773,
"step": 435
},
{
"epoch": 0.3931469792605951,
"grad_norm": 1.3761059794482413,
"learning_rate": 6.92859404822175e-06,
"loss": 1.2832,
"step": 436
},
{
"epoch": 0.39404869251578,
"grad_norm": 1.3702237680815197,
"learning_rate": 6.9151045068826584e-06,
"loss": 1.2687,
"step": 437
},
{
"epoch": 0.39495040577096485,
"grad_norm": 1.3487692751034914,
"learning_rate": 6.9015986096232465e-06,
"loss": 1.291,
"step": 438
},
{
"epoch": 0.3958521190261497,
"grad_norm": 1.3424423254670161,
"learning_rate": 6.888076471790423e-06,
"loss": 1.2621,
"step": 439
},
{
"epoch": 0.39675383228133454,
"grad_norm": 1.3843979031440812,
"learning_rate": 6.874538208869797e-06,
"loss": 1.2767,
"step": 440
},
{
"epoch": 0.3976555455365194,
"grad_norm": 1.3069934768452458,
"learning_rate": 6.860983936484689e-06,
"loss": 1.2866,
"step": 441
},
{
"epoch": 0.3985572587917042,
"grad_norm": 1.3106394157833179,
"learning_rate": 6.8474137703951574e-06,
"loss": 1.2749,
"step": 442
},
{
"epoch": 0.39945897204688907,
"grad_norm": 1.2783459290470887,
"learning_rate": 6.83382782649699e-06,
"loss": 1.2763,
"step": 443
},
{
"epoch": 0.4003606853020739,
"grad_norm": 1.300911163405327,
"learning_rate": 6.820226220820733e-06,
"loss": 1.2837,
"step": 444
},
{
"epoch": 0.4012623985572588,
"grad_norm": 1.3533201412174218,
"learning_rate": 6.806609069530687e-06,
"loss": 1.2334,
"step": 445
},
{
"epoch": 0.40216411181244366,
"grad_norm": 1.3410157731632268,
"learning_rate": 6.7929764889239235e-06,
"loss": 1.2695,
"step": 446
},
{
"epoch": 0.4030658250676285,
"grad_norm": 1.3159135620461133,
"learning_rate": 6.779328595429282e-06,
"loss": 1.2759,
"step": 447
},
{
"epoch": 0.40396753832281335,
"grad_norm": 1.3429134308900144,
"learning_rate": 6.765665505606389e-06,
"loss": 1.2639,
"step": 448
},
{
"epoch": 0.4048692515779982,
"grad_norm": 1.358085645434167,
"learning_rate": 6.7519873361446475e-06,
"loss": 1.2709,
"step": 449
},
{
"epoch": 0.40577096483318303,
"grad_norm": 1.282126956537775,
"learning_rate": 6.738294203862255e-06,
"loss": 1.2801,
"step": 450
},
{
"epoch": 0.4066726780883679,
"grad_norm": 1.3820387277990962,
"learning_rate": 6.724586225705191e-06,
"loss": 1.2791,
"step": 451
},
{
"epoch": 0.4075743913435528,
"grad_norm": 1.3163223637459345,
"learning_rate": 6.710863518746233e-06,
"loss": 1.2556,
"step": 452
},
{
"epoch": 0.4084761045987376,
"grad_norm": 1.2796002323586544,
"learning_rate": 6.697126200183945e-06,
"loss": 1.2749,
"step": 453
},
{
"epoch": 0.40937781785392247,
"grad_norm": 1.3546933591445498,
"learning_rate": 6.683374387341688e-06,
"loss": 1.2883,
"step": 454
},
{
"epoch": 0.4102795311091073,
"grad_norm": 1.3487555368396058,
"learning_rate": 6.669608197666599e-06,
"loss": 1.2743,
"step": 455
},
{
"epoch": 0.41118124436429215,
"grad_norm": 1.266890989390273,
"learning_rate": 6.655827748728613e-06,
"loss": 1.2544,
"step": 456
},
{
"epoch": 0.412082957619477,
"grad_norm": 1.2531573983607907,
"learning_rate": 6.642033158219436e-06,
"loss": 1.2782,
"step": 457
},
{
"epoch": 0.41298467087466184,
"grad_norm": 1.2705610688755955,
"learning_rate": 6.628224543951558e-06,
"loss": 1.2573,
"step": 458
},
{
"epoch": 0.4138863841298467,
"grad_norm": 1.3037540862478307,
"learning_rate": 6.614402023857231e-06,
"loss": 1.2523,
"step": 459
},
{
"epoch": 0.4147880973850316,
"grad_norm": 1.315768394711074,
"learning_rate": 6.600565715987477e-06,
"loss": 1.3002,
"step": 460
},
{
"epoch": 0.41568981064021643,
"grad_norm": 1.2815374396487438,
"learning_rate": 6.586715738511067e-06,
"loss": 1.2452,
"step": 461
},
{
"epoch": 0.4165915238954013,
"grad_norm": 1.265492572389699,
"learning_rate": 6.5728522097135185e-06,
"loss": 1.2615,
"step": 462
},
{
"epoch": 0.4174932371505861,
"grad_norm": 1.3240543289156776,
"learning_rate": 6.558975247996082e-06,
"loss": 1.2809,
"step": 463
},
{
"epoch": 0.41839495040577096,
"grad_norm": 1.3155938565360743,
"learning_rate": 6.545084971874738e-06,
"loss": 1.2814,
"step": 464
},
{
"epoch": 0.4192966636609558,
"grad_norm": 1.373703900141433,
"learning_rate": 6.531181499979171e-06,
"loss": 1.2914,
"step": 465
},
{
"epoch": 0.42019837691614065,
"grad_norm": 1.240236493584311,
"learning_rate": 6.517264951051768e-06,
"loss": 1.2626,
"step": 466
},
{
"epoch": 0.4211000901713255,
"grad_norm": 1.2854276989826168,
"learning_rate": 6.503335443946599e-06,
"loss": 1.2403,
"step": 467
},
{
"epoch": 0.4220018034265104,
"grad_norm": 1.2747103544525322,
"learning_rate": 6.489393097628404e-06,
"loss": 1.2539,
"step": 468
},
{
"epoch": 0.42290351668169524,
"grad_norm": 1.2909245211989353,
"learning_rate": 6.475438031171574e-06,
"loss": 1.2429,
"step": 469
},
{
"epoch": 0.4238052299368801,
"grad_norm": 1.337002870116083,
"learning_rate": 6.461470363759138e-06,
"loss": 1.2849,
"step": 470
},
{
"epoch": 0.4247069431920649,
"grad_norm": 1.2988092746817106,
"learning_rate": 6.447490214681742e-06,
"loss": 1.2777,
"step": 471
},
{
"epoch": 0.42560865644724977,
"grad_norm": 1.317724826921231,
"learning_rate": 6.433497703336634e-06,
"loss": 1.2512,
"step": 472
},
{
"epoch": 0.4265103697024346,
"grad_norm": 1.2707143136330774,
"learning_rate": 6.419492949226639e-06,
"loss": 1.2728,
"step": 473
},
{
"epoch": 0.42741208295761945,
"grad_norm": 1.3083801478910981,
"learning_rate": 6.405476071959142e-06,
"loss": 1.292,
"step": 474
},
{
"epoch": 0.42831379621280435,
"grad_norm": 1.3054874743338112,
"learning_rate": 6.391447191245066e-06,
"loss": 1.2517,
"step": 475
},
{
"epoch": 0.4292155094679892,
"grad_norm": 1.2904740870179476,
"learning_rate": 6.3774064268978485e-06,
"loss": 1.2707,
"step": 476
},
{
"epoch": 0.43011722272317404,
"grad_norm": 1.2629518785414842,
"learning_rate": 6.363353898832421e-06,
"loss": 1.2582,
"step": 477
},
{
"epoch": 0.4310189359783589,
"grad_norm": 1.3089815906738431,
"learning_rate": 6.34928972706418e-06,
"loss": 1.2735,
"step": 478
},
{
"epoch": 0.43192064923354373,
"grad_norm": 1.290175664928981,
"learning_rate": 6.335214031707966e-06,
"loss": 1.2844,
"step": 479
},
{
"epoch": 0.4328223624887286,
"grad_norm": 1.2793599157516249,
"learning_rate": 6.321126932977035e-06,
"loss": 1.2853,
"step": 480
},
{
"epoch": 0.4337240757439134,
"grad_norm": 1.3000724086825444,
"learning_rate": 6.307028551182041e-06,
"loss": 1.2285,
"step": 481
},
{
"epoch": 0.43462578899909826,
"grad_norm": 1.3433631585110632,
"learning_rate": 6.292919006729988e-06,
"loss": 1.2548,
"step": 482
},
{
"epoch": 0.43552750225428316,
"grad_norm": 1.2938230816915852,
"learning_rate": 6.278798420123227e-06,
"loss": 1.2848,
"step": 483
},
{
"epoch": 0.436429215509468,
"grad_norm": 1.3968096528040583,
"learning_rate": 6.264666911958404e-06,
"loss": 1.277,
"step": 484
},
{
"epoch": 0.43733092876465285,
"grad_norm": 1.3270469545827397,
"learning_rate": 6.250524602925449e-06,
"loss": 1.2472,
"step": 485
},
{
"epoch": 0.4382326420198377,
"grad_norm": 1.3158907122253496,
"learning_rate": 6.23637161380653e-06,
"loss": 1.2371,
"step": 486
},
{
"epoch": 0.43913435527502254,
"grad_norm": 1.2974298920685672,
"learning_rate": 6.222208065475034e-06,
"loss": 1.2634,
"step": 487
},
{
"epoch": 0.4400360685302074,
"grad_norm": 1.2961515785195792,
"learning_rate": 6.208034078894523e-06,
"loss": 1.2948,
"step": 488
},
{
"epoch": 0.4409377817853922,
"grad_norm": 1.3046294501341769,
"learning_rate": 6.193849775117709e-06,
"loss": 1.2559,
"step": 489
},
{
"epoch": 0.4418394950405771,
"grad_norm": 1.3561539228341617,
"learning_rate": 6.179655275285422e-06,
"loss": 1.2522,
"step": 490
},
{
"epoch": 0.44274120829576197,
"grad_norm": 1.3657194147132745,
"learning_rate": 6.165450700625565e-06,
"loss": 1.2813,
"step": 491
},
{
"epoch": 0.4436429215509468,
"grad_norm": 1.3150013080989733,
"learning_rate": 6.151236172452086e-06,
"loss": 1.2724,
"step": 492
},
{
"epoch": 0.44454463480613166,
"grad_norm": 1.3131202427968371,
"learning_rate": 6.137011812163943e-06,
"loss": 1.2533,
"step": 493
},
{
"epoch": 0.4454463480613165,
"grad_norm": 1.3524564852985235,
"learning_rate": 6.122777741244067e-06,
"loss": 1.2631,
"step": 494
},
{
"epoch": 0.44634806131650134,
"grad_norm": 1.3458593194377417,
"learning_rate": 6.108534081258317e-06,
"loss": 1.2685,
"step": 495
},
{
"epoch": 0.4472497745716862,
"grad_norm": 1.330923016565149,
"learning_rate": 6.094280953854451e-06,
"loss": 1.2568,
"step": 496
},
{
"epoch": 0.44815148782687103,
"grad_norm": 1.328196318920164,
"learning_rate": 6.0800184807610815e-06,
"loss": 1.2646,
"step": 497
},
{
"epoch": 0.44905320108205593,
"grad_norm": 1.3543757841751654,
"learning_rate": 6.065746783786639e-06,
"loss": 1.2466,
"step": 498
},
{
"epoch": 0.4499549143372408,
"grad_norm": 1.3642324780253887,
"learning_rate": 6.051465984818332e-06,
"loss": 1.2723,
"step": 499
},
{
"epoch": 0.4508566275924256,
"grad_norm": 1.2543782903684808,
"learning_rate": 6.037176205821099e-06,
"loss": 1.265,
"step": 500
},
{
"epoch": 0.45175834084761046,
"grad_norm": 1.3520797825716413,
"learning_rate": 6.022877568836579e-06,
"loss": 1.271,
"step": 501
},
{
"epoch": 0.4526600541027953,
"grad_norm": 1.3381165664660035,
"learning_rate": 6.008570195982057e-06,
"loss": 1.2842,
"step": 502
},
{
"epoch": 0.45356176735798015,
"grad_norm": 1.3145159045552166,
"learning_rate": 5.9942542094494295e-06,
"loss": 1.2608,
"step": 503
},
{
"epoch": 0.454463480613165,
"grad_norm": 1.3776537193003155,
"learning_rate": 5.979929731504158e-06,
"loss": 1.2462,
"step": 504
},
{
"epoch": 0.45536519386834984,
"grad_norm": 1.3582248635145542,
"learning_rate": 5.9655968844842236e-06,
"loss": 1.2697,
"step": 505
},
{
"epoch": 0.45626690712353474,
"grad_norm": 1.397601016532863,
"learning_rate": 5.951255790799082e-06,
"loss": 1.2568,
"step": 506
},
{
"epoch": 0.4571686203787196,
"grad_norm": 1.3563867417958715,
"learning_rate": 5.936906572928625e-06,
"loss": 1.2427,
"step": 507
},
{
"epoch": 0.4580703336339044,
"grad_norm": 1.3042721613566737,
"learning_rate": 5.922549353422121e-06,
"loss": 1.2515,
"step": 508
},
{
"epoch": 0.45897204688908927,
"grad_norm": 1.3588624169364447,
"learning_rate": 5.908184254897183e-06,
"loss": 1.2818,
"step": 509
},
{
"epoch": 0.4598737601442741,
"grad_norm": 1.3477204486305108,
"learning_rate": 5.893811400038711e-06,
"loss": 1.2512,
"step": 510
},
{
"epoch": 0.46077547339945896,
"grad_norm": 1.2814432877128779,
"learning_rate": 5.87943091159785e-06,
"loss": 1.2307,
"step": 511
},
{
"epoch": 0.4616771866546438,
"grad_norm": 1.3786543590269573,
"learning_rate": 5.865042912390938e-06,
"loss": 1.2736,
"step": 512
},
{
"epoch": 0.4625788999098287,
"grad_norm": 1.2913891449053854,
"learning_rate": 5.850647525298457e-06,
"loss": 1.2452,
"step": 513
},
{
"epoch": 0.46348061316501354,
"grad_norm": 1.415181008314584,
"learning_rate": 5.836244873263989e-06,
"loss": 1.2264,
"step": 514
},
{
"epoch": 0.4643823264201984,
"grad_norm": 1.356445707006065,
"learning_rate": 5.8218350792931596e-06,
"loss": 1.2504,
"step": 515
},
{
"epoch": 0.46528403967538323,
"grad_norm": 1.3302986281953149,
"learning_rate": 5.807418266452591e-06,
"loss": 1.2422,
"step": 516
},
{
"epoch": 0.4661857529305681,
"grad_norm": 1.33730329817938,
"learning_rate": 5.792994557868851e-06,
"loss": 1.2566,
"step": 517
},
{
"epoch": 0.4670874661857529,
"grad_norm": 1.3745406587403888,
"learning_rate": 5.778564076727395e-06,
"loss": 1.2577,
"step": 518
},
{
"epoch": 0.46798917944093776,
"grad_norm": 1.3877226557278701,
"learning_rate": 5.764126946271526e-06,
"loss": 1.2332,
"step": 519
},
{
"epoch": 0.4688908926961226,
"grad_norm": 1.3523749190458996,
"learning_rate": 5.749683289801331e-06,
"loss": 1.2735,
"step": 520
},
{
"epoch": 0.4697926059513075,
"grad_norm": 1.2986294783132397,
"learning_rate": 5.735233230672636e-06,
"loss": 1.2509,
"step": 521
},
{
"epoch": 0.47069431920649235,
"grad_norm": 1.3292382069120443,
"learning_rate": 5.720776892295944e-06,
"loss": 1.2429,
"step": 522
},
{
"epoch": 0.4715960324616772,
"grad_norm": 1.3196463593122516,
"learning_rate": 5.70631439813539e-06,
"loss": 1.2614,
"step": 523
},
{
"epoch": 0.47249774571686204,
"grad_norm": 1.3365623305366012,
"learning_rate": 5.691845871707682e-06,
"loss": 1.2547,
"step": 524
},
{
"epoch": 0.4733994589720469,
"grad_norm": 1.4092965451878707,
"learning_rate": 5.677371436581044e-06,
"loss": 1.2522,
"step": 525
},
{
"epoch": 0.4743011722272317,
"grad_norm": 1.2958259510303567,
"learning_rate": 5.662891216374165e-06,
"loss": 1.2589,
"step": 526
},
{
"epoch": 0.47520288548241657,
"grad_norm": 1.314219195752724,
"learning_rate": 5.64840533475514e-06,
"loss": 1.264,
"step": 527
},
{
"epoch": 0.47610459873760147,
"grad_norm": 1.3183829593636753,
"learning_rate": 5.633913915440419e-06,
"loss": 1.2719,
"step": 528
},
{
"epoch": 0.4770063119927863,
"grad_norm": 1.363071460186982,
"learning_rate": 5.61941708219374e-06,
"loss": 1.2327,
"step": 529
},
{
"epoch": 0.47790802524797116,
"grad_norm": 1.328897114850557,
"learning_rate": 5.604914958825085e-06,
"loss": 1.2728,
"step": 530
},
{
"epoch": 0.478809738503156,
"grad_norm": 1.3490178940429087,
"learning_rate": 5.590407669189612e-06,
"loss": 1.2648,
"step": 531
},
{
"epoch": 0.47971145175834085,
"grad_norm": 1.3274020505027164,
"learning_rate": 5.575895337186605e-06,
"loss": 1.2312,
"step": 532
},
{
"epoch": 0.4806131650135257,
"grad_norm": 1.3042298628231705,
"learning_rate": 5.561378086758406e-06,
"loss": 1.2511,
"step": 533
},
{
"epoch": 0.48151487826871053,
"grad_norm": 1.2449161967710574,
"learning_rate": 5.546856041889374e-06,
"loss": 1.2528,
"step": 534
},
{
"epoch": 0.4824165915238954,
"grad_norm": 1.297681240745865,
"learning_rate": 5.5323293266047996e-06,
"loss": 1.2618,
"step": 535
},
{
"epoch": 0.4833183047790803,
"grad_norm": 1.25941931209134,
"learning_rate": 5.5177980649698744e-06,
"loss": 1.2449,
"step": 536
},
{
"epoch": 0.4842200180342651,
"grad_norm": 1.3103057695935634,
"learning_rate": 5.503262381088613e-06,
"loss": 1.2537,
"step": 537
},
{
"epoch": 0.48512173128944996,
"grad_norm": 1.2872216919055939,
"learning_rate": 5.488722399102796e-06,
"loss": 1.251,
"step": 538
},
{
"epoch": 0.4860234445446348,
"grad_norm": 1.335560503143788,
"learning_rate": 5.4741782431909144e-06,
"loss": 1.2464,
"step": 539
},
{
"epoch": 0.48692515779981965,
"grad_norm": 1.276852157297722,
"learning_rate": 5.459630037567105e-06,
"loss": 1.2418,
"step": 540
},
{
"epoch": 0.4878268710550045,
"grad_norm": 1.3990001265601495,
"learning_rate": 5.445077906480095e-06,
"loss": 1.2597,
"step": 541
},
{
"epoch": 0.48872858431018934,
"grad_norm": 1.2988789147578377,
"learning_rate": 5.430521974212132e-06,
"loss": 1.271,
"step": 542
},
{
"epoch": 0.4896302975653742,
"grad_norm": 1.289894149801735,
"learning_rate": 5.4159623650779305e-06,
"loss": 1.2396,
"step": 543
},
{
"epoch": 0.4905320108205591,
"grad_norm": 1.3361917628103448,
"learning_rate": 5.4013992034236065e-06,
"loss": 1.2806,
"step": 544
},
{
"epoch": 0.4914337240757439,
"grad_norm": 1.3851343658094326,
"learning_rate": 5.386832613625615e-06,
"loss": 1.2652,
"step": 545
},
{
"epoch": 0.49233543733092877,
"grad_norm": 1.3460734085077293,
"learning_rate": 5.3722627200896894e-06,
"loss": 1.2381,
"step": 546
},
{
"epoch": 0.4932371505861136,
"grad_norm": 1.3361567213666667,
"learning_rate": 5.357689647249782e-06,
"loss": 1.2388,
"step": 547
},
{
"epoch": 0.49413886384129846,
"grad_norm": 1.2889281104821497,
"learning_rate": 5.343113519566994e-06,
"loss": 1.2488,
"step": 548
},
{
"epoch": 0.4950405770964833,
"grad_norm": 1.2997408839425744,
"learning_rate": 5.328534461528515e-06,
"loss": 1.2575,
"step": 549
},
{
"epoch": 0.49594229035166815,
"grad_norm": 1.3549331356810177,
"learning_rate": 5.3139525976465675e-06,
"loss": 1.2639,
"step": 550
},
{
"epoch": 0.49684400360685305,
"grad_norm": 1.3051590759911373,
"learning_rate": 5.299368052457332e-06,
"loss": 1.2566,
"step": 551
},
{
"epoch": 0.4977457168620379,
"grad_norm": 1.3452343742881867,
"learning_rate": 5.284780950519892e-06,
"loss": 1.2587,
"step": 552
},
{
"epoch": 0.49864743011722273,
"grad_norm": 1.3127844704746279,
"learning_rate": 5.270191416415163e-06,
"loss": 1.2499,
"step": 553
},
{
"epoch": 0.4995491433724076,
"grad_norm": 1.3200976279887406,
"learning_rate": 5.255599574744836e-06,
"loss": 1.2732,
"step": 554
},
{
"epoch": 0.5004508566275925,
"grad_norm": 1.337854355373663,
"learning_rate": 5.241005550130308e-06,
"loss": 1.2649,
"step": 555
},
{
"epoch": 0.5013525698827773,
"grad_norm": 1.314236194849463,
"learning_rate": 5.2264094672116195e-06,
"loss": 1.2482,
"step": 556
},
{
"epoch": 0.5022542831379622,
"grad_norm": 1.2662421270865347,
"learning_rate": 5.211811450646392e-06,
"loss": 1.2555,
"step": 557
},
{
"epoch": 0.503155996393147,
"grad_norm": 1.3444708300857615,
"learning_rate": 5.197211625108755e-06,
"loss": 1.2855,
"step": 558
},
{
"epoch": 0.5040577096483319,
"grad_norm": 1.3117429792391575,
"learning_rate": 5.182610115288296e-06,
"loss": 1.2323,
"step": 559
},
{
"epoch": 0.5049594229035167,
"grad_norm": 1.3763818285742713,
"learning_rate": 5.16800704588898e-06,
"loss": 1.2401,
"step": 560
},
{
"epoch": 0.5058611361587015,
"grad_norm": 1.3139874556118811,
"learning_rate": 5.153402541628097e-06,
"loss": 1.2701,
"step": 561
},
{
"epoch": 0.5067628494138864,
"grad_norm": 1.2896440294650282,
"learning_rate": 5.138796727235188e-06,
"loss": 1.242,
"step": 562
},
{
"epoch": 0.5076645626690712,
"grad_norm": 1.3289435111036993,
"learning_rate": 5.124189727450985e-06,
"loss": 1.2483,
"step": 563
},
{
"epoch": 0.5085662759242561,
"grad_norm": 1.3883596860696592,
"learning_rate": 5.109581667026341e-06,
"loss": 1.2503,
"step": 564
},
{
"epoch": 0.5094679891794409,
"grad_norm": 1.3239009545532878,
"learning_rate": 5.094972670721171e-06,
"loss": 1.2401,
"step": 565
},
{
"epoch": 0.5103697024346258,
"grad_norm": 1.295778294127707,
"learning_rate": 5.080362863303379e-06,
"loss": 1.2423,
"step": 566
},
{
"epoch": 0.5112714156898106,
"grad_norm": 1.369226214795755,
"learning_rate": 5.065752369547803e-06,
"loss": 1.2225,
"step": 567
},
{
"epoch": 0.5121731289449954,
"grad_norm": 1.4045782383828402,
"learning_rate": 5.051141314235135e-06,
"loss": 1.255,
"step": 568
},
{
"epoch": 0.5130748422001803,
"grad_norm": 1.3943603790077395,
"learning_rate": 5.036529822150865e-06,
"loss": 1.2561,
"step": 569
},
{
"epoch": 0.5139765554553652,
"grad_norm": 1.3174744572295207,
"learning_rate": 5.021918018084217e-06,
"loss": 1.2606,
"step": 570
},
{
"epoch": 0.5148782687105501,
"grad_norm": 1.339569973719635,
"learning_rate": 5.007306026827076e-06,
"loss": 1.204,
"step": 571
},
{
"epoch": 0.5157799819657349,
"grad_norm": 1.3320754147517606,
"learning_rate": 4.992693973172925e-06,
"loss": 1.2509,
"step": 572
},
{
"epoch": 0.5166816952209198,
"grad_norm": 1.3051524852266552,
"learning_rate": 4.978081981915784e-06,
"loss": 1.2567,
"step": 573
},
{
"epoch": 0.5175834084761046,
"grad_norm": 1.3339398715435005,
"learning_rate": 4.963470177849135e-06,
"loss": 1.2611,
"step": 574
},
{
"epoch": 0.5184851217312895,
"grad_norm": 1.3586447809755204,
"learning_rate": 4.948858685764867e-06,
"loss": 1.2572,
"step": 575
},
{
"epoch": 0.5193868349864743,
"grad_norm": 1.3003847078321877,
"learning_rate": 4.934247630452198e-06,
"loss": 1.2395,
"step": 576
},
{
"epoch": 0.5202885482416592,
"grad_norm": 1.2589068276430717,
"learning_rate": 4.919637136696621e-06,
"loss": 1.2392,
"step": 577
},
{
"epoch": 0.521190261496844,
"grad_norm": 1.340867722878211,
"learning_rate": 4.905027329278831e-06,
"loss": 1.2476,
"step": 578
},
{
"epoch": 0.5220919747520288,
"grad_norm": 1.3726498349859046,
"learning_rate": 4.89041833297366e-06,
"loss": 1.2498,
"step": 579
},
{
"epoch": 0.5229936880072137,
"grad_norm": 1.3354475932049095,
"learning_rate": 4.875810272549017e-06,
"loss": 1.2521,
"step": 580
},
{
"epoch": 0.5238954012623985,
"grad_norm": 1.3329517177669807,
"learning_rate": 4.861203272764813e-06,
"loss": 1.269,
"step": 581
},
{
"epoch": 0.5247971145175834,
"grad_norm": 1.3565466102588846,
"learning_rate": 4.846597458371905e-06,
"loss": 1.2419,
"step": 582
},
{
"epoch": 0.5256988277727682,
"grad_norm": 1.4078312898982641,
"learning_rate": 4.831992954111022e-06,
"loss": 1.2509,
"step": 583
},
{
"epoch": 0.5266005410279531,
"grad_norm": 1.3295601064574625,
"learning_rate": 4.817389884711706e-06,
"loss": 1.2644,
"step": 584
},
{
"epoch": 0.527502254283138,
"grad_norm": 1.321165085338158,
"learning_rate": 4.802788374891246e-06,
"loss": 1.2556,
"step": 585
},
{
"epoch": 0.5284039675383229,
"grad_norm": 1.3743099444135773,
"learning_rate": 4.788188549353611e-06,
"loss": 1.2417,
"step": 586
},
{
"epoch": 0.5293056807935077,
"grad_norm": 1.332517658766984,
"learning_rate": 4.773590532788382e-06,
"loss": 1.2539,
"step": 587
},
{
"epoch": 0.5302073940486925,
"grad_norm": 1.2694313233555439,
"learning_rate": 4.758994449869693e-06,
"loss": 1.2736,
"step": 588
},
{
"epoch": 0.5311091073038774,
"grad_norm": 1.2742141092043229,
"learning_rate": 4.744400425255165e-06,
"loss": 1.2686,
"step": 589
},
{
"epoch": 0.5320108205590622,
"grad_norm": 1.3252301704980207,
"learning_rate": 4.7298085835848385e-06,
"loss": 1.2448,
"step": 590
},
{
"epoch": 0.5329125338142471,
"grad_norm": 1.3205652503863317,
"learning_rate": 4.71521904948011e-06,
"loss": 1.2445,
"step": 591
},
{
"epoch": 0.5338142470694319,
"grad_norm": 1.3075901906712277,
"learning_rate": 4.700631947542667e-06,
"loss": 1.2344,
"step": 592
},
{
"epoch": 0.5347159603246168,
"grad_norm": 1.2737322650247187,
"learning_rate": 4.686047402353433e-06,
"loss": 1.2524,
"step": 593
},
{
"epoch": 0.5356176735798016,
"grad_norm": 1.2756661202797257,
"learning_rate": 4.671465538471487e-06,
"loss": 1.2503,
"step": 594
},
{
"epoch": 0.5365193868349865,
"grad_norm": 1.325658160187221,
"learning_rate": 4.6568864804330095e-06,
"loss": 1.2465,
"step": 595
},
{
"epoch": 0.5374211000901713,
"grad_norm": 1.28854606631265,
"learning_rate": 4.64231035275022e-06,
"loss": 1.2605,
"step": 596
},
{
"epoch": 0.5383228133453561,
"grad_norm": 1.35727097357451,
"learning_rate": 4.627737279910311e-06,
"loss": 1.2563,
"step": 597
},
{
"epoch": 0.539224526600541,
"grad_norm": 1.3307866660108574,
"learning_rate": 4.613167386374386e-06,
"loss": 1.2746,
"step": 598
},
{
"epoch": 0.5401262398557258,
"grad_norm": 1.2981602983236322,
"learning_rate": 4.598600796576395e-06,
"loss": 1.2606,
"step": 599
},
{
"epoch": 0.5410279531109107,
"grad_norm": 1.2860239646762985,
"learning_rate": 4.58403763492207e-06,
"loss": 1.2577,
"step": 600
},
{
"epoch": 0.5419296663660956,
"grad_norm": 1.3065289252471795,
"learning_rate": 4.569478025787869e-06,
"loss": 1.2276,
"step": 601
},
{
"epoch": 0.5428313796212805,
"grad_norm": 1.2932249228962214,
"learning_rate": 4.554922093519906e-06,
"loss": 1.2472,
"step": 602
},
{
"epoch": 0.5437330928764653,
"grad_norm": 1.2565236651934977,
"learning_rate": 4.5403699624328955e-06,
"loss": 1.2303,
"step": 603
},
{
"epoch": 0.5446348061316502,
"grad_norm": 1.248067022227819,
"learning_rate": 4.525821756809088e-06,
"loss": 1.2453,
"step": 604
},
{
"epoch": 0.545536519386835,
"grad_norm": 1.3002341150666157,
"learning_rate": 4.511277600897205e-06,
"loss": 1.2157,
"step": 605
},
{
"epoch": 0.5464382326420198,
"grad_norm": 1.4187715891618866,
"learning_rate": 4.496737618911388e-06,
"loss": 1.2559,
"step": 606
},
{
"epoch": 0.5473399458972047,
"grad_norm": 1.3027783100524892,
"learning_rate": 4.482201935030126e-06,
"loss": 1.2335,
"step": 607
},
{
"epoch": 0.5482416591523895,
"grad_norm": 1.305987240699055,
"learning_rate": 4.467670673395202e-06,
"loss": 1.2561,
"step": 608
},
{
"epoch": 0.5491433724075744,
"grad_norm": 1.28216707654573,
"learning_rate": 4.4531439581106295e-06,
"loss": 1.2195,
"step": 609
},
{
"epoch": 0.5500450856627592,
"grad_norm": 1.300031525314949,
"learning_rate": 4.438621913241593e-06,
"loss": 1.2583,
"step": 610
},
{
"epoch": 0.5509467989179441,
"grad_norm": 1.3321215317104576,
"learning_rate": 4.424104662813396e-06,
"loss": 1.2331,
"step": 611
},
{
"epoch": 0.5518485121731289,
"grad_norm": 1.2888029553989442,
"learning_rate": 4.409592330810389e-06,
"loss": 1.2238,
"step": 612
},
{
"epoch": 0.5527502254283138,
"grad_norm": 1.2815847068063542,
"learning_rate": 4.3950850411749164e-06,
"loss": 1.2204,
"step": 613
},
{
"epoch": 0.5536519386834986,
"grad_norm": 1.325255076780753,
"learning_rate": 4.38058291780626e-06,
"loss": 1.2165,
"step": 614
},
{
"epoch": 0.5545536519386834,
"grad_norm": 1.2735817359390165,
"learning_rate": 4.366086084559582e-06,
"loss": 1.2599,
"step": 615
},
{
"epoch": 0.5554553651938684,
"grad_norm": 1.2850880285296393,
"learning_rate": 4.351594665244861e-06,
"loss": 1.2474,
"step": 616
},
{
"epoch": 0.5563570784490532,
"grad_norm": 1.321852801332939,
"learning_rate": 4.337108783625837e-06,
"loss": 1.2335,
"step": 617
},
{
"epoch": 0.5572587917042381,
"grad_norm": 1.29957738737894,
"learning_rate": 4.322628563418958e-06,
"loss": 1.2347,
"step": 618
},
{
"epoch": 0.5581605049594229,
"grad_norm": 1.2437967051806695,
"learning_rate": 4.308154128292318e-06,
"loss": 1.2319,
"step": 619
},
{
"epoch": 0.5590622182146078,
"grad_norm": 1.323952254471239,
"learning_rate": 4.29368560186461e-06,
"loss": 1.2393,
"step": 620
},
{
"epoch": 0.5599639314697926,
"grad_norm": 1.3364243863997782,
"learning_rate": 4.279223107704058e-06,
"loss": 1.2353,
"step": 621
},
{
"epoch": 0.5608656447249775,
"grad_norm": 1.223403461559393,
"learning_rate": 4.264766769327367e-06,
"loss": 1.2218,
"step": 622
},
{
"epoch": 0.5617673579801623,
"grad_norm": 1.3309304196344736,
"learning_rate": 4.2503167101986695e-06,
"loss": 1.2183,
"step": 623
},
{
"epoch": 0.5626690712353472,
"grad_norm": 1.3334404043758776,
"learning_rate": 4.235873053728475e-06,
"loss": 1.2517,
"step": 624
},
{
"epoch": 0.563570784490532,
"grad_norm": 1.2850759662357756,
"learning_rate": 4.221435923272606e-06,
"loss": 1.2495,
"step": 625
},
{
"epoch": 0.5644724977457168,
"grad_norm": 1.3029858639642806,
"learning_rate": 4.207005442131151e-06,
"loss": 1.2593,
"step": 626
},
{
"epoch": 0.5653742110009017,
"grad_norm": 1.2667764626567348,
"learning_rate": 4.1925817335474095e-06,
"loss": 1.2248,
"step": 627
},
{
"epoch": 0.5662759242560865,
"grad_norm": 1.2666696474712575,
"learning_rate": 4.17816492070684e-06,
"loss": 1.2328,
"step": 628
},
{
"epoch": 0.5671776375112714,
"grad_norm": 1.3025168642655727,
"learning_rate": 4.163755126736011e-06,
"loss": 1.243,
"step": 629
},
{
"epoch": 0.5680793507664562,
"grad_norm": 1.2526066400333988,
"learning_rate": 4.149352474701545e-06,
"loss": 1.2673,
"step": 630
},
{
"epoch": 0.5689810640216412,
"grad_norm": 1.3106005532588625,
"learning_rate": 4.134957087609065e-06,
"loss": 1.2461,
"step": 631
},
{
"epoch": 0.569882777276826,
"grad_norm": 1.345734152807863,
"learning_rate": 4.1205690884021506e-06,
"loss": 1.2622,
"step": 632
},
{
"epoch": 0.5707844905320109,
"grad_norm": 1.2847763091271833,
"learning_rate": 4.10618859996129e-06,
"loss": 1.2491,
"step": 633
},
{
"epoch": 0.5716862037871957,
"grad_norm": 1.3398059744530983,
"learning_rate": 4.091815745102818e-06,
"loss": 1.2341,
"step": 634
},
{
"epoch": 0.5725879170423805,
"grad_norm": 1.279245025275653,
"learning_rate": 4.077450646577881e-06,
"loss": 1.2276,
"step": 635
},
{
"epoch": 0.5734896302975654,
"grad_norm": 1.31103306359405,
"learning_rate": 4.063093427071376e-06,
"loss": 1.2622,
"step": 636
},
{
"epoch": 0.5743913435527502,
"grad_norm": 1.2869047741613928,
"learning_rate": 4.048744209200918e-06,
"loss": 1.2526,
"step": 637
},
{
"epoch": 0.5752930568079351,
"grad_norm": 1.262888543516136,
"learning_rate": 4.034403115515778e-06,
"loss": 1.2447,
"step": 638
},
{
"epoch": 0.5761947700631199,
"grad_norm": 1.318138827623911,
"learning_rate": 4.020070268495844e-06,
"loss": 1.2477,
"step": 639
},
{
"epoch": 0.5770964833183048,
"grad_norm": 1.3000660382986018,
"learning_rate": 4.005745790550572e-06,
"loss": 1.2348,
"step": 640
},
{
"epoch": 0.5779981965734896,
"grad_norm": 1.3109955465621879,
"learning_rate": 3.991429804017944e-06,
"loss": 1.2437,
"step": 641
},
{
"epoch": 0.5788999098286745,
"grad_norm": 1.3230840693922976,
"learning_rate": 3.9771224311634225e-06,
"loss": 1.2466,
"step": 642
},
{
"epoch": 0.5798016230838593,
"grad_norm": 1.2911053004123727,
"learning_rate": 3.962823794178902e-06,
"loss": 1.2205,
"step": 643
},
{
"epoch": 0.5807033363390441,
"grad_norm": 1.29394302161919,
"learning_rate": 3.948534015181671e-06,
"loss": 1.2436,
"step": 644
},
{
"epoch": 0.581605049594229,
"grad_norm": 1.338827552196303,
"learning_rate": 3.93425321621336e-06,
"loss": 1.2487,
"step": 645
},
{
"epoch": 0.5825067628494139,
"grad_norm": 1.2744443449332064,
"learning_rate": 3.919981519238919e-06,
"loss": 1.2182,
"step": 646
},
{
"epoch": 0.5834084761045988,
"grad_norm": 1.3142705170968756,
"learning_rate": 3.905719046145551e-06,
"loss": 1.2259,
"step": 647
},
{
"epoch": 0.5843101893597836,
"grad_norm": 1.3274687859286416,
"learning_rate": 3.891465918741685e-06,
"loss": 1.2403,
"step": 648
},
{
"epoch": 0.5852119026149685,
"grad_norm": 1.327857493053994,
"learning_rate": 3.8772222587559345e-06,
"loss": 1.2574,
"step": 649
},
{
"epoch": 0.5861136158701533,
"grad_norm": 1.3271105757469566,
"learning_rate": 3.862988187836057e-06,
"loss": 1.2588,
"step": 650
},
{
"epoch": 0.5870153291253382,
"grad_norm": 1.3154262472743066,
"learning_rate": 3.848763827547915e-06,
"loss": 1.2378,
"step": 651
},
{
"epoch": 0.587917042380523,
"grad_norm": 1.3272260190880967,
"learning_rate": 3.834549299374437e-06,
"loss": 1.2258,
"step": 652
},
{
"epoch": 0.5888187556357078,
"grad_norm": 1.2955291861582168,
"learning_rate": 3.8203447247145796e-06,
"loss": 1.249,
"step": 653
},
{
"epoch": 0.5897204688908927,
"grad_norm": 1.2960283704822624,
"learning_rate": 3.80615022488229e-06,
"loss": 1.2142,
"step": 654
},
{
"epoch": 0.5906221821460775,
"grad_norm": 1.2663491385871244,
"learning_rate": 3.7919659211054783e-06,
"loss": 1.2421,
"step": 655
},
{
"epoch": 0.5915238954012624,
"grad_norm": 1.256608347450423,
"learning_rate": 3.7777919345249675e-06,
"loss": 1.2287,
"step": 656
},
{
"epoch": 0.5924256086564472,
"grad_norm": 1.3082636395597274,
"learning_rate": 3.763628386193471e-06,
"loss": 1.2392,
"step": 657
},
{
"epoch": 0.5933273219116321,
"grad_norm": 1.2981758681506774,
"learning_rate": 3.7494753970745536e-06,
"loss": 1.2352,
"step": 658
},
{
"epoch": 0.5942290351668169,
"grad_norm": 1.3346817481845517,
"learning_rate": 3.7353330880415963e-06,
"loss": 1.215,
"step": 659
},
{
"epoch": 0.5951307484220018,
"grad_norm": 1.3310172482033298,
"learning_rate": 3.721201579876775e-06,
"loss": 1.2443,
"step": 660
},
{
"epoch": 0.5960324616771867,
"grad_norm": 1.308881273250555,
"learning_rate": 3.7070809932700134e-06,
"loss": 1.2274,
"step": 661
},
{
"epoch": 0.5969341749323716,
"grad_norm": 1.2636903677769276,
"learning_rate": 3.6929714488179617e-06,
"loss": 1.243,
"step": 662
},
{
"epoch": 0.5978358881875564,
"grad_norm": 1.3444767242506068,
"learning_rate": 3.6788730670229646e-06,
"loss": 1.2254,
"step": 663
},
{
"epoch": 0.5987376014427412,
"grad_norm": 1.3019319337418664,
"learning_rate": 3.664785968292036e-06,
"loss": 1.2551,
"step": 664
},
{
"epoch": 0.5996393146979261,
"grad_norm": 1.3364310515081759,
"learning_rate": 3.6507102729358224e-06,
"loss": 1.2375,
"step": 665
},
{
"epoch": 0.6005410279531109,
"grad_norm": 1.303553692706673,
"learning_rate": 3.6366461011675807e-06,
"loss": 1.2352,
"step": 666
},
{
"epoch": 0.6014427412082958,
"grad_norm": 1.3241505676835355,
"learning_rate": 3.622593573102153e-06,
"loss": 1.2358,
"step": 667
},
{
"epoch": 0.6023444544634806,
"grad_norm": 1.3015882655381004,
"learning_rate": 3.608552808754935e-06,
"loss": 1.2414,
"step": 668
},
{
"epoch": 0.6032461677186655,
"grad_norm": 1.3187048120287344,
"learning_rate": 3.5945239280408596e-06,
"loss": 1.2241,
"step": 669
},
{
"epoch": 0.6041478809738503,
"grad_norm": 1.2909937470324393,
"learning_rate": 3.580507050773363e-06,
"loss": 1.2344,
"step": 670
},
{
"epoch": 0.6050495942290351,
"grad_norm": 1.312803753965677,
"learning_rate": 3.5665022966633678e-06,
"loss": 1.2082,
"step": 671
},
{
"epoch": 0.60595130748422,
"grad_norm": 1.3273407315438335,
"learning_rate": 3.552509785318258e-06,
"loss": 1.2578,
"step": 672
},
{
"epoch": 0.6068530207394048,
"grad_norm": 1.2933303173914894,
"learning_rate": 3.538529636240863e-06,
"loss": 1.23,
"step": 673
},
{
"epoch": 0.6077547339945897,
"grad_norm": 1.2561311990765511,
"learning_rate": 3.5245619688284277e-06,
"loss": 1.22,
"step": 674
},
{
"epoch": 0.6086564472497745,
"grad_norm": 1.2956136355818522,
"learning_rate": 3.510606902371598e-06,
"loss": 1.2268,
"step": 675
},
{
"epoch": 0.6095581605049594,
"grad_norm": 1.2993861533198938,
"learning_rate": 3.496664556053401e-06,
"loss": 1.2594,
"step": 676
},
{
"epoch": 0.6104598737601443,
"grad_norm": 1.256934798888675,
"learning_rate": 3.4827350489482324e-06,
"loss": 1.2333,
"step": 677
},
{
"epoch": 0.6113615870153292,
"grad_norm": 1.2543400892427217,
"learning_rate": 3.4688185000208297e-06,
"loss": 1.228,
"step": 678
},
{
"epoch": 0.612263300270514,
"grad_norm": 1.3103327891219767,
"learning_rate": 3.4549150281252635e-06,
"loss": 1.2426,
"step": 679
},
{
"epoch": 0.6131650135256989,
"grad_norm": 1.3157993375630526,
"learning_rate": 3.441024752003919e-06,
"loss": 1.2386,
"step": 680
},
{
"epoch": 0.6140667267808837,
"grad_norm": 1.3538482564231207,
"learning_rate": 3.4271477902864836e-06,
"loss": 1.2216,
"step": 681
},
{
"epoch": 0.6149684400360685,
"grad_norm": 1.269784948028544,
"learning_rate": 3.413284261488935e-06,
"loss": 1.2162,
"step": 682
},
{
"epoch": 0.6158701532912534,
"grad_norm": 1.2704213319719941,
"learning_rate": 3.399434284012525e-06,
"loss": 1.2372,
"step": 683
},
{
"epoch": 0.6167718665464382,
"grad_norm": 1.383306427504306,
"learning_rate": 3.3855979761427705e-06,
"loss": 1.2345,
"step": 684
},
{
"epoch": 0.6176735798016231,
"grad_norm": 1.268997641633319,
"learning_rate": 3.3717754560484426e-06,
"loss": 1.2465,
"step": 685
},
{
"epoch": 0.6185752930568079,
"grad_norm": 1.27764409089746,
"learning_rate": 3.3579668417805643e-06,
"loss": 1.2301,
"step": 686
},
{
"epoch": 0.6194770063119928,
"grad_norm": 1.2627803061282448,
"learning_rate": 3.3441722512713893e-06,
"loss": 1.2109,
"step": 687
},
{
"epoch": 0.6203787195671776,
"grad_norm": 1.2713892796187032,
"learning_rate": 3.3303918023334024e-06,
"loss": 1.2354,
"step": 688
},
{
"epoch": 0.6212804328223624,
"grad_norm": 1.3364846439846891,
"learning_rate": 3.316625612658315e-06,
"loss": 1.2017,
"step": 689
},
{
"epoch": 0.6221821460775473,
"grad_norm": 1.2827748136981727,
"learning_rate": 3.302873799816054e-06,
"loss": 1.2033,
"step": 690
},
{
"epoch": 0.6230838593327321,
"grad_norm": 1.307953964844232,
"learning_rate": 3.2891364812537686e-06,
"loss": 1.2401,
"step": 691
},
{
"epoch": 0.6239855725879171,
"grad_norm": 1.3259218603144716,
"learning_rate": 3.2754137742948113e-06,
"loss": 1.2352,
"step": 692
},
{
"epoch": 0.6248872858431019,
"grad_norm": 1.229306275079686,
"learning_rate": 3.2617057961377486e-06,
"loss": 1.2558,
"step": 693
},
{
"epoch": 0.6257889990982868,
"grad_norm": 1.3009567818281342,
"learning_rate": 3.2480126638553533e-06,
"loss": 1.2514,
"step": 694
},
{
"epoch": 0.6266907123534716,
"grad_norm": 1.27577455658036,
"learning_rate": 3.234334494393613e-06,
"loss": 1.2358,
"step": 695
},
{
"epoch": 0.6275924256086565,
"grad_norm": 1.221813297508922,
"learning_rate": 3.220671404570719e-06,
"loss": 1.238,
"step": 696
},
{
"epoch": 0.6284941388638413,
"grad_norm": 1.2637233453633625,
"learning_rate": 3.207023511076079e-06,
"loss": 1.2434,
"step": 697
},
{
"epoch": 0.6293958521190262,
"grad_norm": 1.291554646733566,
"learning_rate": 3.1933909304693144e-06,
"loss": 1.2154,
"step": 698
},
{
"epoch": 0.630297565374211,
"grad_norm": 1.3178671426373603,
"learning_rate": 3.1797737791792672e-06,
"loss": 1.2352,
"step": 699
},
{
"epoch": 0.6311992786293958,
"grad_norm": 1.2872125571122581,
"learning_rate": 3.1661721735030105e-06,
"loss": 1.2354,
"step": 700
},
{
"epoch": 0.6321009918845807,
"grad_norm": 1.313884831911044,
"learning_rate": 3.1525862296048446e-06,
"loss": 1.2376,
"step": 701
},
{
"epoch": 0.6330027051397655,
"grad_norm": 1.274735255975163,
"learning_rate": 3.1390160635153123e-06,
"loss": 1.2294,
"step": 702
},
{
"epoch": 0.6339044183949504,
"grad_norm": 1.2614372405167664,
"learning_rate": 3.125461791130204e-06,
"loss": 1.2428,
"step": 703
},
{
"epoch": 0.6348061316501352,
"grad_norm": 1.272361361972863,
"learning_rate": 3.111923528209577e-06,
"loss": 1.2573,
"step": 704
},
{
"epoch": 0.6357078449053201,
"grad_norm": 1.3029565371645733,
"learning_rate": 3.098401390376755e-06,
"loss": 1.2271,
"step": 705
},
{
"epoch": 0.6366095581605049,
"grad_norm": 1.2904996939383162,
"learning_rate": 3.0848954931173437e-06,
"loss": 1.2249,
"step": 706
},
{
"epoch": 0.6375112714156899,
"grad_norm": 1.2898690058485842,
"learning_rate": 3.07140595177825e-06,
"loss": 1.2266,
"step": 707
},
{
"epoch": 0.6384129846708747,
"grad_norm": 1.3509643247178318,
"learning_rate": 3.0579328815666936e-06,
"loss": 1.2469,
"step": 708
},
{
"epoch": 0.6393146979260595,
"grad_norm": 1.2971311382634418,
"learning_rate": 3.044476397549221e-06,
"loss": 1.2222,
"step": 709
},
{
"epoch": 0.6402164111812444,
"grad_norm": 1.3074374357170047,
"learning_rate": 3.031036614650724e-06,
"loss": 1.2324,
"step": 710
},
{
"epoch": 0.6411181244364292,
"grad_norm": 1.320854679826083,
"learning_rate": 3.017613647653461e-06,
"loss": 1.2454,
"step": 711
},
{
"epoch": 0.6420198376916141,
"grad_norm": 1.2893653611762816,
"learning_rate": 3.0042076111960718e-06,
"loss": 1.2575,
"step": 712
},
{
"epoch": 0.6429215509467989,
"grad_norm": 1.2828409194239083,
"learning_rate": 2.9908186197726043e-06,
"loss": 1.2254,
"step": 713
},
{
"epoch": 0.6438232642019838,
"grad_norm": 1.258821924861263,
"learning_rate": 2.977446787731532e-06,
"loss": 1.2415,
"step": 714
},
{
"epoch": 0.6447249774571686,
"grad_norm": 1.3574129132624322,
"learning_rate": 2.9640922292747785e-06,
"loss": 1.2179,
"step": 715
},
{
"epoch": 0.6456266907123535,
"grad_norm": 1.3018397262453858,
"learning_rate": 2.9507550584567413e-06,
"loss": 1.2359,
"step": 716
},
{
"epoch": 0.6465284039675383,
"grad_norm": 1.3114784435553961,
"learning_rate": 2.937435389183324e-06,
"loss": 1.228,
"step": 717
},
{
"epoch": 0.6474301172227231,
"grad_norm": 1.2959756988384548,
"learning_rate": 2.9241333352109535e-06,
"loss": 1.2086,
"step": 718
},
{
"epoch": 0.648331830477908,
"grad_norm": 1.2799077133229382,
"learning_rate": 2.910849010145617e-06,
"loss": 1.2168,
"step": 719
},
{
"epoch": 0.6492335437330928,
"grad_norm": 1.2829633913200977,
"learning_rate": 2.897582527441883e-06,
"loss": 1.2191,
"step": 720
},
{
"epoch": 0.6501352569882777,
"grad_norm": 1.282434130038559,
"learning_rate": 2.8843340004019427e-06,
"loss": 1.2351,
"step": 721
},
{
"epoch": 0.6510369702434626,
"grad_norm": 1.3105069360123125,
"learning_rate": 2.871103542174637e-06,
"loss": 1.2283,
"step": 722
},
{
"epoch": 0.6519386834986475,
"grad_norm": 1.3345408753156254,
"learning_rate": 2.857891265754489e-06,
"loss": 1.234,
"step": 723
},
{
"epoch": 0.6528403967538323,
"grad_norm": 1.3442654702676227,
"learning_rate": 2.8446972839807384e-06,
"loss": 1.2216,
"step": 724
},
{
"epoch": 0.6537421100090172,
"grad_norm": 1.3564801233118708,
"learning_rate": 2.831521709536382e-06,
"loss": 1.2315,
"step": 725
},
{
"epoch": 0.654643823264202,
"grad_norm": 1.2746111775530709,
"learning_rate": 2.818364654947211e-06,
"loss": 1.2405,
"step": 726
},
{
"epoch": 0.6555455365193869,
"grad_norm": 1.2888789023115854,
"learning_rate": 2.8052262325808466e-06,
"loss": 1.1947,
"step": 727
},
{
"epoch": 0.6564472497745717,
"grad_norm": 1.297464906154778,
"learning_rate": 2.7921065546457773e-06,
"loss": 1.222,
"step": 728
},
{
"epoch": 0.6573489630297565,
"grad_norm": 1.3127768408774596,
"learning_rate": 2.779005733190412e-06,
"loss": 1.2199,
"step": 729
},
{
"epoch": 0.6582506762849414,
"grad_norm": 1.3191258871353029,
"learning_rate": 2.7659238801021105e-06,
"loss": 1.2365,
"step": 730
},
{
"epoch": 0.6591523895401262,
"grad_norm": 1.3023738457626162,
"learning_rate": 2.7528611071062366e-06,
"loss": 1.2262,
"step": 731
},
{
"epoch": 0.6600541027953111,
"grad_norm": 1.2807206465971785,
"learning_rate": 2.7398175257652036e-06,
"loss": 1.2256,
"step": 732
},
{
"epoch": 0.6609558160504959,
"grad_norm": 1.2660732605347753,
"learning_rate": 2.7267932474775115e-06,
"loss": 1.192,
"step": 733
},
{
"epoch": 0.6618575293056808,
"grad_norm": 1.3114174256047686,
"learning_rate": 2.7137883834768076e-06,
"loss": 1.2397,
"step": 734
},
{
"epoch": 0.6627592425608656,
"grad_norm": 1.3406088823582483,
"learning_rate": 2.7008030448309318e-06,
"loss": 1.2103,
"step": 735
},
{
"epoch": 0.6636609558160504,
"grad_norm": 1.2614996055747296,
"learning_rate": 2.6878373424409705e-06,
"loss": 1.2365,
"step": 736
},
{
"epoch": 0.6645626690712354,
"grad_norm": 1.2732704037998983,
"learning_rate": 2.674891387040298e-06,
"loss": 1.2243,
"step": 737
},
{
"epoch": 0.6654643823264202,
"grad_norm": 1.3360075379083336,
"learning_rate": 2.66196528919365e-06,
"loss": 1.2478,
"step": 738
},
{
"epoch": 0.6663660955816051,
"grad_norm": 1.3054505998468804,
"learning_rate": 2.649059159296158e-06,
"loss": 1.254,
"step": 739
},
{
"epoch": 0.6672678088367899,
"grad_norm": 1.2942891584243765,
"learning_rate": 2.6361731075724327e-06,
"loss": 1.2153,
"step": 740
},
{
"epoch": 0.6681695220919748,
"grad_norm": 1.3109493986388532,
"learning_rate": 2.6233072440755934e-06,
"loss": 1.2328,
"step": 741
},
{
"epoch": 0.6690712353471596,
"grad_norm": 1.3327195437209476,
"learning_rate": 2.6104616786863507e-06,
"loss": 1.2199,
"step": 742
},
{
"epoch": 0.6699729486023445,
"grad_norm": 1.2821126629709811,
"learning_rate": 2.597636521112053e-06,
"loss": 1.2045,
"step": 743
},
{
"epoch": 0.6708746618575293,
"grad_norm": 1.320388296773102,
"learning_rate": 2.584831880885761e-06,
"loss": 1.2243,
"step": 744
},
{
"epoch": 0.6717763751127142,
"grad_norm": 1.294843994895677,
"learning_rate": 2.572047867365308e-06,
"loss": 1.2069,
"step": 745
},
{
"epoch": 0.672678088367899,
"grad_norm": 1.3064477589046204,
"learning_rate": 2.5592845897323596e-06,
"loss": 1.2158,
"step": 746
},
{
"epoch": 0.6735798016230838,
"grad_norm": 1.3208905098729207,
"learning_rate": 2.5465421569914916e-06,
"loss": 1.2459,
"step": 747
},
{
"epoch": 0.6744815148782687,
"grad_norm": 1.278779451830435,
"learning_rate": 2.5338206779692536e-06,
"loss": 1.2359,
"step": 748
},
{
"epoch": 0.6753832281334535,
"grad_norm": 1.254543313346981,
"learning_rate": 2.5211202613132413e-06,
"loss": 1.1942,
"step": 749
},
{
"epoch": 0.6762849413886384,
"grad_norm": 1.3400421358634278,
"learning_rate": 2.508441015491162e-06,
"loss": 1.2401,
"step": 750
},
{
"epoch": 0.6771866546438232,
"grad_norm": 1.3576541062217489,
"learning_rate": 2.4957830487899224e-06,
"loss": 1.2319,
"step": 751
},
{
"epoch": 0.6780883678990082,
"grad_norm": 1.3010818454018325,
"learning_rate": 2.4831464693146845e-06,
"loss": 1.2321,
"step": 752
},
{
"epoch": 0.678990081154193,
"grad_norm": 1.270217200357556,
"learning_rate": 2.4705313849879663e-06,
"loss": 1.2109,
"step": 753
},
{
"epoch": 0.6798917944093779,
"grad_norm": 1.2871254174671725,
"learning_rate": 2.457937903548695e-06,
"loss": 1.2403,
"step": 754
},
{
"epoch": 0.6807935076645627,
"grad_norm": 1.267863712970668,
"learning_rate": 2.4453661325513065e-06,
"loss": 1.2247,
"step": 755
},
{
"epoch": 0.6816952209197475,
"grad_norm": 1.3253294626486456,
"learning_rate": 2.4328161793648126e-06,
"loss": 1.2333,
"step": 756
},
{
"epoch": 0.6825969341749324,
"grad_norm": 1.2949613306417762,
"learning_rate": 2.420288151171895e-06,
"loss": 1.2199,
"step": 757
},
{
"epoch": 0.6834986474301172,
"grad_norm": 1.2894731802418073,
"learning_rate": 2.407782154967986e-06,
"loss": 1.1996,
"step": 758
},
{
"epoch": 0.6844003606853021,
"grad_norm": 1.3266207348050199,
"learning_rate": 2.3952982975603494e-06,
"loss": 1.2265,
"step": 759
},
{
"epoch": 0.6853020739404869,
"grad_norm": 1.3404362571181616,
"learning_rate": 2.382836685567178e-06,
"loss": 1.2187,
"step": 760
},
{
"epoch": 0.6862037871956718,
"grad_norm": 1.2776517904799676,
"learning_rate": 2.3703974254166704e-06,
"loss": 1.227,
"step": 761
},
{
"epoch": 0.6871055004508566,
"grad_norm": 1.2838876524156215,
"learning_rate": 2.357980623346143e-06,
"loss": 1.2177,
"step": 762
},
{
"epoch": 0.6880072137060415,
"grad_norm": 1.2784263173497654,
"learning_rate": 2.345586385401094e-06,
"loss": 1.2218,
"step": 763
},
{
"epoch": 0.6889089269612263,
"grad_norm": 1.279430229152187,
"learning_rate": 2.3332148174343257e-06,
"loss": 1.2392,
"step": 764
},
{
"epoch": 0.6898106402164111,
"grad_norm": 1.2520563299524021,
"learning_rate": 2.320866025105016e-06,
"loss": 1.2092,
"step": 765
},
{
"epoch": 0.690712353471596,
"grad_norm": 1.3149467895203844,
"learning_rate": 2.3085401138778414e-06,
"loss": 1.2338,
"step": 766
},
{
"epoch": 0.6916140667267808,
"grad_norm": 1.2916597985967335,
"learning_rate": 2.2962371890220502e-06,
"loss": 1.2229,
"step": 767
},
{
"epoch": 0.6925157799819658,
"grad_norm": 1.3252286984828274,
"learning_rate": 2.283957355610584e-06,
"loss": 1.2095,
"step": 768
},
{
"epoch": 0.6934174932371506,
"grad_norm": 1.3270969648402997,
"learning_rate": 2.2717007185191673e-06,
"loss": 1.2239,
"step": 769
},
{
"epoch": 0.6943192064923355,
"grad_norm": 1.3262507945691961,
"learning_rate": 2.25946738242542e-06,
"loss": 1.221,
"step": 770
},
{
"epoch": 0.6952209197475203,
"grad_norm": 1.2744476798628572,
"learning_rate": 2.247257451807961e-06,
"loss": 1.2095,
"step": 771
},
{
"epoch": 0.6961226330027052,
"grad_norm": 1.257677104351814,
"learning_rate": 2.235071030945509e-06,
"loss": 1.2343,
"step": 772
},
{
"epoch": 0.69702434625789,
"grad_norm": 1.273279317005678,
"learning_rate": 2.2229082239160066e-06,
"loss": 1.2096,
"step": 773
},
{
"epoch": 0.6979260595130748,
"grad_norm": 1.3275403323151511,
"learning_rate": 2.2107691345957133e-06,
"loss": 1.2223,
"step": 774
},
{
"epoch": 0.6988277727682597,
"grad_norm": 1.2994686903488226,
"learning_rate": 2.198653866658339e-06,
"loss": 1.2383,
"step": 775
},
{
"epoch": 0.6997294860234445,
"grad_norm": 1.2946263968131735,
"learning_rate": 2.1865625235741376e-06,
"loss": 1.2316,
"step": 776
},
{
"epoch": 0.7006311992786294,
"grad_norm": 1.3260718116411006,
"learning_rate": 2.1744952086090396e-06,
"loss": 1.1987,
"step": 777
},
{
"epoch": 0.7015329125338142,
"grad_norm": 1.2928907842117559,
"learning_rate": 2.162452024823758e-06,
"loss": 1.2327,
"step": 778
},
{
"epoch": 0.7024346257889991,
"grad_norm": 1.3022114442202848,
"learning_rate": 2.1504330750729185e-06,
"loss": 1.2048,
"step": 779
},
{
"epoch": 0.7033363390441839,
"grad_norm": 1.2680584448769776,
"learning_rate": 2.1384384620041756e-06,
"loss": 1.2022,
"step": 780
},
{
"epoch": 0.7042380522993688,
"grad_norm": 1.2980425468161858,
"learning_rate": 2.1264682880573374e-06,
"loss": 1.2112,
"step": 781
},
{
"epoch": 0.7051397655545536,
"grad_norm": 1.3239977283045519,
"learning_rate": 2.1145226554634845e-06,
"loss": 1.2105,
"step": 782
},
{
"epoch": 0.7060414788097386,
"grad_norm": 1.2758563831905616,
"learning_rate": 2.1026016662441097e-06,
"loss": 1.2347,
"step": 783
},
{
"epoch": 0.7069431920649234,
"grad_norm": 1.2413307004498162,
"learning_rate": 2.0907054222102367e-06,
"loss": 1.2359,
"step": 784
},
{
"epoch": 0.7078449053201082,
"grad_norm": 1.2797262934378604,
"learning_rate": 2.0788340249615506e-06,
"loss": 1.2328,
"step": 785
},
{
"epoch": 0.7087466185752931,
"grad_norm": 1.2921593019754436,
"learning_rate": 2.066987575885539e-06,
"loss": 1.222,
"step": 786
},
{
"epoch": 0.7096483318304779,
"grad_norm": 1.3244519054779904,
"learning_rate": 2.0551661761566104e-06,
"loss": 1.2137,
"step": 787
},
{
"epoch": 0.7105500450856628,
"grad_norm": 1.2995133181578151,
"learning_rate": 2.0433699267352536e-06,
"loss": 1.2238,
"step": 788
},
{
"epoch": 0.7114517583408476,
"grad_norm": 1.3071033235317082,
"learning_rate": 2.0315989283671474e-06,
"loss": 1.199,
"step": 789
},
{
"epoch": 0.7123534715960325,
"grad_norm": 1.2721249987745593,
"learning_rate": 2.0198532815823247e-06,
"loss": 1.1905,
"step": 790
},
{
"epoch": 0.7132551848512173,
"grad_norm": 1.322827431541093,
"learning_rate": 2.0081330866942962e-06,
"loss": 1.2146,
"step": 791
},
{
"epoch": 0.7141568981064021,
"grad_norm": 1.3139334735005088,
"learning_rate": 1.9964384437992055e-06,
"loss": 1.2415,
"step": 792
},
{
"epoch": 0.715058611361587,
"grad_norm": 1.2680789806345396,
"learning_rate": 1.98476945277497e-06,
"loss": 1.2181,
"step": 793
},
{
"epoch": 0.7159603246167718,
"grad_norm": 1.2428341282201179,
"learning_rate": 1.9731262132804275e-06,
"loss": 1.2195,
"step": 794
},
{
"epoch": 0.7168620378719567,
"grad_norm": 1.3004003311062884,
"learning_rate": 1.9615088247544802e-06,
"loss": 1.223,
"step": 795
},
{
"epoch": 0.7177637511271415,
"grad_norm": 1.313334117650514,
"learning_rate": 1.9499173864152566e-06,
"loss": 1.2185,
"step": 796
},
{
"epoch": 0.7186654643823264,
"grad_norm": 1.3238878316428104,
"learning_rate": 1.938351997259258e-06,
"loss": 1.2319,
"step": 797
},
{
"epoch": 0.7195671776375113,
"grad_norm": 1.3043461142181643,
"learning_rate": 1.926812756060508e-06,
"loss": 1.23,
"step": 798
},
{
"epoch": 0.7204688908926962,
"grad_norm": 1.2858820326661842,
"learning_rate": 1.9152997613697184e-06,
"loss": 1.1903,
"step": 799
},
{
"epoch": 0.721370604147881,
"grad_norm": 1.3291400806149936,
"learning_rate": 1.9038131115134401e-06,
"loss": 1.2137,
"step": 800
},
{
"epoch": 0.7222723174030659,
"grad_norm": 1.2976270941930153,
"learning_rate": 1.8923529045932292e-06,
"loss": 1.2037,
"step": 801
},
{
"epoch": 0.7231740306582507,
"grad_norm": 1.280008791018806,
"learning_rate": 1.8809192384848046e-06,
"loss": 1.2346,
"step": 802
},
{
"epoch": 0.7240757439134355,
"grad_norm": 1.2520323037262666,
"learning_rate": 1.8695122108372166e-06,
"loss": 1.2157,
"step": 803
},
{
"epoch": 0.7249774571686204,
"grad_norm": 1.308194040096133,
"learning_rate": 1.8581319190720038e-06,
"loss": 1.2231,
"step": 804
},
{
"epoch": 0.7258791704238052,
"grad_norm": 1.2796364986114368,
"learning_rate": 1.8467784603823736e-06,
"loss": 1.2192,
"step": 805
},
{
"epoch": 0.7267808836789901,
"grad_norm": 1.2872786078348708,
"learning_rate": 1.8354519317323632e-06,
"loss": 1.2399,
"step": 806
},
{
"epoch": 0.7276825969341749,
"grad_norm": 1.2784211379965313,
"learning_rate": 1.824152429856017e-06,
"loss": 1.2403,
"step": 807
},
{
"epoch": 0.7285843101893598,
"grad_norm": 1.310396993012597,
"learning_rate": 1.8128800512565514e-06,
"loss": 1.2277,
"step": 808
},
{
"epoch": 0.7294860234445446,
"grad_norm": 1.2612581875182598,
"learning_rate": 1.8016348922055448e-06,
"loss": 1.2311,
"step": 809
},
{
"epoch": 0.7303877366997295,
"grad_norm": 1.3520209311069702,
"learning_rate": 1.7904170487421002e-06,
"loss": 1.2131,
"step": 810
},
{
"epoch": 0.7312894499549143,
"grad_norm": 1.3254416365883752,
"learning_rate": 1.7792266166720368e-06,
"loss": 1.2083,
"step": 811
},
{
"epoch": 0.7321911632100991,
"grad_norm": 1.308839493950598,
"learning_rate": 1.7680636915670673e-06,
"loss": 1.2397,
"step": 812
},
{
"epoch": 0.7330928764652841,
"grad_norm": 1.2861647653716632,
"learning_rate": 1.7569283687639782e-06,
"loss": 1.2047,
"step": 813
},
{
"epoch": 0.7339945897204689,
"grad_norm": 1.3102171488736987,
"learning_rate": 1.7458207433638225e-06,
"loss": 1.238,
"step": 814
},
{
"epoch": 0.7348963029756538,
"grad_norm": 1.279839582732384,
"learning_rate": 1.7347409102311013e-06,
"loss": 1.2363,
"step": 815
},
{
"epoch": 0.7357980162308386,
"grad_norm": 1.2513051547872285,
"learning_rate": 1.7236889639929604e-06,
"loss": 1.2206,
"step": 816
},
{
"epoch": 0.7366997294860235,
"grad_norm": 1.2571979965399165,
"learning_rate": 1.712664999038372e-06,
"loss": 1.2321,
"step": 817
},
{
"epoch": 0.7376014427412083,
"grad_norm": 1.2789411825150419,
"learning_rate": 1.7016691095173398e-06,
"loss": 1.226,
"step": 818
},
{
"epoch": 0.7385031559963932,
"grad_norm": 1.3081374507526442,
"learning_rate": 1.6907013893400838e-06,
"loss": 1.2483,
"step": 819
},
{
"epoch": 0.739404869251578,
"grad_norm": 1.3048788538202847,
"learning_rate": 1.6797619321762531e-06,
"loss": 1.199,
"step": 820
},
{
"epoch": 0.7403065825067628,
"grad_norm": 1.2811961903485563,
"learning_rate": 1.6688508314541086e-06,
"loss": 1.2262,
"step": 821
},
{
"epoch": 0.7412082957619477,
"grad_norm": 1.3386620335025967,
"learning_rate": 1.6579681803597392e-06,
"loss": 1.2517,
"step": 822
},
{
"epoch": 0.7421100090171325,
"grad_norm": 1.2968806903374512,
"learning_rate": 1.6471140718362538e-06,
"loss": 1.2066,
"step": 823
},
{
"epoch": 0.7430117222723174,
"grad_norm": 1.2752349391240716,
"learning_rate": 1.6362885985830001e-06,
"loss": 1.2239,
"step": 824
},
{
"epoch": 0.7439134355275022,
"grad_norm": 1.3261984374711637,
"learning_rate": 1.6254918530547663e-06,
"loss": 1.1986,
"step": 825
},
{
"epoch": 0.7448151487826871,
"grad_norm": 1.2550706964991916,
"learning_rate": 1.6147239274609865e-06,
"loss": 1.2283,
"step": 826
},
{
"epoch": 0.7457168620378719,
"grad_norm": 1.2924267601556008,
"learning_rate": 1.6039849137649633e-06,
"loss": 1.2284,
"step": 827
},
{
"epoch": 0.7466185752930569,
"grad_norm": 1.2625304550791376,
"learning_rate": 1.593274903683077e-06,
"loss": 1.2056,
"step": 828
},
{
"epoch": 0.7475202885482417,
"grad_norm": 1.2690837771273074,
"learning_rate": 1.5825939886840036e-06,
"loss": 1.2255,
"step": 829
},
{
"epoch": 0.7484220018034266,
"grad_norm": 1.2762767081796036,
"learning_rate": 1.571942259987929e-06,
"loss": 1.2353,
"step": 830
},
{
"epoch": 0.7493237150586114,
"grad_norm": 1.2727759423442815,
"learning_rate": 1.5613198085657804e-06,
"loss": 1.2143,
"step": 831
},
{
"epoch": 0.7502254283137962,
"grad_norm": 1.3036977265961338,
"learning_rate": 1.5507267251384334e-06,
"loss": 1.206,
"step": 832
},
{
"epoch": 0.7511271415689811,
"grad_norm": 1.2740183699658059,
"learning_rate": 1.5401631001759604e-06,
"loss": 1.2408,
"step": 833
},
{
"epoch": 0.7520288548241659,
"grad_norm": 1.2799961121807295,
"learning_rate": 1.5296290238968303e-06,
"loss": 1.2259,
"step": 834
},
{
"epoch": 0.7529305680793508,
"grad_norm": 1.2946371850023939,
"learning_rate": 1.5191245862671627e-06,
"loss": 1.2378,
"step": 835
},
{
"epoch": 0.7538322813345356,
"grad_norm": 1.2736534447504666,
"learning_rate": 1.5086498769999397e-06,
"loss": 1.2069,
"step": 836
},
{
"epoch": 0.7547339945897205,
"grad_norm": 1.2931271877296926,
"learning_rate": 1.4982049855542553e-06,
"loss": 1.2431,
"step": 837
},
{
"epoch": 0.7556357078449053,
"grad_norm": 1.2741459793452181,
"learning_rate": 1.4877900011345442e-06,
"loss": 1.2203,
"step": 838
},
{
"epoch": 0.7565374211000901,
"grad_norm": 1.2558676126546313,
"learning_rate": 1.4774050126898164e-06,
"loss": 1.2137,
"step": 839
},
{
"epoch": 0.757439134355275,
"grad_norm": 1.2745001559561364,
"learning_rate": 1.4670501089129075e-06,
"loss": 1.2066,
"step": 840
},
{
"epoch": 0.7583408476104598,
"grad_norm": 1.2911834530742523,
"learning_rate": 1.4567253782397073e-06,
"loss": 1.2179,
"step": 841
},
{
"epoch": 0.7592425608656447,
"grad_norm": 1.274307360634065,
"learning_rate": 1.4464309088484252e-06,
"loss": 1.2313,
"step": 842
},
{
"epoch": 0.7601442741208295,
"grad_norm": 1.2923011502987385,
"learning_rate": 1.4361667886588116e-06,
"loss": 1.1962,
"step": 843
},
{
"epoch": 0.7610459873760145,
"grad_norm": 1.301882038769701,
"learning_rate": 1.425933105331429e-06,
"loss": 1.2223,
"step": 844
},
{
"epoch": 0.7619477006311993,
"grad_norm": 1.2863546659971987,
"learning_rate": 1.4157299462668872e-06,
"loss": 1.2043,
"step": 845
},
{
"epoch": 0.7628494138863842,
"grad_norm": 1.3053223183086544,
"learning_rate": 1.4055573986051125e-06,
"loss": 1.2321,
"step": 846
},
{
"epoch": 0.763751127141569,
"grad_norm": 1.3159435801199877,
"learning_rate": 1.395415549224587e-06,
"loss": 1.211,
"step": 847
},
{
"epoch": 0.7646528403967539,
"grad_norm": 1.2851907687256028,
"learning_rate": 1.3853044847416208e-06,
"loss": 1.2144,
"step": 848
},
{
"epoch": 0.7655545536519387,
"grad_norm": 1.2799253757664457,
"learning_rate": 1.3752242915095993e-06,
"loss": 1.2162,
"step": 849
},
{
"epoch": 0.7664562669071235,
"grad_norm": 1.2875109268543516,
"learning_rate": 1.3651750556182586e-06,
"loss": 1.2125,
"step": 850
},
{
"epoch": 0.7673579801623084,
"grad_norm": 1.3080584590334174,
"learning_rate": 1.3551568628929434e-06,
"loss": 1.225,
"step": 851
},
{
"epoch": 0.7682596934174932,
"grad_norm": 1.2860096387559667,
"learning_rate": 1.34516979889387e-06,
"loss": 1.2079,
"step": 852
},
{
"epoch": 0.7691614066726781,
"grad_norm": 1.235344160104314,
"learning_rate": 1.3352139489154064e-06,
"loss": 1.2131,
"step": 853
},
{
"epoch": 0.7700631199278629,
"grad_norm": 1.327651942534106,
"learning_rate": 1.3252893979853304e-06,
"loss": 1.2,
"step": 854
},
{
"epoch": 0.7709648331830478,
"grad_norm": 1.2568962321346648,
"learning_rate": 1.315396230864121e-06,
"loss": 1.2499,
"step": 855
},
{
"epoch": 0.7718665464382326,
"grad_norm": 1.3369013029699717,
"learning_rate": 1.3055345320442142e-06,
"loss": 1.2521,
"step": 856
},
{
"epoch": 0.7727682596934174,
"grad_norm": 1.3204783254303465,
"learning_rate": 1.295704385749299e-06,
"loss": 1.2109,
"step": 857
},
{
"epoch": 0.7736699729486023,
"grad_norm": 1.250606473191615,
"learning_rate": 1.2859058759335835e-06,
"loss": 1.2117,
"step": 858
},
{
"epoch": 0.7745716862037872,
"grad_norm": 1.2629471608700342,
"learning_rate": 1.2761390862810907e-06,
"loss": 1.2066,
"step": 859
},
{
"epoch": 0.7754733994589721,
"grad_norm": 1.3409577687424445,
"learning_rate": 1.2664041002049366e-06,
"loss": 1.2136,
"step": 860
},
{
"epoch": 0.7763751127141569,
"grad_norm": 1.2868230389731257,
"learning_rate": 1.256701000846619e-06,
"loss": 1.1905,
"step": 861
},
{
"epoch": 0.7772768259693418,
"grad_norm": 1.2406015334116862,
"learning_rate": 1.2470298710753047e-06,
"loss": 1.2296,
"step": 862
},
{
"epoch": 0.7781785392245266,
"grad_norm": 1.2811225594832343,
"learning_rate": 1.2373907934871292e-06,
"loss": 1.2087,
"step": 863
},
{
"epoch": 0.7790802524797115,
"grad_norm": 1.2846826323381735,
"learning_rate": 1.227783850404487e-06,
"loss": 1.2182,
"step": 864
},
{
"epoch": 0.7799819657348963,
"grad_norm": 1.2634245507700415,
"learning_rate": 1.218209123875323e-06,
"loss": 1.2383,
"step": 865
},
{
"epoch": 0.7808836789900812,
"grad_norm": 1.3138866338710329,
"learning_rate": 1.2086666956724425e-06,
"loss": 1.2467,
"step": 866
},
{
"epoch": 0.781785392245266,
"grad_norm": 1.3186911496412215,
"learning_rate": 1.1991566472928028e-06,
"loss": 1.2289,
"step": 867
},
{
"epoch": 0.7826871055004508,
"grad_norm": 1.2884060019272627,
"learning_rate": 1.1896790599568291e-06,
"loss": 1.2203,
"step": 868
},
{
"epoch": 0.7835888187556357,
"grad_norm": 1.3059275711703233,
"learning_rate": 1.1802340146077045e-06,
"loss": 1.2169,
"step": 869
},
{
"epoch": 0.7844905320108205,
"grad_norm": 1.301415764999824,
"learning_rate": 1.1708215919106963e-06,
"loss": 1.2373,
"step": 870
},
{
"epoch": 0.7853922452660054,
"grad_norm": 1.2923142951378839,
"learning_rate": 1.1614418722524506e-06,
"loss": 1.2093,
"step": 871
},
{
"epoch": 0.7862939585211902,
"grad_norm": 1.3064111928829703,
"learning_rate": 1.1520949357403194e-06,
"loss": 1.2056,
"step": 872
},
{
"epoch": 0.7871956717763751,
"grad_norm": 1.2810652585045075,
"learning_rate": 1.1427808622016683e-06,
"loss": 1.2287,
"step": 873
},
{
"epoch": 0.78809738503156,
"grad_norm": 1.2914490392277977,
"learning_rate": 1.1334997311832003e-06,
"loss": 1.2412,
"step": 874
},
{
"epoch": 0.7889990982867449,
"grad_norm": 1.255451413387033,
"learning_rate": 1.1242516219502663e-06,
"loss": 1.2131,
"step": 875
},
{
"epoch": 0.7899008115419297,
"grad_norm": 1.2556143337911658,
"learning_rate": 1.1150366134862033e-06,
"loss": 1.2126,
"step": 876
},
{
"epoch": 0.7908025247971145,
"grad_norm": 1.3313063769408204,
"learning_rate": 1.105854784491648e-06,
"loss": 1.2468,
"step": 877
},
{
"epoch": 0.7917042380522994,
"grad_norm": 1.298214254858563,
"learning_rate": 1.0967062133838658e-06,
"loss": 1.2137,
"step": 878
},
{
"epoch": 0.7926059513074842,
"grad_norm": 1.2746933883075344,
"learning_rate": 1.0875909782960887e-06,
"loss": 1.2039,
"step": 879
},
{
"epoch": 0.7935076645626691,
"grad_norm": 1.3540595796355972,
"learning_rate": 1.0785091570768386e-06,
"loss": 1.2191,
"step": 880
},
{
"epoch": 0.7944093778178539,
"grad_norm": 1.3563137733418598,
"learning_rate": 1.0694608272892698e-06,
"loss": 1.2376,
"step": 881
},
{
"epoch": 0.7953110910730388,
"grad_norm": 1.2647689876029176,
"learning_rate": 1.0604460662105022e-06,
"loss": 1.1925,
"step": 882
},
{
"epoch": 0.7962128043282236,
"grad_norm": 1.3125086631687228,
"learning_rate": 1.0514649508309642e-06,
"loss": 1.2144,
"step": 883
},
{
"epoch": 0.7971145175834085,
"grad_norm": 1.2897071180116173,
"learning_rate": 1.04251755785373e-06,
"loss": 1.2244,
"step": 884
},
{
"epoch": 0.7980162308385933,
"grad_norm": 1.283139531262602,
"learning_rate": 1.0336039636938716e-06,
"loss": 1.1859,
"step": 885
},
{
"epoch": 0.7989179440937781,
"grad_norm": 1.2765723038043117,
"learning_rate": 1.024724244477801e-06,
"loss": 1.209,
"step": 886
},
{
"epoch": 0.799819657348963,
"grad_norm": 1.29015886657531,
"learning_rate": 1.0158784760426243e-06,
"loss": 1.2101,
"step": 887
},
{
"epoch": 0.8007213706041478,
"grad_norm": 1.2589137070190157,
"learning_rate": 1.0070667339354873e-06,
"loss": 1.207,
"step": 888
},
{
"epoch": 0.8016230838593328,
"grad_norm": 1.277316552734332,
"learning_rate": 9.98289093412938e-07,
"loss": 1.2457,
"step": 889
},
{
"epoch": 0.8025247971145176,
"grad_norm": 1.3008418492196654,
"learning_rate": 9.895456294402778e-07,
"loss": 1.2113,
"step": 890
},
{
"epoch": 0.8034265103697025,
"grad_norm": 1.2807960216818002,
"learning_rate": 9.808364166909256e-07,
"loss": 1.197,
"step": 891
},
{
"epoch": 0.8043282236248873,
"grad_norm": 1.2564207138926697,
"learning_rate": 9.721615295457775e-07,
"loss": 1.1898,
"step": 892
},
{
"epoch": 0.8052299368800722,
"grad_norm": 1.3041160534809693,
"learning_rate": 9.63521042092575e-07,
"loss": 1.2209,
"step": 893
},
{
"epoch": 0.806131650135257,
"grad_norm": 1.294351117986239,
"learning_rate": 9.549150281252633e-07,
"loss": 1.2086,
"step": 894
},
{
"epoch": 0.8070333633904418,
"grad_norm": 1.2894951265659893,
"learning_rate": 9.46343561143373e-07,
"loss": 1.1988,
"step": 895
},
{
"epoch": 0.8079350766456267,
"grad_norm": 1.2872240658238072,
"learning_rate": 9.378067143513858e-07,
"loss": 1.227,
"step": 896
},
{
"epoch": 0.8088367899008115,
"grad_norm": 1.2773245018646944,
"learning_rate": 9.29304560658107e-07,
"loss": 1.2261,
"step": 897
},
{
"epoch": 0.8097385031559964,
"grad_norm": 1.240601055718308,
"learning_rate": 9.20837172676049e-07,
"loss": 1.2217,
"step": 898
},
{
"epoch": 0.8106402164111812,
"grad_norm": 1.321794367808805,
"learning_rate": 9.124046227208083e-07,
"loss": 1.1978,
"step": 899
},
{
"epoch": 0.8115419296663661,
"grad_norm": 1.3149000543559988,
"learning_rate": 9.040069828104475e-07,
"loss": 1.229,
"step": 900
},
{
"epoch": 0.8124436429215509,
"grad_norm": 1.3125901611372035,
"learning_rate": 8.956443246648771e-07,
"loss": 1.2368,
"step": 901
},
{
"epoch": 0.8133453561767358,
"grad_norm": 1.2785850919554989,
"learning_rate": 8.873167197052529e-07,
"loss": 1.2306,
"step": 902
},
{
"epoch": 0.8142470694319206,
"grad_norm": 1.3000250879578916,
"learning_rate": 8.790242390533521e-07,
"loss": 1.225,
"step": 903
},
{
"epoch": 0.8151487826871056,
"grad_norm": 1.2818311674334737,
"learning_rate": 8.707669535309793e-07,
"loss": 1.2047,
"step": 904
},
{
"epoch": 0.8160504959422904,
"grad_norm": 1.271805790254973,
"learning_rate": 8.625449336593522e-07,
"loss": 1.2172,
"step": 905
},
{
"epoch": 0.8169522091974752,
"grad_norm": 1.2761977162628635,
"learning_rate": 8.543582496585063e-07,
"loss": 1.1918,
"step": 906
},
{
"epoch": 0.8178539224526601,
"grad_norm": 1.3410080816109553,
"learning_rate": 8.462069714466858e-07,
"loss": 1.22,
"step": 907
},
{
"epoch": 0.8187556357078449,
"grad_norm": 1.2521910265828438,
"learning_rate": 8.380911686397581e-07,
"loss": 1.2199,
"step": 908
},
{
"epoch": 0.8196573489630298,
"grad_norm": 1.3638833178851848,
"learning_rate": 8.30010910550611e-07,
"loss": 1.2307,
"step": 909
},
{
"epoch": 0.8205590622182146,
"grad_norm": 1.2852248499047008,
"learning_rate": 8.219662661885619e-07,
"loss": 1.2033,
"step": 910
},
{
"epoch": 0.8214607754733995,
"grad_norm": 1.2652628132587298,
"learning_rate": 8.139573042587729e-07,
"loss": 1.2028,
"step": 911
},
{
"epoch": 0.8223624887285843,
"grad_norm": 1.2762952391981852,
"learning_rate": 8.059840931616558e-07,
"loss": 1.1733,
"step": 912
},
{
"epoch": 0.8232642019837692,
"grad_norm": 1.2774076567333978,
"learning_rate": 7.980467009923009e-07,
"loss": 1.2039,
"step": 913
},
{
"epoch": 0.824165915238954,
"grad_norm": 1.3040355830465697,
"learning_rate": 7.901451955398792e-07,
"loss": 1.2161,
"step": 914
},
{
"epoch": 0.8250676284941388,
"grad_norm": 1.3095023386402835,
"learning_rate": 7.822796442870784e-07,
"loss": 1.2345,
"step": 915
},
{
"epoch": 0.8259693417493237,
"grad_norm": 1.28427660003993,
"learning_rate": 7.744501144095135e-07,
"loss": 1.2107,
"step": 916
},
{
"epoch": 0.8268710550045085,
"grad_norm": 1.2878227831037923,
"learning_rate": 7.666566727751645e-07,
"loss": 1.211,
"step": 917
},
{
"epoch": 0.8277727682596934,
"grad_norm": 1.3124410623046319,
"learning_rate": 7.588993859437988e-07,
"loss": 1.2459,
"step": 918
},
{
"epoch": 0.8286744815148782,
"grad_norm": 1.263563579523678,
"learning_rate": 7.511783201664053e-07,
"loss": 1.204,
"step": 919
},
{
"epoch": 0.8295761947700632,
"grad_norm": 1.2767223067970443,
"learning_rate": 7.434935413846245e-07,
"loss": 1.2043,
"step": 920
},
{
"epoch": 0.830477908025248,
"grad_norm": 1.27323540409098,
"learning_rate": 7.35845115230191e-07,
"loss": 1.1902,
"step": 921
},
{
"epoch": 0.8313796212804329,
"grad_norm": 1.2856585107991603,
"learning_rate": 7.282331070243703e-07,
"loss": 1.214,
"step": 922
},
{
"epoch": 0.8322813345356177,
"grad_norm": 1.2777930103434787,
"learning_rate": 7.206575817773992e-07,
"loss": 1.2162,
"step": 923
},
{
"epoch": 0.8331830477908025,
"grad_norm": 1.2695250453108164,
"learning_rate": 7.131186041879357e-07,
"loss": 1.206,
"step": 924
},
{
"epoch": 0.8340847610459874,
"grad_norm": 1.2670670761716276,
"learning_rate": 7.056162386424964e-07,
"loss": 1.199,
"step": 925
},
{
"epoch": 0.8349864743011722,
"grad_norm": 1.2738982457981094,
"learning_rate": 6.981505492149232e-07,
"loss": 1.1969,
"step": 926
},
{
"epoch": 0.8358881875563571,
"grad_norm": 1.264231758299848,
"learning_rate": 6.907215996658174e-07,
"loss": 1.2045,
"step": 927
},
{
"epoch": 0.8367899008115419,
"grad_norm": 1.2954200749690095,
"learning_rate": 6.833294534420093e-07,
"loss": 1.2117,
"step": 928
},
{
"epoch": 0.8376916140667268,
"grad_norm": 1.401407562366026,
"learning_rate": 6.759741736760062e-07,
"loss": 1.2149,
"step": 929
},
{
"epoch": 0.8385933273219116,
"grad_norm": 1.2545881088743782,
"learning_rate": 6.686558231854634e-07,
"loss": 1.1956,
"step": 930
},
{
"epoch": 0.8394950405770965,
"grad_norm": 1.2878052921321332,
"learning_rate": 6.613744644726383e-07,
"loss": 1.2128,
"step": 931
},
{
"epoch": 0.8403967538322813,
"grad_norm": 1.2746377137243443,
"learning_rate": 6.541301597238636e-07,
"loss": 1.2344,
"step": 932
},
{
"epoch": 0.8412984670874661,
"grad_norm": 1.258707795620394,
"learning_rate": 6.469229708090091e-07,
"loss": 1.2212,
"step": 933
},
{
"epoch": 0.842200180342651,
"grad_norm": 1.272285625648656,
"learning_rate": 6.397529592809615e-07,
"loss": 1.2071,
"step": 934
},
{
"epoch": 0.8431018935978359,
"grad_norm": 1.2769280073303368,
"learning_rate": 6.326201863750942e-07,
"loss": 1.2162,
"step": 935
},
{
"epoch": 0.8440036068530208,
"grad_norm": 1.281458182940483,
"learning_rate": 6.255247130087405e-07,
"loss": 1.2103,
"step": 936
},
{
"epoch": 0.8449053201082056,
"grad_norm": 1.2921797567709183,
"learning_rate": 6.184665997806832e-07,
"loss": 1.2108,
"step": 937
},
{
"epoch": 0.8458070333633905,
"grad_norm": 1.2846162043149028,
"learning_rate": 6.114459069706252e-07,
"loss": 1.2147,
"step": 938
},
{
"epoch": 0.8467087466185753,
"grad_norm": 1.310085155671471,
"learning_rate": 6.044626945386894e-07,
"loss": 1.2141,
"step": 939
},
{
"epoch": 0.8476104598737602,
"grad_norm": 1.2712954656833793,
"learning_rate": 5.975170221248894e-07,
"loss": 1.2311,
"step": 940
},
{
"epoch": 0.848512173128945,
"grad_norm": 1.2955924380936459,
"learning_rate": 5.90608949048635e-07,
"loss": 1.2223,
"step": 941
},
{
"epoch": 0.8494138863841298,
"grad_norm": 1.3049752396017495,
"learning_rate": 5.837385343082152e-07,
"loss": 1.2381,
"step": 942
},
{
"epoch": 0.8503155996393147,
"grad_norm": 1.2931102898743785,
"learning_rate": 5.769058365803016e-07,
"loss": 1.2164,
"step": 943
},
{
"epoch": 0.8512173128944995,
"grad_norm": 1.289678632558847,
"learning_rate": 5.701109142194422e-07,
"loss": 1.1922,
"step": 944
},
{
"epoch": 0.8521190261496844,
"grad_norm": 1.3321126371269403,
"learning_rate": 5.633538252575677e-07,
"loss": 1.1958,
"step": 945
},
{
"epoch": 0.8530207394048692,
"grad_norm": 1.2720222221403463,
"learning_rate": 5.566346274034895e-07,
"loss": 1.2272,
"step": 946
},
{
"epoch": 0.8539224526600541,
"grad_norm": 1.23704892854696,
"learning_rate": 5.499533780424138e-07,
"loss": 1.2108,
"step": 947
},
{
"epoch": 0.8548241659152389,
"grad_norm": 1.2541502216827884,
"learning_rate": 5.433101342354474e-07,
"loss": 1.2108,
"step": 948
},
{
"epoch": 0.8557258791704238,
"grad_norm": 1.2885605008569092,
"learning_rate": 5.367049527191093e-07,
"loss": 1.2257,
"step": 949
},
{
"epoch": 0.8566275924256087,
"grad_norm": 1.2927700166567266,
"learning_rate": 5.301378899048514e-07,
"loss": 1.2112,
"step": 950
},
{
"epoch": 0.8575293056807936,
"grad_norm": 1.2839929669287422,
"learning_rate": 5.236090018785705e-07,
"loss": 1.2026,
"step": 951
},
{
"epoch": 0.8584310189359784,
"grad_norm": 1.3136146492814051,
"learning_rate": 5.171183444001337e-07,
"loss": 1.2331,
"step": 952
},
{
"epoch": 0.8593327321911632,
"grad_norm": 1.3001428661698673,
"learning_rate": 5.106659729029007e-07,
"loss": 1.1918,
"step": 953
},
{
"epoch": 0.8602344454463481,
"grad_norm": 1.2839843931089516,
"learning_rate": 5.042519424932512e-07,
"loss": 1.2202,
"step": 954
},
{
"epoch": 0.8611361587015329,
"grad_norm": 1.270892973171116,
"learning_rate": 4.978763079501109e-07,
"loss": 1.2201,
"step": 955
},
{
"epoch": 0.8620378719567178,
"grad_norm": 1.2916321550437255,
"learning_rate": 4.915391237244876e-07,
"loss": 1.2364,
"step": 956
},
{
"epoch": 0.8629395852119026,
"grad_norm": 1.3047521772223711,
"learning_rate": 4.852404439390051e-07,
"loss": 1.2193,
"step": 957
},
{
"epoch": 0.8638412984670875,
"grad_norm": 1.3099220594258418,
"learning_rate": 4.789803223874423e-07,
"loss": 1.2021,
"step": 958
},
{
"epoch": 0.8647430117222723,
"grad_norm": 1.3059600685193462,
"learning_rate": 4.727588125342669e-07,
"loss": 1.2213,
"step": 959
},
{
"epoch": 0.8656447249774571,
"grad_norm": 1.3207903999542119,
"learning_rate": 4.665759675141901e-07,
"loss": 1.2244,
"step": 960
},
{
"epoch": 0.866546438232642,
"grad_norm": 1.288523708061658,
"learning_rate": 4.604318401317009e-07,
"loss": 1.2316,
"step": 961
},
{
"epoch": 0.8674481514878268,
"grad_norm": 1.2572262151165579,
"learning_rate": 4.543264828606264e-07,
"loss": 1.2207,
"step": 962
},
{
"epoch": 0.8683498647430117,
"grad_norm": 1.2815957832640745,
"learning_rate": 4.48259947843675e-07,
"loss": 1.201,
"step": 963
},
{
"epoch": 0.8692515779981965,
"grad_norm": 1.2426149475176893,
"learning_rate": 4.422322868919937e-07,
"loss": 1.174,
"step": 964
},
{
"epoch": 0.8701532912533815,
"grad_norm": 1.2917796970754292,
"learning_rate": 4.3624355148472796e-07,
"loss": 1.2154,
"step": 965
},
{
"epoch": 0.8710550045085663,
"grad_norm": 1.2804388471447807,
"learning_rate": 4.302937927685802e-07,
"loss": 1.1898,
"step": 966
},
{
"epoch": 0.8719567177637512,
"grad_norm": 1.2965540391390407,
"learning_rate": 4.2438306155737243e-07,
"loss": 1.2193,
"step": 967
},
{
"epoch": 0.872858431018936,
"grad_norm": 1.3095171070129454,
"learning_rate": 4.1851140833161163e-07,
"loss": 1.2035,
"step": 968
},
{
"epoch": 0.8737601442741209,
"grad_norm": 1.3406437797884603,
"learning_rate": 4.1267888323806294e-07,
"loss": 1.2361,
"step": 969
},
{
"epoch": 0.8746618575293057,
"grad_norm": 1.2968176832526788,
"learning_rate": 4.0688553608931313e-07,
"loss": 1.2081,
"step": 970
},
{
"epoch": 0.8755635707844905,
"grad_norm": 1.2398826667404843,
"learning_rate": 4.011314163633573e-07,
"loss": 1.19,
"step": 971
},
{
"epoch": 0.8764652840396754,
"grad_norm": 1.277753523037616,
"learning_rate": 3.954165732031634e-07,
"loss": 1.1806,
"step": 972
},
{
"epoch": 0.8773669972948602,
"grad_norm": 1.2786206929414288,
"learning_rate": 3.897410554162623e-07,
"loss": 1.2338,
"step": 973
},
{
"epoch": 0.8782687105500451,
"grad_norm": 1.291914303763212,
"learning_rate": 3.841049114743239e-07,
"loss": 1.2323,
"step": 974
},
{
"epoch": 0.8791704238052299,
"grad_norm": 1.2796108765905125,
"learning_rate": 3.7850818951274903e-07,
"loss": 1.2232,
"step": 975
},
{
"epoch": 0.8800721370604148,
"grad_norm": 1.259816182886979,
"learning_rate": 3.729509373302548e-07,
"loss": 1.1889,
"step": 976
},
{
"epoch": 0.8809738503155996,
"grad_norm": 1.2792451995025527,
"learning_rate": 3.674332023884664e-07,
"loss": 1.2116,
"step": 977
},
{
"epoch": 0.8818755635707844,
"grad_norm": 1.274657278905521,
"learning_rate": 3.619550318115145e-07,
"loss": 1.235,
"step": 978
},
{
"epoch": 0.8827772768259693,
"grad_norm": 1.257454729532636,
"learning_rate": 3.5651647238562904e-07,
"loss": 1.2106,
"step": 979
},
{
"epoch": 0.8836789900811542,
"grad_norm": 1.252377545649136,
"learning_rate": 3.511175705587433e-07,
"loss": 1.2043,
"step": 980
},
{
"epoch": 0.8845807033363391,
"grad_norm": 1.2521024978130257,
"learning_rate": 3.4575837244009367e-07,
"loss": 1.1983,
"step": 981
},
{
"epoch": 0.8854824165915239,
"grad_norm": 1.2923042352880336,
"learning_rate": 3.4043892379982956e-07,
"loss": 1.2339,
"step": 982
},
{
"epoch": 0.8863841298467088,
"grad_norm": 1.226700149978839,
"learning_rate": 3.351592700686168e-07,
"loss": 1.2028,
"step": 983
},
{
"epoch": 0.8872858431018936,
"grad_norm": 1.2665798312269472,
"learning_rate": 3.299194563372604e-07,
"loss": 1.2185,
"step": 984
},
{
"epoch": 0.8881875563570785,
"grad_norm": 1.253667104013189,
"learning_rate": 3.247195273563047e-07,
"loss": 1.191,
"step": 985
},
{
"epoch": 0.8890892696122633,
"grad_norm": 1.2429872892333635,
"learning_rate": 3.1955952753566445e-07,
"loss": 1.209,
"step": 986
},
{
"epoch": 0.8899909828674482,
"grad_norm": 1.325045978009465,
"learning_rate": 3.144395009442369e-07,
"loss": 1.224,
"step": 987
},
{
"epoch": 0.890892696122633,
"grad_norm": 1.329496806355262,
"learning_rate": 3.093594913095299e-07,
"loss": 1.211,
"step": 988
},
{
"epoch": 0.8917944093778178,
"grad_norm": 1.2305655303732355,
"learning_rate": 3.043195420172879e-07,
"loss": 1.2036,
"step": 989
},
{
"epoch": 0.8926961226330027,
"grad_norm": 1.2574131675915197,
"learning_rate": 2.9931969611111777e-07,
"loss": 1.2032,
"step": 990
},
{
"epoch": 0.8935978358881875,
"grad_norm": 1.2662488779050474,
"learning_rate": 2.943599962921279e-07,
"loss": 1.2251,
"step": 991
},
{
"epoch": 0.8944995491433724,
"grad_norm": 1.330940318105111,
"learning_rate": 2.89440484918555e-07,
"loss": 1.2036,
"step": 992
},
{
"epoch": 0.8954012623985572,
"grad_norm": 1.2724811054244518,
"learning_rate": 2.84561204005413e-07,
"loss": 1.2275,
"step": 993
},
{
"epoch": 0.8963029756537421,
"grad_norm": 1.2428562642394823,
"learning_rate": 2.7972219522412194e-07,
"loss": 1.2087,
"step": 994
},
{
"epoch": 0.8972046889089269,
"grad_norm": 1.2550505429558814,
"learning_rate": 2.7492349990216327e-07,
"loss": 1.1932,
"step": 995
},
{
"epoch": 0.8981064021641119,
"grad_norm": 1.2889360575719786,
"learning_rate": 2.701651590227178e-07,
"loss": 1.2001,
"step": 996
},
{
"epoch": 0.8990081154192967,
"grad_norm": 1.3377452346886367,
"learning_rate": 2.654472132243241e-07,
"loss": 1.2136,
"step": 997
},
{
"epoch": 0.8999098286744815,
"grad_norm": 1.2974101369208686,
"learning_rate": 2.6076970280052295e-07,
"loss": 1.199,
"step": 998
},
{
"epoch": 0.9008115419296664,
"grad_norm": 1.2734787152019011,
"learning_rate": 2.5613266769952183e-07,
"loss": 1.2127,
"step": 999
},
{
"epoch": 0.9017132551848512,
"grad_norm": 1.2985739168739143,
"learning_rate": 2.5153614752384534e-07,
"loss": 1.1983,
"step": 1000
},
{
"epoch": 0.9026149684400361,
"grad_norm": 1.2573502235072782,
"learning_rate": 2.469801815300027e-07,
"loss": 1.2135,
"step": 1001
},
{
"epoch": 0.9035166816952209,
"grad_norm": 1.2748461049930755,
"learning_rate": 2.4246480862815226e-07,
"loss": 1.2245,
"step": 1002
},
{
"epoch": 0.9044183949504058,
"grad_norm": 1.2884368599511475,
"learning_rate": 2.3799006738176422e-07,
"loss": 1.2142,
"step": 1003
},
{
"epoch": 0.9053201082055906,
"grad_norm": 1.2743626671513446,
"learning_rate": 2.3355599600729916e-07,
"loss": 1.2163,
"step": 1004
},
{
"epoch": 0.9062218214607755,
"grad_norm": 1.313060889818672,
"learning_rate": 2.2916263237387104e-07,
"loss": 1.2059,
"step": 1005
},
{
"epoch": 0.9071235347159603,
"grad_norm": 1.2833792613487849,
"learning_rate": 2.2481001400293855e-07,
"loss": 1.2131,
"step": 1006
},
{
"epoch": 0.9080252479711451,
"grad_norm": 1.3170354684360084,
"learning_rate": 2.204981780679677e-07,
"loss": 1.2095,
"step": 1007
},
{
"epoch": 0.90892696122633,
"grad_norm": 1.3165273872030707,
"learning_rate": 2.1622716139412803e-07,
"loss": 1.2189,
"step": 1008
},
{
"epoch": 0.9098286744815148,
"grad_norm": 1.2612221063278017,
"learning_rate": 2.1199700045797077e-07,
"loss": 1.1943,
"step": 1009
},
{
"epoch": 0.9107303877366997,
"grad_norm": 1.254731155862502,
"learning_rate": 2.0780773138711908e-07,
"loss": 1.2084,
"step": 1010
},
{
"epoch": 0.9116321009918846,
"grad_norm": 1.2638790136002085,
"learning_rate": 2.036593899599615e-07,
"loss": 1.195,
"step": 1011
},
{
"epoch": 0.9125338142470695,
"grad_norm": 1.2774446957229728,
"learning_rate": 1.9955201160534342e-07,
"loss": 1.2388,
"step": 1012
},
{
"epoch": 0.9134355275022543,
"grad_norm": 1.2814569340864863,
"learning_rate": 1.9548563140226518e-07,
"loss": 1.212,
"step": 1013
},
{
"epoch": 0.9143372407574392,
"grad_norm": 1.265465404596977,
"learning_rate": 1.9146028407958483e-07,
"loss": 1.2067,
"step": 1014
},
{
"epoch": 0.915238954012624,
"grad_norm": 1.269985400786495,
"learning_rate": 1.874760040157181e-07,
"loss": 1.2273,
"step": 1015
},
{
"epoch": 0.9161406672678089,
"grad_norm": 1.321443696897108,
"learning_rate": 1.8353282523834671e-07,
"loss": 1.2235,
"step": 1016
},
{
"epoch": 0.9170423805229937,
"grad_norm": 1.261142508109314,
"learning_rate": 1.7963078142412883e-07,
"loss": 1.203,
"step": 1017
},
{
"epoch": 0.9179440937781785,
"grad_norm": 1.27184781211763,
"learning_rate": 1.7576990589840747e-07,
"loss": 1.2091,
"step": 1018
},
{
"epoch": 0.9188458070333634,
"grad_norm": 1.2921875252777293,
"learning_rate": 1.7195023163493253e-07,
"loss": 1.2069,
"step": 1019
},
{
"epoch": 0.9197475202885482,
"grad_norm": 1.2942084641451423,
"learning_rate": 1.6817179125557026e-07,
"loss": 1.2291,
"step": 1020
},
{
"epoch": 0.9206492335437331,
"grad_norm": 1.2439106153543784,
"learning_rate": 1.6443461703003427e-07,
"loss": 1.2141,
"step": 1021
},
{
"epoch": 0.9215509467989179,
"grad_norm": 1.2883747080006507,
"learning_rate": 1.6073874087560115e-07,
"loss": 1.2058,
"step": 1022
},
{
"epoch": 0.9224526600541028,
"grad_norm": 1.237116734838997,
"learning_rate": 1.5708419435684463e-07,
"loss": 1.2,
"step": 1023
},
{
"epoch": 0.9233543733092876,
"grad_norm": 1.2705941080800744,
"learning_rate": 1.5347100868536246e-07,
"loss": 1.1878,
"step": 1024
},
{
"epoch": 0.9242560865644724,
"grad_norm": 1.289668475702933,
"learning_rate": 1.4989921471951163e-07,
"loss": 1.2059,
"step": 1025
},
{
"epoch": 0.9251577998196574,
"grad_norm": 1.252243810732813,
"learning_rate": 1.4636884296414133e-07,
"loss": 1.1894,
"step": 1026
},
{
"epoch": 0.9260595130748422,
"grad_norm": 1.239862577289118,
"learning_rate": 1.428799235703382e-07,
"loss": 1.2062,
"step": 1027
},
{
"epoch": 0.9269612263300271,
"grad_norm": 1.2792086558499869,
"learning_rate": 1.3943248633516426e-07,
"loss": 1.2289,
"step": 1028
},
{
"epoch": 0.9278629395852119,
"grad_norm": 1.2585322782352686,
"learning_rate": 1.3602656070140275e-07,
"loss": 1.2187,
"step": 1029
},
{
"epoch": 0.9287646528403968,
"grad_norm": 1.2649978962250352,
"learning_rate": 1.3266217575730934e-07,
"loss": 1.2335,
"step": 1030
},
{
"epoch": 0.9296663660955816,
"grad_norm": 1.2709565276249286,
"learning_rate": 1.2933936023636073e-07,
"loss": 1.2253,
"step": 1031
},
{
"epoch": 0.9305680793507665,
"grad_norm": 1.2741243155450708,
"learning_rate": 1.2605814251701154e-07,
"loss": 1.2155,
"step": 1032
},
{
"epoch": 0.9314697926059513,
"grad_norm": 1.2962554005547353,
"learning_rate": 1.2281855062245163e-07,
"loss": 1.2323,
"step": 1033
},
{
"epoch": 0.9323715058611362,
"grad_norm": 1.222038672850551,
"learning_rate": 1.196206122203647e-07,
"loss": 1.2294,
"step": 1034
},
{
"epoch": 0.933273219116321,
"grad_norm": 1.273022165563276,
"learning_rate": 1.1646435462269346e-07,
"loss": 1.209,
"step": 1035
},
{
"epoch": 0.9341749323715058,
"grad_norm": 1.317712793510341,
"learning_rate": 1.1334980478540758e-07,
"loss": 1.2239,
"step": 1036
},
{
"epoch": 0.9350766456266907,
"grad_norm": 1.29768734193814,
"learning_rate": 1.1027698930827169e-07,
"loss": 1.2089,
"step": 1037
},
{
"epoch": 0.9359783588818755,
"grad_norm": 1.2748192341276068,
"learning_rate": 1.0724593443461883e-07,
"loss": 1.2161,
"step": 1038
},
{
"epoch": 0.9368800721370604,
"grad_norm": 1.3078946894653718,
"learning_rate": 1.0425666605112516e-07,
"loss": 1.2134,
"step": 1039
},
{
"epoch": 0.9377817853922452,
"grad_norm": 1.2943299119489553,
"learning_rate": 1.0130920968759228e-07,
"loss": 1.2191,
"step": 1040
},
{
"epoch": 0.9386834986474302,
"grad_norm": 1.2524693445516681,
"learning_rate": 9.84035905167241e-08,
"loss": 1.2023,
"step": 1041
},
{
"epoch": 0.939585211902615,
"grad_norm": 1.2254001564356944,
"learning_rate": 9.553983335391647e-08,
"loss": 1.191,
"step": 1042
},
{
"epoch": 0.9404869251577999,
"grad_norm": 1.283320757635457,
"learning_rate": 9.271796265704403e-08,
"loss": 1.2217,
"step": 1043
},
{
"epoch": 0.9413886384129847,
"grad_norm": 1.3413375530222929,
"learning_rate": 8.993800252624863e-08,
"loss": 1.2107,
"step": 1044
},
{
"epoch": 0.9422903516681695,
"grad_norm": 1.2395249209659651,
"learning_rate": 8.719997670373682e-08,
"loss": 1.2085,
"step": 1045
},
{
"epoch": 0.9431920649233544,
"grad_norm": 1.3019154798956944,
"learning_rate": 8.450390857357549e-08,
"loss": 1.2187,
"step": 1046
},
{
"epoch": 0.9440937781785392,
"grad_norm": 1.2742821693649515,
"learning_rate": 8.18498211614932e-08,
"loss": 1.2058,
"step": 1047
},
{
"epoch": 0.9449954914337241,
"grad_norm": 1.3231134618421645,
"learning_rate": 7.923773713468197e-08,
"loss": 1.2127,
"step": 1048
},
{
"epoch": 0.9458972046889089,
"grad_norm": 1.277096810367392,
"learning_rate": 7.666767880160464e-08,
"loss": 1.2289,
"step": 1049
},
{
"epoch": 0.9467989179440938,
"grad_norm": 1.2665767697644446,
"learning_rate": 7.413966811180451e-08,
"loss": 1.2099,
"step": 1050
},
{
"epoch": 0.9477006311992786,
"grad_norm": 1.3063066976618636,
"learning_rate": 7.165372665571879e-08,
"loss": 1.2369,
"step": 1051
},
{
"epoch": 0.9486023444544635,
"grad_norm": 1.2888722722107182,
"learning_rate": 6.920987566448989e-08,
"loss": 1.1898,
"step": 1052
},
{
"epoch": 0.9495040577096483,
"grad_norm": 1.2529437153897622,
"learning_rate": 6.680813600979164e-08,
"loss": 1.1879,
"step": 1053
},
{
"epoch": 0.9504057709648331,
"grad_norm": 1.2660744074547863,
"learning_rate": 6.444852820364222e-08,
"loss": 1.2249,
"step": 1054
},
{
"epoch": 0.951307484220018,
"grad_norm": 1.2731308456724335,
"learning_rate": 6.213107239823602e-08,
"loss": 1.1905,
"step": 1055
},
{
"epoch": 0.9522091974752029,
"grad_norm": 1.2847581977653115,
"learning_rate": 5.985578838576978e-08,
"loss": 1.21,
"step": 1056
},
{
"epoch": 0.9531109107303878,
"grad_norm": 1.2858831339292007,
"learning_rate": 5.762269559826894e-08,
"loss": 1.2408,
"step": 1057
},
{
"epoch": 0.9540126239855726,
"grad_norm": 1.2835136436122037,
"learning_rate": 5.54318131074294e-08,
"loss": 1.2223,
"step": 1058
},
{
"epoch": 0.9549143372407575,
"grad_norm": 1.2443952821022994,
"learning_rate": 5.3283159624448745e-08,
"loss": 1.1958,
"step": 1059
},
{
"epoch": 0.9558160504959423,
"grad_norm": 1.264268819104271,
"learning_rate": 5.117675349986917e-08,
"loss": 1.1901,
"step": 1060
},
{
"epoch": 0.9567177637511272,
"grad_norm": 1.2956493028514946,
"learning_rate": 4.911261272341872e-08,
"loss": 1.197,
"step": 1061
},
{
"epoch": 0.957619477006312,
"grad_norm": 1.269520271669753,
"learning_rate": 4.7090754923859725e-08,
"loss": 1.1895,
"step": 1062
},
{
"epoch": 0.9585211902614968,
"grad_norm": 1.2979719372812786,
"learning_rate": 4.511119736883729e-08,
"loss": 1.2298,
"step": 1063
},
{
"epoch": 0.9594229035166817,
"grad_norm": 1.288532218908057,
"learning_rate": 4.3173956964732145e-08,
"loss": 1.2037,
"step": 1064
},
{
"epoch": 0.9603246167718665,
"grad_norm": 1.2797908398125752,
"learning_rate": 4.127905025651635e-08,
"loss": 1.241,
"step": 1065
},
{
"epoch": 0.9612263300270514,
"grad_norm": 1.3040381654756092,
"learning_rate": 3.9426493427611177e-08,
"loss": 1.1916,
"step": 1066
},
{
"epoch": 0.9621280432822362,
"grad_norm": 1.2808361333680878,
"learning_rate": 3.761630229974833e-08,
"loss": 1.2143,
"step": 1067
},
{
"epoch": 0.9630297565374211,
"grad_norm": 1.2950532754186528,
"learning_rate": 3.584849233283838e-08,
"loss": 1.2227,
"step": 1068
},
{
"epoch": 0.9639314697926059,
"grad_norm": 1.2550421927753321,
"learning_rate": 3.4123078624834214e-08,
"loss": 1.2139,
"step": 1069
},
{
"epoch": 0.9648331830477908,
"grad_norm": 1.2701351461973567,
"learning_rate": 3.244007591160503e-08,
"loss": 1.2109,
"step": 1070
},
{
"epoch": 0.9657348963029756,
"grad_norm": 1.2571051407543998,
"learning_rate": 3.079949856680975e-08,
"loss": 1.2068,
"step": 1071
},
{
"epoch": 0.9666366095581606,
"grad_norm": 1.2481617283700457,
"learning_rate": 2.9201360601772698e-08,
"loss": 1.2402,
"step": 1072
},
{
"epoch": 0.9675383228133454,
"grad_norm": 1.237772880329741,
"learning_rate": 2.7645675665367578e-08,
"loss": 1.2181,
"step": 1073
},
{
"epoch": 0.9684400360685302,
"grad_norm": 1.2782845837706895,
"learning_rate": 2.6132457043896442e-08,
"loss": 1.1945,
"step": 1074
},
{
"epoch": 0.9693417493237151,
"grad_norm": 1.289759744788445,
"learning_rate": 2.4661717660980356e-08,
"loss": 1.2033,
"step": 1075
},
{
"epoch": 0.9702434625788999,
"grad_norm": 1.311942673721432,
"learning_rate": 2.323347007744503e-08,
"loss": 1.2219,
"step": 1076
},
{
"epoch": 0.9711451758340848,
"grad_norm": 1.2727763297469359,
"learning_rate": 2.184772649121758e-08,
"loss": 1.1994,
"step": 1077
},
{
"epoch": 0.9720468890892696,
"grad_norm": 1.2557611757173046,
"learning_rate": 2.0504498737219936e-08,
"loss": 1.1953,
"step": 1078
},
{
"epoch": 0.9729486023444545,
"grad_norm": 1.2781594119302995,
"learning_rate": 1.920379828726726e-08,
"loss": 1.1996,
"step": 1079
},
{
"epoch": 0.9738503155996393,
"grad_norm": 1.3054939219702595,
"learning_rate": 1.7945636249971364e-08,
"loss": 1.2329,
"step": 1080
},
{
"epoch": 0.9747520288548241,
"grad_norm": 1.310407071986216,
"learning_rate": 1.6730023370645775e-08,
"loss": 1.2282,
"step": 1081
},
{
"epoch": 0.975653742110009,
"grad_norm": 1.2478522708231785,
"learning_rate": 1.5556970031214145e-08,
"loss": 1.2347,
"step": 1082
},
{
"epoch": 0.9765554553651938,
"grad_norm": 1.272238571063739,
"learning_rate": 1.4426486250119776e-08,
"loss": 1.2115,
"step": 1083
},
{
"epoch": 0.9774571686203787,
"grad_norm": 1.2597189160490305,
"learning_rate": 1.333858168224178e-08,
"loss": 1.1915,
"step": 1084
},
{
"epoch": 0.9783588818755635,
"grad_norm": 1.2574926511170699,
"learning_rate": 1.2293265618811834e-08,
"loss": 1.2163,
"step": 1085
},
{
"epoch": 0.9792605951307484,
"grad_norm": 1.2527364255680953,
"learning_rate": 1.1290546987336448e-08,
"loss": 1.2086,
"step": 1086
},
{
"epoch": 0.9801623083859333,
"grad_norm": 1.27362762517652,
"learning_rate": 1.0330434351518149e-08,
"loss": 1.1843,
"step": 1087
},
{
"epoch": 0.9810640216411182,
"grad_norm": 1.2973034143378401,
"learning_rate": 9.412935911183863e-09,
"loss": 1.1956,
"step": 1088
},
{
"epoch": 0.981965734896303,
"grad_norm": 1.3018989918977304,
"learning_rate": 8.538059502214979e-09,
"loss": 1.2319,
"step": 1089
},
{
"epoch": 0.9828674481514879,
"grad_norm": 1.2479095491543981,
"learning_rate": 7.705812596479623e-09,
"loss": 1.2188,
"step": 1090
},
{
"epoch": 0.9837691614066727,
"grad_norm": 1.3272869423642981,
"learning_rate": 6.9162023017699255e-09,
"loss": 1.2234,
"step": 1091
},
{
"epoch": 0.9846708746618575,
"grad_norm": 1.2488700072518422,
"learning_rate": 6.169235361739856e-09,
"loss": 1.1952,
"step": 1092
},
{
"epoch": 0.9855725879170424,
"grad_norm": 1.2700332989633605,
"learning_rate": 5.464918155849708e-09,
"loss": 1.2117,
"step": 1093
},
{
"epoch": 0.9864743011722272,
"grad_norm": 1.2768089943898375,
"learning_rate": 4.803256699308923e-09,
"loss": 1.1925,
"step": 1094
},
{
"epoch": 0.9873760144274121,
"grad_norm": 1.2755150189086724,
"learning_rate": 4.18425664302724e-09,
"loss": 1.2056,
"step": 1095
},
{
"epoch": 0.9882777276825969,
"grad_norm": 1.2451176059668059,
"learning_rate": 3.6079232735647398e-09,
"loss": 1.1898,
"step": 1096
},
{
"epoch": 0.9891794409377818,
"grad_norm": 1.2758758036274904,
"learning_rate": 3.074261513087984e-09,
"loss": 1.2147,
"step": 1097
},
{
"epoch": 0.9900811541929666,
"grad_norm": 1.2630686054047537,
"learning_rate": 2.583275919327277e-09,
"loss": 1.2157,
"step": 1098
},
{
"epoch": 0.9909828674481514,
"grad_norm": 1.291774655594905,
"learning_rate": 2.134970685536697e-09,
"loss": 1.232,
"step": 1099
},
{
"epoch": 0.9918845807033363,
"grad_norm": 1.2811485111546765,
"learning_rate": 1.7293496404602316e-09,
"loss": 1.2154,
"step": 1100
},
{
"epoch": 0.9927862939585211,
"grad_norm": 1.292187830956411,
"learning_rate": 1.3664162482990296e-09,
"loss": 1.1845,
"step": 1101
},
{
"epoch": 0.9936880072137061,
"grad_norm": 1.2505142052902283,
"learning_rate": 1.0461736086786467e-09,
"loss": 1.2096,
"step": 1102
},
{
"epoch": 0.9945897204688909,
"grad_norm": 1.2457145139334065,
"learning_rate": 7.686244566273981e-10,
"loss": 1.1985,
"step": 1103
},
{
"epoch": 0.9954914337240758,
"grad_norm": 1.250521582131511,
"learning_rate": 5.337711625497122e-10,
"loss": 1.2144,
"step": 1104
},
{
"epoch": 0.9963931469792606,
"grad_norm": 1.3207469421718456,
"learning_rate": 3.416157322055913e-10,
"loss": 1.2173,
"step": 1105
},
{
"epoch": 0.9972948602344455,
"grad_norm": 1.305287692170028,
"learning_rate": 1.921598066961794e-10,
"loss": 1.1969,
"step": 1106
},
{
"epoch": 0.9981965734896303,
"grad_norm": 1.281374648115917,
"learning_rate": 8.540466244710832e-11,
"loss": 1.2162,
"step": 1107
},
{
"epoch": 0.9990982867448152,
"grad_norm": 1.2559201459727685,
"learning_rate": 2.1351211199061028e-11,
"loss": 1.2056,
"step": 1108
},
{
"epoch": 1.0,
"grad_norm": 1.2684289549241197,
"learning_rate": 0.0,
"loss": 1.2128,
"step": 1109
},
{
"epoch": 1.0,
"step": 1109,
"total_flos": 1474711721345024.0,
"train_loss": 1.2891367367520001,
"train_runtime": 18995.5344,
"train_samples_per_second": 0.467,
"train_steps_per_second": 0.058
}
],
"logging_steps": 1.0,
"max_steps": 1109,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1474711721345024.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}