PrepAI's picture
Upload folder using huggingface_hub
9a3b244 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9999002576676919,
"eval_steps": 10000,
"global_step": 30076,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0033247444102734603,
"grad_norm": 1.3035300970077515,
"learning_rate": 1.6622340425531915e-06,
"loss": 5.822,
"step": 50
},
{
"epoch": 0.006649488820546921,
"grad_norm": 1.1972568035125732,
"learning_rate": 3.324468085106383e-06,
"loss": 5.9169,
"step": 100
},
{
"epoch": 0.009974233230820381,
"grad_norm": 1.352640986442566,
"learning_rate": 4.986702127659574e-06,
"loss": 5.2194,
"step": 150
},
{
"epoch": 0.013298977641093841,
"grad_norm": 1.7489923238754272,
"learning_rate": 6.648936170212766e-06,
"loss": 4.452,
"step": 200
},
{
"epoch": 0.0166237220513673,
"grad_norm": 1.8752517700195312,
"learning_rate": 8.311170212765958e-06,
"loss": 3.6432,
"step": 250
},
{
"epoch": 0.019948466461640762,
"grad_norm": 1.9665223360061646,
"learning_rate": 9.973404255319148e-06,
"loss": 3.174,
"step": 300
},
{
"epoch": 0.02327321087191422,
"grad_norm": 2.6562719345092773,
"learning_rate": 1.1635638297872341e-05,
"loss": 2.5593,
"step": 350
},
{
"epoch": 0.026597955282187682,
"grad_norm": 2.408365488052368,
"learning_rate": 1.3297872340425532e-05,
"loss": 2.1801,
"step": 400
},
{
"epoch": 0.02992269969246114,
"grad_norm": 2.3265933990478516,
"learning_rate": 1.4960106382978726e-05,
"loss": 1.8083,
"step": 450
},
{
"epoch": 0.0332474441027346,
"grad_norm": 2.1040735244750977,
"learning_rate": 1.6622340425531915e-05,
"loss": 1.6401,
"step": 500
},
{
"epoch": 0.036572188513008065,
"grad_norm": 3.4362895488739014,
"learning_rate": 1.8284574468085108e-05,
"loss": 1.5832,
"step": 550
},
{
"epoch": 0.039896932923281524,
"grad_norm": 1.546397089958191,
"learning_rate": 1.9946808510638297e-05,
"loss": 1.4445,
"step": 600
},
{
"epoch": 0.04322167733355498,
"grad_norm": 1.6556838750839233,
"learning_rate": 2.1609042553191493e-05,
"loss": 1.3488,
"step": 650
},
{
"epoch": 0.04654642174382844,
"grad_norm": 1.819988489151001,
"learning_rate": 2.3271276595744682e-05,
"loss": 1.2645,
"step": 700
},
{
"epoch": 0.049871166154101906,
"grad_norm": 1.4990456104278564,
"learning_rate": 2.4933510638297874e-05,
"loss": 1.2705,
"step": 750
},
{
"epoch": 0.053195910564375365,
"grad_norm": 1.2020275592803955,
"learning_rate": 2.6595744680851064e-05,
"loss": 1.1177,
"step": 800
},
{
"epoch": 0.056520654974648823,
"grad_norm": 3.6433067321777344,
"learning_rate": 2.8257978723404256e-05,
"loss": 1.1117,
"step": 850
},
{
"epoch": 0.05984539938492228,
"grad_norm": 1.214572548866272,
"learning_rate": 2.9920212765957452e-05,
"loss": 1.0738,
"step": 900
},
{
"epoch": 0.06317014379519574,
"grad_norm": 1.198177456855774,
"learning_rate": 3.158244680851064e-05,
"loss": 0.9417,
"step": 950
},
{
"epoch": 0.0664948882054692,
"grad_norm": 2.4085018634796143,
"learning_rate": 3.324468085106383e-05,
"loss": 0.9209,
"step": 1000
},
{
"epoch": 0.06981963261574267,
"grad_norm": 1.7089121341705322,
"learning_rate": 3.490691489361702e-05,
"loss": 0.923,
"step": 1050
},
{
"epoch": 0.07314437702601613,
"grad_norm": 1.4717004299163818,
"learning_rate": 3.6569148936170215e-05,
"loss": 0.8675,
"step": 1100
},
{
"epoch": 0.07646912143628959,
"grad_norm": 0.943592369556427,
"learning_rate": 3.823138297872341e-05,
"loss": 0.8474,
"step": 1150
},
{
"epoch": 0.07979386584656305,
"grad_norm": 1.0817354917526245,
"learning_rate": 3.9893617021276594e-05,
"loss": 0.9049,
"step": 1200
},
{
"epoch": 0.0831186102568365,
"grad_norm": 1.4040069580078125,
"learning_rate": 4.1555851063829786e-05,
"loss": 0.8135,
"step": 1250
},
{
"epoch": 0.08644335466710996,
"grad_norm": 1.383522868156433,
"learning_rate": 4.3218085106382986e-05,
"loss": 0.8804,
"step": 1300
},
{
"epoch": 0.08976809907738342,
"grad_norm": 2.222273111343384,
"learning_rate": 4.488031914893617e-05,
"loss": 0.8195,
"step": 1350
},
{
"epoch": 0.09309284348765688,
"grad_norm": 1.0952123403549194,
"learning_rate": 4.6542553191489364e-05,
"loss": 0.7564,
"step": 1400
},
{
"epoch": 0.09641758789793034,
"grad_norm": 1.3211369514465332,
"learning_rate": 4.8204787234042556e-05,
"loss": 0.8551,
"step": 1450
},
{
"epoch": 0.09974233230820381,
"grad_norm": 1.8156408071517944,
"learning_rate": 4.986702127659575e-05,
"loss": 0.7798,
"step": 1500
},
{
"epoch": 0.10306707671847727,
"grad_norm": 0.7081910967826843,
"learning_rate": 5.152925531914894e-05,
"loss": 0.7384,
"step": 1550
},
{
"epoch": 0.10639182112875073,
"grad_norm": 1.294580340385437,
"learning_rate": 5.319148936170213e-05,
"loss": 0.792,
"step": 1600
},
{
"epoch": 0.10971656553902419,
"grad_norm": 1.5583733320236206,
"learning_rate": 5.485372340425532e-05,
"loss": 0.7462,
"step": 1650
},
{
"epoch": 0.11304130994929765,
"grad_norm": 1.0178766250610352,
"learning_rate": 5.651595744680851e-05,
"loss": 0.7477,
"step": 1700
},
{
"epoch": 0.1163660543595711,
"grad_norm": 3.021415948867798,
"learning_rate": 5.81781914893617e-05,
"loss": 0.6861,
"step": 1750
},
{
"epoch": 0.11969079876984456,
"grad_norm": 1.043920636177063,
"learning_rate": 5.9840425531914904e-05,
"loss": 0.7214,
"step": 1800
},
{
"epoch": 0.12301554318011802,
"grad_norm": 2.1260740756988525,
"learning_rate": 6.150265957446809e-05,
"loss": 0.6917,
"step": 1850
},
{
"epoch": 0.12634028759039148,
"grad_norm": 1.0638266801834106,
"learning_rate": 6.316489361702128e-05,
"loss": 0.6948,
"step": 1900
},
{
"epoch": 0.12966503200066495,
"grad_norm": 1.7689710855484009,
"learning_rate": 6.482712765957447e-05,
"loss": 0.7296,
"step": 1950
},
{
"epoch": 0.1329897764109384,
"grad_norm": 1.052815556526184,
"learning_rate": 6.648936170212766e-05,
"loss": 0.732,
"step": 2000
},
{
"epoch": 0.13631452082121187,
"grad_norm": 0.7970194220542908,
"learning_rate": 6.815159574468085e-05,
"loss": 0.6978,
"step": 2050
},
{
"epoch": 0.13963926523148534,
"grad_norm": 0.7442440986633301,
"learning_rate": 6.981382978723405e-05,
"loss": 0.7093,
"step": 2100
},
{
"epoch": 0.1429640096417588,
"grad_norm": 1.5074230432510376,
"learning_rate": 7.147606382978723e-05,
"loss": 0.7114,
"step": 2150
},
{
"epoch": 0.14628875405203226,
"grad_norm": 1.4102712869644165,
"learning_rate": 7.313829787234043e-05,
"loss": 0.7156,
"step": 2200
},
{
"epoch": 0.1496134984623057,
"grad_norm": 1.1298854351043701,
"learning_rate": 7.480053191489363e-05,
"loss": 0.6908,
"step": 2250
},
{
"epoch": 0.15293824287257918,
"grad_norm": 1.149941325187683,
"learning_rate": 7.646276595744682e-05,
"loss": 0.7239,
"step": 2300
},
{
"epoch": 0.15626298728285262,
"grad_norm": 0.860246479511261,
"learning_rate": 7.8125e-05,
"loss": 0.7164,
"step": 2350
},
{
"epoch": 0.1595877316931261,
"grad_norm": 0.9874492883682251,
"learning_rate": 7.978723404255319e-05,
"loss": 0.7372,
"step": 2400
},
{
"epoch": 0.16291247610339954,
"grad_norm": 1.1497923135757446,
"learning_rate": 8.144946808510639e-05,
"loss": 0.6838,
"step": 2450
},
{
"epoch": 0.166237220513673,
"grad_norm": 0.8075393438339233,
"learning_rate": 8.311170212765957e-05,
"loss": 0.6905,
"step": 2500
},
{
"epoch": 0.16956196492394648,
"grad_norm": 1.4072059392929077,
"learning_rate": 8.477393617021277e-05,
"loss": 0.6824,
"step": 2550
},
{
"epoch": 0.17288670933421993,
"grad_norm": 1.4905030727386475,
"learning_rate": 8.643617021276597e-05,
"loss": 0.6813,
"step": 2600
},
{
"epoch": 0.1762114537444934,
"grad_norm": 1.1402111053466797,
"learning_rate": 8.809840425531916e-05,
"loss": 0.6849,
"step": 2650
},
{
"epoch": 0.17953619815476685,
"grad_norm": 1.8863836526870728,
"learning_rate": 8.976063829787234e-05,
"loss": 0.6488,
"step": 2700
},
{
"epoch": 0.18286094256504032,
"grad_norm": 1.0747898817062378,
"learning_rate": 9.142287234042554e-05,
"loss": 0.6468,
"step": 2750
},
{
"epoch": 0.18618568697531376,
"grad_norm": 1.4408470392227173,
"learning_rate": 9.308510638297873e-05,
"loss": 0.6661,
"step": 2800
},
{
"epoch": 0.18951043138558724,
"grad_norm": 0.6010861396789551,
"learning_rate": 9.474734042553191e-05,
"loss": 0.6519,
"step": 2850
},
{
"epoch": 0.19283517579586068,
"grad_norm": 1.3736897706985474,
"learning_rate": 9.640957446808511e-05,
"loss": 0.6852,
"step": 2900
},
{
"epoch": 0.19615992020613415,
"grad_norm": 0.7717883586883545,
"learning_rate": 9.807180851063831e-05,
"loss": 0.6286,
"step": 2950
},
{
"epoch": 0.19948466461640763,
"grad_norm": 2.024017572402954,
"learning_rate": 9.97340425531915e-05,
"loss": 0.7135,
"step": 3000
},
{
"epoch": 0.20280940902668107,
"grad_norm": 0.49820956587791443,
"learning_rate": 9.999940594707412e-05,
"loss": 0.6312,
"step": 3050
},
{
"epoch": 0.20613415343695454,
"grad_norm": 0.9031746983528137,
"learning_rate": 9.999714964504067e-05,
"loss": 0.651,
"step": 3100
},
{
"epoch": 0.209458897847228,
"grad_norm": 0.7573064565658569,
"learning_rate": 9.999320961690213e-05,
"loss": 0.6373,
"step": 3150
},
{
"epoch": 0.21278364225750146,
"grad_norm": 0.5948196649551392,
"learning_rate": 9.998758599534463e-05,
"loss": 0.7085,
"step": 3200
},
{
"epoch": 0.2161083866677749,
"grad_norm": 1.0268886089324951,
"learning_rate": 9.998027896975173e-05,
"loss": 0.684,
"step": 3250
},
{
"epoch": 0.21943313107804838,
"grad_norm": 0.7842527031898499,
"learning_rate": 9.997128878619808e-05,
"loss": 0.6139,
"step": 3300
},
{
"epoch": 0.22275787548832182,
"grad_norm": 0.5904123187065125,
"learning_rate": 9.996061574744102e-05,
"loss": 0.6458,
"step": 3350
},
{
"epoch": 0.2260826198985953,
"grad_norm": 1.0376912355422974,
"learning_rate": 9.994826021291056e-05,
"loss": 0.6125,
"step": 3400
},
{
"epoch": 0.22940736430886877,
"grad_norm": 0.8252595663070679,
"learning_rate": 9.993422259869713e-05,
"loss": 0.6086,
"step": 3450
},
{
"epoch": 0.2327321087191422,
"grad_norm": 0.7072761654853821,
"learning_rate": 9.991850337753762e-05,
"loss": 0.6516,
"step": 3500
},
{
"epoch": 0.23605685312941568,
"grad_norm": 0.5640501976013184,
"learning_rate": 9.990110307879952e-05,
"loss": 0.5939,
"step": 3550
},
{
"epoch": 0.23938159753968913,
"grad_norm": 0.9937716126441956,
"learning_rate": 9.988202228846291e-05,
"loss": 0.5968,
"step": 3600
},
{
"epoch": 0.2427063419499626,
"grad_norm": 0.6056251525878906,
"learning_rate": 9.986126164910094e-05,
"loss": 0.6498,
"step": 3650
},
{
"epoch": 0.24603108636023605,
"grad_norm": 0.5068459510803223,
"learning_rate": 9.983882185985808e-05,
"loss": 0.6477,
"step": 3700
},
{
"epoch": 0.24935583077050952,
"grad_norm": 0.6225175857543945,
"learning_rate": 9.98147036764266e-05,
"loss": 0.6328,
"step": 3750
},
{
"epoch": 0.25268057518078296,
"grad_norm": 0.5870462656021118,
"learning_rate": 9.978890791102109e-05,
"loss": 0.649,
"step": 3800
},
{
"epoch": 0.25600531959105643,
"grad_norm": 3.570667028427124,
"learning_rate": 9.976143543235114e-05,
"loss": 0.6618,
"step": 3850
},
{
"epoch": 0.2593300640013299,
"grad_norm": 0.6308077573776245,
"learning_rate": 9.973228716559209e-05,
"loss": 0.6205,
"step": 3900
},
{
"epoch": 0.2626548084116034,
"grad_norm": 0.4385659396648407,
"learning_rate": 9.970146409235386e-05,
"loss": 0.5935,
"step": 3950
},
{
"epoch": 0.2659795528218768,
"grad_norm": 0.5376986861228943,
"learning_rate": 9.966896725064786e-05,
"loss": 0.596,
"step": 4000
},
{
"epoch": 0.26930429723215027,
"grad_norm": 0.7176681160926819,
"learning_rate": 9.963479773485211e-05,
"loss": 0.6313,
"step": 4050
},
{
"epoch": 0.27262904164242374,
"grad_norm": 0.6590093374252319,
"learning_rate": 9.959895669567435e-05,
"loss": 0.6126,
"step": 4100
},
{
"epoch": 0.2759537860526972,
"grad_norm": 0.5095696449279785,
"learning_rate": 9.956144534011318e-05,
"loss": 0.6132,
"step": 4150
},
{
"epoch": 0.2792785304629707,
"grad_norm": 0.7277359962463379,
"learning_rate": 9.952226493141765e-05,
"loss": 0.6075,
"step": 4200
},
{
"epoch": 0.2826032748732441,
"grad_norm": 0.41294175386428833,
"learning_rate": 9.94814167890445e-05,
"loss": 0.5422,
"step": 4250
},
{
"epoch": 0.2859280192835176,
"grad_norm": 0.7728482484817505,
"learning_rate": 9.943890228861383e-05,
"loss": 0.573,
"step": 4300
},
{
"epoch": 0.28925276369379105,
"grad_norm": 0.6309620141983032,
"learning_rate": 9.939472286186271e-05,
"loss": 0.6314,
"step": 4350
},
{
"epoch": 0.2925775081040645,
"grad_norm": 0.4572204649448395,
"learning_rate": 9.934887999659707e-05,
"loss": 0.5865,
"step": 4400
},
{
"epoch": 0.29590225251433794,
"grad_norm": 0.44096603989601135,
"learning_rate": 9.930137523664149e-05,
"loss": 0.5994,
"step": 4450
},
{
"epoch": 0.2992269969246114,
"grad_norm": 0.8694889545440674,
"learning_rate": 9.925221018178728e-05,
"loss": 0.6174,
"step": 4500
},
{
"epoch": 0.3025517413348849,
"grad_norm": 0.41049107909202576,
"learning_rate": 9.920138648773852e-05,
"loss": 0.5778,
"step": 4550
},
{
"epoch": 0.30587648574515836,
"grad_norm": 0.5933430790901184,
"learning_rate": 9.914890586605638e-05,
"loss": 0.5745,
"step": 4600
},
{
"epoch": 0.3092012301554318,
"grad_norm": 0.8841090798377991,
"learning_rate": 9.90947700841015e-05,
"loss": 0.6356,
"step": 4650
},
{
"epoch": 0.31252597456570524,
"grad_norm": 0.6952012181282043,
"learning_rate": 9.903898096497441e-05,
"loss": 0.6593,
"step": 4700
},
{
"epoch": 0.3158507189759787,
"grad_norm": 0.43332594633102417,
"learning_rate": 9.898154038745408e-05,
"loss": 0.637,
"step": 4750
},
{
"epoch": 0.3191754633862522,
"grad_norm": 0.5256862640380859,
"learning_rate": 9.892245028593483e-05,
"loss": 0.6007,
"step": 4800
},
{
"epoch": 0.32250020779652566,
"grad_norm": 0.643079400062561,
"learning_rate": 9.886171265036102e-05,
"loss": 0.5526,
"step": 4850
},
{
"epoch": 0.3258249522067991,
"grad_norm": 0.9413782954216003,
"learning_rate": 9.879932952616009e-05,
"loss": 0.5863,
"step": 4900
},
{
"epoch": 0.32914969661707255,
"grad_norm": 0.46808409690856934,
"learning_rate": 9.873530301417373e-05,
"loss": 0.579,
"step": 4950
},
{
"epoch": 0.332474441027346,
"grad_norm": 0.6770108342170715,
"learning_rate": 9.8669635270587e-05,
"loss": 0.6135,
"step": 5000
},
{
"epoch": 0.3357991854376195,
"grad_norm": 0.443143367767334,
"learning_rate": 9.860232850685589e-05,
"loss": 0.5849,
"step": 5050
},
{
"epoch": 0.33912392984789297,
"grad_norm": 0.4198506772518158,
"learning_rate": 9.853338498963272e-05,
"loss": 0.591,
"step": 5100
},
{
"epoch": 0.3424486742581664,
"grad_norm": 0.3660542666912079,
"learning_rate": 9.846280704068982e-05,
"loss": 0.6121,
"step": 5150
},
{
"epoch": 0.34577341866843986,
"grad_norm": 0.5003857016563416,
"learning_rate": 9.839059703684139e-05,
"loss": 0.5705,
"step": 5200
},
{
"epoch": 0.34909816307871333,
"grad_norm": 0.5552676916122437,
"learning_rate": 9.831675740986346e-05,
"loss": 0.633,
"step": 5250
},
{
"epoch": 0.3524229074889868,
"grad_norm": 0.2692893445491791,
"learning_rate": 9.82412906464119e-05,
"loss": 0.5706,
"step": 5300
},
{
"epoch": 0.3557476518992602,
"grad_norm": 1.03011953830719,
"learning_rate": 9.816419928793879e-05,
"loss": 0.5357,
"step": 5350
},
{
"epoch": 0.3590723963095337,
"grad_norm": 0.4879101812839508,
"learning_rate": 9.808548593060681e-05,
"loss": 0.6065,
"step": 5400
},
{
"epoch": 0.36239714071980716,
"grad_norm": 0.4781530797481537,
"learning_rate": 9.800515322520174e-05,
"loss": 0.5832,
"step": 5450
},
{
"epoch": 0.36572188513008064,
"grad_norm": 0.519418478012085,
"learning_rate": 9.792320387704328e-05,
"loss": 0.5808,
"step": 5500
},
{
"epoch": 0.3690466295403541,
"grad_norm": 0.5514370203018188,
"learning_rate": 9.783964064589387e-05,
"loss": 0.6015,
"step": 5550
},
{
"epoch": 0.3723713739506275,
"grad_norm": 0.8520498871803284,
"learning_rate": 9.775446634586584e-05,
"loss": 0.6086,
"step": 5600
},
{
"epoch": 0.375696118360901,
"grad_norm": 0.6045146584510803,
"learning_rate": 9.766768384532654e-05,
"loss": 0.5674,
"step": 5650
},
{
"epoch": 0.37902086277117447,
"grad_norm": 0.44037437438964844,
"learning_rate": 9.757929606680181e-05,
"loss": 0.5112,
"step": 5700
},
{
"epoch": 0.38234560718144794,
"grad_norm": 0.41407912969589233,
"learning_rate": 9.748930598687752e-05,
"loss": 0.6066,
"step": 5750
},
{
"epoch": 0.38567035159172136,
"grad_norm": 0.33725085854530334,
"learning_rate": 9.73977166360994e-05,
"loss": 0.5997,
"step": 5800
},
{
"epoch": 0.38899509600199483,
"grad_norm": 0.9541948437690735,
"learning_rate": 9.730453109887087e-05,
"loss": 0.5986,
"step": 5850
},
{
"epoch": 0.3923198404122683,
"grad_norm": 0.31891727447509766,
"learning_rate": 9.720975251334929e-05,
"loss": 0.5235,
"step": 5900
},
{
"epoch": 0.3956445848225418,
"grad_norm": 0.869501531124115,
"learning_rate": 9.711338407134016e-05,
"loss": 0.62,
"step": 5950
},
{
"epoch": 0.39896932923281525,
"grad_norm": 0.38659653067588806,
"learning_rate": 9.701542901818974e-05,
"loss": 0.583,
"step": 6000
},
{
"epoch": 0.40229407364308867,
"grad_norm": 0.5901491045951843,
"learning_rate": 9.691589065267568e-05,
"loss": 0.5456,
"step": 6050
},
{
"epoch": 0.40561881805336214,
"grad_norm": 0.6315745711326599,
"learning_rate": 9.681477232689596e-05,
"loss": 0.5725,
"step": 6100
},
{
"epoch": 0.4089435624636356,
"grad_norm": 0.48777422308921814,
"learning_rate": 9.671207744615598e-05,
"loss": 0.6161,
"step": 6150
},
{
"epoch": 0.4122683068739091,
"grad_norm": 0.3584806025028229,
"learning_rate": 9.660780946885397e-05,
"loss": 0.5519,
"step": 6200
},
{
"epoch": 0.4155930512841825,
"grad_norm": 0.7234945297241211,
"learning_rate": 9.650197190636438e-05,
"loss": 0.6336,
"step": 6250
},
{
"epoch": 0.418917795694456,
"grad_norm": 1.169434905052185,
"learning_rate": 9.639456832291974e-05,
"loss": 0.5666,
"step": 6300
},
{
"epoch": 0.42224254010472945,
"grad_norm": 0.5370940566062927,
"learning_rate": 9.628560233549058e-05,
"loss": 0.55,
"step": 6350
},
{
"epoch": 0.4255672845150029,
"grad_norm": 1.2353452444076538,
"learning_rate": 9.617507761366367e-05,
"loss": 0.5756,
"step": 6400
},
{
"epoch": 0.4288920289252764,
"grad_norm": 0.4101187288761139,
"learning_rate": 9.606299787951836e-05,
"loss": 0.6014,
"step": 6450
},
{
"epoch": 0.4322167733355498,
"grad_norm": 0.38137727975845337,
"learning_rate": 9.594936690750129e-05,
"loss": 0.5764,
"step": 6500
},
{
"epoch": 0.4355415177458233,
"grad_norm": 0.618617057800293,
"learning_rate": 9.583418852429933e-05,
"loss": 0.5548,
"step": 6550
},
{
"epoch": 0.43886626215609675,
"grad_norm": 0.4934926927089691,
"learning_rate": 9.571746660871058e-05,
"loss": 0.5769,
"step": 6600
},
{
"epoch": 0.4421910065663702,
"grad_norm": 0.33685383200645447,
"learning_rate": 9.559920509151386e-05,
"loss": 0.562,
"step": 6650
},
{
"epoch": 0.44551575097664364,
"grad_norm": 0.43346357345581055,
"learning_rate": 9.547940795533627e-05,
"loss": 0.5478,
"step": 6700
},
{
"epoch": 0.4488404953869171,
"grad_norm": 0.5250598192214966,
"learning_rate": 9.535807923451911e-05,
"loss": 0.5292,
"step": 6750
},
{
"epoch": 0.4521652397971906,
"grad_norm": 0.3458341062068939,
"learning_rate": 9.523522301498202e-05,
"loss": 0.608,
"step": 6800
},
{
"epoch": 0.45548998420746406,
"grad_norm": 0.7067184448242188,
"learning_rate": 9.511084343408531e-05,
"loss": 0.5555,
"step": 6850
},
{
"epoch": 0.45881472861773753,
"grad_norm": 0.3423425853252411,
"learning_rate": 9.498494468049072e-05,
"loss": 0.5309,
"step": 6900
},
{
"epoch": 0.46213947302801095,
"grad_norm": 0.4263427257537842,
"learning_rate": 9.485753099402031e-05,
"loss": 0.5725,
"step": 6950
},
{
"epoch": 0.4654642174382844,
"grad_norm": 0.3699227273464203,
"learning_rate": 9.472860666551369e-05,
"loss": 0.544,
"step": 7000
},
{
"epoch": 0.4687889618485579,
"grad_norm": 0.34789547324180603,
"learning_rate": 9.459817603668351e-05,
"loss": 0.5701,
"step": 7050
},
{
"epoch": 0.47211370625883137,
"grad_norm": 0.3918140232563019,
"learning_rate": 9.446624349996929e-05,
"loss": 0.5523,
"step": 7100
},
{
"epoch": 0.47543845066910484,
"grad_norm": 0.39375773072242737,
"learning_rate": 9.433281349838941e-05,
"loss": 0.5224,
"step": 7150
},
{
"epoch": 0.47876319507937826,
"grad_norm": 0.49626776576042175,
"learning_rate": 9.419789052539157e-05,
"loss": 0.5807,
"step": 7200
},
{
"epoch": 0.48208793948965173,
"grad_norm": 0.4889478087425232,
"learning_rate": 9.406147912470143e-05,
"loss": 0.5496,
"step": 7250
},
{
"epoch": 0.4854126838999252,
"grad_norm": 0.8686904311180115,
"learning_rate": 9.392358389016961e-05,
"loss": 0.5681,
"step": 7300
},
{
"epoch": 0.4887374283101987,
"grad_norm": 1.1051130294799805,
"learning_rate": 9.378420946561697e-05,
"loss": 0.5595,
"step": 7350
},
{
"epoch": 0.4920621727204721,
"grad_norm": 0.3066134452819824,
"learning_rate": 9.364336054467819e-05,
"loss": 0.5436,
"step": 7400
},
{
"epoch": 0.49538691713074556,
"grad_norm": 0.5851086378097534,
"learning_rate": 9.350104187064379e-05,
"loss": 0.5452,
"step": 7450
},
{
"epoch": 0.49871166154101904,
"grad_norm": 0.5407485365867615,
"learning_rate": 9.335725823630035e-05,
"loss": 0.5603,
"step": 7500
},
{
"epoch": 0.5020364059512925,
"grad_norm": 0.4865974187850952,
"learning_rate": 9.321201448376904e-05,
"loss": 0.517,
"step": 7550
},
{
"epoch": 0.5053611503615659,
"grad_norm": 0.7569780945777893,
"learning_rate": 9.306531550434268e-05,
"loss": 0.5428,
"step": 7600
},
{
"epoch": 0.5086858947718395,
"grad_norm": 0.3547607958316803,
"learning_rate": 9.291716623832091e-05,
"loss": 0.5486,
"step": 7650
},
{
"epoch": 0.5120106391821129,
"grad_norm": 0.9511945843696594,
"learning_rate": 9.276757167484389e-05,
"loss": 0.5383,
"step": 7700
},
{
"epoch": 0.5153353835923863,
"grad_norm": 0.5263503789901733,
"learning_rate": 9.261653685172422e-05,
"loss": 0.6017,
"step": 7750
},
{
"epoch": 0.5186601280026598,
"grad_norm": 0.4225033223628998,
"learning_rate": 9.246406685527739e-05,
"loss": 0.5711,
"step": 7800
},
{
"epoch": 0.5219848724129332,
"grad_norm": 0.523815393447876,
"learning_rate": 9.231016682015035e-05,
"loss": 0.5859,
"step": 7850
},
{
"epoch": 0.5253096168232068,
"grad_norm": 0.48155075311660767,
"learning_rate": 9.21548419291487e-05,
"loss": 0.5151,
"step": 7900
},
{
"epoch": 0.5286343612334802,
"grad_norm": 0.7636247873306274,
"learning_rate": 9.19980974130621e-05,
"loss": 0.5288,
"step": 7950
},
{
"epoch": 0.5319591056437536,
"grad_norm": 0.3996843993663788,
"learning_rate": 9.183993855048811e-05,
"loss": 0.554,
"step": 8000
},
{
"epoch": 0.5352838500540271,
"grad_norm": 0.49857622385025024,
"learning_rate": 9.168037066765453e-05,
"loss": 0.5566,
"step": 8050
},
{
"epoch": 0.5386085944643005,
"grad_norm": 0.5007392764091492,
"learning_rate": 9.151939913823988e-05,
"loss": 0.5464,
"step": 8100
},
{
"epoch": 0.5419333388745741,
"grad_norm": 0.4842822551727295,
"learning_rate": 9.135702938319251e-05,
"loss": 0.5381,
"step": 8150
},
{
"epoch": 0.5452580832848475,
"grad_norm": 0.645003616809845,
"learning_rate": 9.119326687054802e-05,
"loss": 0.528,
"step": 8200
},
{
"epoch": 0.5485828276951209,
"grad_norm": 0.5707802176475525,
"learning_rate": 9.102811711524519e-05,
"loss": 0.5613,
"step": 8250
},
{
"epoch": 0.5519075721053944,
"grad_norm": 0.29192325472831726,
"learning_rate": 9.086158567894013e-05,
"loss": 0.5576,
"step": 8300
},
{
"epoch": 0.5552323165156678,
"grad_norm": 0.3091285228729248,
"learning_rate": 9.069367816981911e-05,
"loss": 0.54,
"step": 8350
},
{
"epoch": 0.5585570609259414,
"grad_norm": 0.4691781997680664,
"learning_rate": 9.052440024240956e-05,
"loss": 0.4902,
"step": 8400
},
{
"epoch": 0.5618818053362148,
"grad_norm": 0.38676175475120544,
"learning_rate": 9.03537575973898e-05,
"loss": 0.509,
"step": 8450
},
{
"epoch": 0.5652065497464882,
"grad_norm": 0.38752368092536926,
"learning_rate": 9.018175598139696e-05,
"loss": 0.5154,
"step": 8500
},
{
"epoch": 0.5685312941567617,
"grad_norm": 0.5562826991081238,
"learning_rate": 9.000840118683344e-05,
"loss": 0.535,
"step": 8550
},
{
"epoch": 0.5718560385670352,
"grad_norm": 0.8270835280418396,
"learning_rate": 8.983369905167191e-05,
"loss": 0.4827,
"step": 8600
},
{
"epoch": 0.5751807829773086,
"grad_norm": 0.4083782434463501,
"learning_rate": 8.965765545925869e-05,
"loss": 0.5161,
"step": 8650
},
{
"epoch": 0.5785055273875821,
"grad_norm": 0.47276222705841064,
"learning_rate": 8.948027633811557e-05,
"loss": 0.5239,
"step": 8700
},
{
"epoch": 0.5818302717978555,
"grad_norm": 0.6050196886062622,
"learning_rate": 8.930156766174025e-05,
"loss": 0.5577,
"step": 8750
},
{
"epoch": 0.585155016208129,
"grad_norm": 0.8670181632041931,
"learning_rate": 8.912153544840507e-05,
"loss": 0.5693,
"step": 8800
},
{
"epoch": 0.5884797606184025,
"grad_norm": 0.8589004874229431,
"learning_rate": 8.894018576095439e-05,
"loss": 0.4972,
"step": 8850
},
{
"epoch": 0.5918045050286759,
"grad_norm": 0.47463271021842957,
"learning_rate": 8.875752470660043e-05,
"loss": 0.5021,
"step": 8900
},
{
"epoch": 0.5951292494389494,
"grad_norm": 0.27892622351646423,
"learning_rate": 8.857355843671757e-05,
"loss": 0.5546,
"step": 8950
},
{
"epoch": 0.5984539938492228,
"grad_norm": 0.5175593495368958,
"learning_rate": 8.838829314663522e-05,
"loss": 0.5434,
"step": 9000
},
{
"epoch": 0.6017787382594963,
"grad_norm": 0.6045388579368591,
"learning_rate": 8.820173507542915e-05,
"loss": 0.5041,
"step": 9050
},
{
"epoch": 0.6051034826697698,
"grad_norm": 0.39441245794296265,
"learning_rate": 8.80138905057114e-05,
"loss": 0.5354,
"step": 9100
},
{
"epoch": 0.6084282270800432,
"grad_norm": 0.6685227751731873,
"learning_rate": 8.782476576341873e-05,
"loss": 0.5127,
"step": 9150
},
{
"epoch": 0.6117529714903167,
"grad_norm": 0.9093782305717468,
"learning_rate": 8.763436721759952e-05,
"loss": 0.4883,
"step": 9200
},
{
"epoch": 0.6150777159005901,
"grad_norm": 0.4950058162212372,
"learning_rate": 8.744270128019934e-05,
"loss": 0.4566,
"step": 9250
},
{
"epoch": 0.6184024603108637,
"grad_norm": 0.649726927280426,
"learning_rate": 8.724977440584497e-05,
"loss": 0.5758,
"step": 9300
},
{
"epoch": 0.6217272047211371,
"grad_norm": 0.6150277256965637,
"learning_rate": 8.705559309162712e-05,
"loss": 0.5346,
"step": 9350
},
{
"epoch": 0.6250519491314105,
"grad_norm": 0.35310274362564087,
"learning_rate": 8.686016387688153e-05,
"loss": 0.5128,
"step": 9400
},
{
"epoch": 0.628376693541684,
"grad_norm": 0.47013986110687256,
"learning_rate": 8.666349334296877e-05,
"loss": 0.4906,
"step": 9450
},
{
"epoch": 0.6317014379519574,
"grad_norm": 0.30959010124206543,
"learning_rate": 8.646558811305268e-05,
"loss": 0.5378,
"step": 9500
},
{
"epoch": 0.6350261823622309,
"grad_norm": 0.337326318025589,
"learning_rate": 8.626645485187722e-05,
"loss": 0.5361,
"step": 9550
},
{
"epoch": 0.6383509267725044,
"grad_norm": 0.6151895523071289,
"learning_rate": 8.60661002655421e-05,
"loss": 0.509,
"step": 9600
},
{
"epoch": 0.6416756711827778,
"grad_norm": 0.355437308549881,
"learning_rate": 8.586453110127688e-05,
"loss": 0.536,
"step": 9650
},
{
"epoch": 0.6450004155930513,
"grad_norm": 0.4256291687488556,
"learning_rate": 8.566175414721384e-05,
"loss": 0.4913,
"step": 9700
},
{
"epoch": 0.6483251600033247,
"grad_norm": 0.5116075277328491,
"learning_rate": 8.545777623215927e-05,
"loss": 0.5193,
"step": 9750
},
{
"epoch": 0.6516499044135982,
"grad_norm": 0.5095045566558838,
"learning_rate": 8.525260422536358e-05,
"loss": 0.5351,
"step": 9800
},
{
"epoch": 0.6549746488238717,
"grad_norm": 0.41416656970977783,
"learning_rate": 8.504624503628995e-05,
"loss": 0.5594,
"step": 9850
},
{
"epoch": 0.6582993932341451,
"grad_norm": 0.2669268846511841,
"learning_rate": 8.483870561438161e-05,
"loss": 0.5219,
"step": 9900
},
{
"epoch": 0.6616241376444186,
"grad_norm": 0.576519250869751,
"learning_rate": 8.462999294882783e-05,
"loss": 0.5606,
"step": 9950
},
{
"epoch": 0.664948882054692,
"grad_norm": 0.8234946727752686,
"learning_rate": 8.442011406832859e-05,
"loss": 0.5011,
"step": 10000
},
{
"epoch": 0.664948882054692,
"eval_loss": 0.4348411560058594,
"eval_runtime": 4362.332,
"eval_samples_per_second": 1.532,
"eval_steps_per_second": 1.532,
"step": 10000
},
{
"epoch": 0.6682736264649655,
"grad_norm": 0.11554688215255737,
"learning_rate": 8.420907604085781e-05,
"loss": 0.5451,
"step": 10050
},
{
"epoch": 0.671598370875239,
"grad_norm": 0.3037506937980652,
"learning_rate": 8.399688597342535e-05,
"loss": 0.5388,
"step": 10100
},
{
"epoch": 0.6749231152855124,
"grad_norm": 0.32672300934791565,
"learning_rate": 8.378355101183769e-05,
"loss": 0.5205,
"step": 10150
},
{
"epoch": 0.6782478596957859,
"grad_norm": 0.7825640439987183,
"learning_rate": 8.356907834045726e-05,
"loss": 0.5349,
"step": 10200
},
{
"epoch": 0.6815726041060594,
"grad_norm": 0.43441250920295715,
"learning_rate": 8.335347518196052e-05,
"loss": 0.4955,
"step": 10250
},
{
"epoch": 0.6848973485163328,
"grad_norm": 0.48924630880355835,
"learning_rate": 8.313674879709475e-05,
"loss": 0.5571,
"step": 10300
},
{
"epoch": 0.6882220929266063,
"grad_norm": 0.574004590511322,
"learning_rate": 8.29189064844334e-05,
"loss": 0.5335,
"step": 10350
},
{
"epoch": 0.6915468373368797,
"grad_norm": 0.46639284491539,
"learning_rate": 8.269995558013049e-05,
"loss": 0.5151,
"step": 10400
},
{
"epoch": 0.6948715817471531,
"grad_norm": 0.9258661866188049,
"learning_rate": 8.24799034576734e-05,
"loss": 0.4983,
"step": 10450
},
{
"epoch": 0.6981963261574267,
"grad_norm": 0.2828778922557831,
"learning_rate": 8.225875752763468e-05,
"loss": 0.5027,
"step": 10500
},
{
"epoch": 0.7015210705677001,
"grad_norm": 0.40603527426719666,
"learning_rate": 8.203652523742237e-05,
"loss": 0.4735,
"step": 10550
},
{
"epoch": 0.7048458149779736,
"grad_norm": 0.6398336291313171,
"learning_rate": 8.181321407102929e-05,
"loss": 0.4945,
"step": 10600
},
{
"epoch": 0.708170559388247,
"grad_norm": 0.5521181225776672,
"learning_rate": 8.158883154878094e-05,
"loss": 0.5094,
"step": 10650
},
{
"epoch": 0.7114953037985204,
"grad_norm": 0.41034767031669617,
"learning_rate": 8.136338522708233e-05,
"loss": 0.5064,
"step": 10700
},
{
"epoch": 0.714820048208794,
"grad_norm": 0.34174844622612,
"learning_rate": 8.11368826981634e-05,
"loss": 0.4934,
"step": 10750
},
{
"epoch": 0.7181447926190674,
"grad_norm": 0.43941041827201843,
"learning_rate": 8.090933158982338e-05,
"loss": 0.5097,
"step": 10800
},
{
"epoch": 0.7214695370293409,
"grad_norm": 0.7018864154815674,
"learning_rate": 8.068073956517397e-05,
"loss": 0.4923,
"step": 10850
},
{
"epoch": 0.7247942814396143,
"grad_norm": 0.6354297995567322,
"learning_rate": 8.045111432238121e-05,
"loss": 0.4611,
"step": 10900
},
{
"epoch": 0.7281190258498877,
"grad_norm": 0.49247485399246216,
"learning_rate": 8.022046359440623e-05,
"loss": 0.5119,
"step": 10950
},
{
"epoch": 0.7314437702601613,
"grad_norm": 0.7940396666526794,
"learning_rate": 7.998879514874491e-05,
"loss": 0.5359,
"step": 11000
},
{
"epoch": 0.7347685146704347,
"grad_norm": 0.35591453313827515,
"learning_rate": 7.975611678716615e-05,
"loss": 0.532,
"step": 11050
},
{
"epoch": 0.7380932590807082,
"grad_norm": 0.3358200490474701,
"learning_rate": 7.952243634544929e-05,
"loss": 0.4793,
"step": 11100
},
{
"epoch": 0.7414180034909816,
"grad_norm": 0.6496360898017883,
"learning_rate": 7.928776169312016e-05,
"loss": 0.4981,
"step": 11150
},
{
"epoch": 0.744742747901255,
"grad_norm": 0.646221935749054,
"learning_rate": 7.905210073318605e-05,
"loss": 0.4578,
"step": 11200
},
{
"epoch": 0.7480674923115286,
"grad_norm": 1.0316184759140015,
"learning_rate": 7.881546140186958e-05,
"loss": 0.5101,
"step": 11250
},
{
"epoch": 0.751392236721802,
"grad_norm": 0.6004906296730042,
"learning_rate": 7.857785166834144e-05,
"loss": 0.4905,
"step": 11300
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.3192722797393799,
"learning_rate": 7.833927953445202e-05,
"loss": 0.4868,
"step": 11350
},
{
"epoch": 0.7580417255423489,
"grad_norm": 0.591340959072113,
"learning_rate": 7.809975303446195e-05,
"loss": 0.5107,
"step": 11400
},
{
"epoch": 0.7613664699526224,
"grad_norm": 1.268475890159607,
"learning_rate": 7.785928023477142e-05,
"loss": 0.4964,
"step": 11450
},
{
"epoch": 0.7646912143628959,
"grad_norm": 0.5102896690368652,
"learning_rate": 7.761786923364878e-05,
"loss": 0.5179,
"step": 11500
},
{
"epoch": 0.7680159587731693,
"grad_norm": 0.6084123849868774,
"learning_rate": 7.737552816095754e-05,
"loss": 0.4624,
"step": 11550
},
{
"epoch": 0.7713407031834427,
"grad_norm": 0.668404221534729,
"learning_rate": 7.713226517788275e-05,
"loss": 0.4917,
"step": 11600
},
{
"epoch": 0.7746654475937162,
"grad_norm": 0.6722292900085449,
"learning_rate": 7.688808847665612e-05,
"loss": 0.523,
"step": 11650
},
{
"epoch": 0.7779901920039897,
"grad_norm": 0.7151288390159607,
"learning_rate": 7.664300628028017e-05,
"loss": 0.5019,
"step": 11700
},
{
"epoch": 0.7813149364142632,
"grad_norm": 0.5944408178329468,
"learning_rate": 7.639702684225123e-05,
"loss": 0.5049,
"step": 11750
},
{
"epoch": 0.7846396808245366,
"grad_norm": 0.6515059471130371,
"learning_rate": 7.615015844628157e-05,
"loss": 0.4823,
"step": 11800
},
{
"epoch": 0.78796442523481,
"grad_norm": 0.2709754705429077,
"learning_rate": 7.590240940602036e-05,
"loss": 0.4591,
"step": 11850
},
{
"epoch": 0.7912891696450836,
"grad_norm": 0.42835110425949097,
"learning_rate": 7.565378806477377e-05,
"loss": 0.4851,
"step": 11900
},
{
"epoch": 0.794613914055357,
"grad_norm": 0.4259116053581238,
"learning_rate": 7.540430279522395e-05,
"loss": 0.4735,
"step": 11950
},
{
"epoch": 0.7979386584656305,
"grad_norm": 0.8144826292991638,
"learning_rate": 7.515396199914708e-05,
"loss": 0.4941,
"step": 12000
},
{
"epoch": 0.8012634028759039,
"grad_norm": 0.6449247002601624,
"learning_rate": 7.490277410713044e-05,
"loss": 0.4753,
"step": 12050
},
{
"epoch": 0.8045881472861773,
"grad_norm": 0.45336267352104187,
"learning_rate": 7.46507475782885e-05,
"loss": 0.5152,
"step": 12100
},
{
"epoch": 0.8079128916964509,
"grad_norm": 0.4385122060775757,
"learning_rate": 7.439789089997796e-05,
"loss": 0.523,
"step": 12150
},
{
"epoch": 0.8112376361067243,
"grad_norm": 0.36548131704330444,
"learning_rate": 7.414421258751212e-05,
"loss": 0.4939,
"step": 12200
},
{
"epoch": 0.8145623805169978,
"grad_norm": 0.5231236219406128,
"learning_rate": 7.38897211838739e-05,
"loss": 0.4787,
"step": 12250
},
{
"epoch": 0.8178871249272712,
"grad_norm": 0.4674762785434723,
"learning_rate": 7.363442525942826e-05,
"loss": 0.4973,
"step": 12300
},
{
"epoch": 0.8212118693375446,
"grad_norm": 0.4448126554489136,
"learning_rate": 7.337833341163358e-05,
"loss": 0.51,
"step": 12350
},
{
"epoch": 0.8245366137478182,
"grad_norm": 0.3027734160423279,
"learning_rate": 7.31214542647521e-05,
"loss": 0.5298,
"step": 12400
},
{
"epoch": 0.8278613581580916,
"grad_norm": 0.7488745450973511,
"learning_rate": 7.286379646955946e-05,
"loss": 0.492,
"step": 12450
},
{
"epoch": 0.831186102568365,
"grad_norm": 0.7628294229507446,
"learning_rate": 7.260536870305347e-05,
"loss": 0.4854,
"step": 12500
},
{
"epoch": 0.8345108469786385,
"grad_norm": 0.5619131326675415,
"learning_rate": 7.234617966816174e-05,
"loss": 0.4439,
"step": 12550
},
{
"epoch": 0.837835591388912,
"grad_norm": 0.2590934634208679,
"learning_rate": 7.208623809344879e-05,
"loss": 0.4949,
"step": 12600
},
{
"epoch": 0.8411603357991855,
"grad_norm": 0.47846803069114685,
"learning_rate": 7.182555273282193e-05,
"loss": 0.5091,
"step": 12650
},
{
"epoch": 0.8444850802094589,
"grad_norm": 0.5931444764137268,
"learning_rate": 7.156413236523656e-05,
"loss": 0.4732,
"step": 12700
},
{
"epoch": 0.8478098246197323,
"grad_norm": 0.4458071291446686,
"learning_rate": 7.130198579440052e-05,
"loss": 0.4639,
"step": 12750
},
{
"epoch": 0.8511345690300058,
"grad_norm": 0.35852745175361633,
"learning_rate": 7.103912184847757e-05,
"loss": 0.4818,
"step": 12800
},
{
"epoch": 0.8544593134402793,
"grad_norm": 0.496389776468277,
"learning_rate": 7.07755493797901e-05,
"loss": 0.4424,
"step": 12850
},
{
"epoch": 0.8577840578505528,
"grad_norm": 0.5191590189933777,
"learning_rate": 7.051127726452102e-05,
"loss": 0.4551,
"step": 12900
},
{
"epoch": 0.8611088022608262,
"grad_norm": 0.4241909682750702,
"learning_rate": 7.024631440241491e-05,
"loss": 0.4496,
"step": 12950
},
{
"epoch": 0.8644335466710996,
"grad_norm": 0.5711332559585571,
"learning_rate": 6.998066971647817e-05,
"loss": 0.49,
"step": 13000
},
{
"epoch": 0.8677582910813731,
"grad_norm": 0.29857337474823,
"learning_rate": 6.971435215267866e-05,
"loss": 0.4822,
"step": 13050
},
{
"epoch": 0.8710830354916466,
"grad_norm": 0.40240055322647095,
"learning_rate": 6.944737067964429e-05,
"loss": 0.4665,
"step": 13100
},
{
"epoch": 0.8744077799019201,
"grad_norm": 0.3824640214443207,
"learning_rate": 6.917973428836118e-05,
"loss": 0.4762,
"step": 13150
},
{
"epoch": 0.8777325243121935,
"grad_norm": 0.5857983231544495,
"learning_rate": 6.891145199187065e-05,
"loss": 0.4301,
"step": 13200
},
{
"epoch": 0.8810572687224669,
"grad_norm": 0.43042436242103577,
"learning_rate": 6.864253282496595e-05,
"loss": 0.5202,
"step": 13250
},
{
"epoch": 0.8843820131327405,
"grad_norm": 0.47307106852531433,
"learning_rate": 6.837298584388771e-05,
"loss": 0.4664,
"step": 13300
},
{
"epoch": 0.8877067575430139,
"grad_norm": 0.6773985624313354,
"learning_rate": 6.810282012601923e-05,
"loss": 0.4748,
"step": 13350
},
{
"epoch": 0.8910315019532873,
"grad_norm": 0.35346561670303345,
"learning_rate": 6.783204476958058e-05,
"loss": 0.4798,
"step": 13400
},
{
"epoch": 0.8943562463635608,
"grad_norm": 0.3380667269229889,
"learning_rate": 6.75606688933223e-05,
"loss": 0.5189,
"step": 13450
},
{
"epoch": 0.8976809907738342,
"grad_norm": 0.7384742498397827,
"learning_rate": 6.728870163621836e-05,
"loss": 0.4913,
"step": 13500
},
{
"epoch": 0.9010057351841078,
"grad_norm": 0.4812994599342346,
"learning_rate": 6.701615215715829e-05,
"loss": 0.453,
"step": 13550
},
{
"epoch": 0.9043304795943812,
"grad_norm": 0.3846656382083893,
"learning_rate": 6.674302963463876e-05,
"loss": 0.459,
"step": 13600
},
{
"epoch": 0.9076552240046546,
"grad_norm": 0.49682462215423584,
"learning_rate": 6.646934326645452e-05,
"loss": 0.5119,
"step": 13650
},
{
"epoch": 0.9109799684149281,
"grad_norm": 0.2614552974700928,
"learning_rate": 6.61951022693887e-05,
"loss": 0.4414,
"step": 13700
},
{
"epoch": 0.9143047128252015,
"grad_norm": 0.4591997563838959,
"learning_rate": 6.592031587890224e-05,
"loss": 0.5148,
"step": 13750
},
{
"epoch": 0.9176294572354751,
"grad_norm": 0.4885028600692749,
"learning_rate": 6.564499334882312e-05,
"loss": 0.4998,
"step": 13800
},
{
"epoch": 0.9209542016457485,
"grad_norm": 0.7544069290161133,
"learning_rate": 6.536914395103446e-05,
"loss": 0.5493,
"step": 13850
},
{
"epoch": 0.9242789460560219,
"grad_norm": 0.6468596458435059,
"learning_rate": 6.509277697516255e-05,
"loss": 0.4527,
"step": 13900
},
{
"epoch": 0.9276036904662954,
"grad_norm": 0.6907210350036621,
"learning_rate": 6.481590172826371e-05,
"loss": 0.4917,
"step": 13950
},
{
"epoch": 0.9309284348765688,
"grad_norm": 0.5351805090904236,
"learning_rate": 6.453852753451119e-05,
"loss": 0.5103,
"step": 14000
},
{
"epoch": 0.9342531792868424,
"grad_norm": 0.7480131983757019,
"learning_rate": 6.426066373488084e-05,
"loss": 0.4945,
"step": 14050
},
{
"epoch": 0.9375779236971158,
"grad_norm": 0.30603644251823425,
"learning_rate": 6.39823196868368e-05,
"loss": 0.5453,
"step": 14100
},
{
"epoch": 0.9409026681073892,
"grad_norm": 0.7746521830558777,
"learning_rate": 6.370350476401624e-05,
"loss": 0.4735,
"step": 14150
},
{
"epoch": 0.9442274125176627,
"grad_norm": 1.2858827114105225,
"learning_rate": 6.342422835591368e-05,
"loss": 0.5343,
"step": 14200
},
{
"epoch": 0.9475521569279362,
"grad_norm": 0.45832210779190063,
"learning_rate": 6.314449986756489e-05,
"loss": 0.5078,
"step": 14250
},
{
"epoch": 0.9508769013382097,
"grad_norm": 0.5275512337684631,
"learning_rate": 6.286432871923e-05,
"loss": 0.4985,
"step": 14300
},
{
"epoch": 0.9542016457484831,
"grad_norm": 0.25901734828948975,
"learning_rate": 6.258372434607645e-05,
"loss": 0.4495,
"step": 14350
},
{
"epoch": 0.9575263901587565,
"grad_norm": 0.5617000460624695,
"learning_rate": 6.230269619786111e-05,
"loss": 0.4606,
"step": 14400
},
{
"epoch": 0.96085113456903,
"grad_norm": 0.7701844573020935,
"learning_rate": 6.202125373861207e-05,
"loss": 0.4699,
"step": 14450
},
{
"epoch": 0.9641758789793035,
"grad_norm": 0.33021387457847595,
"learning_rate": 6.173940644630996e-05,
"loss": 0.4802,
"step": 14500
},
{
"epoch": 0.9675006233895769,
"grad_norm": 0.40716060996055603,
"learning_rate": 6.145716381256873e-05,
"loss": 0.4557,
"step": 14550
},
{
"epoch": 0.9708253677998504,
"grad_norm": 0.49702873826026917,
"learning_rate": 6.117453534231606e-05,
"loss": 0.4472,
"step": 14600
},
{
"epoch": 0.9741501122101238,
"grad_norm": 0.4306732714176178,
"learning_rate": 6.0891530553473195e-05,
"loss": 0.4498,
"step": 14650
},
{
"epoch": 0.9774748566203973,
"grad_norm": 0.43764013051986694,
"learning_rate": 6.060815897663447e-05,
"loss": 0.4741,
"step": 14700
},
{
"epoch": 0.9807996010306708,
"grad_norm": 0.6434122323989868,
"learning_rate": 6.0324430154746316e-05,
"loss": 0.4821,
"step": 14750
},
{
"epoch": 0.9841243454409442,
"grad_norm": 0.5440848469734192,
"learning_rate": 6.004035364278593e-05,
"loss": 0.4687,
"step": 14800
},
{
"epoch": 0.9874490898512177,
"grad_norm": 0.7061350345611572,
"learning_rate": 5.9755939007439445e-05,
"loss": 0.427,
"step": 14850
},
{
"epoch": 0.9907738342614911,
"grad_norm": 1.1848245859146118,
"learning_rate": 5.9471195826779834e-05,
"loss": 0.4594,
"step": 14900
},
{
"epoch": 0.9940985786717647,
"grad_norm": 0.5134018659591675,
"learning_rate": 5.918613368994423e-05,
"loss": 0.482,
"step": 14950
},
{
"epoch": 0.9974233230820381,
"grad_norm": 0.4567798674106598,
"learning_rate": 5.8900762196811175e-05,
"loss": 0.4322,
"step": 15000
},
{
"epoch": 1.0007480674923115,
"grad_norm": 0.49805304408073425,
"learning_rate": 5.861509095767714e-05,
"loss": 0.4441,
"step": 15050
},
{
"epoch": 1.004072811902585,
"grad_norm": 0.7849738597869873,
"learning_rate": 5.832912959293304e-05,
"loss": 0.4052,
"step": 15100
},
{
"epoch": 1.0073975563128585,
"grad_norm": 0.6468079686164856,
"learning_rate": 5.804288773274011e-05,
"loss": 0.4061,
"step": 15150
},
{
"epoch": 1.0107223007231319,
"grad_norm": 0.3043822944164276,
"learning_rate": 5.775637501670579e-05,
"loss": 0.3922,
"step": 15200
},
{
"epoch": 1.0140470451334054,
"grad_norm": 0.3926864266395569,
"learning_rate": 5.7469601093558854e-05,
"loss": 0.444,
"step": 15250
},
{
"epoch": 1.017371789543679,
"grad_norm": 0.2536259889602661,
"learning_rate": 5.718257562082471e-05,
"loss": 0.4149,
"step": 15300
},
{
"epoch": 1.0206965339539522,
"grad_norm": 0.9208493232727051,
"learning_rate": 5.689530826449997e-05,
"loss": 0.3901,
"step": 15350
},
{
"epoch": 1.0240212783642257,
"grad_norm": 0.34819620847702026,
"learning_rate": 5.660780869872711e-05,
"loss": 0.4268,
"step": 15400
},
{
"epoch": 1.0273460227744993,
"grad_norm": 1.2284760475158691,
"learning_rate": 5.632008660546853e-05,
"loss": 0.4328,
"step": 15450
},
{
"epoch": 1.0306707671847726,
"grad_norm": 0.3599700629711151,
"learning_rate": 5.6032151674180575e-05,
"loss": 0.3798,
"step": 15500
},
{
"epoch": 1.033995511595046,
"grad_norm": 0.31814679503440857,
"learning_rate": 5.574401360148727e-05,
"loss": 0.4215,
"step": 15550
},
{
"epoch": 1.0373202560053196,
"grad_norm": 0.6101735234260559,
"learning_rate": 5.5455682090853624e-05,
"loss": 0.4242,
"step": 15600
},
{
"epoch": 1.0406450004155932,
"grad_norm": 0.8657062649726868,
"learning_rate": 5.5167166852259055e-05,
"loss": 0.4163,
"step": 15650
},
{
"epoch": 1.0439697448258665,
"grad_norm": 0.4173007309436798,
"learning_rate": 5.4878477601870194e-05,
"loss": 0.3637,
"step": 15700
},
{
"epoch": 1.04729448923614,
"grad_norm": 0.13609103858470917,
"learning_rate": 5.458962406171384e-05,
"loss": 0.4117,
"step": 15750
},
{
"epoch": 1.0506192336464135,
"grad_norm": 0.4456997513771057,
"learning_rate": 5.430061595934941e-05,
"loss": 0.4177,
"step": 15800
},
{
"epoch": 1.0539439780566868,
"grad_norm": 0.549238383769989,
"learning_rate": 5.401146302754153e-05,
"loss": 0.4446,
"step": 15850
},
{
"epoch": 1.0572687224669604,
"grad_norm": 0.4045184850692749,
"learning_rate": 5.372217500393205e-05,
"loss": 0.4097,
"step": 15900
},
{
"epoch": 1.0605934668772339,
"grad_norm": 0.3764871060848236,
"learning_rate": 5.3432761630712335e-05,
"loss": 0.378,
"step": 15950
},
{
"epoch": 1.0639182112875072,
"grad_norm": 0.46142441034317017,
"learning_rate": 5.314323265429501e-05,
"loss": 0.373,
"step": 16000
},
{
"epoch": 1.0672429556977807,
"grad_norm": 0.671380341053009,
"learning_rate": 5.285359782498582e-05,
"loss": 0.4159,
"step": 16050
},
{
"epoch": 1.0705677001080542,
"grad_norm": 0.5095057487487793,
"learning_rate": 5.2563866896655275e-05,
"loss": 0.4017,
"step": 16100
},
{
"epoch": 1.0738924445183278,
"grad_norm": 0.8826057314872742,
"learning_rate": 5.227404962641016e-05,
"loss": 0.4627,
"step": 16150
},
{
"epoch": 1.077217188928601,
"grad_norm": 0.740010142326355,
"learning_rate": 5.198415577426493e-05,
"loss": 0.4073,
"step": 16200
},
{
"epoch": 1.0805419333388746,
"grad_norm": 0.4958501160144806,
"learning_rate": 5.1694195102813046e-05,
"loss": 0.4024,
"step": 16250
},
{
"epoch": 1.0838666777491481,
"grad_norm": 0.8271908760070801,
"learning_rate": 5.140417737689822e-05,
"loss": 0.4322,
"step": 16300
},
{
"epoch": 1.0871914221594214,
"grad_norm": 0.43836158514022827,
"learning_rate": 5.111411236328555e-05,
"loss": 0.3967,
"step": 16350
},
{
"epoch": 1.090516166569695,
"grad_norm": 0.6598096489906311,
"learning_rate": 5.0824009830332606e-05,
"loss": 0.4123,
"step": 16400
},
{
"epoch": 1.0938409109799685,
"grad_norm": 0.5519174337387085,
"learning_rate": 5.053387954766049e-05,
"loss": 0.398,
"step": 16450
},
{
"epoch": 1.0971656553902418,
"grad_norm": 0.4203943610191345,
"learning_rate": 5.02437312858248e-05,
"loss": 0.4162,
"step": 16500
},
{
"epoch": 1.1004903998005153,
"grad_norm": 1.1314325332641602,
"learning_rate": 4.995357481598663e-05,
"loss": 0.3965,
"step": 16550
},
{
"epoch": 1.1038151442107889,
"grad_norm": 0.35250428318977356,
"learning_rate": 4.966341990958347e-05,
"loss": 0.4073,
"step": 16600
},
{
"epoch": 1.1071398886210622,
"grad_norm": 0.3103015720844269,
"learning_rate": 4.937327633800018e-05,
"loss": 0.4228,
"step": 16650
},
{
"epoch": 1.1104646330313357,
"grad_norm": 0.7171920537948608,
"learning_rate": 4.908315387223985e-05,
"loss": 0.3919,
"step": 16700
},
{
"epoch": 1.1137893774416092,
"grad_norm": 0.4805893003940582,
"learning_rate": 4.87930622825949e-05,
"loss": 0.3981,
"step": 16750
},
{
"epoch": 1.1171141218518827,
"grad_norm": 0.5666382312774658,
"learning_rate": 4.850301133831786e-05,
"loss": 0.4203,
"step": 16800
},
{
"epoch": 1.120438866262156,
"grad_norm": 0.18436360359191895,
"learning_rate": 4.821301080729249e-05,
"loss": 0.4215,
"step": 16850
},
{
"epoch": 1.1237636106724296,
"grad_norm": 0.4461723566055298,
"learning_rate": 4.792307045570486e-05,
"loss": 0.4055,
"step": 16900
},
{
"epoch": 1.127088355082703,
"grad_norm": 0.3168890178203583,
"learning_rate": 4.7633200047714345e-05,
"loss": 0.3586,
"step": 16950
},
{
"epoch": 1.1304130994929764,
"grad_norm": 0.2920137941837311,
"learning_rate": 4.734340934512492e-05,
"loss": 0.4116,
"step": 17000
},
{
"epoch": 1.13373784390325,
"grad_norm": 0.40534549951553345,
"learning_rate": 4.70537081070563e-05,
"loss": 0.4024,
"step": 17050
},
{
"epoch": 1.1370625883135235,
"grad_norm": 0.5520055294036865,
"learning_rate": 4.6764106089615454e-05,
"loss": 0.4162,
"step": 17100
},
{
"epoch": 1.1403873327237968,
"grad_norm": 0.3157692849636078,
"learning_rate": 4.647461304556787e-05,
"loss": 0.3925,
"step": 17150
},
{
"epoch": 1.1437120771340703,
"grad_norm": 0.8194869756698608,
"learning_rate": 4.618523872400921e-05,
"loss": 0.4147,
"step": 17200
},
{
"epoch": 1.1470368215443438,
"grad_norm": 0.3826686441898346,
"learning_rate": 4.589599287003703e-05,
"loss": 0.4036,
"step": 17250
},
{
"epoch": 1.1503615659546171,
"grad_norm": 0.8618173599243164,
"learning_rate": 4.56068852244225e-05,
"loss": 0.4285,
"step": 17300
},
{
"epoch": 1.1536863103648907,
"grad_norm": 0.996113657951355,
"learning_rate": 4.5317925523282464e-05,
"loss": 0.3751,
"step": 17350
},
{
"epoch": 1.1570110547751642,
"grad_norm": 0.5433736443519592,
"learning_rate": 4.5029123497751514e-05,
"loss": 0.408,
"step": 17400
},
{
"epoch": 1.1603357991854377,
"grad_norm": 0.6233689188957214,
"learning_rate": 4.474048887365426e-05,
"loss": 0.4105,
"step": 17450
},
{
"epoch": 1.163660543595711,
"grad_norm": 0.6037063002586365,
"learning_rate": 4.445203137117788e-05,
"loss": 0.3618,
"step": 17500
},
{
"epoch": 1.1669852880059846,
"grad_norm": 0.8507609367370605,
"learning_rate": 4.4163760704544675e-05,
"loss": 0.4433,
"step": 17550
},
{
"epoch": 1.170310032416258,
"grad_norm": 0.4909146726131439,
"learning_rate": 4.3875686581685e-05,
"loss": 0.4128,
"step": 17600
},
{
"epoch": 1.1736347768265314,
"grad_norm": 0.4087628424167633,
"learning_rate": 4.358781870391033e-05,
"loss": 0.4035,
"step": 17650
},
{
"epoch": 1.176959521236805,
"grad_norm": 0.5856008529663086,
"learning_rate": 4.330016676558651e-05,
"loss": 0.3809,
"step": 17700
},
{
"epoch": 1.1802842656470784,
"grad_norm": 0.14058536291122437,
"learning_rate": 4.3012740453807346e-05,
"loss": 0.3875,
"step": 17750
},
{
"epoch": 1.1836090100573518,
"grad_norm": 0.2947339415550232,
"learning_rate": 4.272554944806831e-05,
"loss": 0.4059,
"step": 17800
},
{
"epoch": 1.1869337544676253,
"grad_norm": 0.6987840533256531,
"learning_rate": 4.243860341994062e-05,
"loss": 0.385,
"step": 17850
},
{
"epoch": 1.1902584988778988,
"grad_norm": 0.4702407717704773,
"learning_rate": 4.2151912032745547e-05,
"loss": 0.433,
"step": 17900
},
{
"epoch": 1.1935832432881721,
"grad_norm": 0.28774410486221313,
"learning_rate": 4.18654849412289e-05,
"loss": 0.3464,
"step": 17950
},
{
"epoch": 1.1969079876984456,
"grad_norm": 0.47577986121177673,
"learning_rate": 4.157933179123599e-05,
"loss": 0.406,
"step": 18000
},
{
"epoch": 1.2002327321087192,
"grad_norm": 0.674921989440918,
"learning_rate": 4.129346221938676e-05,
"loss": 0.4521,
"step": 18050
},
{
"epoch": 1.2035574765189927,
"grad_norm": 0.4696671962738037,
"learning_rate": 4.100788585275125e-05,
"loss": 0.3983,
"step": 18100
},
{
"epoch": 1.206882220929266,
"grad_norm": 0.7673335075378418,
"learning_rate": 4.0722612308525335e-05,
"loss": 0.4084,
"step": 18150
},
{
"epoch": 1.2102069653395395,
"grad_norm": 0.39890438318252563,
"learning_rate": 4.043765119370699e-05,
"loss": 0.3673,
"step": 18200
},
{
"epoch": 1.213531709749813,
"grad_norm": 0.5470781326293945,
"learning_rate": 4.0153012104772635e-05,
"loss": 0.3686,
"step": 18250
},
{
"epoch": 1.2168564541600864,
"grad_norm": 0.5802851319313049,
"learning_rate": 3.9868704627354e-05,
"loss": 0.4034,
"step": 18300
},
{
"epoch": 1.22018119857036,
"grad_norm": 0.5744081139564514,
"learning_rate": 3.9584738335915314e-05,
"loss": 0.3896,
"step": 18350
},
{
"epoch": 1.2235059429806334,
"grad_norm": 0.6031488180160522,
"learning_rate": 3.930112279343094e-05,
"loss": 0.3943,
"step": 18400
},
{
"epoch": 1.226830687390907,
"grad_norm": 0.42322850227355957,
"learning_rate": 3.9017867551063184e-05,
"loss": 0.3821,
"step": 18450
},
{
"epoch": 1.2301554318011803,
"grad_norm": 1.014979600906372,
"learning_rate": 3.8734982147840756e-05,
"loss": 0.3888,
"step": 18500
},
{
"epoch": 1.2334801762114538,
"grad_norm": 0.6480023860931396,
"learning_rate": 3.845247611033749e-05,
"loss": 0.4109,
"step": 18550
},
{
"epoch": 1.236804920621727,
"grad_norm": 0.8400156497955322,
"learning_rate": 3.817035895235159e-05,
"loss": 0.3897,
"step": 18600
},
{
"epoch": 1.2401296650320006,
"grad_norm": 0.7448896765708923,
"learning_rate": 3.7888640174585096e-05,
"loss": 0.3637,
"step": 18650
},
{
"epoch": 1.2434544094422741,
"grad_norm": 0.7204906940460205,
"learning_rate": 3.760732926432407e-05,
"loss": 0.3688,
"step": 18700
},
{
"epoch": 1.2467791538525477,
"grad_norm": 0.21433959901332855,
"learning_rate": 3.732643569511901e-05,
"loss": 0.412,
"step": 18750
},
{
"epoch": 1.250103898262821,
"grad_norm": 0.6840627789497375,
"learning_rate": 3.704596892646593e-05,
"loss": 0.4127,
"step": 18800
},
{
"epoch": 1.2534286426730945,
"grad_norm": 0.5724749565124512,
"learning_rate": 3.676593840348765e-05,
"loss": 0.3849,
"step": 18850
},
{
"epoch": 1.256753387083368,
"grad_norm": 1.0407353639602661,
"learning_rate": 3.648635355661577e-05,
"loss": 0.412,
"step": 18900
},
{
"epoch": 1.2600781314936413,
"grad_norm": 0.6900772452354431,
"learning_rate": 3.6207223801273196e-05,
"loss": 0.4414,
"step": 18950
},
{
"epoch": 1.2634028759039149,
"grad_norm": 0.5711185932159424,
"learning_rate": 3.5928558537556895e-05,
"loss": 0.3557,
"step": 19000
},
{
"epoch": 1.2667276203141884,
"grad_norm": 0.928859293460846,
"learning_rate": 3.565036714992142e-05,
"loss": 0.3692,
"step": 19050
},
{
"epoch": 1.270052364724462,
"grad_norm": 0.4256090521812439,
"learning_rate": 3.537265900686286e-05,
"loss": 0.3895,
"step": 19100
},
{
"epoch": 1.2733771091347352,
"grad_norm": 0.3450973629951477,
"learning_rate": 3.5095443460603405e-05,
"loss": 0.3645,
"step": 19150
},
{
"epoch": 1.2767018535450088,
"grad_norm": 0.48858773708343506,
"learning_rate": 3.4818729846776254e-05,
"loss": 0.3473,
"step": 19200
},
{
"epoch": 1.280026597955282,
"grad_norm": 0.44542333483695984,
"learning_rate": 3.4542527484111365e-05,
"loss": 0.3837,
"step": 19250
},
{
"epoch": 1.2833513423655556,
"grad_norm": 0.54665207862854,
"learning_rate": 3.426684567412153e-05,
"loss": 0.3911,
"step": 19300
},
{
"epoch": 1.2866760867758291,
"grad_norm": 0.46556198596954346,
"learning_rate": 3.3991693700789235e-05,
"loss": 0.4085,
"step": 19350
},
{
"epoch": 1.2900008311861026,
"grad_norm": 0.5987225770950317,
"learning_rate": 3.371708083025392e-05,
"loss": 0.37,
"step": 19400
},
{
"epoch": 1.293325575596376,
"grad_norm": 0.0574885755777359,
"learning_rate": 3.344301631049993e-05,
"loss": 0.359,
"step": 19450
},
{
"epoch": 1.2966503200066495,
"grad_norm": 0.39397045969963074,
"learning_rate": 3.316950937104518e-05,
"loss": 0.3657,
"step": 19500
},
{
"epoch": 1.299975064416923,
"grad_norm": 0.7825318574905396,
"learning_rate": 3.2896569222630224e-05,
"loss": 0.3981,
"step": 19550
},
{
"epoch": 1.3032998088271963,
"grad_norm": 0.6932367086410522,
"learning_rate": 3.26242050569081e-05,
"loss": 0.3821,
"step": 19600
},
{
"epoch": 1.3066245532374698,
"grad_norm": 0.613335907459259,
"learning_rate": 3.235242604613478e-05,
"loss": 0.3534,
"step": 19650
},
{
"epoch": 1.3099492976477434,
"grad_norm": 0.9380619525909424,
"learning_rate": 3.208124134286038e-05,
"loss": 0.3691,
"step": 19700
},
{
"epoch": 1.313274042058017,
"grad_norm": 0.6845571398735046,
"learning_rate": 3.181066007962079e-05,
"loss": 0.3995,
"step": 19750
},
{
"epoch": 1.3165987864682902,
"grad_norm": 0.40944433212280273,
"learning_rate": 3.1540691368630185e-05,
"loss": 0.422,
"step": 19800
},
{
"epoch": 1.3199235308785637,
"grad_norm": 0.34895792603492737,
"learning_rate": 3.127134430147417e-05,
"loss": 0.4023,
"step": 19850
},
{
"epoch": 1.323248275288837,
"grad_norm": 0.6736898422241211,
"learning_rate": 3.100262794880363e-05,
"loss": 0.4225,
"step": 19900
},
{
"epoch": 1.3265730196991106,
"grad_norm": 0.595956563949585,
"learning_rate": 3.073455136002919e-05,
"loss": 0.4148,
"step": 19950
},
{
"epoch": 1.329897764109384,
"grad_norm": 1.9619659185409546,
"learning_rate": 3.0467123563016513e-05,
"loss": 0.4008,
"step": 20000
},
{
"epoch": 1.329897764109384,
"eval_loss": 0.36843690276145935,
"eval_runtime": 4369.5216,
"eval_samples_per_second": 1.53,
"eval_steps_per_second": 1.53,
"step": 20000
},
{
"epoch": 1.3332225085196576,
"grad_norm": 0.3691612780094147,
"learning_rate": 3.0200353563782248e-05,
"loss": 0.3904,
"step": 20050
},
{
"epoch": 1.336547252929931,
"grad_norm": 0.30493393540382385,
"learning_rate": 2.9934250346190818e-05,
"loss": 0.3746,
"step": 20100
},
{
"epoch": 1.3398719973402045,
"grad_norm": 0.6750917434692383,
"learning_rate": 2.9668822871651736e-05,
"loss": 0.3831,
"step": 20150
},
{
"epoch": 1.343196741750478,
"grad_norm": 0.4708922207355499,
"learning_rate": 2.9404080078817924e-05,
"loss": 0.376,
"step": 20200
},
{
"epoch": 1.3465214861607513,
"grad_norm": 0.4739364981651306,
"learning_rate": 2.9140030883284684e-05,
"loss": 0.3932,
"step": 20250
},
{
"epoch": 1.3498462305710248,
"grad_norm": 0.2959195375442505,
"learning_rate": 2.8876684177289404e-05,
"loss": 0.4033,
"step": 20300
},
{
"epoch": 1.3531709749812983,
"grad_norm": 0.680323600769043,
"learning_rate": 2.861404882941212e-05,
"loss": 0.3659,
"step": 20350
},
{
"epoch": 1.3564957193915719,
"grad_norm": 0.4198705554008484,
"learning_rate": 2.8352133684276853e-05,
"loss": 0.3681,
"step": 20400
},
{
"epoch": 1.3598204638018452,
"grad_norm": 0.6400431394577026,
"learning_rate": 2.8090947562253807e-05,
"loss": 0.4492,
"step": 20450
},
{
"epoch": 1.3631452082121187,
"grad_norm": 0.5183029770851135,
"learning_rate": 2.7830499259162213e-05,
"loss": 0.387,
"step": 20500
},
{
"epoch": 1.3664699526223922,
"grad_norm": 0.31550315022468567,
"learning_rate": 2.7570797545974235e-05,
"loss": 0.4326,
"step": 20550
},
{
"epoch": 1.3697946970326655,
"grad_norm": 0.3952213227748871,
"learning_rate": 2.7311851168519496e-05,
"loss": 0.4243,
"step": 20600
},
{
"epoch": 1.373119441442939,
"grad_norm": 0.45342549681663513,
"learning_rate": 2.7053668847190672e-05,
"loss": 0.3845,
"step": 20650
},
{
"epoch": 1.3764441858532126,
"grad_norm": 0.8320136070251465,
"learning_rate": 2.6796259276649693e-05,
"loss": 0.3915,
"step": 20700
},
{
"epoch": 1.3797689302634861,
"grad_norm": 0.4563830494880676,
"learning_rate": 2.653963112553498e-05,
"loss": 0.3915,
"step": 20750
},
{
"epoch": 1.3830936746737594,
"grad_norm": 0.24706579744815826,
"learning_rate": 2.6283793036169603e-05,
"loss": 0.3984,
"step": 20800
},
{
"epoch": 1.386418419084033,
"grad_norm": 0.5812034010887146,
"learning_rate": 2.6028753624270074e-05,
"loss": 0.3679,
"step": 20850
},
{
"epoch": 1.3897431634943063,
"grad_norm": 0.3415985107421875,
"learning_rate": 2.5774521478656343e-05,
"loss": 0.3808,
"step": 20900
},
{
"epoch": 1.3930679079045798,
"grad_norm": 0.2992309629917145,
"learning_rate": 2.5521105160962473e-05,
"loss": 0.3669,
"step": 20950
},
{
"epoch": 1.3963926523148533,
"grad_norm": 0.7009733319282532,
"learning_rate": 2.52685132053484e-05,
"loss": 0.3882,
"step": 21000
},
{
"epoch": 1.3997173967251269,
"grad_norm": 0.4481956660747528,
"learning_rate": 2.501675411821241e-05,
"loss": 0.4083,
"step": 21050
},
{
"epoch": 1.4030421411354002,
"grad_norm": 0.3502480089664459,
"learning_rate": 2.4765836377904787e-05,
"loss": 0.4103,
"step": 21100
},
{
"epoch": 1.4063668855456737,
"grad_norm": 0.7800574898719788,
"learning_rate": 2.4515768434442215e-05,
"loss": 0.3978,
"step": 21150
},
{
"epoch": 1.4096916299559472,
"grad_norm": 0.4401569664478302,
"learning_rate": 2.4266558709223293e-05,
"loss": 0.3907,
"step": 21200
},
{
"epoch": 1.4130163743662205,
"grad_norm": 0.25381216406822205,
"learning_rate": 2.4018215594744835e-05,
"loss": 0.4001,
"step": 21250
},
{
"epoch": 1.416341118776494,
"grad_norm": 1.5524805784225464,
"learning_rate": 2.377074745431931e-05,
"loss": 0.3897,
"step": 21300
},
{
"epoch": 1.4196658631867676,
"grad_norm": 0.49514323472976685,
"learning_rate": 2.352416262179315e-05,
"loss": 0.3693,
"step": 21350
},
{
"epoch": 1.422990607597041,
"grad_norm": 0.5744829177856445,
"learning_rate": 2.3278469401266178e-05,
"loss": 0.3648,
"step": 21400
},
{
"epoch": 1.4263153520073144,
"grad_norm": 0.7969784736633301,
"learning_rate": 2.3033676066811845e-05,
"loss": 0.3768,
"step": 21450
},
{
"epoch": 1.429640096417588,
"grad_norm": 0.9067749977111816,
"learning_rate": 2.2789790862198628e-05,
"loss": 0.3326,
"step": 21500
},
{
"epoch": 1.4329648408278612,
"grad_norm": 0.6318944692611694,
"learning_rate": 2.2546822000612495e-05,
"loss": 0.37,
"step": 21550
},
{
"epoch": 1.4362895852381348,
"grad_norm": 0.3009655177593231,
"learning_rate": 2.2304777664380176e-05,
"loss": 0.3777,
"step": 21600
},
{
"epoch": 1.4396143296484083,
"grad_norm": 0.7818790674209595,
"learning_rate": 2.2063666004693695e-05,
"loss": 0.3809,
"step": 21650
},
{
"epoch": 1.4429390740586818,
"grad_norm": 0.3410826623439789,
"learning_rate": 2.182349514133583e-05,
"loss": 0.3746,
"step": 21700
},
{
"epoch": 1.4462638184689551,
"grad_norm": 0.20603881776332855,
"learning_rate": 2.1584273162406755e-05,
"loss": 0.3536,
"step": 21750
},
{
"epoch": 1.4495885628792287,
"grad_norm": 0.37000730633735657,
"learning_rate": 2.134600812405151e-05,
"loss": 0.3886,
"step": 21800
},
{
"epoch": 1.4529133072895022,
"grad_norm": 0.25155916810035706,
"learning_rate": 2.1108708050188825e-05,
"loss": 0.3688,
"step": 21850
},
{
"epoch": 1.4562380516997755,
"grad_norm": 0.6626400351524353,
"learning_rate": 2.0872380932240832e-05,
"loss": 0.3716,
"step": 21900
},
{
"epoch": 1.459562796110049,
"grad_norm": 0.9164525270462036,
"learning_rate": 2.063703472886402e-05,
"loss": 0.3939,
"step": 21950
},
{
"epoch": 1.4628875405203225,
"grad_norm": 0.3983005881309509,
"learning_rate": 2.0402677365681112e-05,
"loss": 0.361,
"step": 22000
},
{
"epoch": 1.466212284930596,
"grad_norm": 0.787325918674469,
"learning_rate": 2.0169316735014236e-05,
"loss": 0.4137,
"step": 22050
},
{
"epoch": 1.4695370293408694,
"grad_norm": 0.7404587268829346,
"learning_rate": 1.99369606956191e-05,
"loss": 0.4188,
"step": 22100
},
{
"epoch": 1.472861773751143,
"grad_norm": 0.3947995603084564,
"learning_rate": 1.9705617072420392e-05,
"loss": 0.373,
"step": 22150
},
{
"epoch": 1.4761865181614162,
"grad_norm": 0.5493362545967102,
"learning_rate": 1.9475293656248182e-05,
"loss": 0.3778,
"step": 22200
},
{
"epoch": 1.4795112625716897,
"grad_norm": 0.4823583960533142,
"learning_rate": 1.9245998203575593e-05,
"loss": 0.4243,
"step": 22250
},
{
"epoch": 1.4828360069819633,
"grad_norm": 0.334708034992218,
"learning_rate": 1.9017738436257655e-05,
"loss": 0.344,
"step": 22300
},
{
"epoch": 1.4861607513922368,
"grad_norm": 0.4683977961540222,
"learning_rate": 1.879052204127114e-05,
"loss": 0.3771,
"step": 22350
},
{
"epoch": 1.48948549580251,
"grad_norm": 0.38489750027656555,
"learning_rate": 1.8564356670455767e-05,
"loss": 0.3922,
"step": 22400
},
{
"epoch": 1.4928102402127836,
"grad_norm": 0.6407677531242371,
"learning_rate": 1.8339249940256492e-05,
"loss": 0.3877,
"step": 22450
},
{
"epoch": 1.4961349846230572,
"grad_norm": 0.46461015939712524,
"learning_rate": 1.8115209431467074e-05,
"loss": 0.3898,
"step": 22500
},
{
"epoch": 1.4994597290333305,
"grad_norm": 0.9419897794723511,
"learning_rate": 1.7892242688974664e-05,
"loss": 0.4022,
"step": 22550
},
{
"epoch": 1.502784473443604,
"grad_norm": 0.32548418641090393,
"learning_rate": 1.767035722150582e-05,
"loss": 0.3609,
"step": 22600
},
{
"epoch": 1.5061092178538775,
"grad_norm": 1.439875602722168,
"learning_rate": 1.7449560501373567e-05,
"loss": 0.3637,
"step": 22650
},
{
"epoch": 1.509433962264151,
"grad_norm": 0.6430267095565796,
"learning_rate": 1.7229859964225868e-05,
"loss": 0.3746,
"step": 22700
},
{
"epoch": 1.5127587066744244,
"grad_norm": 0.41092389822006226,
"learning_rate": 1.7011263008795075e-05,
"loss": 0.4118,
"step": 22750
},
{
"epoch": 1.5160834510846979,
"grad_norm": 0.49290186166763306,
"learning_rate": 1.679377699664884e-05,
"loss": 0.3622,
"step": 22800
},
{
"epoch": 1.5194081954949712,
"grad_norm": 0.48516589403152466,
"learning_rate": 1.657740925194225e-05,
"loss": 0.3815,
"step": 22850
},
{
"epoch": 1.5227329399052447,
"grad_norm": 0.6224206686019897,
"learning_rate": 1.6362167061171063e-05,
"loss": 0.3837,
"step": 22900
},
{
"epoch": 1.5260576843155182,
"grad_norm": 0.40503615140914917,
"learning_rate": 1.614805767292642e-05,
"loss": 0.3932,
"step": 22950
},
{
"epoch": 1.5293824287257918,
"grad_norm": 0.46341225504875183,
"learning_rate": 1.5935088297650674e-05,
"loss": 0.3485,
"step": 23000
},
{
"epoch": 1.5327071731360653,
"grad_norm": 0.7125037312507629,
"learning_rate": 1.5723266107394653e-05,
"loss": 0.3887,
"step": 23050
},
{
"epoch": 1.5360319175463386,
"grad_norm": 0.36392441391944885,
"learning_rate": 1.551259823557602e-05,
"loss": 0.3908,
"step": 23100
},
{
"epoch": 1.5393566619566121,
"grad_norm": 0.4638826847076416,
"learning_rate": 1.530309177673912e-05,
"loss": 0.4156,
"step": 23150
},
{
"epoch": 1.5426814063668854,
"grad_norm": 0.46034330129623413,
"learning_rate": 1.509475378631603e-05,
"loss": 0.3439,
"step": 23200
},
{
"epoch": 1.546006150777159,
"grad_norm": 0.28411605954170227,
"learning_rate": 1.4887591280389007e-05,
"loss": 0.3763,
"step": 23250
},
{
"epoch": 1.5493308951874325,
"grad_norm": 0.3564077615737915,
"learning_rate": 1.468161123545413e-05,
"loss": 0.4004,
"step": 23300
},
{
"epoch": 1.552655639597706,
"grad_norm": 0.5136293172836304,
"learning_rate": 1.4476820588186412e-05,
"loss": 0.3433,
"step": 23350
},
{
"epoch": 1.5559803840079793,
"grad_norm": 0.8439062237739563,
"learning_rate": 1.4273226235206178e-05,
"loss": 0.3838,
"step": 23400
},
{
"epoch": 1.5593051284182529,
"grad_norm": 0.6947528719902039,
"learning_rate": 1.4070835032846852e-05,
"loss": 0.3627,
"step": 23450
},
{
"epoch": 1.5626298728285262,
"grad_norm": 0.418443500995636,
"learning_rate": 1.3869653796923993e-05,
"loss": 0.3698,
"step": 23500
},
{
"epoch": 1.5659546172387997,
"grad_norm": 0.5064214468002319,
"learning_rate": 1.3669689302505778e-05,
"loss": 0.3827,
"step": 23550
},
{
"epoch": 1.5692793616490732,
"grad_norm": 0.2299993336200714,
"learning_rate": 1.3470948283684925e-05,
"loss": 0.363,
"step": 23600
},
{
"epoch": 1.5726041060593468,
"grad_norm": 0.3324210047721863,
"learning_rate": 1.3273437433351787e-05,
"loss": 0.3504,
"step": 23650
},
{
"epoch": 1.5759288504696203,
"grad_norm": 0.5440026521682739,
"learning_rate": 1.307716340296904e-05,
"loss": 0.4031,
"step": 23700
},
{
"epoch": 1.5792535948798936,
"grad_norm": 0.5328998565673828,
"learning_rate": 1.2882132802347647e-05,
"loss": 0.3945,
"step": 23750
},
{
"epoch": 1.5825783392901671,
"grad_norm": 0.9955481290817261,
"learning_rate": 1.268835219942433e-05,
"loss": 0.3742,
"step": 23800
},
{
"epoch": 1.5859030837004404,
"grad_norm": 0.2555365562438965,
"learning_rate": 1.2495828120040288e-05,
"loss": 0.412,
"step": 23850
},
{
"epoch": 1.589227828110714,
"grad_norm": 0.7778675556182861,
"learning_rate": 1.23045670477215e-05,
"loss": 0.3863,
"step": 23900
},
{
"epoch": 1.5925525725209875,
"grad_norm": 0.19272594153881073,
"learning_rate": 1.2114575423460333e-05,
"loss": 0.3391,
"step": 23950
},
{
"epoch": 1.595877316931261,
"grad_norm": 0.5186722278594971,
"learning_rate": 1.1925859645498722e-05,
"loss": 0.3796,
"step": 24000
},
{
"epoch": 1.5992020613415345,
"grad_norm": 0.508628249168396,
"learning_rate": 1.1738426069112573e-05,
"loss": 0.4019,
"step": 24050
},
{
"epoch": 1.6025268057518078,
"grad_norm": 0.37196823954582214,
"learning_rate": 1.1552281006397819e-05,
"loss": 0.3652,
"step": 24100
},
{
"epoch": 1.6058515501620811,
"grad_norm": 0.2910582721233368,
"learning_rate": 1.1367430726057887e-05,
"loss": 0.3499,
"step": 24150
},
{
"epoch": 1.6091762945723547,
"grad_norm": 0.6717352271080017,
"learning_rate": 1.1183881453192479e-05,
"loss": 0.3619,
"step": 24200
},
{
"epoch": 1.6125010389826282,
"grad_norm": 0.34437698125839233,
"learning_rate": 1.1001639369088018e-05,
"loss": 0.3463,
"step": 24250
},
{
"epoch": 1.6158257833929017,
"grad_norm": 0.46413883566856384,
"learning_rate": 1.082071061100945e-05,
"loss": 0.3765,
"step": 24300
},
{
"epoch": 1.6191505278031753,
"grad_norm": 0.49501270055770874,
"learning_rate": 1.0641101271993614e-05,
"loss": 0.3561,
"step": 24350
},
{
"epoch": 1.6224752722134486,
"grad_norm": 0.8731350302696228,
"learning_rate": 1.0462817400643959e-05,
"loss": 0.3863,
"step": 24400
},
{
"epoch": 1.625800016623722,
"grad_norm": 0.25603896379470825,
"learning_rate": 1.0285865000926925e-05,
"loss": 0.3678,
"step": 24450
},
{
"epoch": 1.6291247610339954,
"grad_norm": 0.7849488854408264,
"learning_rate": 1.0110250031969709e-05,
"loss": 0.3705,
"step": 24500
},
{
"epoch": 1.632449505444269,
"grad_norm": 0.5623769760131836,
"learning_rate": 9.935978407859624e-06,
"loss": 0.3429,
"step": 24550
},
{
"epoch": 1.6357742498545425,
"grad_norm": 0.3793054521083832,
"learning_rate": 9.763055997444897e-06,
"loss": 0.3985,
"step": 24600
},
{
"epoch": 1.639098994264816,
"grad_norm": 1.2295911312103271,
"learning_rate": 9.591488624137023e-06,
"loss": 0.3575,
"step": 24650
},
{
"epoch": 1.6424237386750895,
"grad_norm": 0.23833027482032776,
"learning_rate": 9.421282065714676e-06,
"loss": 0.3721,
"step": 24700
},
{
"epoch": 1.6457484830853628,
"grad_norm": 0.9506494402885437,
"learning_rate": 9.25244205412915e-06,
"loss": 0.3741,
"step": 24750
},
{
"epoch": 1.6490732274956361,
"grad_norm": 0.5477185845375061,
"learning_rate": 9.08497427531128e-06,
"loss": 0.3259,
"step": 24800
},
{
"epoch": 1.6523979719059096,
"grad_norm": 0.6023584604263306,
"learning_rate": 8.91888436897997e-06,
"loss": 0.396,
"step": 24850
},
{
"epoch": 1.6557227163161832,
"grad_norm": 0.5275429487228394,
"learning_rate": 8.754177928452328e-06,
"loss": 0.3445,
"step": 24900
},
{
"epoch": 1.6590474607264567,
"grad_norm": 0.41783201694488525,
"learning_rate": 8.590860500455217e-06,
"loss": 0.387,
"step": 24950
},
{
"epoch": 1.6623722051367302,
"grad_norm": 0.19075682759284973,
"learning_rate": 8.428937584938496e-06,
"loss": 0.3951,
"step": 25000
},
{
"epoch": 1.6656969495470035,
"grad_norm": 0.2861451804637909,
"learning_rate": 8.268414634889848e-06,
"loss": 0.3673,
"step": 25050
},
{
"epoch": 1.669021693957277,
"grad_norm": 0.8673615455627441,
"learning_rate": 8.109297056151067e-06,
"loss": 0.3975,
"step": 25100
},
{
"epoch": 1.6723464383675504,
"grad_norm": 0.6853104829788208,
"learning_rate": 7.951590207236038e-06,
"loss": 0.3967,
"step": 25150
},
{
"epoch": 1.675671182777824,
"grad_norm": 0.7709905505180359,
"learning_rate": 7.79529939915029e-06,
"loss": 0.3482,
"step": 25200
},
{
"epoch": 1.6789959271880974,
"grad_norm": 0.250615656375885,
"learning_rate": 7.640429895212164e-06,
"loss": 0.3693,
"step": 25250
},
{
"epoch": 1.682320671598371,
"grad_norm": 0.725862443447113,
"learning_rate": 7.486986910875499e-06,
"loss": 0.325,
"step": 25300
},
{
"epoch": 1.6856454160086445,
"grad_norm": 0.4505915343761444,
"learning_rate": 7.3349756135540235e-06,
"loss": 0.3634,
"step": 25350
},
{
"epoch": 1.6889701604189178,
"grad_norm": 0.7828101515769958,
"learning_rate": 7.184401122447398e-06,
"loss": 0.3927,
"step": 25400
},
{
"epoch": 1.692294904829191,
"grad_norm": 0.38519877195358276,
"learning_rate": 7.035268508368697e-06,
"loss": 0.3676,
"step": 25450
},
{
"epoch": 1.6956196492394646,
"grad_norm": 0.687976598739624,
"learning_rate": 6.887582793573727e-06,
"loss": 0.3897,
"step": 25500
},
{
"epoch": 1.6989443936497381,
"grad_norm": 0.38796254992485046,
"learning_rate": 6.741348951591908e-06,
"loss": 0.3922,
"step": 25550
},
{
"epoch": 1.7022691380600117,
"grad_norm": 0.8533971309661865,
"learning_rate": 6.596571907058707e-06,
"loss": 0.374,
"step": 25600
},
{
"epoch": 1.7055938824702852,
"grad_norm": 0.9028803110122681,
"learning_rate": 6.453256535549846e-06,
"loss": 0.4181,
"step": 25650
},
{
"epoch": 1.7089186268805585,
"grad_norm": 0.4268290102481842,
"learning_rate": 6.31140766341713e-06,
"loss": 0.3611,
"step": 25700
},
{
"epoch": 1.712243371290832,
"grad_norm": 0.6220707893371582,
"learning_rate": 6.1710300676258385e-06,
"loss": 0.3328,
"step": 25750
},
{
"epoch": 1.7155681157011053,
"grad_norm": 0.4467557668685913,
"learning_rate": 6.032128475593924e-06,
"loss": 0.3704,
"step": 25800
},
{
"epoch": 1.7188928601113789,
"grad_norm": 0.4108564555644989,
"learning_rate": 5.894707565032776e-06,
"loss": 0.3486,
"step": 25850
},
{
"epoch": 1.7222176045216524,
"grad_norm": 0.5933622121810913,
"learning_rate": 5.758771963789722e-06,
"loss": 0.3668,
"step": 25900
},
{
"epoch": 1.725542348931926,
"grad_norm": 0.2982137203216553,
"learning_rate": 5.6243262496921245e-06,
"loss": 0.3143,
"step": 25950
},
{
"epoch": 1.7288670933421995,
"grad_norm": 0.6367640495300293,
"learning_rate": 5.4913749503932575e-06,
"loss": 0.3452,
"step": 26000
},
{
"epoch": 1.7321918377524728,
"grad_norm": 0.4822899103164673,
"learning_rate": 5.359922543219848e-06,
"loss": 0.3903,
"step": 26050
},
{
"epoch": 1.7355165821627463,
"grad_norm": 0.7650361061096191,
"learning_rate": 5.229973455021231e-06,
"loss": 0.3691,
"step": 26100
},
{
"epoch": 1.7388413265730196,
"grad_norm": 0.4355323910713196,
"learning_rate": 5.101532062020325e-06,
"loss": 0.3174,
"step": 26150
},
{
"epoch": 1.7421660709832931,
"grad_norm": 0.49863699078559875,
"learning_rate": 4.974602689666252e-06,
"loss": 0.3693,
"step": 26200
},
{
"epoch": 1.7454908153935667,
"grad_norm": 0.455340713262558,
"learning_rate": 4.8491896124886416e-06,
"loss": 0.3869,
"step": 26250
},
{
"epoch": 1.7488155598038402,
"grad_norm": 0.3978399932384491,
"learning_rate": 4.725297053953692e-06,
"loss": 0.3925,
"step": 26300
},
{
"epoch": 1.7521403042141135,
"grad_norm": 0.3163425922393799,
"learning_rate": 4.602929186321947e-06,
"loss": 0.3563,
"step": 26350
},
{
"epoch": 1.755465048624387,
"grad_norm": 0.5234238505363464,
"learning_rate": 4.48209013050781e-06,
"loss": 0.4169,
"step": 26400
},
{
"epoch": 1.7587897930346603,
"grad_norm": 0.42517679929733276,
"learning_rate": 4.362783955940719e-06,
"loss": 0.363,
"step": 26450
},
{
"epoch": 1.7621145374449338,
"grad_norm": 0.6609143018722534,
"learning_rate": 4.245014680428117e-06,
"loss": 0.3572,
"step": 26500
},
{
"epoch": 1.7654392818552074,
"grad_norm": 0.21873889863491058,
"learning_rate": 4.128786270020174e-06,
"loss": 0.3497,
"step": 26550
},
{
"epoch": 1.768764026265481,
"grad_norm": 0.8427754044532776,
"learning_rate": 4.014102638876205e-06,
"loss": 0.3702,
"step": 26600
},
{
"epoch": 1.7720887706757544,
"grad_norm": 0.8649630546569824,
"learning_rate": 3.900967649132847e-06,
"loss": 0.3662,
"step": 26650
},
{
"epoch": 1.7754135150860277,
"grad_norm": 0.43425253033638,
"learning_rate": 3.789385110774013e-06,
"loss": 0.3643,
"step": 26700
},
{
"epoch": 1.7787382594963013,
"grad_norm": 0.3920991122722626,
"learning_rate": 3.679358781502562e-06,
"loss": 0.3834,
"step": 26750
},
{
"epoch": 1.7820630039065746,
"grad_norm": 0.7349820137023926,
"learning_rate": 3.5708923666137927e-06,
"loss": 0.3632,
"step": 26800
},
{
"epoch": 1.785387748316848,
"grad_norm": 0.6672165989875793,
"learning_rate": 3.4639895188706195e-06,
"loss": 0.3702,
"step": 26850
},
{
"epoch": 1.7887124927271216,
"grad_norm": 0.537217915058136,
"learning_rate": 3.358653838380571e-06,
"loss": 0.3397,
"step": 26900
},
{
"epoch": 1.7920372371373952,
"grad_norm": 0.8017503023147583,
"learning_rate": 3.254888872474593e-06,
"loss": 0.3762,
"step": 26950
},
{
"epoch": 1.7953619815476687,
"grad_norm": 0.16730186343193054,
"learning_rate": 3.1526981155875156e-06,
"loss": 0.3425,
"step": 27000
},
{
"epoch": 1.798686725957942,
"grad_norm": 0.5476846694946289,
"learning_rate": 3.0520850091404263e-06,
"loss": 0.3708,
"step": 27050
},
{
"epoch": 1.8020114703682153,
"grad_norm": 0.15695548057556152,
"learning_rate": 2.9530529414247608e-06,
"loss": 0.3675,
"step": 27100
},
{
"epoch": 1.8053362147784888,
"grad_norm": 0.5888222455978394,
"learning_rate": 2.8556052474881967e-06,
"loss": 0.3647,
"step": 27150
},
{
"epoch": 1.8086609591887624,
"grad_norm": 0.9941563010215759,
"learning_rate": 2.7597452090223354e-06,
"loss": 0.3456,
"step": 27200
},
{
"epoch": 1.8119857035990359,
"grad_norm": 0.610040009021759,
"learning_rate": 2.6654760542521917e-06,
"loss": 0.3746,
"step": 27250
},
{
"epoch": 1.8153104480093094,
"grad_norm": 0.8063670992851257,
"learning_rate": 2.572800957827476e-06,
"loss": 0.3798,
"step": 27300
},
{
"epoch": 1.8186351924195827,
"grad_norm": 0.28963837027549744,
"learning_rate": 2.4817230407156946e-06,
"loss": 0.3713,
"step": 27350
},
{
"epoch": 1.8219599368298562,
"grad_norm": 0.39352625608444214,
"learning_rate": 2.3922453700970295e-06,
"loss": 0.3976,
"step": 27400
},
{
"epoch": 1.8252846812401295,
"grad_norm": 0.9289838671684265,
"learning_rate": 2.3043709592610485e-06,
"loss": 0.3788,
"step": 27450
},
{
"epoch": 1.828609425650403,
"grad_norm": 0.42766475677490234,
"learning_rate": 2.2181027675052534e-06,
"loss": 0.3672,
"step": 27500
},
{
"epoch": 1.8319341700606766,
"grad_norm": 0.4662071168422699,
"learning_rate": 2.133443700035387e-06,
"loss": 0.3382,
"step": 27550
},
{
"epoch": 1.8352589144709501,
"grad_norm": 0.5831243991851807,
"learning_rate": 2.0503966078676217e-06,
"loss": 0.4051,
"step": 27600
},
{
"epoch": 1.8385836588812237,
"grad_norm": 0.4857243299484253,
"learning_rate": 1.9689642877325165e-06,
"loss": 0.3855,
"step": 27650
},
{
"epoch": 1.841908403291497,
"grad_norm": 0.44161850214004517,
"learning_rate": 1.8891494819808841e-06,
"loss": 0.366,
"step": 27700
},
{
"epoch": 1.8452331477017703,
"grad_norm": 0.3997364640235901,
"learning_rate": 1.8109548784913887e-06,
"loss": 0.3469,
"step": 27750
},
{
"epoch": 1.8485578921120438,
"grad_norm": 0.7303462028503418,
"learning_rate": 1.7343831105800511e-06,
"loss": 0.3724,
"step": 27800
},
{
"epoch": 1.8518826365223173,
"grad_norm": 0.5361658930778503,
"learning_rate": 1.6594367569115532e-06,
"loss": 0.353,
"step": 27850
},
{
"epoch": 1.8552073809325909,
"grad_norm": 0.7074758410453796,
"learning_rate": 1.5861183414124403e-06,
"loss": 0.3689,
"step": 27900
},
{
"epoch": 1.8585321253428644,
"grad_norm": 0.9188947081565857,
"learning_rate": 1.514430333186062e-06,
"loss": 0.3341,
"step": 27950
},
{
"epoch": 1.8618568697531377,
"grad_norm": 0.8179706931114197,
"learning_rate": 1.4443751464294664e-06,
"loss": 0.3709,
"step": 28000
},
{
"epoch": 1.8651816141634112,
"grad_norm": 0.7332112193107605,
"learning_rate": 1.3759551403520643e-06,
"loss": 0.3433,
"step": 28050
},
{
"epoch": 1.8685063585736845,
"grad_norm": 0.4983903765678406,
"learning_rate": 1.3091726190962329e-06,
"loss": 0.3337,
"step": 28100
},
{
"epoch": 1.871831102983958,
"grad_norm": 0.2123403400182724,
"learning_rate": 1.2440298316596654e-06,
"loss": 0.3475,
"step": 28150
},
{
"epoch": 1.8751558473942316,
"grad_norm": 0.19655907154083252,
"learning_rate": 1.18052897181965e-06,
"loss": 0.3441,
"step": 28200
},
{
"epoch": 1.878480591804505,
"grad_norm": 0.40445396304130554,
"learning_rate": 1.1186721780592102e-06,
"loss": 0.3793,
"step": 28250
},
{
"epoch": 1.8818053362147786,
"grad_norm": 0.3509872853755951,
"learning_rate": 1.0584615334950643e-06,
"loss": 0.3656,
"step": 28300
},
{
"epoch": 1.885130080625052,
"grad_norm": 0.25542914867401123,
"learning_rate": 9.998990658074914e-07,
"loss": 0.3368,
"step": 28350
},
{
"epoch": 1.8884548250353252,
"grad_norm": 0.5417547225952148,
"learning_rate": 9.429867471720255e-07,
"loss": 0.3631,
"step": 28400
},
{
"epoch": 1.8917795694455988,
"grad_norm": 0.2370055764913559,
"learning_rate": 8.877264941930586e-07,
"loss": 0.3569,
"step": 28450
},
{
"epoch": 1.8951043138558723,
"grad_norm": 0.7116398811340332,
"learning_rate": 8.341201678392974e-07,
"loss": 0.4227,
"step": 28500
},
{
"epoch": 1.8984290582661458,
"grad_norm": 0.9473690390586853,
"learning_rate": 7.821695733810641e-07,
"loss": 0.3959,
"step": 28550
},
{
"epoch": 1.9017538026764194,
"grad_norm": 0.446325421333313,
"learning_rate": 7.318764603295447e-07,
"loss": 0.3363,
"step": 28600
},
{
"epoch": 1.9050785470866927,
"grad_norm": 1.2480918169021606,
"learning_rate": 6.832425223778304e-07,
"loss": 0.395,
"step": 28650
},
{
"epoch": 1.9084032914969662,
"grad_norm": 0.8201688528060913,
"learning_rate": 6.362693973439193e-07,
"loss": 0.352,
"step": 28700
},
{
"epoch": 1.9117280359072395,
"grad_norm": 0.546881377696991,
"learning_rate": 5.909586671155098e-07,
"loss": 0.3362,
"step": 28750
},
{
"epoch": 1.915052780317513,
"grad_norm": 0.8294301629066467,
"learning_rate": 5.47311857596794e-07,
"loss": 0.3586,
"step": 28800
},
{
"epoch": 1.9183775247277866,
"grad_norm": 0.7555782794952393,
"learning_rate": 5.05330438657009e-07,
"loss": 0.3757,
"step": 28850
},
{
"epoch": 1.92170226913806,
"grad_norm": 1.1964213848114014,
"learning_rate": 4.6501582408096657e-07,
"loss": 0.3783,
"step": 28900
},
{
"epoch": 1.9250270135483336,
"grad_norm": 0.5259461998939514,
"learning_rate": 4.263693715214456e-07,
"loss": 0.3516,
"step": 28950
},
{
"epoch": 1.928351757958607,
"grad_norm": 0.23752902448177338,
"learning_rate": 3.893923824534629e-07,
"loss": 0.3414,
"step": 29000
},
{
"epoch": 1.9316765023688804,
"grad_norm": 0.7082974314689636,
"learning_rate": 3.5408610213043536e-07,
"loss": 0.3867,
"step": 29050
},
{
"epoch": 1.9350012467791537,
"grad_norm": 0.5476464033126831,
"learning_rate": 3.204517195422696e-07,
"loss": 0.3588,
"step": 29100
},
{
"epoch": 1.9383259911894273,
"grad_norm": 0.7200431823730469,
"learning_rate": 2.8849036737528813e-07,
"loss": 0.3816,
"step": 29150
},
{
"epoch": 1.9416507355997008,
"grad_norm": 0.41176721453666687,
"learning_rate": 2.5820312197411543e-07,
"loss": 0.3547,
"step": 29200
},
{
"epoch": 1.9449754800099743,
"grad_norm": 1.0158333778381348,
"learning_rate": 2.2959100330541273e-07,
"loss": 0.3763,
"step": 29250
},
{
"epoch": 1.9483002244202476,
"grad_norm": 1.136353850364685,
"learning_rate": 2.0265497492352735e-07,
"loss": 0.3524,
"step": 29300
},
{
"epoch": 1.9516249688305212,
"grad_norm": 0.6701260805130005,
"learning_rate": 1.7739594393805793e-07,
"loss": 0.3532,
"step": 29350
},
{
"epoch": 1.9549497132407945,
"grad_norm": 0.5202396512031555,
"learning_rate": 1.538147609832896e-07,
"loss": 0.3787,
"step": 29400
},
{
"epoch": 1.958274457651068,
"grad_norm": 0.3793032467365265,
"learning_rate": 1.3191222018956174e-07,
"loss": 0.3813,
"step": 29450
},
{
"epoch": 1.9615992020613415,
"grad_norm": 0.266659140586853,
"learning_rate": 1.1168905915652228e-07,
"loss": 0.3346,
"step": 29500
},
{
"epoch": 1.964923946471615,
"grad_norm": 0.4805378317832947,
"learning_rate": 9.314595892827016e-08,
"loss": 0.3695,
"step": 29550
},
{
"epoch": 1.9682486908818886,
"grad_norm": 0.9481486082077026,
"learning_rate": 7.628354397045123e-08,
"loss": 0.3579,
"step": 29600
},
{
"epoch": 1.971573435292162,
"grad_norm": 0.3038884997367859,
"learning_rate": 6.110238214919739e-08,
"loss": 0.3449,
"step": 29650
},
{
"epoch": 1.9748981797024354,
"grad_norm": 0.6856468319892883,
"learning_rate": 4.760298471201963e-08,
"loss": 0.3598,
"step": 29700
},
{
"epoch": 1.9782229241127087,
"grad_norm": 0.4969344437122345,
"learning_rate": 3.5785806270599575e-08,
"loss": 0.3437,
"step": 29750
},
{
"epoch": 1.9815476685229823,
"grad_norm": 0.6707144975662231,
"learning_rate": 2.565124478545733e-08,
"loss": 0.361,
"step": 29800
},
{
"epoch": 1.9848724129332558,
"grad_norm": 0.4501785933971405,
"learning_rate": 1.719964155256215e-08,
"loss": 0.3668,
"step": 29850
},
{
"epoch": 1.9881971573435293,
"grad_norm": 0.31389063596725464,
"learning_rate": 1.043128119184167e-08,
"loss": 0.3647,
"step": 29900
},
{
"epoch": 1.9915219017538028,
"grad_norm": 0.9812319278717041,
"learning_rate": 5.346391637583992e-09,
"loss": 0.4068,
"step": 29950
},
{
"epoch": 1.9948466461640761,
"grad_norm": 0.5165169835090637,
"learning_rate": 1.945144130788279e-09,
"loss": 0.4035,
"step": 30000
},
{
"epoch": 1.9948466461640761,
"eval_loss": 0.3463568687438965,
"eval_runtime": 4374.0175,
"eval_samples_per_second": 1.528,
"eval_steps_per_second": 1.528,
"step": 30000
},
{
"epoch": 1.9981713905743494,
"grad_norm": 0.814073383808136,
"learning_rate": 2.2765321335826983e-10,
"loss": 0.3829,
"step": 30050
},
{
"epoch": 1.9999002576676919,
"step": 30076,
"total_flos": 1.0617208245100216e+19,
"train_loss": 0.5353738934709316,
"train_runtime": 238041.8859,
"train_samples_per_second": 0.505,
"train_steps_per_second": 0.126
}
],
"logging_steps": 50,
"max_steps": 30076,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0617208245100216e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}