diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4273 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9999002576676919, + "eval_steps": 10000, + "global_step": 30076, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0033247444102734603, + "grad_norm": 1.3035300970077515, + "learning_rate": 1.6622340425531915e-06, + "loss": 5.822, + "step": 50 + }, + { + "epoch": 0.006649488820546921, + "grad_norm": 1.1972568035125732, + "learning_rate": 3.324468085106383e-06, + "loss": 5.9169, + "step": 100 + }, + { + "epoch": 0.009974233230820381, + "grad_norm": 1.352640986442566, + "learning_rate": 4.986702127659574e-06, + "loss": 5.2194, + "step": 150 + }, + { + "epoch": 0.013298977641093841, + "grad_norm": 1.7489923238754272, + "learning_rate": 6.648936170212766e-06, + "loss": 4.452, + "step": 200 + }, + { + "epoch": 0.0166237220513673, + "grad_norm": 1.8752517700195312, + "learning_rate": 8.311170212765958e-06, + "loss": 3.6432, + "step": 250 + }, + { + "epoch": 0.019948466461640762, + "grad_norm": 1.9665223360061646, + "learning_rate": 9.973404255319148e-06, + "loss": 3.174, + "step": 300 + }, + { + "epoch": 0.02327321087191422, + "grad_norm": 2.6562719345092773, + "learning_rate": 1.1635638297872341e-05, + "loss": 2.5593, + "step": 350 + }, + { + "epoch": 0.026597955282187682, + "grad_norm": 2.408365488052368, + "learning_rate": 1.3297872340425532e-05, + "loss": 2.1801, + "step": 400 + }, + { + "epoch": 0.02992269969246114, + "grad_norm": 2.3265933990478516, + "learning_rate": 1.4960106382978726e-05, + "loss": 1.8083, + "step": 450 + }, + { + "epoch": 0.0332474441027346, + "grad_norm": 2.1040735244750977, + "learning_rate": 1.6622340425531915e-05, + "loss": 1.6401, + "step": 500 + }, + { + "epoch": 0.036572188513008065, + "grad_norm": 3.4362895488739014, + "learning_rate": 1.8284574468085108e-05, + "loss": 1.5832, + "step": 550 + }, + { + "epoch": 0.039896932923281524, + "grad_norm": 1.546397089958191, + "learning_rate": 1.9946808510638297e-05, + "loss": 1.4445, + "step": 600 + }, + { + "epoch": 0.04322167733355498, + "grad_norm": 1.6556838750839233, + "learning_rate": 2.1609042553191493e-05, + "loss": 1.3488, + "step": 650 + }, + { + "epoch": 0.04654642174382844, + "grad_norm": 1.819988489151001, + "learning_rate": 2.3271276595744682e-05, + "loss": 1.2645, + "step": 700 + }, + { + "epoch": 0.049871166154101906, + "grad_norm": 1.4990456104278564, + "learning_rate": 2.4933510638297874e-05, + "loss": 1.2705, + "step": 750 + }, + { + "epoch": 0.053195910564375365, + "grad_norm": 1.2020275592803955, + "learning_rate": 2.6595744680851064e-05, + "loss": 1.1177, + "step": 800 + }, + { + "epoch": 0.056520654974648823, + "grad_norm": 3.6433067321777344, + "learning_rate": 2.8257978723404256e-05, + "loss": 1.1117, + "step": 850 + }, + { + "epoch": 0.05984539938492228, + "grad_norm": 1.214572548866272, + "learning_rate": 2.9920212765957452e-05, + "loss": 1.0738, + "step": 900 + }, + { + "epoch": 0.06317014379519574, + "grad_norm": 1.198177456855774, + "learning_rate": 3.158244680851064e-05, + "loss": 0.9417, + "step": 950 + }, + { + "epoch": 0.0664948882054692, + "grad_norm": 2.4085018634796143, + "learning_rate": 3.324468085106383e-05, + "loss": 0.9209, + "step": 1000 + }, + { + "epoch": 0.06981963261574267, + "grad_norm": 1.7089121341705322, + "learning_rate": 3.490691489361702e-05, + "loss": 0.923, + "step": 1050 + }, + { + "epoch": 0.07314437702601613, + "grad_norm": 1.4717004299163818, + "learning_rate": 3.6569148936170215e-05, + "loss": 0.8675, + "step": 1100 + }, + { + "epoch": 0.07646912143628959, + "grad_norm": 0.943592369556427, + "learning_rate": 3.823138297872341e-05, + "loss": 0.8474, + "step": 1150 + }, + { + "epoch": 0.07979386584656305, + "grad_norm": 1.0817354917526245, + "learning_rate": 3.9893617021276594e-05, + "loss": 0.9049, + "step": 1200 + }, + { + "epoch": 0.0831186102568365, + "grad_norm": 1.4040069580078125, + "learning_rate": 4.1555851063829786e-05, + "loss": 0.8135, + "step": 1250 + }, + { + "epoch": 0.08644335466710996, + "grad_norm": 1.383522868156433, + "learning_rate": 4.3218085106382986e-05, + "loss": 0.8804, + "step": 1300 + }, + { + "epoch": 0.08976809907738342, + "grad_norm": 2.222273111343384, + "learning_rate": 4.488031914893617e-05, + "loss": 0.8195, + "step": 1350 + }, + { + "epoch": 0.09309284348765688, + "grad_norm": 1.0952123403549194, + "learning_rate": 4.6542553191489364e-05, + "loss": 0.7564, + "step": 1400 + }, + { + "epoch": 0.09641758789793034, + "grad_norm": 1.3211369514465332, + "learning_rate": 4.8204787234042556e-05, + "loss": 0.8551, + "step": 1450 + }, + { + "epoch": 0.09974233230820381, + "grad_norm": 1.8156408071517944, + "learning_rate": 4.986702127659575e-05, + "loss": 0.7798, + "step": 1500 + }, + { + "epoch": 0.10306707671847727, + "grad_norm": 0.7081910967826843, + "learning_rate": 5.152925531914894e-05, + "loss": 0.7384, + "step": 1550 + }, + { + "epoch": 0.10639182112875073, + "grad_norm": 1.294580340385437, + "learning_rate": 5.319148936170213e-05, + "loss": 0.792, + "step": 1600 + }, + { + "epoch": 0.10971656553902419, + "grad_norm": 1.5583733320236206, + "learning_rate": 5.485372340425532e-05, + "loss": 0.7462, + "step": 1650 + }, + { + "epoch": 0.11304130994929765, + "grad_norm": 1.0178766250610352, + "learning_rate": 5.651595744680851e-05, + "loss": 0.7477, + "step": 1700 + }, + { + "epoch": 0.1163660543595711, + "grad_norm": 3.021415948867798, + "learning_rate": 5.81781914893617e-05, + "loss": 0.6861, + "step": 1750 + }, + { + "epoch": 0.11969079876984456, + "grad_norm": 1.043920636177063, + "learning_rate": 5.9840425531914904e-05, + "loss": 0.7214, + "step": 1800 + }, + { + "epoch": 0.12301554318011802, + "grad_norm": 2.1260740756988525, + "learning_rate": 6.150265957446809e-05, + "loss": 0.6917, + "step": 1850 + }, + { + "epoch": 0.12634028759039148, + "grad_norm": 1.0638266801834106, + "learning_rate": 6.316489361702128e-05, + "loss": 0.6948, + "step": 1900 + }, + { + "epoch": 0.12966503200066495, + "grad_norm": 1.7689710855484009, + "learning_rate": 6.482712765957447e-05, + "loss": 0.7296, + "step": 1950 + }, + { + "epoch": 0.1329897764109384, + "grad_norm": 1.052815556526184, + "learning_rate": 6.648936170212766e-05, + "loss": 0.732, + "step": 2000 + }, + { + "epoch": 0.13631452082121187, + "grad_norm": 0.7970194220542908, + "learning_rate": 6.815159574468085e-05, + "loss": 0.6978, + "step": 2050 + }, + { + "epoch": 0.13963926523148534, + "grad_norm": 0.7442440986633301, + "learning_rate": 6.981382978723405e-05, + "loss": 0.7093, + "step": 2100 + }, + { + "epoch": 0.1429640096417588, + "grad_norm": 1.5074230432510376, + "learning_rate": 7.147606382978723e-05, + "loss": 0.7114, + "step": 2150 + }, + { + "epoch": 0.14628875405203226, + "grad_norm": 1.4102712869644165, + "learning_rate": 7.313829787234043e-05, + "loss": 0.7156, + "step": 2200 + }, + { + "epoch": 0.1496134984623057, + "grad_norm": 1.1298854351043701, + "learning_rate": 7.480053191489363e-05, + "loss": 0.6908, + "step": 2250 + }, + { + "epoch": 0.15293824287257918, + "grad_norm": 1.149941325187683, + "learning_rate": 7.646276595744682e-05, + "loss": 0.7239, + "step": 2300 + }, + { + "epoch": 0.15626298728285262, + "grad_norm": 0.860246479511261, + "learning_rate": 7.8125e-05, + "loss": 0.7164, + "step": 2350 + }, + { + "epoch": 0.1595877316931261, + "grad_norm": 0.9874492883682251, + "learning_rate": 7.978723404255319e-05, + "loss": 0.7372, + "step": 2400 + }, + { + "epoch": 0.16291247610339954, + "grad_norm": 1.1497923135757446, + "learning_rate": 8.144946808510639e-05, + "loss": 0.6838, + "step": 2450 + }, + { + "epoch": 0.166237220513673, + "grad_norm": 0.8075393438339233, + "learning_rate": 8.311170212765957e-05, + "loss": 0.6905, + "step": 2500 + }, + { + "epoch": 0.16956196492394648, + "grad_norm": 1.4072059392929077, + "learning_rate": 8.477393617021277e-05, + "loss": 0.6824, + "step": 2550 + }, + { + "epoch": 0.17288670933421993, + "grad_norm": 1.4905030727386475, + "learning_rate": 8.643617021276597e-05, + "loss": 0.6813, + "step": 2600 + }, + { + "epoch": 0.1762114537444934, + "grad_norm": 1.1402111053466797, + "learning_rate": 8.809840425531916e-05, + "loss": 0.6849, + "step": 2650 + }, + { + "epoch": 0.17953619815476685, + "grad_norm": 1.8863836526870728, + "learning_rate": 8.976063829787234e-05, + "loss": 0.6488, + "step": 2700 + }, + { + "epoch": 0.18286094256504032, + "grad_norm": 1.0747898817062378, + "learning_rate": 9.142287234042554e-05, + "loss": 0.6468, + "step": 2750 + }, + { + "epoch": 0.18618568697531376, + "grad_norm": 1.4408470392227173, + "learning_rate": 9.308510638297873e-05, + "loss": 0.6661, + "step": 2800 + }, + { + "epoch": 0.18951043138558724, + "grad_norm": 0.6010861396789551, + "learning_rate": 9.474734042553191e-05, + "loss": 0.6519, + "step": 2850 + }, + { + "epoch": 0.19283517579586068, + "grad_norm": 1.3736897706985474, + "learning_rate": 9.640957446808511e-05, + "loss": 0.6852, + "step": 2900 + }, + { + "epoch": 0.19615992020613415, + "grad_norm": 0.7717883586883545, + "learning_rate": 9.807180851063831e-05, + "loss": 0.6286, + "step": 2950 + }, + { + "epoch": 0.19948466461640763, + "grad_norm": 2.024017572402954, + "learning_rate": 9.97340425531915e-05, + "loss": 0.7135, + "step": 3000 + }, + { + "epoch": 0.20280940902668107, + "grad_norm": 0.49820956587791443, + "learning_rate": 9.999940594707412e-05, + "loss": 0.6312, + "step": 3050 + }, + { + "epoch": 0.20613415343695454, + "grad_norm": 0.9031746983528137, + "learning_rate": 9.999714964504067e-05, + "loss": 0.651, + "step": 3100 + }, + { + "epoch": 0.209458897847228, + "grad_norm": 0.7573064565658569, + "learning_rate": 9.999320961690213e-05, + "loss": 0.6373, + "step": 3150 + }, + { + "epoch": 0.21278364225750146, + "grad_norm": 0.5948196649551392, + "learning_rate": 9.998758599534463e-05, + "loss": 0.7085, + "step": 3200 + }, + { + "epoch": 0.2161083866677749, + "grad_norm": 1.0268886089324951, + "learning_rate": 9.998027896975173e-05, + "loss": 0.684, + "step": 3250 + }, + { + "epoch": 0.21943313107804838, + "grad_norm": 0.7842527031898499, + "learning_rate": 9.997128878619808e-05, + "loss": 0.6139, + "step": 3300 + }, + { + "epoch": 0.22275787548832182, + "grad_norm": 0.5904123187065125, + "learning_rate": 9.996061574744102e-05, + "loss": 0.6458, + "step": 3350 + }, + { + "epoch": 0.2260826198985953, + "grad_norm": 1.0376912355422974, + "learning_rate": 9.994826021291056e-05, + "loss": 0.6125, + "step": 3400 + }, + { + "epoch": 0.22940736430886877, + "grad_norm": 0.8252595663070679, + "learning_rate": 9.993422259869713e-05, + "loss": 0.6086, + "step": 3450 + }, + { + "epoch": 0.2327321087191422, + "grad_norm": 0.7072761654853821, + "learning_rate": 9.991850337753762e-05, + "loss": 0.6516, + "step": 3500 + }, + { + "epoch": 0.23605685312941568, + "grad_norm": 0.5640501976013184, + "learning_rate": 9.990110307879952e-05, + "loss": 0.5939, + "step": 3550 + }, + { + "epoch": 0.23938159753968913, + "grad_norm": 0.9937716126441956, + "learning_rate": 9.988202228846291e-05, + "loss": 0.5968, + "step": 3600 + }, + { + "epoch": 0.2427063419499626, + "grad_norm": 0.6056251525878906, + "learning_rate": 9.986126164910094e-05, + "loss": 0.6498, + "step": 3650 + }, + { + "epoch": 0.24603108636023605, + "grad_norm": 0.5068459510803223, + "learning_rate": 9.983882185985808e-05, + "loss": 0.6477, + "step": 3700 + }, + { + "epoch": 0.24935583077050952, + "grad_norm": 0.6225175857543945, + "learning_rate": 9.98147036764266e-05, + "loss": 0.6328, + "step": 3750 + }, + { + "epoch": 0.25268057518078296, + "grad_norm": 0.5870462656021118, + "learning_rate": 9.978890791102109e-05, + "loss": 0.649, + "step": 3800 + }, + { + "epoch": 0.25600531959105643, + "grad_norm": 3.570667028427124, + "learning_rate": 9.976143543235114e-05, + "loss": 0.6618, + "step": 3850 + }, + { + "epoch": 0.2593300640013299, + "grad_norm": 0.6308077573776245, + "learning_rate": 9.973228716559209e-05, + "loss": 0.6205, + "step": 3900 + }, + { + "epoch": 0.2626548084116034, + "grad_norm": 0.4385659396648407, + "learning_rate": 9.970146409235386e-05, + "loss": 0.5935, + "step": 3950 + }, + { + "epoch": 0.2659795528218768, + "grad_norm": 0.5376986861228943, + "learning_rate": 9.966896725064786e-05, + "loss": 0.596, + "step": 4000 + }, + { + "epoch": 0.26930429723215027, + "grad_norm": 0.7176681160926819, + "learning_rate": 9.963479773485211e-05, + "loss": 0.6313, + "step": 4050 + }, + { + "epoch": 0.27262904164242374, + "grad_norm": 0.6590093374252319, + "learning_rate": 9.959895669567435e-05, + "loss": 0.6126, + "step": 4100 + }, + { + "epoch": 0.2759537860526972, + "grad_norm": 0.5095696449279785, + "learning_rate": 9.956144534011318e-05, + "loss": 0.6132, + "step": 4150 + }, + { + "epoch": 0.2792785304629707, + "grad_norm": 0.7277359962463379, + "learning_rate": 9.952226493141765e-05, + "loss": 0.6075, + "step": 4200 + }, + { + "epoch": 0.2826032748732441, + "grad_norm": 0.41294175386428833, + "learning_rate": 9.94814167890445e-05, + "loss": 0.5422, + "step": 4250 + }, + { + "epoch": 0.2859280192835176, + "grad_norm": 0.7728482484817505, + "learning_rate": 9.943890228861383e-05, + "loss": 0.573, + "step": 4300 + }, + { + "epoch": 0.28925276369379105, + "grad_norm": 0.6309620141983032, + "learning_rate": 9.939472286186271e-05, + "loss": 0.6314, + "step": 4350 + }, + { + "epoch": 0.2925775081040645, + "grad_norm": 0.4572204649448395, + "learning_rate": 9.934887999659707e-05, + "loss": 0.5865, + "step": 4400 + }, + { + "epoch": 0.29590225251433794, + "grad_norm": 0.44096603989601135, + "learning_rate": 9.930137523664149e-05, + "loss": 0.5994, + "step": 4450 + }, + { + "epoch": 0.2992269969246114, + "grad_norm": 0.8694889545440674, + "learning_rate": 9.925221018178728e-05, + "loss": 0.6174, + "step": 4500 + }, + { + "epoch": 0.3025517413348849, + "grad_norm": 0.41049107909202576, + "learning_rate": 9.920138648773852e-05, + "loss": 0.5778, + "step": 4550 + }, + { + "epoch": 0.30587648574515836, + "grad_norm": 0.5933430790901184, + "learning_rate": 9.914890586605638e-05, + "loss": 0.5745, + "step": 4600 + }, + { + "epoch": 0.3092012301554318, + "grad_norm": 0.8841090798377991, + "learning_rate": 9.90947700841015e-05, + "loss": 0.6356, + "step": 4650 + }, + { + "epoch": 0.31252597456570524, + "grad_norm": 0.6952012181282043, + "learning_rate": 9.903898096497441e-05, + "loss": 0.6593, + "step": 4700 + }, + { + "epoch": 0.3158507189759787, + "grad_norm": 0.43332594633102417, + "learning_rate": 9.898154038745408e-05, + "loss": 0.637, + "step": 4750 + }, + { + "epoch": 0.3191754633862522, + "grad_norm": 0.5256862640380859, + "learning_rate": 9.892245028593483e-05, + "loss": 0.6007, + "step": 4800 + }, + { + "epoch": 0.32250020779652566, + "grad_norm": 0.643079400062561, + "learning_rate": 9.886171265036102e-05, + "loss": 0.5526, + "step": 4850 + }, + { + "epoch": 0.3258249522067991, + "grad_norm": 0.9413782954216003, + "learning_rate": 9.879932952616009e-05, + "loss": 0.5863, + "step": 4900 + }, + { + "epoch": 0.32914969661707255, + "grad_norm": 0.46808409690856934, + "learning_rate": 9.873530301417373e-05, + "loss": 0.579, + "step": 4950 + }, + { + "epoch": 0.332474441027346, + "grad_norm": 0.6770108342170715, + "learning_rate": 9.8669635270587e-05, + "loss": 0.6135, + "step": 5000 + }, + { + "epoch": 0.3357991854376195, + "grad_norm": 0.443143367767334, + "learning_rate": 9.860232850685589e-05, + "loss": 0.5849, + "step": 5050 + }, + { + "epoch": 0.33912392984789297, + "grad_norm": 0.4198506772518158, + "learning_rate": 9.853338498963272e-05, + "loss": 0.591, + "step": 5100 + }, + { + "epoch": 0.3424486742581664, + "grad_norm": 0.3660542666912079, + "learning_rate": 9.846280704068982e-05, + "loss": 0.6121, + "step": 5150 + }, + { + "epoch": 0.34577341866843986, + "grad_norm": 0.5003857016563416, + "learning_rate": 9.839059703684139e-05, + "loss": 0.5705, + "step": 5200 + }, + { + "epoch": 0.34909816307871333, + "grad_norm": 0.5552676916122437, + "learning_rate": 9.831675740986346e-05, + "loss": 0.633, + "step": 5250 + }, + { + "epoch": 0.3524229074889868, + "grad_norm": 0.2692893445491791, + "learning_rate": 9.82412906464119e-05, + "loss": 0.5706, + "step": 5300 + }, + { + "epoch": 0.3557476518992602, + "grad_norm": 1.03011953830719, + "learning_rate": 9.816419928793879e-05, + "loss": 0.5357, + "step": 5350 + }, + { + "epoch": 0.3590723963095337, + "grad_norm": 0.4879101812839508, + "learning_rate": 9.808548593060681e-05, + "loss": 0.6065, + "step": 5400 + }, + { + "epoch": 0.36239714071980716, + "grad_norm": 0.4781530797481537, + "learning_rate": 9.800515322520174e-05, + "loss": 0.5832, + "step": 5450 + }, + { + "epoch": 0.36572188513008064, + "grad_norm": 0.519418478012085, + "learning_rate": 9.792320387704328e-05, + "loss": 0.5808, + "step": 5500 + }, + { + "epoch": 0.3690466295403541, + "grad_norm": 0.5514370203018188, + "learning_rate": 9.783964064589387e-05, + "loss": 0.6015, + "step": 5550 + }, + { + "epoch": 0.3723713739506275, + "grad_norm": 0.8520498871803284, + "learning_rate": 9.775446634586584e-05, + "loss": 0.6086, + "step": 5600 + }, + { + "epoch": 0.375696118360901, + "grad_norm": 0.6045146584510803, + "learning_rate": 9.766768384532654e-05, + "loss": 0.5674, + "step": 5650 + }, + { + "epoch": 0.37902086277117447, + "grad_norm": 0.44037437438964844, + "learning_rate": 9.757929606680181e-05, + "loss": 0.5112, + "step": 5700 + }, + { + "epoch": 0.38234560718144794, + "grad_norm": 0.41407912969589233, + "learning_rate": 9.748930598687752e-05, + "loss": 0.6066, + "step": 5750 + }, + { + "epoch": 0.38567035159172136, + "grad_norm": 0.33725085854530334, + "learning_rate": 9.73977166360994e-05, + "loss": 0.5997, + "step": 5800 + }, + { + "epoch": 0.38899509600199483, + "grad_norm": 0.9541948437690735, + "learning_rate": 9.730453109887087e-05, + "loss": 0.5986, + "step": 5850 + }, + { + "epoch": 0.3923198404122683, + "grad_norm": 0.31891727447509766, + "learning_rate": 9.720975251334929e-05, + "loss": 0.5235, + "step": 5900 + }, + { + "epoch": 0.3956445848225418, + "grad_norm": 0.869501531124115, + "learning_rate": 9.711338407134016e-05, + "loss": 0.62, + "step": 5950 + }, + { + "epoch": 0.39896932923281525, + "grad_norm": 0.38659653067588806, + "learning_rate": 9.701542901818974e-05, + "loss": 0.583, + "step": 6000 + }, + { + "epoch": 0.40229407364308867, + "grad_norm": 0.5901491045951843, + "learning_rate": 9.691589065267568e-05, + "loss": 0.5456, + "step": 6050 + }, + { + "epoch": 0.40561881805336214, + "grad_norm": 0.6315745711326599, + "learning_rate": 9.681477232689596e-05, + "loss": 0.5725, + "step": 6100 + }, + { + "epoch": 0.4089435624636356, + "grad_norm": 0.48777422308921814, + "learning_rate": 9.671207744615598e-05, + "loss": 0.6161, + "step": 6150 + }, + { + "epoch": 0.4122683068739091, + "grad_norm": 0.3584806025028229, + "learning_rate": 9.660780946885397e-05, + "loss": 0.5519, + "step": 6200 + }, + { + "epoch": 0.4155930512841825, + "grad_norm": 0.7234945297241211, + "learning_rate": 9.650197190636438e-05, + "loss": 0.6336, + "step": 6250 + }, + { + "epoch": 0.418917795694456, + "grad_norm": 1.169434905052185, + "learning_rate": 9.639456832291974e-05, + "loss": 0.5666, + "step": 6300 + }, + { + "epoch": 0.42224254010472945, + "grad_norm": 0.5370940566062927, + "learning_rate": 9.628560233549058e-05, + "loss": 0.55, + "step": 6350 + }, + { + "epoch": 0.4255672845150029, + "grad_norm": 1.2353452444076538, + "learning_rate": 9.617507761366367e-05, + "loss": 0.5756, + "step": 6400 + }, + { + "epoch": 0.4288920289252764, + "grad_norm": 0.4101187288761139, + "learning_rate": 9.606299787951836e-05, + "loss": 0.6014, + "step": 6450 + }, + { + "epoch": 0.4322167733355498, + "grad_norm": 0.38137727975845337, + "learning_rate": 9.594936690750129e-05, + "loss": 0.5764, + "step": 6500 + }, + { + "epoch": 0.4355415177458233, + "grad_norm": 0.618617057800293, + "learning_rate": 9.583418852429933e-05, + "loss": 0.5548, + "step": 6550 + }, + { + "epoch": 0.43886626215609675, + "grad_norm": 0.4934926927089691, + "learning_rate": 9.571746660871058e-05, + "loss": 0.5769, + "step": 6600 + }, + { + "epoch": 0.4421910065663702, + "grad_norm": 0.33685383200645447, + "learning_rate": 9.559920509151386e-05, + "loss": 0.562, + "step": 6650 + }, + { + "epoch": 0.44551575097664364, + "grad_norm": 0.43346357345581055, + "learning_rate": 9.547940795533627e-05, + "loss": 0.5478, + "step": 6700 + }, + { + "epoch": 0.4488404953869171, + "grad_norm": 0.5250598192214966, + "learning_rate": 9.535807923451911e-05, + "loss": 0.5292, + "step": 6750 + }, + { + "epoch": 0.4521652397971906, + "grad_norm": 0.3458341062068939, + "learning_rate": 9.523522301498202e-05, + "loss": 0.608, + "step": 6800 + }, + { + "epoch": 0.45548998420746406, + "grad_norm": 0.7067184448242188, + "learning_rate": 9.511084343408531e-05, + "loss": 0.5555, + "step": 6850 + }, + { + "epoch": 0.45881472861773753, + "grad_norm": 0.3423425853252411, + "learning_rate": 9.498494468049072e-05, + "loss": 0.5309, + "step": 6900 + }, + { + "epoch": 0.46213947302801095, + "grad_norm": 0.4263427257537842, + "learning_rate": 9.485753099402031e-05, + "loss": 0.5725, + "step": 6950 + }, + { + "epoch": 0.4654642174382844, + "grad_norm": 0.3699227273464203, + "learning_rate": 9.472860666551369e-05, + "loss": 0.544, + "step": 7000 + }, + { + "epoch": 0.4687889618485579, + "grad_norm": 0.34789547324180603, + "learning_rate": 9.459817603668351e-05, + "loss": 0.5701, + "step": 7050 + }, + { + "epoch": 0.47211370625883137, + "grad_norm": 0.3918140232563019, + "learning_rate": 9.446624349996929e-05, + "loss": 0.5523, + "step": 7100 + }, + { + "epoch": 0.47543845066910484, + "grad_norm": 0.39375773072242737, + "learning_rate": 9.433281349838941e-05, + "loss": 0.5224, + "step": 7150 + }, + { + "epoch": 0.47876319507937826, + "grad_norm": 0.49626776576042175, + "learning_rate": 9.419789052539157e-05, + "loss": 0.5807, + "step": 7200 + }, + { + "epoch": 0.48208793948965173, + "grad_norm": 0.4889478087425232, + "learning_rate": 9.406147912470143e-05, + "loss": 0.5496, + "step": 7250 + }, + { + "epoch": 0.4854126838999252, + "grad_norm": 0.8686904311180115, + "learning_rate": 9.392358389016961e-05, + "loss": 0.5681, + "step": 7300 + }, + { + "epoch": 0.4887374283101987, + "grad_norm": 1.1051130294799805, + "learning_rate": 9.378420946561697e-05, + "loss": 0.5595, + "step": 7350 + }, + { + "epoch": 0.4920621727204721, + "grad_norm": 0.3066134452819824, + "learning_rate": 9.364336054467819e-05, + "loss": 0.5436, + "step": 7400 + }, + { + "epoch": 0.49538691713074556, + "grad_norm": 0.5851086378097534, + "learning_rate": 9.350104187064379e-05, + "loss": 0.5452, + "step": 7450 + }, + { + "epoch": 0.49871166154101904, + "grad_norm": 0.5407485365867615, + "learning_rate": 9.335725823630035e-05, + "loss": 0.5603, + "step": 7500 + }, + { + "epoch": 0.5020364059512925, + "grad_norm": 0.4865974187850952, + "learning_rate": 9.321201448376904e-05, + "loss": 0.517, + "step": 7550 + }, + { + "epoch": 0.5053611503615659, + "grad_norm": 0.7569780945777893, + "learning_rate": 9.306531550434268e-05, + "loss": 0.5428, + "step": 7600 + }, + { + "epoch": 0.5086858947718395, + "grad_norm": 0.3547607958316803, + "learning_rate": 9.291716623832091e-05, + "loss": 0.5486, + "step": 7650 + }, + { + "epoch": 0.5120106391821129, + "grad_norm": 0.9511945843696594, + "learning_rate": 9.276757167484389e-05, + "loss": 0.5383, + "step": 7700 + }, + { + "epoch": 0.5153353835923863, + "grad_norm": 0.5263503789901733, + "learning_rate": 9.261653685172422e-05, + "loss": 0.6017, + "step": 7750 + }, + { + "epoch": 0.5186601280026598, + "grad_norm": 0.4225033223628998, + "learning_rate": 9.246406685527739e-05, + "loss": 0.5711, + "step": 7800 + }, + { + "epoch": 0.5219848724129332, + "grad_norm": 0.523815393447876, + "learning_rate": 9.231016682015035e-05, + "loss": 0.5859, + "step": 7850 + }, + { + "epoch": 0.5253096168232068, + "grad_norm": 0.48155075311660767, + "learning_rate": 9.21548419291487e-05, + "loss": 0.5151, + "step": 7900 + }, + { + "epoch": 0.5286343612334802, + "grad_norm": 0.7636247873306274, + "learning_rate": 9.19980974130621e-05, + "loss": 0.5288, + "step": 7950 + }, + { + "epoch": 0.5319591056437536, + "grad_norm": 0.3996843993663788, + "learning_rate": 9.183993855048811e-05, + "loss": 0.554, + "step": 8000 + }, + { + "epoch": 0.5352838500540271, + "grad_norm": 0.49857622385025024, + "learning_rate": 9.168037066765453e-05, + "loss": 0.5566, + "step": 8050 + }, + { + "epoch": 0.5386085944643005, + "grad_norm": 0.5007392764091492, + "learning_rate": 9.151939913823988e-05, + "loss": 0.5464, + "step": 8100 + }, + { + "epoch": 0.5419333388745741, + "grad_norm": 0.4842822551727295, + "learning_rate": 9.135702938319251e-05, + "loss": 0.5381, + "step": 8150 + }, + { + "epoch": 0.5452580832848475, + "grad_norm": 0.645003616809845, + "learning_rate": 9.119326687054802e-05, + "loss": 0.528, + "step": 8200 + }, + { + "epoch": 0.5485828276951209, + "grad_norm": 0.5707802176475525, + "learning_rate": 9.102811711524519e-05, + "loss": 0.5613, + "step": 8250 + }, + { + "epoch": 0.5519075721053944, + "grad_norm": 0.29192325472831726, + "learning_rate": 9.086158567894013e-05, + "loss": 0.5576, + "step": 8300 + }, + { + "epoch": 0.5552323165156678, + "grad_norm": 0.3091285228729248, + "learning_rate": 9.069367816981911e-05, + "loss": 0.54, + "step": 8350 + }, + { + "epoch": 0.5585570609259414, + "grad_norm": 0.4691781997680664, + "learning_rate": 9.052440024240956e-05, + "loss": 0.4902, + "step": 8400 + }, + { + "epoch": 0.5618818053362148, + "grad_norm": 0.38676175475120544, + "learning_rate": 9.03537575973898e-05, + "loss": 0.509, + "step": 8450 + }, + { + "epoch": 0.5652065497464882, + "grad_norm": 0.38752368092536926, + "learning_rate": 9.018175598139696e-05, + "loss": 0.5154, + "step": 8500 + }, + { + "epoch": 0.5685312941567617, + "grad_norm": 0.5562826991081238, + "learning_rate": 9.000840118683344e-05, + "loss": 0.535, + "step": 8550 + }, + { + "epoch": 0.5718560385670352, + "grad_norm": 0.8270835280418396, + "learning_rate": 8.983369905167191e-05, + "loss": 0.4827, + "step": 8600 + }, + { + "epoch": 0.5751807829773086, + "grad_norm": 0.4083782434463501, + "learning_rate": 8.965765545925869e-05, + "loss": 0.5161, + "step": 8650 + }, + { + "epoch": 0.5785055273875821, + "grad_norm": 0.47276222705841064, + "learning_rate": 8.948027633811557e-05, + "loss": 0.5239, + "step": 8700 + }, + { + "epoch": 0.5818302717978555, + "grad_norm": 0.6050196886062622, + "learning_rate": 8.930156766174025e-05, + "loss": 0.5577, + "step": 8750 + }, + { + "epoch": 0.585155016208129, + "grad_norm": 0.8670181632041931, + "learning_rate": 8.912153544840507e-05, + "loss": 0.5693, + "step": 8800 + }, + { + "epoch": 0.5884797606184025, + "grad_norm": 0.8589004874229431, + "learning_rate": 8.894018576095439e-05, + "loss": 0.4972, + "step": 8850 + }, + { + "epoch": 0.5918045050286759, + "grad_norm": 0.47463271021842957, + "learning_rate": 8.875752470660043e-05, + "loss": 0.5021, + "step": 8900 + }, + { + "epoch": 0.5951292494389494, + "grad_norm": 0.27892622351646423, + "learning_rate": 8.857355843671757e-05, + "loss": 0.5546, + "step": 8950 + }, + { + "epoch": 0.5984539938492228, + "grad_norm": 0.5175593495368958, + "learning_rate": 8.838829314663522e-05, + "loss": 0.5434, + "step": 9000 + }, + { + "epoch": 0.6017787382594963, + "grad_norm": 0.6045388579368591, + "learning_rate": 8.820173507542915e-05, + "loss": 0.5041, + "step": 9050 + }, + { + "epoch": 0.6051034826697698, + "grad_norm": 0.39441245794296265, + "learning_rate": 8.80138905057114e-05, + "loss": 0.5354, + "step": 9100 + }, + { + "epoch": 0.6084282270800432, + "grad_norm": 0.6685227751731873, + "learning_rate": 8.782476576341873e-05, + "loss": 0.5127, + "step": 9150 + }, + { + "epoch": 0.6117529714903167, + "grad_norm": 0.9093782305717468, + "learning_rate": 8.763436721759952e-05, + "loss": 0.4883, + "step": 9200 + }, + { + "epoch": 0.6150777159005901, + "grad_norm": 0.4950058162212372, + "learning_rate": 8.744270128019934e-05, + "loss": 0.4566, + "step": 9250 + }, + { + "epoch": 0.6184024603108637, + "grad_norm": 0.649726927280426, + "learning_rate": 8.724977440584497e-05, + "loss": 0.5758, + "step": 9300 + }, + { + "epoch": 0.6217272047211371, + "grad_norm": 0.6150277256965637, + "learning_rate": 8.705559309162712e-05, + "loss": 0.5346, + "step": 9350 + }, + { + "epoch": 0.6250519491314105, + "grad_norm": 0.35310274362564087, + "learning_rate": 8.686016387688153e-05, + "loss": 0.5128, + "step": 9400 + }, + { + "epoch": 0.628376693541684, + "grad_norm": 0.47013986110687256, + "learning_rate": 8.666349334296877e-05, + "loss": 0.4906, + "step": 9450 + }, + { + "epoch": 0.6317014379519574, + "grad_norm": 0.30959010124206543, + "learning_rate": 8.646558811305268e-05, + "loss": 0.5378, + "step": 9500 + }, + { + "epoch": 0.6350261823622309, + "grad_norm": 0.337326318025589, + "learning_rate": 8.626645485187722e-05, + "loss": 0.5361, + "step": 9550 + }, + { + "epoch": 0.6383509267725044, + "grad_norm": 0.6151895523071289, + "learning_rate": 8.60661002655421e-05, + "loss": 0.509, + "step": 9600 + }, + { + "epoch": 0.6416756711827778, + "grad_norm": 0.355437308549881, + "learning_rate": 8.586453110127688e-05, + "loss": 0.536, + "step": 9650 + }, + { + "epoch": 0.6450004155930513, + "grad_norm": 0.4256291687488556, + "learning_rate": 8.566175414721384e-05, + "loss": 0.4913, + "step": 9700 + }, + { + "epoch": 0.6483251600033247, + "grad_norm": 0.5116075277328491, + "learning_rate": 8.545777623215927e-05, + "loss": 0.5193, + "step": 9750 + }, + { + "epoch": 0.6516499044135982, + "grad_norm": 0.5095045566558838, + "learning_rate": 8.525260422536358e-05, + "loss": 0.5351, + "step": 9800 + }, + { + "epoch": 0.6549746488238717, + "grad_norm": 0.41416656970977783, + "learning_rate": 8.504624503628995e-05, + "loss": 0.5594, + "step": 9850 + }, + { + "epoch": 0.6582993932341451, + "grad_norm": 0.2669268846511841, + "learning_rate": 8.483870561438161e-05, + "loss": 0.5219, + "step": 9900 + }, + { + "epoch": 0.6616241376444186, + "grad_norm": 0.576519250869751, + "learning_rate": 8.462999294882783e-05, + "loss": 0.5606, + "step": 9950 + }, + { + "epoch": 0.664948882054692, + "grad_norm": 0.8234946727752686, + "learning_rate": 8.442011406832859e-05, + "loss": 0.5011, + "step": 10000 + }, + { + "epoch": 0.664948882054692, + "eval_loss": 0.4348411560058594, + "eval_runtime": 4362.332, + "eval_samples_per_second": 1.532, + "eval_steps_per_second": 1.532, + "step": 10000 + }, + { + "epoch": 0.6682736264649655, + "grad_norm": 0.11554688215255737, + "learning_rate": 8.420907604085781e-05, + "loss": 0.5451, + "step": 10050 + }, + { + "epoch": 0.671598370875239, + "grad_norm": 0.3037506937980652, + "learning_rate": 8.399688597342535e-05, + "loss": 0.5388, + "step": 10100 + }, + { + "epoch": 0.6749231152855124, + "grad_norm": 0.32672300934791565, + "learning_rate": 8.378355101183769e-05, + "loss": 0.5205, + "step": 10150 + }, + { + "epoch": 0.6782478596957859, + "grad_norm": 0.7825640439987183, + "learning_rate": 8.356907834045726e-05, + "loss": 0.5349, + "step": 10200 + }, + { + "epoch": 0.6815726041060594, + "grad_norm": 0.43441250920295715, + "learning_rate": 8.335347518196052e-05, + "loss": 0.4955, + "step": 10250 + }, + { + "epoch": 0.6848973485163328, + "grad_norm": 0.48924630880355835, + "learning_rate": 8.313674879709475e-05, + "loss": 0.5571, + "step": 10300 + }, + { + "epoch": 0.6882220929266063, + "grad_norm": 0.574004590511322, + "learning_rate": 8.29189064844334e-05, + "loss": 0.5335, + "step": 10350 + }, + { + "epoch": 0.6915468373368797, + "grad_norm": 0.46639284491539, + "learning_rate": 8.269995558013049e-05, + "loss": 0.5151, + "step": 10400 + }, + { + "epoch": 0.6948715817471531, + "grad_norm": 0.9258661866188049, + "learning_rate": 8.24799034576734e-05, + "loss": 0.4983, + "step": 10450 + }, + { + "epoch": 0.6981963261574267, + "grad_norm": 0.2828778922557831, + "learning_rate": 8.225875752763468e-05, + "loss": 0.5027, + "step": 10500 + }, + { + "epoch": 0.7015210705677001, + "grad_norm": 0.40603527426719666, + "learning_rate": 8.203652523742237e-05, + "loss": 0.4735, + "step": 10550 + }, + { + "epoch": 0.7048458149779736, + "grad_norm": 0.6398336291313171, + "learning_rate": 8.181321407102929e-05, + "loss": 0.4945, + "step": 10600 + }, + { + "epoch": 0.708170559388247, + "grad_norm": 0.5521181225776672, + "learning_rate": 8.158883154878094e-05, + "loss": 0.5094, + "step": 10650 + }, + { + "epoch": 0.7114953037985204, + "grad_norm": 0.41034767031669617, + "learning_rate": 8.136338522708233e-05, + "loss": 0.5064, + "step": 10700 + }, + { + "epoch": 0.714820048208794, + "grad_norm": 0.34174844622612, + "learning_rate": 8.11368826981634e-05, + "loss": 0.4934, + "step": 10750 + }, + { + "epoch": 0.7181447926190674, + "grad_norm": 0.43941041827201843, + "learning_rate": 8.090933158982338e-05, + "loss": 0.5097, + "step": 10800 + }, + { + "epoch": 0.7214695370293409, + "grad_norm": 0.7018864154815674, + "learning_rate": 8.068073956517397e-05, + "loss": 0.4923, + "step": 10850 + }, + { + "epoch": 0.7247942814396143, + "grad_norm": 0.6354297995567322, + "learning_rate": 8.045111432238121e-05, + "loss": 0.4611, + "step": 10900 + }, + { + "epoch": 0.7281190258498877, + "grad_norm": 0.49247485399246216, + "learning_rate": 8.022046359440623e-05, + "loss": 0.5119, + "step": 10950 + }, + { + "epoch": 0.7314437702601613, + "grad_norm": 0.7940396666526794, + "learning_rate": 7.998879514874491e-05, + "loss": 0.5359, + "step": 11000 + }, + { + "epoch": 0.7347685146704347, + "grad_norm": 0.35591453313827515, + "learning_rate": 7.975611678716615e-05, + "loss": 0.532, + "step": 11050 + }, + { + "epoch": 0.7380932590807082, + "grad_norm": 0.3358200490474701, + "learning_rate": 7.952243634544929e-05, + "loss": 0.4793, + "step": 11100 + }, + { + "epoch": 0.7414180034909816, + "grad_norm": 0.6496360898017883, + "learning_rate": 7.928776169312016e-05, + "loss": 0.4981, + "step": 11150 + }, + { + "epoch": 0.744742747901255, + "grad_norm": 0.646221935749054, + "learning_rate": 7.905210073318605e-05, + "loss": 0.4578, + "step": 11200 + }, + { + "epoch": 0.7480674923115286, + "grad_norm": 1.0316184759140015, + "learning_rate": 7.881546140186958e-05, + "loss": 0.5101, + "step": 11250 + }, + { + "epoch": 0.751392236721802, + "grad_norm": 0.6004906296730042, + "learning_rate": 7.857785166834144e-05, + "loss": 0.4905, + "step": 11300 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 0.3192722797393799, + "learning_rate": 7.833927953445202e-05, + "loss": 0.4868, + "step": 11350 + }, + { + "epoch": 0.7580417255423489, + "grad_norm": 0.591340959072113, + "learning_rate": 7.809975303446195e-05, + "loss": 0.5107, + "step": 11400 + }, + { + "epoch": 0.7613664699526224, + "grad_norm": 1.268475890159607, + "learning_rate": 7.785928023477142e-05, + "loss": 0.4964, + "step": 11450 + }, + { + "epoch": 0.7646912143628959, + "grad_norm": 0.5102896690368652, + "learning_rate": 7.761786923364878e-05, + "loss": 0.5179, + "step": 11500 + }, + { + "epoch": 0.7680159587731693, + "grad_norm": 0.6084123849868774, + "learning_rate": 7.737552816095754e-05, + "loss": 0.4624, + "step": 11550 + }, + { + "epoch": 0.7713407031834427, + "grad_norm": 0.668404221534729, + "learning_rate": 7.713226517788275e-05, + "loss": 0.4917, + "step": 11600 + }, + { + "epoch": 0.7746654475937162, + "grad_norm": 0.6722292900085449, + "learning_rate": 7.688808847665612e-05, + "loss": 0.523, + "step": 11650 + }, + { + "epoch": 0.7779901920039897, + "grad_norm": 0.7151288390159607, + "learning_rate": 7.664300628028017e-05, + "loss": 0.5019, + "step": 11700 + }, + { + "epoch": 0.7813149364142632, + "grad_norm": 0.5944408178329468, + "learning_rate": 7.639702684225123e-05, + "loss": 0.5049, + "step": 11750 + }, + { + "epoch": 0.7846396808245366, + "grad_norm": 0.6515059471130371, + "learning_rate": 7.615015844628157e-05, + "loss": 0.4823, + "step": 11800 + }, + { + "epoch": 0.78796442523481, + "grad_norm": 0.2709754705429077, + "learning_rate": 7.590240940602036e-05, + "loss": 0.4591, + "step": 11850 + }, + { + "epoch": 0.7912891696450836, + "grad_norm": 0.42835110425949097, + "learning_rate": 7.565378806477377e-05, + "loss": 0.4851, + "step": 11900 + }, + { + "epoch": 0.794613914055357, + "grad_norm": 0.4259116053581238, + "learning_rate": 7.540430279522395e-05, + "loss": 0.4735, + "step": 11950 + }, + { + "epoch": 0.7979386584656305, + "grad_norm": 0.8144826292991638, + "learning_rate": 7.515396199914708e-05, + "loss": 0.4941, + "step": 12000 + }, + { + "epoch": 0.8012634028759039, + "grad_norm": 0.6449247002601624, + "learning_rate": 7.490277410713044e-05, + "loss": 0.4753, + "step": 12050 + }, + { + "epoch": 0.8045881472861773, + "grad_norm": 0.45336267352104187, + "learning_rate": 7.46507475782885e-05, + "loss": 0.5152, + "step": 12100 + }, + { + "epoch": 0.8079128916964509, + "grad_norm": 0.4385122060775757, + "learning_rate": 7.439789089997796e-05, + "loss": 0.523, + "step": 12150 + }, + { + "epoch": 0.8112376361067243, + "grad_norm": 0.36548131704330444, + "learning_rate": 7.414421258751212e-05, + "loss": 0.4939, + "step": 12200 + }, + { + "epoch": 0.8145623805169978, + "grad_norm": 0.5231236219406128, + "learning_rate": 7.38897211838739e-05, + "loss": 0.4787, + "step": 12250 + }, + { + "epoch": 0.8178871249272712, + "grad_norm": 0.4674762785434723, + "learning_rate": 7.363442525942826e-05, + "loss": 0.4973, + "step": 12300 + }, + { + "epoch": 0.8212118693375446, + "grad_norm": 0.4448126554489136, + "learning_rate": 7.337833341163358e-05, + "loss": 0.51, + "step": 12350 + }, + { + "epoch": 0.8245366137478182, + "grad_norm": 0.3027734160423279, + "learning_rate": 7.31214542647521e-05, + "loss": 0.5298, + "step": 12400 + }, + { + "epoch": 0.8278613581580916, + "grad_norm": 0.7488745450973511, + "learning_rate": 7.286379646955946e-05, + "loss": 0.492, + "step": 12450 + }, + { + "epoch": 0.831186102568365, + "grad_norm": 0.7628294229507446, + "learning_rate": 7.260536870305347e-05, + "loss": 0.4854, + "step": 12500 + }, + { + "epoch": 0.8345108469786385, + "grad_norm": 0.5619131326675415, + "learning_rate": 7.234617966816174e-05, + "loss": 0.4439, + "step": 12550 + }, + { + "epoch": 0.837835591388912, + "grad_norm": 0.2590934634208679, + "learning_rate": 7.208623809344879e-05, + "loss": 0.4949, + "step": 12600 + }, + { + "epoch": 0.8411603357991855, + "grad_norm": 0.47846803069114685, + "learning_rate": 7.182555273282193e-05, + "loss": 0.5091, + "step": 12650 + }, + { + "epoch": 0.8444850802094589, + "grad_norm": 0.5931444764137268, + "learning_rate": 7.156413236523656e-05, + "loss": 0.4732, + "step": 12700 + }, + { + "epoch": 0.8478098246197323, + "grad_norm": 0.4458071291446686, + "learning_rate": 7.130198579440052e-05, + "loss": 0.4639, + "step": 12750 + }, + { + "epoch": 0.8511345690300058, + "grad_norm": 0.35852745175361633, + "learning_rate": 7.103912184847757e-05, + "loss": 0.4818, + "step": 12800 + }, + { + "epoch": 0.8544593134402793, + "grad_norm": 0.496389776468277, + "learning_rate": 7.07755493797901e-05, + "loss": 0.4424, + "step": 12850 + }, + { + "epoch": 0.8577840578505528, + "grad_norm": 0.5191590189933777, + "learning_rate": 7.051127726452102e-05, + "loss": 0.4551, + "step": 12900 + }, + { + "epoch": 0.8611088022608262, + "grad_norm": 0.4241909682750702, + "learning_rate": 7.024631440241491e-05, + "loss": 0.4496, + "step": 12950 + }, + { + "epoch": 0.8644335466710996, + "grad_norm": 0.5711332559585571, + "learning_rate": 6.998066971647817e-05, + "loss": 0.49, + "step": 13000 + }, + { + "epoch": 0.8677582910813731, + "grad_norm": 0.29857337474823, + "learning_rate": 6.971435215267866e-05, + "loss": 0.4822, + "step": 13050 + }, + { + "epoch": 0.8710830354916466, + "grad_norm": 0.40240055322647095, + "learning_rate": 6.944737067964429e-05, + "loss": 0.4665, + "step": 13100 + }, + { + "epoch": 0.8744077799019201, + "grad_norm": 0.3824640214443207, + "learning_rate": 6.917973428836118e-05, + "loss": 0.4762, + "step": 13150 + }, + { + "epoch": 0.8777325243121935, + "grad_norm": 0.5857983231544495, + "learning_rate": 6.891145199187065e-05, + "loss": 0.4301, + "step": 13200 + }, + { + "epoch": 0.8810572687224669, + "grad_norm": 0.43042436242103577, + "learning_rate": 6.864253282496595e-05, + "loss": 0.5202, + "step": 13250 + }, + { + "epoch": 0.8843820131327405, + "grad_norm": 0.47307106852531433, + "learning_rate": 6.837298584388771e-05, + "loss": 0.4664, + "step": 13300 + }, + { + "epoch": 0.8877067575430139, + "grad_norm": 0.6773985624313354, + "learning_rate": 6.810282012601923e-05, + "loss": 0.4748, + "step": 13350 + }, + { + "epoch": 0.8910315019532873, + "grad_norm": 0.35346561670303345, + "learning_rate": 6.783204476958058e-05, + "loss": 0.4798, + "step": 13400 + }, + { + "epoch": 0.8943562463635608, + "grad_norm": 0.3380667269229889, + "learning_rate": 6.75606688933223e-05, + "loss": 0.5189, + "step": 13450 + }, + { + "epoch": 0.8976809907738342, + "grad_norm": 0.7384742498397827, + "learning_rate": 6.728870163621836e-05, + "loss": 0.4913, + "step": 13500 + }, + { + "epoch": 0.9010057351841078, + "grad_norm": 0.4812994599342346, + "learning_rate": 6.701615215715829e-05, + "loss": 0.453, + "step": 13550 + }, + { + "epoch": 0.9043304795943812, + "grad_norm": 0.3846656382083893, + "learning_rate": 6.674302963463876e-05, + "loss": 0.459, + "step": 13600 + }, + { + "epoch": 0.9076552240046546, + "grad_norm": 0.49682462215423584, + "learning_rate": 6.646934326645452e-05, + "loss": 0.5119, + "step": 13650 + }, + { + "epoch": 0.9109799684149281, + "grad_norm": 0.2614552974700928, + "learning_rate": 6.61951022693887e-05, + "loss": 0.4414, + "step": 13700 + }, + { + "epoch": 0.9143047128252015, + "grad_norm": 0.4591997563838959, + "learning_rate": 6.592031587890224e-05, + "loss": 0.5148, + "step": 13750 + }, + { + "epoch": 0.9176294572354751, + "grad_norm": 0.4885028600692749, + "learning_rate": 6.564499334882312e-05, + "loss": 0.4998, + "step": 13800 + }, + { + "epoch": 0.9209542016457485, + "grad_norm": 0.7544069290161133, + "learning_rate": 6.536914395103446e-05, + "loss": 0.5493, + "step": 13850 + }, + { + "epoch": 0.9242789460560219, + "grad_norm": 0.6468596458435059, + "learning_rate": 6.509277697516255e-05, + "loss": 0.4527, + "step": 13900 + }, + { + "epoch": 0.9276036904662954, + "grad_norm": 0.6907210350036621, + "learning_rate": 6.481590172826371e-05, + "loss": 0.4917, + "step": 13950 + }, + { + "epoch": 0.9309284348765688, + "grad_norm": 0.5351805090904236, + "learning_rate": 6.453852753451119e-05, + "loss": 0.5103, + "step": 14000 + }, + { + "epoch": 0.9342531792868424, + "grad_norm": 0.7480131983757019, + "learning_rate": 6.426066373488084e-05, + "loss": 0.4945, + "step": 14050 + }, + { + "epoch": 0.9375779236971158, + "grad_norm": 0.30603644251823425, + "learning_rate": 6.39823196868368e-05, + "loss": 0.5453, + "step": 14100 + }, + { + "epoch": 0.9409026681073892, + "grad_norm": 0.7746521830558777, + "learning_rate": 6.370350476401624e-05, + "loss": 0.4735, + "step": 14150 + }, + { + "epoch": 0.9442274125176627, + "grad_norm": 1.2858827114105225, + "learning_rate": 6.342422835591368e-05, + "loss": 0.5343, + "step": 14200 + }, + { + "epoch": 0.9475521569279362, + "grad_norm": 0.45832210779190063, + "learning_rate": 6.314449986756489e-05, + "loss": 0.5078, + "step": 14250 + }, + { + "epoch": 0.9508769013382097, + "grad_norm": 0.5275512337684631, + "learning_rate": 6.286432871923e-05, + "loss": 0.4985, + "step": 14300 + }, + { + "epoch": 0.9542016457484831, + "grad_norm": 0.25901734828948975, + "learning_rate": 6.258372434607645e-05, + "loss": 0.4495, + "step": 14350 + }, + { + "epoch": 0.9575263901587565, + "grad_norm": 0.5617000460624695, + "learning_rate": 6.230269619786111e-05, + "loss": 0.4606, + "step": 14400 + }, + { + "epoch": 0.96085113456903, + "grad_norm": 0.7701844573020935, + "learning_rate": 6.202125373861207e-05, + "loss": 0.4699, + "step": 14450 + }, + { + "epoch": 0.9641758789793035, + "grad_norm": 0.33021387457847595, + "learning_rate": 6.173940644630996e-05, + "loss": 0.4802, + "step": 14500 + }, + { + "epoch": 0.9675006233895769, + "grad_norm": 0.40716060996055603, + "learning_rate": 6.145716381256873e-05, + "loss": 0.4557, + "step": 14550 + }, + { + "epoch": 0.9708253677998504, + "grad_norm": 0.49702873826026917, + "learning_rate": 6.117453534231606e-05, + "loss": 0.4472, + "step": 14600 + }, + { + "epoch": 0.9741501122101238, + "grad_norm": 0.4306732714176178, + "learning_rate": 6.0891530553473195e-05, + "loss": 0.4498, + "step": 14650 + }, + { + "epoch": 0.9774748566203973, + "grad_norm": 0.43764013051986694, + "learning_rate": 6.060815897663447e-05, + "loss": 0.4741, + "step": 14700 + }, + { + "epoch": 0.9807996010306708, + "grad_norm": 0.6434122323989868, + "learning_rate": 6.0324430154746316e-05, + "loss": 0.4821, + "step": 14750 + }, + { + "epoch": 0.9841243454409442, + "grad_norm": 0.5440848469734192, + "learning_rate": 6.004035364278593e-05, + "loss": 0.4687, + "step": 14800 + }, + { + "epoch": 0.9874490898512177, + "grad_norm": 0.7061350345611572, + "learning_rate": 5.9755939007439445e-05, + "loss": 0.427, + "step": 14850 + }, + { + "epoch": 0.9907738342614911, + "grad_norm": 1.1848245859146118, + "learning_rate": 5.9471195826779834e-05, + "loss": 0.4594, + "step": 14900 + }, + { + "epoch": 0.9940985786717647, + "grad_norm": 0.5134018659591675, + "learning_rate": 5.918613368994423e-05, + "loss": 0.482, + "step": 14950 + }, + { + "epoch": 0.9974233230820381, + "grad_norm": 0.4567798674106598, + "learning_rate": 5.8900762196811175e-05, + "loss": 0.4322, + "step": 15000 + }, + { + "epoch": 1.0007480674923115, + "grad_norm": 0.49805304408073425, + "learning_rate": 5.861509095767714e-05, + "loss": 0.4441, + "step": 15050 + }, + { + "epoch": 1.004072811902585, + "grad_norm": 0.7849738597869873, + "learning_rate": 5.832912959293304e-05, + "loss": 0.4052, + "step": 15100 + }, + { + "epoch": 1.0073975563128585, + "grad_norm": 0.6468079686164856, + "learning_rate": 5.804288773274011e-05, + "loss": 0.4061, + "step": 15150 + }, + { + "epoch": 1.0107223007231319, + "grad_norm": 0.3043822944164276, + "learning_rate": 5.775637501670579e-05, + "loss": 0.3922, + "step": 15200 + }, + { + "epoch": 1.0140470451334054, + "grad_norm": 0.3926864266395569, + "learning_rate": 5.7469601093558854e-05, + "loss": 0.444, + "step": 15250 + }, + { + "epoch": 1.017371789543679, + "grad_norm": 0.2536259889602661, + "learning_rate": 5.718257562082471e-05, + "loss": 0.4149, + "step": 15300 + }, + { + "epoch": 1.0206965339539522, + "grad_norm": 0.9208493232727051, + "learning_rate": 5.689530826449997e-05, + "loss": 0.3901, + "step": 15350 + }, + { + "epoch": 1.0240212783642257, + "grad_norm": 0.34819620847702026, + "learning_rate": 5.660780869872711e-05, + "loss": 0.4268, + "step": 15400 + }, + { + "epoch": 1.0273460227744993, + "grad_norm": 1.2284760475158691, + "learning_rate": 5.632008660546853e-05, + "loss": 0.4328, + "step": 15450 + }, + { + "epoch": 1.0306707671847726, + "grad_norm": 0.3599700629711151, + "learning_rate": 5.6032151674180575e-05, + "loss": 0.3798, + "step": 15500 + }, + { + "epoch": 1.033995511595046, + "grad_norm": 0.31814679503440857, + "learning_rate": 5.574401360148727e-05, + "loss": 0.4215, + "step": 15550 + }, + { + "epoch": 1.0373202560053196, + "grad_norm": 0.6101735234260559, + "learning_rate": 5.5455682090853624e-05, + "loss": 0.4242, + "step": 15600 + }, + { + "epoch": 1.0406450004155932, + "grad_norm": 0.8657062649726868, + "learning_rate": 5.5167166852259055e-05, + "loss": 0.4163, + "step": 15650 + }, + { + "epoch": 1.0439697448258665, + "grad_norm": 0.4173007309436798, + "learning_rate": 5.4878477601870194e-05, + "loss": 0.3637, + "step": 15700 + }, + { + "epoch": 1.04729448923614, + "grad_norm": 0.13609103858470917, + "learning_rate": 5.458962406171384e-05, + "loss": 0.4117, + "step": 15750 + }, + { + "epoch": 1.0506192336464135, + "grad_norm": 0.4456997513771057, + "learning_rate": 5.430061595934941e-05, + "loss": 0.4177, + "step": 15800 + }, + { + "epoch": 1.0539439780566868, + "grad_norm": 0.549238383769989, + "learning_rate": 5.401146302754153e-05, + "loss": 0.4446, + "step": 15850 + }, + { + "epoch": 1.0572687224669604, + "grad_norm": 0.4045184850692749, + "learning_rate": 5.372217500393205e-05, + "loss": 0.4097, + "step": 15900 + }, + { + "epoch": 1.0605934668772339, + "grad_norm": 0.3764871060848236, + "learning_rate": 5.3432761630712335e-05, + "loss": 0.378, + "step": 15950 + }, + { + "epoch": 1.0639182112875072, + "grad_norm": 0.46142441034317017, + "learning_rate": 5.314323265429501e-05, + "loss": 0.373, + "step": 16000 + }, + { + "epoch": 1.0672429556977807, + "grad_norm": 0.671380341053009, + "learning_rate": 5.285359782498582e-05, + "loss": 0.4159, + "step": 16050 + }, + { + "epoch": 1.0705677001080542, + "grad_norm": 0.5095057487487793, + "learning_rate": 5.2563866896655275e-05, + "loss": 0.4017, + "step": 16100 + }, + { + "epoch": 1.0738924445183278, + "grad_norm": 0.8826057314872742, + "learning_rate": 5.227404962641016e-05, + "loss": 0.4627, + "step": 16150 + }, + { + "epoch": 1.077217188928601, + "grad_norm": 0.740010142326355, + "learning_rate": 5.198415577426493e-05, + "loss": 0.4073, + "step": 16200 + }, + { + "epoch": 1.0805419333388746, + "grad_norm": 0.4958501160144806, + "learning_rate": 5.1694195102813046e-05, + "loss": 0.4024, + "step": 16250 + }, + { + "epoch": 1.0838666777491481, + "grad_norm": 0.8271908760070801, + "learning_rate": 5.140417737689822e-05, + "loss": 0.4322, + "step": 16300 + }, + { + "epoch": 1.0871914221594214, + "grad_norm": 0.43836158514022827, + "learning_rate": 5.111411236328555e-05, + "loss": 0.3967, + "step": 16350 + }, + { + "epoch": 1.090516166569695, + "grad_norm": 0.6598096489906311, + "learning_rate": 5.0824009830332606e-05, + "loss": 0.4123, + "step": 16400 + }, + { + "epoch": 1.0938409109799685, + "grad_norm": 0.5519174337387085, + "learning_rate": 5.053387954766049e-05, + "loss": 0.398, + "step": 16450 + }, + { + "epoch": 1.0971656553902418, + "grad_norm": 0.4203943610191345, + "learning_rate": 5.02437312858248e-05, + "loss": 0.4162, + "step": 16500 + }, + { + "epoch": 1.1004903998005153, + "grad_norm": 1.1314325332641602, + "learning_rate": 4.995357481598663e-05, + "loss": 0.3965, + "step": 16550 + }, + { + "epoch": 1.1038151442107889, + "grad_norm": 0.35250428318977356, + "learning_rate": 4.966341990958347e-05, + "loss": 0.4073, + "step": 16600 + }, + { + "epoch": 1.1071398886210622, + "grad_norm": 0.3103015720844269, + "learning_rate": 4.937327633800018e-05, + "loss": 0.4228, + "step": 16650 + }, + { + "epoch": 1.1104646330313357, + "grad_norm": 0.7171920537948608, + "learning_rate": 4.908315387223985e-05, + "loss": 0.3919, + "step": 16700 + }, + { + "epoch": 1.1137893774416092, + "grad_norm": 0.4805893003940582, + "learning_rate": 4.87930622825949e-05, + "loss": 0.3981, + "step": 16750 + }, + { + "epoch": 1.1171141218518827, + "grad_norm": 0.5666382312774658, + "learning_rate": 4.850301133831786e-05, + "loss": 0.4203, + "step": 16800 + }, + { + "epoch": 1.120438866262156, + "grad_norm": 0.18436360359191895, + "learning_rate": 4.821301080729249e-05, + "loss": 0.4215, + "step": 16850 + }, + { + "epoch": 1.1237636106724296, + "grad_norm": 0.4461723566055298, + "learning_rate": 4.792307045570486e-05, + "loss": 0.4055, + "step": 16900 + }, + { + "epoch": 1.127088355082703, + "grad_norm": 0.3168890178203583, + "learning_rate": 4.7633200047714345e-05, + "loss": 0.3586, + "step": 16950 + }, + { + "epoch": 1.1304130994929764, + "grad_norm": 0.2920137941837311, + "learning_rate": 4.734340934512492e-05, + "loss": 0.4116, + "step": 17000 + }, + { + "epoch": 1.13373784390325, + "grad_norm": 0.40534549951553345, + "learning_rate": 4.70537081070563e-05, + "loss": 0.4024, + "step": 17050 + }, + { + "epoch": 1.1370625883135235, + "grad_norm": 0.5520055294036865, + "learning_rate": 4.6764106089615454e-05, + "loss": 0.4162, + "step": 17100 + }, + { + "epoch": 1.1403873327237968, + "grad_norm": 0.3157692849636078, + "learning_rate": 4.647461304556787e-05, + "loss": 0.3925, + "step": 17150 + }, + { + "epoch": 1.1437120771340703, + "grad_norm": 0.8194869756698608, + "learning_rate": 4.618523872400921e-05, + "loss": 0.4147, + "step": 17200 + }, + { + "epoch": 1.1470368215443438, + "grad_norm": 0.3826686441898346, + "learning_rate": 4.589599287003703e-05, + "loss": 0.4036, + "step": 17250 + }, + { + "epoch": 1.1503615659546171, + "grad_norm": 0.8618173599243164, + "learning_rate": 4.56068852244225e-05, + "loss": 0.4285, + "step": 17300 + }, + { + "epoch": 1.1536863103648907, + "grad_norm": 0.996113657951355, + "learning_rate": 4.5317925523282464e-05, + "loss": 0.3751, + "step": 17350 + }, + { + "epoch": 1.1570110547751642, + "grad_norm": 0.5433736443519592, + "learning_rate": 4.5029123497751514e-05, + "loss": 0.408, + "step": 17400 + }, + { + "epoch": 1.1603357991854377, + "grad_norm": 0.6233689188957214, + "learning_rate": 4.474048887365426e-05, + "loss": 0.4105, + "step": 17450 + }, + { + "epoch": 1.163660543595711, + "grad_norm": 0.6037063002586365, + "learning_rate": 4.445203137117788e-05, + "loss": 0.3618, + "step": 17500 + }, + { + "epoch": 1.1669852880059846, + "grad_norm": 0.8507609367370605, + "learning_rate": 4.4163760704544675e-05, + "loss": 0.4433, + "step": 17550 + }, + { + "epoch": 1.170310032416258, + "grad_norm": 0.4909146726131439, + "learning_rate": 4.3875686581685e-05, + "loss": 0.4128, + "step": 17600 + }, + { + "epoch": 1.1736347768265314, + "grad_norm": 0.4087628424167633, + "learning_rate": 4.358781870391033e-05, + "loss": 0.4035, + "step": 17650 + }, + { + "epoch": 1.176959521236805, + "grad_norm": 0.5856008529663086, + "learning_rate": 4.330016676558651e-05, + "loss": 0.3809, + "step": 17700 + }, + { + "epoch": 1.1802842656470784, + "grad_norm": 0.14058536291122437, + "learning_rate": 4.3012740453807346e-05, + "loss": 0.3875, + "step": 17750 + }, + { + "epoch": 1.1836090100573518, + "grad_norm": 0.2947339415550232, + "learning_rate": 4.272554944806831e-05, + "loss": 0.4059, + "step": 17800 + }, + { + "epoch": 1.1869337544676253, + "grad_norm": 0.6987840533256531, + "learning_rate": 4.243860341994062e-05, + "loss": 0.385, + "step": 17850 + }, + { + "epoch": 1.1902584988778988, + "grad_norm": 0.4702407717704773, + "learning_rate": 4.2151912032745547e-05, + "loss": 0.433, + "step": 17900 + }, + { + "epoch": 1.1935832432881721, + "grad_norm": 0.28774410486221313, + "learning_rate": 4.18654849412289e-05, + "loss": 0.3464, + "step": 17950 + }, + { + "epoch": 1.1969079876984456, + "grad_norm": 0.47577986121177673, + "learning_rate": 4.157933179123599e-05, + "loss": 0.406, + "step": 18000 + }, + { + "epoch": 1.2002327321087192, + "grad_norm": 0.674921989440918, + "learning_rate": 4.129346221938676e-05, + "loss": 0.4521, + "step": 18050 + }, + { + "epoch": 1.2035574765189927, + "grad_norm": 0.4696671962738037, + "learning_rate": 4.100788585275125e-05, + "loss": 0.3983, + "step": 18100 + }, + { + "epoch": 1.206882220929266, + "grad_norm": 0.7673335075378418, + "learning_rate": 4.0722612308525335e-05, + "loss": 0.4084, + "step": 18150 + }, + { + "epoch": 1.2102069653395395, + "grad_norm": 0.39890438318252563, + "learning_rate": 4.043765119370699e-05, + "loss": 0.3673, + "step": 18200 + }, + { + "epoch": 1.213531709749813, + "grad_norm": 0.5470781326293945, + "learning_rate": 4.0153012104772635e-05, + "loss": 0.3686, + "step": 18250 + }, + { + "epoch": 1.2168564541600864, + "grad_norm": 0.5802851319313049, + "learning_rate": 3.9868704627354e-05, + "loss": 0.4034, + "step": 18300 + }, + { + "epoch": 1.22018119857036, + "grad_norm": 0.5744081139564514, + "learning_rate": 3.9584738335915314e-05, + "loss": 0.3896, + "step": 18350 + }, + { + "epoch": 1.2235059429806334, + "grad_norm": 0.6031488180160522, + "learning_rate": 3.930112279343094e-05, + "loss": 0.3943, + "step": 18400 + }, + { + "epoch": 1.226830687390907, + "grad_norm": 0.42322850227355957, + "learning_rate": 3.9017867551063184e-05, + "loss": 0.3821, + "step": 18450 + }, + { + "epoch": 1.2301554318011803, + "grad_norm": 1.014979600906372, + "learning_rate": 3.8734982147840756e-05, + "loss": 0.3888, + "step": 18500 + }, + { + "epoch": 1.2334801762114538, + "grad_norm": 0.6480023860931396, + "learning_rate": 3.845247611033749e-05, + "loss": 0.4109, + "step": 18550 + }, + { + "epoch": 1.236804920621727, + "grad_norm": 0.8400156497955322, + "learning_rate": 3.817035895235159e-05, + "loss": 0.3897, + "step": 18600 + }, + { + "epoch": 1.2401296650320006, + "grad_norm": 0.7448896765708923, + "learning_rate": 3.7888640174585096e-05, + "loss": 0.3637, + "step": 18650 + }, + { + "epoch": 1.2434544094422741, + "grad_norm": 0.7204906940460205, + "learning_rate": 3.760732926432407e-05, + "loss": 0.3688, + "step": 18700 + }, + { + "epoch": 1.2467791538525477, + "grad_norm": 0.21433959901332855, + "learning_rate": 3.732643569511901e-05, + "loss": 0.412, + "step": 18750 + }, + { + "epoch": 1.250103898262821, + "grad_norm": 0.6840627789497375, + "learning_rate": 3.704596892646593e-05, + "loss": 0.4127, + "step": 18800 + }, + { + "epoch": 1.2534286426730945, + "grad_norm": 0.5724749565124512, + "learning_rate": 3.676593840348765e-05, + "loss": 0.3849, + "step": 18850 + }, + { + "epoch": 1.256753387083368, + "grad_norm": 1.0407353639602661, + "learning_rate": 3.648635355661577e-05, + "loss": 0.412, + "step": 18900 + }, + { + "epoch": 1.2600781314936413, + "grad_norm": 0.6900772452354431, + "learning_rate": 3.6207223801273196e-05, + "loss": 0.4414, + "step": 18950 + }, + { + "epoch": 1.2634028759039149, + "grad_norm": 0.5711185932159424, + "learning_rate": 3.5928558537556895e-05, + "loss": 0.3557, + "step": 19000 + }, + { + "epoch": 1.2667276203141884, + "grad_norm": 0.928859293460846, + "learning_rate": 3.565036714992142e-05, + "loss": 0.3692, + "step": 19050 + }, + { + "epoch": 1.270052364724462, + "grad_norm": 0.4256090521812439, + "learning_rate": 3.537265900686286e-05, + "loss": 0.3895, + "step": 19100 + }, + { + "epoch": 1.2733771091347352, + "grad_norm": 0.3450973629951477, + "learning_rate": 3.5095443460603405e-05, + "loss": 0.3645, + "step": 19150 + }, + { + "epoch": 1.2767018535450088, + "grad_norm": 0.48858773708343506, + "learning_rate": 3.4818729846776254e-05, + "loss": 0.3473, + "step": 19200 + }, + { + "epoch": 1.280026597955282, + "grad_norm": 0.44542333483695984, + "learning_rate": 3.4542527484111365e-05, + "loss": 0.3837, + "step": 19250 + }, + { + "epoch": 1.2833513423655556, + "grad_norm": 0.54665207862854, + "learning_rate": 3.426684567412153e-05, + "loss": 0.3911, + "step": 19300 + }, + { + "epoch": 1.2866760867758291, + "grad_norm": 0.46556198596954346, + "learning_rate": 3.3991693700789235e-05, + "loss": 0.4085, + "step": 19350 + }, + { + "epoch": 1.2900008311861026, + "grad_norm": 0.5987225770950317, + "learning_rate": 3.371708083025392e-05, + "loss": 0.37, + "step": 19400 + }, + { + "epoch": 1.293325575596376, + "grad_norm": 0.0574885755777359, + "learning_rate": 3.344301631049993e-05, + "loss": 0.359, + "step": 19450 + }, + { + "epoch": 1.2966503200066495, + "grad_norm": 0.39397045969963074, + "learning_rate": 3.316950937104518e-05, + "loss": 0.3657, + "step": 19500 + }, + { + "epoch": 1.299975064416923, + "grad_norm": 0.7825318574905396, + "learning_rate": 3.2896569222630224e-05, + "loss": 0.3981, + "step": 19550 + }, + { + "epoch": 1.3032998088271963, + "grad_norm": 0.6932367086410522, + "learning_rate": 3.26242050569081e-05, + "loss": 0.3821, + "step": 19600 + }, + { + "epoch": 1.3066245532374698, + "grad_norm": 0.613335907459259, + "learning_rate": 3.235242604613478e-05, + "loss": 0.3534, + "step": 19650 + }, + { + "epoch": 1.3099492976477434, + "grad_norm": 0.9380619525909424, + "learning_rate": 3.208124134286038e-05, + "loss": 0.3691, + "step": 19700 + }, + { + "epoch": 1.313274042058017, + "grad_norm": 0.6845571398735046, + "learning_rate": 3.181066007962079e-05, + "loss": 0.3995, + "step": 19750 + }, + { + "epoch": 1.3165987864682902, + "grad_norm": 0.40944433212280273, + "learning_rate": 3.1540691368630185e-05, + "loss": 0.422, + "step": 19800 + }, + { + "epoch": 1.3199235308785637, + "grad_norm": 0.34895792603492737, + "learning_rate": 3.127134430147417e-05, + "loss": 0.4023, + "step": 19850 + }, + { + "epoch": 1.323248275288837, + "grad_norm": 0.6736898422241211, + "learning_rate": 3.100262794880363e-05, + "loss": 0.4225, + "step": 19900 + }, + { + "epoch": 1.3265730196991106, + "grad_norm": 0.595956563949585, + "learning_rate": 3.073455136002919e-05, + "loss": 0.4148, + "step": 19950 + }, + { + "epoch": 1.329897764109384, + "grad_norm": 1.9619659185409546, + "learning_rate": 3.0467123563016513e-05, + "loss": 0.4008, + "step": 20000 + }, + { + "epoch": 1.329897764109384, + "eval_loss": 0.36843690276145935, + "eval_runtime": 4369.5216, + "eval_samples_per_second": 1.53, + "eval_steps_per_second": 1.53, + "step": 20000 + }, + { + "epoch": 1.3332225085196576, + "grad_norm": 0.3691612780094147, + "learning_rate": 3.0200353563782248e-05, + "loss": 0.3904, + "step": 20050 + }, + { + "epoch": 1.336547252929931, + "grad_norm": 0.30493393540382385, + "learning_rate": 2.9934250346190818e-05, + "loss": 0.3746, + "step": 20100 + }, + { + "epoch": 1.3398719973402045, + "grad_norm": 0.6750917434692383, + "learning_rate": 2.9668822871651736e-05, + "loss": 0.3831, + "step": 20150 + }, + { + "epoch": 1.343196741750478, + "grad_norm": 0.4708922207355499, + "learning_rate": 2.9404080078817924e-05, + "loss": 0.376, + "step": 20200 + }, + { + "epoch": 1.3465214861607513, + "grad_norm": 0.4739364981651306, + "learning_rate": 2.9140030883284684e-05, + "loss": 0.3932, + "step": 20250 + }, + { + "epoch": 1.3498462305710248, + "grad_norm": 0.2959195375442505, + "learning_rate": 2.8876684177289404e-05, + "loss": 0.4033, + "step": 20300 + }, + { + "epoch": 1.3531709749812983, + "grad_norm": 0.680323600769043, + "learning_rate": 2.861404882941212e-05, + "loss": 0.3659, + "step": 20350 + }, + { + "epoch": 1.3564957193915719, + "grad_norm": 0.4198705554008484, + "learning_rate": 2.8352133684276853e-05, + "loss": 0.3681, + "step": 20400 + }, + { + "epoch": 1.3598204638018452, + "grad_norm": 0.6400431394577026, + "learning_rate": 2.8090947562253807e-05, + "loss": 0.4492, + "step": 20450 + }, + { + "epoch": 1.3631452082121187, + "grad_norm": 0.5183029770851135, + "learning_rate": 2.7830499259162213e-05, + "loss": 0.387, + "step": 20500 + }, + { + "epoch": 1.3664699526223922, + "grad_norm": 0.31550315022468567, + "learning_rate": 2.7570797545974235e-05, + "loss": 0.4326, + "step": 20550 + }, + { + "epoch": 1.3697946970326655, + "grad_norm": 0.3952213227748871, + "learning_rate": 2.7311851168519496e-05, + "loss": 0.4243, + "step": 20600 + }, + { + "epoch": 1.373119441442939, + "grad_norm": 0.45342549681663513, + "learning_rate": 2.7053668847190672e-05, + "loss": 0.3845, + "step": 20650 + }, + { + "epoch": 1.3764441858532126, + "grad_norm": 0.8320136070251465, + "learning_rate": 2.6796259276649693e-05, + "loss": 0.3915, + "step": 20700 + }, + { + "epoch": 1.3797689302634861, + "grad_norm": 0.4563830494880676, + "learning_rate": 2.653963112553498e-05, + "loss": 0.3915, + "step": 20750 + }, + { + "epoch": 1.3830936746737594, + "grad_norm": 0.24706579744815826, + "learning_rate": 2.6283793036169603e-05, + "loss": 0.3984, + "step": 20800 + }, + { + "epoch": 1.386418419084033, + "grad_norm": 0.5812034010887146, + "learning_rate": 2.6028753624270074e-05, + "loss": 0.3679, + "step": 20850 + }, + { + "epoch": 1.3897431634943063, + "grad_norm": 0.3415985107421875, + "learning_rate": 2.5774521478656343e-05, + "loss": 0.3808, + "step": 20900 + }, + { + "epoch": 1.3930679079045798, + "grad_norm": 0.2992309629917145, + "learning_rate": 2.5521105160962473e-05, + "loss": 0.3669, + "step": 20950 + }, + { + "epoch": 1.3963926523148533, + "grad_norm": 0.7009733319282532, + "learning_rate": 2.52685132053484e-05, + "loss": 0.3882, + "step": 21000 + }, + { + "epoch": 1.3997173967251269, + "grad_norm": 0.4481956660747528, + "learning_rate": 2.501675411821241e-05, + "loss": 0.4083, + "step": 21050 + }, + { + "epoch": 1.4030421411354002, + "grad_norm": 0.3502480089664459, + "learning_rate": 2.4765836377904787e-05, + "loss": 0.4103, + "step": 21100 + }, + { + "epoch": 1.4063668855456737, + "grad_norm": 0.7800574898719788, + "learning_rate": 2.4515768434442215e-05, + "loss": 0.3978, + "step": 21150 + }, + { + "epoch": 1.4096916299559472, + "grad_norm": 0.4401569664478302, + "learning_rate": 2.4266558709223293e-05, + "loss": 0.3907, + "step": 21200 + }, + { + "epoch": 1.4130163743662205, + "grad_norm": 0.25381216406822205, + "learning_rate": 2.4018215594744835e-05, + "loss": 0.4001, + "step": 21250 + }, + { + "epoch": 1.416341118776494, + "grad_norm": 1.5524805784225464, + "learning_rate": 2.377074745431931e-05, + "loss": 0.3897, + "step": 21300 + }, + { + "epoch": 1.4196658631867676, + "grad_norm": 0.49514323472976685, + "learning_rate": 2.352416262179315e-05, + "loss": 0.3693, + "step": 21350 + }, + { + "epoch": 1.422990607597041, + "grad_norm": 0.5744829177856445, + "learning_rate": 2.3278469401266178e-05, + "loss": 0.3648, + "step": 21400 + }, + { + "epoch": 1.4263153520073144, + "grad_norm": 0.7969784736633301, + "learning_rate": 2.3033676066811845e-05, + "loss": 0.3768, + "step": 21450 + }, + { + "epoch": 1.429640096417588, + "grad_norm": 0.9067749977111816, + "learning_rate": 2.2789790862198628e-05, + "loss": 0.3326, + "step": 21500 + }, + { + "epoch": 1.4329648408278612, + "grad_norm": 0.6318944692611694, + "learning_rate": 2.2546822000612495e-05, + "loss": 0.37, + "step": 21550 + }, + { + "epoch": 1.4362895852381348, + "grad_norm": 0.3009655177593231, + "learning_rate": 2.2304777664380176e-05, + "loss": 0.3777, + "step": 21600 + }, + { + "epoch": 1.4396143296484083, + "grad_norm": 0.7818790674209595, + "learning_rate": 2.2063666004693695e-05, + "loss": 0.3809, + "step": 21650 + }, + { + "epoch": 1.4429390740586818, + "grad_norm": 0.3410826623439789, + "learning_rate": 2.182349514133583e-05, + "loss": 0.3746, + "step": 21700 + }, + { + "epoch": 1.4462638184689551, + "grad_norm": 0.20603881776332855, + "learning_rate": 2.1584273162406755e-05, + "loss": 0.3536, + "step": 21750 + }, + { + "epoch": 1.4495885628792287, + "grad_norm": 0.37000730633735657, + "learning_rate": 2.134600812405151e-05, + "loss": 0.3886, + "step": 21800 + }, + { + "epoch": 1.4529133072895022, + "grad_norm": 0.25155916810035706, + "learning_rate": 2.1108708050188825e-05, + "loss": 0.3688, + "step": 21850 + }, + { + "epoch": 1.4562380516997755, + "grad_norm": 0.6626400351524353, + "learning_rate": 2.0872380932240832e-05, + "loss": 0.3716, + "step": 21900 + }, + { + "epoch": 1.459562796110049, + "grad_norm": 0.9164525270462036, + "learning_rate": 2.063703472886402e-05, + "loss": 0.3939, + "step": 21950 + }, + { + "epoch": 1.4628875405203225, + "grad_norm": 0.3983005881309509, + "learning_rate": 2.0402677365681112e-05, + "loss": 0.361, + "step": 22000 + }, + { + "epoch": 1.466212284930596, + "grad_norm": 0.787325918674469, + "learning_rate": 2.0169316735014236e-05, + "loss": 0.4137, + "step": 22050 + }, + { + "epoch": 1.4695370293408694, + "grad_norm": 0.7404587268829346, + "learning_rate": 1.99369606956191e-05, + "loss": 0.4188, + "step": 22100 + }, + { + "epoch": 1.472861773751143, + "grad_norm": 0.3947995603084564, + "learning_rate": 1.9705617072420392e-05, + "loss": 0.373, + "step": 22150 + }, + { + "epoch": 1.4761865181614162, + "grad_norm": 0.5493362545967102, + "learning_rate": 1.9475293656248182e-05, + "loss": 0.3778, + "step": 22200 + }, + { + "epoch": 1.4795112625716897, + "grad_norm": 0.4823583960533142, + "learning_rate": 1.9245998203575593e-05, + "loss": 0.4243, + "step": 22250 + }, + { + "epoch": 1.4828360069819633, + "grad_norm": 0.334708034992218, + "learning_rate": 1.9017738436257655e-05, + "loss": 0.344, + "step": 22300 + }, + { + "epoch": 1.4861607513922368, + "grad_norm": 0.4683977961540222, + "learning_rate": 1.879052204127114e-05, + "loss": 0.3771, + "step": 22350 + }, + { + "epoch": 1.48948549580251, + "grad_norm": 0.38489750027656555, + "learning_rate": 1.8564356670455767e-05, + "loss": 0.3922, + "step": 22400 + }, + { + "epoch": 1.4928102402127836, + "grad_norm": 0.6407677531242371, + "learning_rate": 1.8339249940256492e-05, + "loss": 0.3877, + "step": 22450 + }, + { + "epoch": 1.4961349846230572, + "grad_norm": 0.46461015939712524, + "learning_rate": 1.8115209431467074e-05, + "loss": 0.3898, + "step": 22500 + }, + { + "epoch": 1.4994597290333305, + "grad_norm": 0.9419897794723511, + "learning_rate": 1.7892242688974664e-05, + "loss": 0.4022, + "step": 22550 + }, + { + "epoch": 1.502784473443604, + "grad_norm": 0.32548418641090393, + "learning_rate": 1.767035722150582e-05, + "loss": 0.3609, + "step": 22600 + }, + { + "epoch": 1.5061092178538775, + "grad_norm": 1.439875602722168, + "learning_rate": 1.7449560501373567e-05, + "loss": 0.3637, + "step": 22650 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 0.6430267095565796, + "learning_rate": 1.7229859964225868e-05, + "loss": 0.3746, + "step": 22700 + }, + { + "epoch": 1.5127587066744244, + "grad_norm": 0.41092389822006226, + "learning_rate": 1.7011263008795075e-05, + "loss": 0.4118, + "step": 22750 + }, + { + "epoch": 1.5160834510846979, + "grad_norm": 0.49290186166763306, + "learning_rate": 1.679377699664884e-05, + "loss": 0.3622, + "step": 22800 + }, + { + "epoch": 1.5194081954949712, + "grad_norm": 0.48516589403152466, + "learning_rate": 1.657740925194225e-05, + "loss": 0.3815, + "step": 22850 + }, + { + "epoch": 1.5227329399052447, + "grad_norm": 0.6224206686019897, + "learning_rate": 1.6362167061171063e-05, + "loss": 0.3837, + "step": 22900 + }, + { + "epoch": 1.5260576843155182, + "grad_norm": 0.40503615140914917, + "learning_rate": 1.614805767292642e-05, + "loss": 0.3932, + "step": 22950 + }, + { + "epoch": 1.5293824287257918, + "grad_norm": 0.46341225504875183, + "learning_rate": 1.5935088297650674e-05, + "loss": 0.3485, + "step": 23000 + }, + { + "epoch": 1.5327071731360653, + "grad_norm": 0.7125037312507629, + "learning_rate": 1.5723266107394653e-05, + "loss": 0.3887, + "step": 23050 + }, + { + "epoch": 1.5360319175463386, + "grad_norm": 0.36392441391944885, + "learning_rate": 1.551259823557602e-05, + "loss": 0.3908, + "step": 23100 + }, + { + "epoch": 1.5393566619566121, + "grad_norm": 0.4638826847076416, + "learning_rate": 1.530309177673912e-05, + "loss": 0.4156, + "step": 23150 + }, + { + "epoch": 1.5426814063668854, + "grad_norm": 0.46034330129623413, + "learning_rate": 1.509475378631603e-05, + "loss": 0.3439, + "step": 23200 + }, + { + "epoch": 1.546006150777159, + "grad_norm": 0.28411605954170227, + "learning_rate": 1.4887591280389007e-05, + "loss": 0.3763, + "step": 23250 + }, + { + "epoch": 1.5493308951874325, + "grad_norm": 0.3564077615737915, + "learning_rate": 1.468161123545413e-05, + "loss": 0.4004, + "step": 23300 + }, + { + "epoch": 1.552655639597706, + "grad_norm": 0.5136293172836304, + "learning_rate": 1.4476820588186412e-05, + "loss": 0.3433, + "step": 23350 + }, + { + "epoch": 1.5559803840079793, + "grad_norm": 0.8439062237739563, + "learning_rate": 1.4273226235206178e-05, + "loss": 0.3838, + "step": 23400 + }, + { + "epoch": 1.5593051284182529, + "grad_norm": 0.6947528719902039, + "learning_rate": 1.4070835032846852e-05, + "loss": 0.3627, + "step": 23450 + }, + { + "epoch": 1.5626298728285262, + "grad_norm": 0.418443500995636, + "learning_rate": 1.3869653796923993e-05, + "loss": 0.3698, + "step": 23500 + }, + { + "epoch": 1.5659546172387997, + "grad_norm": 0.5064214468002319, + "learning_rate": 1.3669689302505778e-05, + "loss": 0.3827, + "step": 23550 + }, + { + "epoch": 1.5692793616490732, + "grad_norm": 0.2299993336200714, + "learning_rate": 1.3470948283684925e-05, + "loss": 0.363, + "step": 23600 + }, + { + "epoch": 1.5726041060593468, + "grad_norm": 0.3324210047721863, + "learning_rate": 1.3273437433351787e-05, + "loss": 0.3504, + "step": 23650 + }, + { + "epoch": 1.5759288504696203, + "grad_norm": 0.5440026521682739, + "learning_rate": 1.307716340296904e-05, + "loss": 0.4031, + "step": 23700 + }, + { + "epoch": 1.5792535948798936, + "grad_norm": 0.5328998565673828, + "learning_rate": 1.2882132802347647e-05, + "loss": 0.3945, + "step": 23750 + }, + { + "epoch": 1.5825783392901671, + "grad_norm": 0.9955481290817261, + "learning_rate": 1.268835219942433e-05, + "loss": 0.3742, + "step": 23800 + }, + { + "epoch": 1.5859030837004404, + "grad_norm": 0.2555365562438965, + "learning_rate": 1.2495828120040288e-05, + "loss": 0.412, + "step": 23850 + }, + { + "epoch": 1.589227828110714, + "grad_norm": 0.7778675556182861, + "learning_rate": 1.23045670477215e-05, + "loss": 0.3863, + "step": 23900 + }, + { + "epoch": 1.5925525725209875, + "grad_norm": 0.19272594153881073, + "learning_rate": 1.2114575423460333e-05, + "loss": 0.3391, + "step": 23950 + }, + { + "epoch": 1.595877316931261, + "grad_norm": 0.5186722278594971, + "learning_rate": 1.1925859645498722e-05, + "loss": 0.3796, + "step": 24000 + }, + { + "epoch": 1.5992020613415345, + "grad_norm": 0.508628249168396, + "learning_rate": 1.1738426069112573e-05, + "loss": 0.4019, + "step": 24050 + }, + { + "epoch": 1.6025268057518078, + "grad_norm": 0.37196823954582214, + "learning_rate": 1.1552281006397819e-05, + "loss": 0.3652, + "step": 24100 + }, + { + "epoch": 1.6058515501620811, + "grad_norm": 0.2910582721233368, + "learning_rate": 1.1367430726057887e-05, + "loss": 0.3499, + "step": 24150 + }, + { + "epoch": 1.6091762945723547, + "grad_norm": 0.6717352271080017, + "learning_rate": 1.1183881453192479e-05, + "loss": 0.3619, + "step": 24200 + }, + { + "epoch": 1.6125010389826282, + "grad_norm": 0.34437698125839233, + "learning_rate": 1.1001639369088018e-05, + "loss": 0.3463, + "step": 24250 + }, + { + "epoch": 1.6158257833929017, + "grad_norm": 0.46413883566856384, + "learning_rate": 1.082071061100945e-05, + "loss": 0.3765, + "step": 24300 + }, + { + "epoch": 1.6191505278031753, + "grad_norm": 0.49501270055770874, + "learning_rate": 1.0641101271993614e-05, + "loss": 0.3561, + "step": 24350 + }, + { + "epoch": 1.6224752722134486, + "grad_norm": 0.8731350302696228, + "learning_rate": 1.0462817400643959e-05, + "loss": 0.3863, + "step": 24400 + }, + { + "epoch": 1.625800016623722, + "grad_norm": 0.25603896379470825, + "learning_rate": 1.0285865000926925e-05, + "loss": 0.3678, + "step": 24450 + }, + { + "epoch": 1.6291247610339954, + "grad_norm": 0.7849488854408264, + "learning_rate": 1.0110250031969709e-05, + "loss": 0.3705, + "step": 24500 + }, + { + "epoch": 1.632449505444269, + "grad_norm": 0.5623769760131836, + "learning_rate": 9.935978407859624e-06, + "loss": 0.3429, + "step": 24550 + }, + { + "epoch": 1.6357742498545425, + "grad_norm": 0.3793054521083832, + "learning_rate": 9.763055997444897e-06, + "loss": 0.3985, + "step": 24600 + }, + { + "epoch": 1.639098994264816, + "grad_norm": 1.2295911312103271, + "learning_rate": 9.591488624137023e-06, + "loss": 0.3575, + "step": 24650 + }, + { + "epoch": 1.6424237386750895, + "grad_norm": 0.23833027482032776, + "learning_rate": 9.421282065714676e-06, + "loss": 0.3721, + "step": 24700 + }, + { + "epoch": 1.6457484830853628, + "grad_norm": 0.9506494402885437, + "learning_rate": 9.25244205412915e-06, + "loss": 0.3741, + "step": 24750 + }, + { + "epoch": 1.6490732274956361, + "grad_norm": 0.5477185845375061, + "learning_rate": 9.08497427531128e-06, + "loss": 0.3259, + "step": 24800 + }, + { + "epoch": 1.6523979719059096, + "grad_norm": 0.6023584604263306, + "learning_rate": 8.91888436897997e-06, + "loss": 0.396, + "step": 24850 + }, + { + "epoch": 1.6557227163161832, + "grad_norm": 0.5275429487228394, + "learning_rate": 8.754177928452328e-06, + "loss": 0.3445, + "step": 24900 + }, + { + "epoch": 1.6590474607264567, + "grad_norm": 0.41783201694488525, + "learning_rate": 8.590860500455217e-06, + "loss": 0.387, + "step": 24950 + }, + { + "epoch": 1.6623722051367302, + "grad_norm": 0.19075682759284973, + "learning_rate": 8.428937584938496e-06, + "loss": 0.3951, + "step": 25000 + }, + { + "epoch": 1.6656969495470035, + "grad_norm": 0.2861451804637909, + "learning_rate": 8.268414634889848e-06, + "loss": 0.3673, + "step": 25050 + }, + { + "epoch": 1.669021693957277, + "grad_norm": 0.8673615455627441, + "learning_rate": 8.109297056151067e-06, + "loss": 0.3975, + "step": 25100 + }, + { + "epoch": 1.6723464383675504, + "grad_norm": 0.6853104829788208, + "learning_rate": 7.951590207236038e-06, + "loss": 0.3967, + "step": 25150 + }, + { + "epoch": 1.675671182777824, + "grad_norm": 0.7709905505180359, + "learning_rate": 7.79529939915029e-06, + "loss": 0.3482, + "step": 25200 + }, + { + "epoch": 1.6789959271880974, + "grad_norm": 0.250615656375885, + "learning_rate": 7.640429895212164e-06, + "loss": 0.3693, + "step": 25250 + }, + { + "epoch": 1.682320671598371, + "grad_norm": 0.725862443447113, + "learning_rate": 7.486986910875499e-06, + "loss": 0.325, + "step": 25300 + }, + { + "epoch": 1.6856454160086445, + "grad_norm": 0.4505915343761444, + "learning_rate": 7.3349756135540235e-06, + "loss": 0.3634, + "step": 25350 + }, + { + "epoch": 1.6889701604189178, + "grad_norm": 0.7828101515769958, + "learning_rate": 7.184401122447398e-06, + "loss": 0.3927, + "step": 25400 + }, + { + "epoch": 1.692294904829191, + "grad_norm": 0.38519877195358276, + "learning_rate": 7.035268508368697e-06, + "loss": 0.3676, + "step": 25450 + }, + { + "epoch": 1.6956196492394646, + "grad_norm": 0.687976598739624, + "learning_rate": 6.887582793573727e-06, + "loss": 0.3897, + "step": 25500 + }, + { + "epoch": 1.6989443936497381, + "grad_norm": 0.38796254992485046, + "learning_rate": 6.741348951591908e-06, + "loss": 0.3922, + "step": 25550 + }, + { + "epoch": 1.7022691380600117, + "grad_norm": 0.8533971309661865, + "learning_rate": 6.596571907058707e-06, + "loss": 0.374, + "step": 25600 + }, + { + "epoch": 1.7055938824702852, + "grad_norm": 0.9028803110122681, + "learning_rate": 6.453256535549846e-06, + "loss": 0.4181, + "step": 25650 + }, + { + "epoch": 1.7089186268805585, + "grad_norm": 0.4268290102481842, + "learning_rate": 6.31140766341713e-06, + "loss": 0.3611, + "step": 25700 + }, + { + "epoch": 1.712243371290832, + "grad_norm": 0.6220707893371582, + "learning_rate": 6.1710300676258385e-06, + "loss": 0.3328, + "step": 25750 + }, + { + "epoch": 1.7155681157011053, + "grad_norm": 0.4467557668685913, + "learning_rate": 6.032128475593924e-06, + "loss": 0.3704, + "step": 25800 + }, + { + "epoch": 1.7188928601113789, + "grad_norm": 0.4108564555644989, + "learning_rate": 5.894707565032776e-06, + "loss": 0.3486, + "step": 25850 + }, + { + "epoch": 1.7222176045216524, + "grad_norm": 0.5933622121810913, + "learning_rate": 5.758771963789722e-06, + "loss": 0.3668, + "step": 25900 + }, + { + "epoch": 1.725542348931926, + "grad_norm": 0.2982137203216553, + "learning_rate": 5.6243262496921245e-06, + "loss": 0.3143, + "step": 25950 + }, + { + "epoch": 1.7288670933421995, + "grad_norm": 0.6367640495300293, + "learning_rate": 5.4913749503932575e-06, + "loss": 0.3452, + "step": 26000 + }, + { + "epoch": 1.7321918377524728, + "grad_norm": 0.4822899103164673, + "learning_rate": 5.359922543219848e-06, + "loss": 0.3903, + "step": 26050 + }, + { + "epoch": 1.7355165821627463, + "grad_norm": 0.7650361061096191, + "learning_rate": 5.229973455021231e-06, + "loss": 0.3691, + "step": 26100 + }, + { + "epoch": 1.7388413265730196, + "grad_norm": 0.4355323910713196, + "learning_rate": 5.101532062020325e-06, + "loss": 0.3174, + "step": 26150 + }, + { + "epoch": 1.7421660709832931, + "grad_norm": 0.49863699078559875, + "learning_rate": 4.974602689666252e-06, + "loss": 0.3693, + "step": 26200 + }, + { + "epoch": 1.7454908153935667, + "grad_norm": 0.455340713262558, + "learning_rate": 4.8491896124886416e-06, + "loss": 0.3869, + "step": 26250 + }, + { + "epoch": 1.7488155598038402, + "grad_norm": 0.3978399932384491, + "learning_rate": 4.725297053953692e-06, + "loss": 0.3925, + "step": 26300 + }, + { + "epoch": 1.7521403042141135, + "grad_norm": 0.3163425922393799, + "learning_rate": 4.602929186321947e-06, + "loss": 0.3563, + "step": 26350 + }, + { + "epoch": 1.755465048624387, + "grad_norm": 0.5234238505363464, + "learning_rate": 4.48209013050781e-06, + "loss": 0.4169, + "step": 26400 + }, + { + "epoch": 1.7587897930346603, + "grad_norm": 0.42517679929733276, + "learning_rate": 4.362783955940719e-06, + "loss": 0.363, + "step": 26450 + }, + { + "epoch": 1.7621145374449338, + "grad_norm": 0.6609143018722534, + "learning_rate": 4.245014680428117e-06, + "loss": 0.3572, + "step": 26500 + }, + { + "epoch": 1.7654392818552074, + "grad_norm": 0.21873889863491058, + "learning_rate": 4.128786270020174e-06, + "loss": 0.3497, + "step": 26550 + }, + { + "epoch": 1.768764026265481, + "grad_norm": 0.8427754044532776, + "learning_rate": 4.014102638876205e-06, + "loss": 0.3702, + "step": 26600 + }, + { + "epoch": 1.7720887706757544, + "grad_norm": 0.8649630546569824, + "learning_rate": 3.900967649132847e-06, + "loss": 0.3662, + "step": 26650 + }, + { + "epoch": 1.7754135150860277, + "grad_norm": 0.43425253033638, + "learning_rate": 3.789385110774013e-06, + "loss": 0.3643, + "step": 26700 + }, + { + "epoch": 1.7787382594963013, + "grad_norm": 0.3920991122722626, + "learning_rate": 3.679358781502562e-06, + "loss": 0.3834, + "step": 26750 + }, + { + "epoch": 1.7820630039065746, + "grad_norm": 0.7349820137023926, + "learning_rate": 3.5708923666137927e-06, + "loss": 0.3632, + "step": 26800 + }, + { + "epoch": 1.785387748316848, + "grad_norm": 0.6672165989875793, + "learning_rate": 3.4639895188706195e-06, + "loss": 0.3702, + "step": 26850 + }, + { + "epoch": 1.7887124927271216, + "grad_norm": 0.537217915058136, + "learning_rate": 3.358653838380571e-06, + "loss": 0.3397, + "step": 26900 + }, + { + "epoch": 1.7920372371373952, + "grad_norm": 0.8017503023147583, + "learning_rate": 3.254888872474593e-06, + "loss": 0.3762, + "step": 26950 + }, + { + "epoch": 1.7953619815476687, + "grad_norm": 0.16730186343193054, + "learning_rate": 3.1526981155875156e-06, + "loss": 0.3425, + "step": 27000 + }, + { + "epoch": 1.798686725957942, + "grad_norm": 0.5476846694946289, + "learning_rate": 3.0520850091404263e-06, + "loss": 0.3708, + "step": 27050 + }, + { + "epoch": 1.8020114703682153, + "grad_norm": 0.15695548057556152, + "learning_rate": 2.9530529414247608e-06, + "loss": 0.3675, + "step": 27100 + }, + { + "epoch": 1.8053362147784888, + "grad_norm": 0.5888222455978394, + "learning_rate": 2.8556052474881967e-06, + "loss": 0.3647, + "step": 27150 + }, + { + "epoch": 1.8086609591887624, + "grad_norm": 0.9941563010215759, + "learning_rate": 2.7597452090223354e-06, + "loss": 0.3456, + "step": 27200 + }, + { + "epoch": 1.8119857035990359, + "grad_norm": 0.610040009021759, + "learning_rate": 2.6654760542521917e-06, + "loss": 0.3746, + "step": 27250 + }, + { + "epoch": 1.8153104480093094, + "grad_norm": 0.8063670992851257, + "learning_rate": 2.572800957827476e-06, + "loss": 0.3798, + "step": 27300 + }, + { + "epoch": 1.8186351924195827, + "grad_norm": 0.28963837027549744, + "learning_rate": 2.4817230407156946e-06, + "loss": 0.3713, + "step": 27350 + }, + { + "epoch": 1.8219599368298562, + "grad_norm": 0.39352625608444214, + "learning_rate": 2.3922453700970295e-06, + "loss": 0.3976, + "step": 27400 + }, + { + "epoch": 1.8252846812401295, + "grad_norm": 0.9289838671684265, + "learning_rate": 2.3043709592610485e-06, + "loss": 0.3788, + "step": 27450 + }, + { + "epoch": 1.828609425650403, + "grad_norm": 0.42766475677490234, + "learning_rate": 2.2181027675052534e-06, + "loss": 0.3672, + "step": 27500 + }, + { + "epoch": 1.8319341700606766, + "grad_norm": 0.4662071168422699, + "learning_rate": 2.133443700035387e-06, + "loss": 0.3382, + "step": 27550 + }, + { + "epoch": 1.8352589144709501, + "grad_norm": 0.5831243991851807, + "learning_rate": 2.0503966078676217e-06, + "loss": 0.4051, + "step": 27600 + }, + { + "epoch": 1.8385836588812237, + "grad_norm": 0.4857243299484253, + "learning_rate": 1.9689642877325165e-06, + "loss": 0.3855, + "step": 27650 + }, + { + "epoch": 1.841908403291497, + "grad_norm": 0.44161850214004517, + "learning_rate": 1.8891494819808841e-06, + "loss": 0.366, + "step": 27700 + }, + { + "epoch": 1.8452331477017703, + "grad_norm": 0.3997364640235901, + "learning_rate": 1.8109548784913887e-06, + "loss": 0.3469, + "step": 27750 + }, + { + "epoch": 1.8485578921120438, + "grad_norm": 0.7303462028503418, + "learning_rate": 1.7343831105800511e-06, + "loss": 0.3724, + "step": 27800 + }, + { + "epoch": 1.8518826365223173, + "grad_norm": 0.5361658930778503, + "learning_rate": 1.6594367569115532e-06, + "loss": 0.353, + "step": 27850 + }, + { + "epoch": 1.8552073809325909, + "grad_norm": 0.7074758410453796, + "learning_rate": 1.5861183414124403e-06, + "loss": 0.3689, + "step": 27900 + }, + { + "epoch": 1.8585321253428644, + "grad_norm": 0.9188947081565857, + "learning_rate": 1.514430333186062e-06, + "loss": 0.3341, + "step": 27950 + }, + { + "epoch": 1.8618568697531377, + "grad_norm": 0.8179706931114197, + "learning_rate": 1.4443751464294664e-06, + "loss": 0.3709, + "step": 28000 + }, + { + "epoch": 1.8651816141634112, + "grad_norm": 0.7332112193107605, + "learning_rate": 1.3759551403520643e-06, + "loss": 0.3433, + "step": 28050 + }, + { + "epoch": 1.8685063585736845, + "grad_norm": 0.4983903765678406, + "learning_rate": 1.3091726190962329e-06, + "loss": 0.3337, + "step": 28100 + }, + { + "epoch": 1.871831102983958, + "grad_norm": 0.2123403400182724, + "learning_rate": 1.2440298316596654e-06, + "loss": 0.3475, + "step": 28150 + }, + { + "epoch": 1.8751558473942316, + "grad_norm": 0.19655907154083252, + "learning_rate": 1.18052897181965e-06, + "loss": 0.3441, + "step": 28200 + }, + { + "epoch": 1.878480591804505, + "grad_norm": 0.40445396304130554, + "learning_rate": 1.1186721780592102e-06, + "loss": 0.3793, + "step": 28250 + }, + { + "epoch": 1.8818053362147786, + "grad_norm": 0.3509872853755951, + "learning_rate": 1.0584615334950643e-06, + "loss": 0.3656, + "step": 28300 + }, + { + "epoch": 1.885130080625052, + "grad_norm": 0.25542914867401123, + "learning_rate": 9.998990658074914e-07, + "loss": 0.3368, + "step": 28350 + }, + { + "epoch": 1.8884548250353252, + "grad_norm": 0.5417547225952148, + "learning_rate": 9.429867471720255e-07, + "loss": 0.3631, + "step": 28400 + }, + { + "epoch": 1.8917795694455988, + "grad_norm": 0.2370055764913559, + "learning_rate": 8.877264941930586e-07, + "loss": 0.3569, + "step": 28450 + }, + { + "epoch": 1.8951043138558723, + "grad_norm": 0.7116398811340332, + "learning_rate": 8.341201678392974e-07, + "loss": 0.4227, + "step": 28500 + }, + { + "epoch": 1.8984290582661458, + "grad_norm": 0.9473690390586853, + "learning_rate": 7.821695733810641e-07, + "loss": 0.3959, + "step": 28550 + }, + { + "epoch": 1.9017538026764194, + "grad_norm": 0.446325421333313, + "learning_rate": 7.318764603295447e-07, + "loss": 0.3363, + "step": 28600 + }, + { + "epoch": 1.9050785470866927, + "grad_norm": 1.2480918169021606, + "learning_rate": 6.832425223778304e-07, + "loss": 0.395, + "step": 28650 + }, + { + "epoch": 1.9084032914969662, + "grad_norm": 0.8201688528060913, + "learning_rate": 6.362693973439193e-07, + "loss": 0.352, + "step": 28700 + }, + { + "epoch": 1.9117280359072395, + "grad_norm": 0.546881377696991, + "learning_rate": 5.909586671155098e-07, + "loss": 0.3362, + "step": 28750 + }, + { + "epoch": 1.915052780317513, + "grad_norm": 0.8294301629066467, + "learning_rate": 5.47311857596794e-07, + "loss": 0.3586, + "step": 28800 + }, + { + "epoch": 1.9183775247277866, + "grad_norm": 0.7555782794952393, + "learning_rate": 5.05330438657009e-07, + "loss": 0.3757, + "step": 28850 + }, + { + "epoch": 1.92170226913806, + "grad_norm": 1.1964213848114014, + "learning_rate": 4.6501582408096657e-07, + "loss": 0.3783, + "step": 28900 + }, + { + "epoch": 1.9250270135483336, + "grad_norm": 0.5259461998939514, + "learning_rate": 4.263693715214456e-07, + "loss": 0.3516, + "step": 28950 + }, + { + "epoch": 1.928351757958607, + "grad_norm": 0.23752902448177338, + "learning_rate": 3.893923824534629e-07, + "loss": 0.3414, + "step": 29000 + }, + { + "epoch": 1.9316765023688804, + "grad_norm": 0.7082974314689636, + "learning_rate": 3.5408610213043536e-07, + "loss": 0.3867, + "step": 29050 + }, + { + "epoch": 1.9350012467791537, + "grad_norm": 0.5476464033126831, + "learning_rate": 3.204517195422696e-07, + "loss": 0.3588, + "step": 29100 + }, + { + "epoch": 1.9383259911894273, + "grad_norm": 0.7200431823730469, + "learning_rate": 2.8849036737528813e-07, + "loss": 0.3816, + "step": 29150 + }, + { + "epoch": 1.9416507355997008, + "grad_norm": 0.41176721453666687, + "learning_rate": 2.5820312197411543e-07, + "loss": 0.3547, + "step": 29200 + }, + { + "epoch": 1.9449754800099743, + "grad_norm": 1.0158333778381348, + "learning_rate": 2.2959100330541273e-07, + "loss": 0.3763, + "step": 29250 + }, + { + "epoch": 1.9483002244202476, + "grad_norm": 1.136353850364685, + "learning_rate": 2.0265497492352735e-07, + "loss": 0.3524, + "step": 29300 + }, + { + "epoch": 1.9516249688305212, + "grad_norm": 0.6701260805130005, + "learning_rate": 1.7739594393805793e-07, + "loss": 0.3532, + "step": 29350 + }, + { + "epoch": 1.9549497132407945, + "grad_norm": 0.5202396512031555, + "learning_rate": 1.538147609832896e-07, + "loss": 0.3787, + "step": 29400 + }, + { + "epoch": 1.958274457651068, + "grad_norm": 0.3793032467365265, + "learning_rate": 1.3191222018956174e-07, + "loss": 0.3813, + "step": 29450 + }, + { + "epoch": 1.9615992020613415, + "grad_norm": 0.266659140586853, + "learning_rate": 1.1168905915652228e-07, + "loss": 0.3346, + "step": 29500 + }, + { + "epoch": 1.964923946471615, + "grad_norm": 0.4805378317832947, + "learning_rate": 9.314595892827016e-08, + "loss": 0.3695, + "step": 29550 + }, + { + "epoch": 1.9682486908818886, + "grad_norm": 0.9481486082077026, + "learning_rate": 7.628354397045123e-08, + "loss": 0.3579, + "step": 29600 + }, + { + "epoch": 1.971573435292162, + "grad_norm": 0.3038884997367859, + "learning_rate": 6.110238214919739e-08, + "loss": 0.3449, + "step": 29650 + }, + { + "epoch": 1.9748981797024354, + "grad_norm": 0.6856468319892883, + "learning_rate": 4.760298471201963e-08, + "loss": 0.3598, + "step": 29700 + }, + { + "epoch": 1.9782229241127087, + "grad_norm": 0.4969344437122345, + "learning_rate": 3.5785806270599575e-08, + "loss": 0.3437, + "step": 29750 + }, + { + "epoch": 1.9815476685229823, + "grad_norm": 0.6707144975662231, + "learning_rate": 2.565124478545733e-08, + "loss": 0.361, + "step": 29800 + }, + { + "epoch": 1.9848724129332558, + "grad_norm": 0.4501785933971405, + "learning_rate": 1.719964155256215e-08, + "loss": 0.3668, + "step": 29850 + }, + { + "epoch": 1.9881971573435293, + "grad_norm": 0.31389063596725464, + "learning_rate": 1.043128119184167e-08, + "loss": 0.3647, + "step": 29900 + }, + { + "epoch": 1.9915219017538028, + "grad_norm": 0.9812319278717041, + "learning_rate": 5.346391637583992e-09, + "loss": 0.4068, + "step": 29950 + }, + { + "epoch": 1.9948466461640761, + "grad_norm": 0.5165169835090637, + "learning_rate": 1.945144130788279e-09, + "loss": 0.4035, + "step": 30000 + }, + { + "epoch": 1.9948466461640761, + "eval_loss": 0.3463568687438965, + "eval_runtime": 4374.0175, + "eval_samples_per_second": 1.528, + "eval_steps_per_second": 1.528, + "step": 30000 + }, + { + "epoch": 1.9981713905743494, + "grad_norm": 0.814073383808136, + "learning_rate": 2.2765321335826983e-10, + "loss": 0.3829, + "step": 30050 + }, + { + "epoch": 1.9999002576676919, + "step": 30076, + "total_flos": 1.0617208245100216e+19, + "train_loss": 0.5353738934709316, + "train_runtime": 238041.8859, + "train_samples_per_second": 0.505, + "train_steps_per_second": 0.126 + } + ], + "logging_steps": 50, + "max_steps": 30076, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0617208245100216e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}