{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4002468627854351, "eval_steps": 500, "global_step": 4864, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.228759514503188e-05, "grad_norm": 9.532528095057138, "learning_rate": 5.479452054794521e-08, "loss": 0.7901, "step": 1 }, { "epoch": 0.00016457519029006376, "grad_norm": 30.026945671831577, "learning_rate": 1.0958904109589042e-07, "loss": 2.1253, "step": 2 }, { "epoch": 0.00024686278543509563, "grad_norm": 8.88519815829157, "learning_rate": 1.6438356164383561e-07, "loss": 0.7715, "step": 3 }, { "epoch": 0.00032915038058012753, "grad_norm": 29.197616305414858, "learning_rate": 2.1917808219178084e-07, "loss": 2.1284, "step": 4 }, { "epoch": 0.0004114379757251594, "grad_norm": 29.892559190290434, "learning_rate": 2.73972602739726e-07, "loss": 2.0685, "step": 5 }, { "epoch": 0.0004937255708701913, "grad_norm": 10.567782598278942, "learning_rate": 3.2876712328767123e-07, "loss": 0.8122, "step": 6 }, { "epoch": 0.0005760131660152232, "grad_norm": 28.912763215741734, "learning_rate": 3.835616438356165e-07, "loss": 2.1056, "step": 7 }, { "epoch": 0.0006583007611602551, "grad_norm": 29.51664131482477, "learning_rate": 4.383561643835617e-07, "loss": 2.0418, "step": 8 }, { "epoch": 0.000740588356305287, "grad_norm": 28.30266632286417, "learning_rate": 4.931506849315068e-07, "loss": 2.0237, "step": 9 }, { "epoch": 0.0008228759514503189, "grad_norm": 27.46875103243188, "learning_rate": 5.47945205479452e-07, "loss": 1.9595, "step": 10 }, { "epoch": 0.0009051635465953507, "grad_norm": 24.865752165641698, "learning_rate": 6.027397260273974e-07, "loss": 1.9174, "step": 11 }, { "epoch": 0.0009874511417403825, "grad_norm": 24.328147714809518, "learning_rate": 6.575342465753425e-07, "loss": 1.9307, "step": 12 }, { "epoch": 0.0010697387368854144, "grad_norm": 5.5234808874616395, "learning_rate": 7.123287671232878e-07, "loss": 0.8138, "step": 13 }, { "epoch": 0.0011520263320304463, "grad_norm": 24.035678143620423, "learning_rate": 7.67123287671233e-07, "loss": 1.9803, "step": 14 }, { "epoch": 0.0012343139271754782, "grad_norm": 20.7270429685146, "learning_rate": 8.219178082191781e-07, "loss": 1.8216, "step": 15 }, { "epoch": 0.0013166015223205101, "grad_norm": 3.1954913902580597, "learning_rate": 8.767123287671234e-07, "loss": 0.7577, "step": 16 }, { "epoch": 0.001398889117465542, "grad_norm": 19.0932823831642, "learning_rate": 9.315068493150686e-07, "loss": 1.8765, "step": 17 }, { "epoch": 0.001481176712610574, "grad_norm": 17.783753558169572, "learning_rate": 9.863013698630137e-07, "loss": 1.7423, "step": 18 }, { "epoch": 0.0015634643077556058, "grad_norm": 13.929582396803928, "learning_rate": 1.041095890410959e-06, "loss": 1.5683, "step": 19 }, { "epoch": 0.0016457519029006377, "grad_norm": 10.860155069125868, "learning_rate": 1.095890410958904e-06, "loss": 1.5344, "step": 20 }, { "epoch": 0.0017280394980456696, "grad_norm": 10.868210550382598, "learning_rate": 1.1506849315068494e-06, "loss": 1.4788, "step": 21 }, { "epoch": 0.0018103270931907015, "grad_norm": 9.306619668804826, "learning_rate": 1.2054794520547947e-06, "loss": 1.4831, "step": 22 }, { "epoch": 0.0018926146883357334, "grad_norm": 2.4601086961337857, "learning_rate": 1.26027397260274e-06, "loss": 0.7305, "step": 23 }, { "epoch": 0.001974902283480765, "grad_norm": 7.6886950923134005, "learning_rate": 1.315068493150685e-06, "loss": 1.4257, "step": 24 }, { "epoch": 0.002057189878625797, "grad_norm": 6.220708397685521, "learning_rate": 1.3698630136986302e-06, "loss": 1.3468, "step": 25 }, { "epoch": 0.002139477473770829, "grad_norm": 4.674476253548759, "learning_rate": 1.4246575342465755e-06, "loss": 1.3151, "step": 26 }, { "epoch": 0.002221765068915861, "grad_norm": 3.895214381538298, "learning_rate": 1.4794520547945206e-06, "loss": 1.3041, "step": 27 }, { "epoch": 0.0023040526640608927, "grad_norm": 3.527134956076901, "learning_rate": 1.534246575342466e-06, "loss": 1.2878, "step": 28 }, { "epoch": 0.0023863402592059248, "grad_norm": 3.5362809667326522, "learning_rate": 1.5890410958904112e-06, "loss": 1.2726, "step": 29 }, { "epoch": 0.0024686278543509564, "grad_norm": 2.966450361552696, "learning_rate": 1.6438356164383561e-06, "loss": 1.2993, "step": 30 }, { "epoch": 0.0025509154494959886, "grad_norm": 2.458939366346722, "learning_rate": 1.6986301369863014e-06, "loss": 1.281, "step": 31 }, { "epoch": 0.0026332030446410202, "grad_norm": 2.535030337573037, "learning_rate": 1.7534246575342468e-06, "loss": 1.2708, "step": 32 }, { "epoch": 0.0027154906397860524, "grad_norm": 1.239317382781359, "learning_rate": 1.808219178082192e-06, "loss": 0.6648, "step": 33 }, { "epoch": 0.002797778234931084, "grad_norm": 1.1180854196130607, "learning_rate": 1.8630136986301372e-06, "loss": 0.6646, "step": 34 }, { "epoch": 0.002880065830076116, "grad_norm": 2.1450564270921646, "learning_rate": 1.9178082191780823e-06, "loss": 1.2447, "step": 35 }, { "epoch": 0.002962353425221148, "grad_norm": 1.8049145439148968, "learning_rate": 1.9726027397260274e-06, "loss": 1.1815, "step": 36 }, { "epoch": 0.00304464102036618, "grad_norm": 0.795375753210199, "learning_rate": 2.027397260273973e-06, "loss": 0.6292, "step": 37 }, { "epoch": 0.0031269286155112116, "grad_norm": 0.7439259016336192, "learning_rate": 2.082191780821918e-06, "loss": 0.6468, "step": 38 }, { "epoch": 0.0032092162106562437, "grad_norm": 2.102073236832498, "learning_rate": 2.1369863013698635e-06, "loss": 1.1965, "step": 39 }, { "epoch": 0.0032915038058012754, "grad_norm": 1.7507482751861791, "learning_rate": 2.191780821917808e-06, "loss": 1.147, "step": 40 }, { "epoch": 0.0033737914009463075, "grad_norm": 2.115499646494852, "learning_rate": 2.2465753424657537e-06, "loss": 1.2079, "step": 41 }, { "epoch": 0.003456078996091339, "grad_norm": 1.5822724466961147, "learning_rate": 2.301369863013699e-06, "loss": 1.213, "step": 42 }, { "epoch": 0.0035383665912363713, "grad_norm": 0.6843357265370693, "learning_rate": 2.356164383561644e-06, "loss": 0.624, "step": 43 }, { "epoch": 0.003620654186381403, "grad_norm": 1.9669305292499641, "learning_rate": 2.4109589041095894e-06, "loss": 1.1691, "step": 44 }, { "epoch": 0.003702941781526435, "grad_norm": 4.293989393639943, "learning_rate": 2.4657534246575345e-06, "loss": 1.1484, "step": 45 }, { "epoch": 0.003785229376671467, "grad_norm": 1.3873591085798673, "learning_rate": 2.52054794520548e-06, "loss": 1.177, "step": 46 }, { "epoch": 0.0038675169718164985, "grad_norm": 3.6561002665760807, "learning_rate": 2.5753424657534247e-06, "loss": 1.1469, "step": 47 }, { "epoch": 0.00394980456696153, "grad_norm": 1.5450365482515196, "learning_rate": 2.63013698630137e-06, "loss": 1.1521, "step": 48 }, { "epoch": 0.004032092162106563, "grad_norm": 1.5565124011894804, "learning_rate": 2.6849315068493153e-06, "loss": 1.1589, "step": 49 }, { "epoch": 0.004114379757251594, "grad_norm": 0.6675144755255817, "learning_rate": 2.7397260273972604e-06, "loss": 0.6406, "step": 50 }, { "epoch": 0.004196667352396626, "grad_norm": 1.5292143908928457, "learning_rate": 2.794520547945206e-06, "loss": 1.1297, "step": 51 }, { "epoch": 0.004278954947541658, "grad_norm": 0.6502938857874467, "learning_rate": 2.849315068493151e-06, "loss": 0.6186, "step": 52 }, { "epoch": 0.00436124254268669, "grad_norm": 1.4333837148693778, "learning_rate": 2.9041095890410957e-06, "loss": 1.1303, "step": 53 }, { "epoch": 0.004443530137831722, "grad_norm": 1.4749791593345467, "learning_rate": 2.9589041095890413e-06, "loss": 1.1387, "step": 54 }, { "epoch": 0.004525817732976754, "grad_norm": 1.4998339630977238, "learning_rate": 3.0136986301369864e-06, "loss": 1.1857, "step": 55 }, { "epoch": 0.004608105328121785, "grad_norm": 1.5507431529256293, "learning_rate": 3.068493150684932e-06, "loss": 1.1487, "step": 56 }, { "epoch": 0.004690392923266818, "grad_norm": 1.6348282836598194, "learning_rate": 3.123287671232877e-06, "loss": 1.1641, "step": 57 }, { "epoch": 0.0047726805184118495, "grad_norm": 0.5752534532225031, "learning_rate": 3.1780821917808225e-06, "loss": 0.5701, "step": 58 }, { "epoch": 0.004854968113556881, "grad_norm": 1.6099812024773308, "learning_rate": 3.2328767123287676e-06, "loss": 1.1721, "step": 59 }, { "epoch": 0.004937255708701913, "grad_norm": 0.6408161226805661, "learning_rate": 3.2876712328767123e-06, "loss": 0.5998, "step": 60 }, { "epoch": 0.0050195433038469454, "grad_norm": 0.5617271278467075, "learning_rate": 3.342465753424658e-06, "loss": 0.6265, "step": 61 }, { "epoch": 0.005101830898991977, "grad_norm": 1.9160395609787255, "learning_rate": 3.397260273972603e-06, "loss": 1.1687, "step": 62 }, { "epoch": 0.005184118494137009, "grad_norm": 1.7944962743686514, "learning_rate": 3.4520547945205484e-06, "loss": 1.0999, "step": 63 }, { "epoch": 0.0052664060892820405, "grad_norm": 1.6550254402978586, "learning_rate": 3.5068493150684935e-06, "loss": 1.1283, "step": 64 }, { "epoch": 0.005348693684427073, "grad_norm": 2.06701106889446, "learning_rate": 3.5616438356164386e-06, "loss": 1.1449, "step": 65 }, { "epoch": 0.005430981279572105, "grad_norm": 1.334891505276627, "learning_rate": 3.616438356164384e-06, "loss": 1.0978, "step": 66 }, { "epoch": 0.005513268874717136, "grad_norm": 1.809032539584058, "learning_rate": 3.671232876712329e-06, "loss": 1.1172, "step": 67 }, { "epoch": 0.005595556469862168, "grad_norm": 0.5631162064075181, "learning_rate": 3.7260273972602743e-06, "loss": 0.5793, "step": 68 }, { "epoch": 0.0056778440650072, "grad_norm": 1.6486487445332147, "learning_rate": 3.7808219178082194e-06, "loss": 1.0659, "step": 69 }, { "epoch": 0.005760131660152232, "grad_norm": 1.7514518974861626, "learning_rate": 3.8356164383561645e-06, "loss": 1.1786, "step": 70 }, { "epoch": 0.005842419255297264, "grad_norm": 2.6958756773092887, "learning_rate": 3.89041095890411e-06, "loss": 1.1019, "step": 71 }, { "epoch": 0.005924706850442296, "grad_norm": 1.7803679070531404, "learning_rate": 3.945205479452055e-06, "loss": 1.0859, "step": 72 }, { "epoch": 0.006006994445587327, "grad_norm": 1.5059878641321802, "learning_rate": 4.000000000000001e-06, "loss": 1.0788, "step": 73 }, { "epoch": 0.00608928204073236, "grad_norm": 1.8716327109844846, "learning_rate": 4.054794520547946e-06, "loss": 1.1095, "step": 74 }, { "epoch": 0.0061715696358773916, "grad_norm": 1.5616475319286818, "learning_rate": 4.109589041095891e-06, "loss": 1.1278, "step": 75 }, { "epoch": 0.006253857231022423, "grad_norm": 1.493898527453622, "learning_rate": 4.164383561643836e-06, "loss": 1.104, "step": 76 }, { "epoch": 0.006336144826167455, "grad_norm": 1.8452837120263397, "learning_rate": 4.219178082191781e-06, "loss": 1.1095, "step": 77 }, { "epoch": 0.0064184324213124875, "grad_norm": 1.784319898693149, "learning_rate": 4.273972602739727e-06, "loss": 1.0949, "step": 78 }, { "epoch": 0.006500720016457519, "grad_norm": 2.137737098454538, "learning_rate": 4.328767123287671e-06, "loss": 1.1302, "step": 79 }, { "epoch": 0.006583007611602551, "grad_norm": 1.5914074135685312, "learning_rate": 4.383561643835616e-06, "loss": 1.0916, "step": 80 }, { "epoch": 0.0066652952067475825, "grad_norm": 2.3489068213528266, "learning_rate": 4.438356164383562e-06, "loss": 1.0729, "step": 81 }, { "epoch": 0.006747582801892615, "grad_norm": 2.073369039063705, "learning_rate": 4.493150684931507e-06, "loss": 1.0892, "step": 82 }, { "epoch": 0.006829870397037647, "grad_norm": 1.8770075428367665, "learning_rate": 4.5479452054794525e-06, "loss": 1.1187, "step": 83 }, { "epoch": 0.006912157992182678, "grad_norm": 4.506883747948483, "learning_rate": 4.602739726027398e-06, "loss": 1.0762, "step": 84 }, { "epoch": 0.00699444558732771, "grad_norm": 1.7209663187813125, "learning_rate": 4.657534246575343e-06, "loss": 1.1226, "step": 85 }, { "epoch": 0.007076733182472743, "grad_norm": 0.6052191270162426, "learning_rate": 4.712328767123288e-06, "loss": 0.6055, "step": 86 }, { "epoch": 0.007159020777617774, "grad_norm": 1.7994312730778819, "learning_rate": 4.767123287671233e-06, "loss": 1.0967, "step": 87 }, { "epoch": 0.007241308372762806, "grad_norm": 1.9304702595282108, "learning_rate": 4.821917808219179e-06, "loss": 1.1492, "step": 88 }, { "epoch": 0.007323595967907838, "grad_norm": 2.088564652992412, "learning_rate": 4.876712328767124e-06, "loss": 1.0985, "step": 89 }, { "epoch": 0.00740588356305287, "grad_norm": 1.8604994381662585, "learning_rate": 4.931506849315069e-06, "loss": 1.0923, "step": 90 }, { "epoch": 0.007488171158197902, "grad_norm": 0.5594391183994828, "learning_rate": 4.986301369863014e-06, "loss": 0.6021, "step": 91 }, { "epoch": 0.007570458753342934, "grad_norm": 1.7905925850647735, "learning_rate": 5.04109589041096e-06, "loss": 1.1047, "step": 92 }, { "epoch": 0.007652746348487965, "grad_norm": 2.5829004230758055, "learning_rate": 5.095890410958904e-06, "loss": 1.0856, "step": 93 }, { "epoch": 0.007735033943632997, "grad_norm": 2.8109366679812817, "learning_rate": 5.1506849315068494e-06, "loss": 1.0906, "step": 94 }, { "epoch": 0.00781732153877803, "grad_norm": 1.9488333893087777, "learning_rate": 5.2054794520547945e-06, "loss": 1.1174, "step": 95 }, { "epoch": 0.00789960913392306, "grad_norm": 1.8898489727850725, "learning_rate": 5.26027397260274e-06, "loss": 1.0764, "step": 96 }, { "epoch": 0.007981896729068093, "grad_norm": 1.9662220110655733, "learning_rate": 5.3150684931506856e-06, "loss": 1.0687, "step": 97 }, { "epoch": 0.008064184324213125, "grad_norm": 2.012210892740288, "learning_rate": 5.369863013698631e-06, "loss": 1.0688, "step": 98 }, { "epoch": 0.008146471919358156, "grad_norm": 2.0256582980555145, "learning_rate": 5.424657534246576e-06, "loss": 1.0435, "step": 99 }, { "epoch": 0.008228759514503189, "grad_norm": 2.3161294458478228, "learning_rate": 5.479452054794521e-06, "loss": 1.1027, "step": 100 }, { "epoch": 0.008311047109648221, "grad_norm": 2.159842764055281, "learning_rate": 5.534246575342466e-06, "loss": 1.0223, "step": 101 }, { "epoch": 0.008393334704793252, "grad_norm": 2.7342793057170964, "learning_rate": 5.589041095890412e-06, "loss": 1.0485, "step": 102 }, { "epoch": 0.008475622299938285, "grad_norm": 0.6133807544248717, "learning_rate": 5.643835616438357e-06, "loss": 0.5933, "step": 103 }, { "epoch": 0.008557909895083315, "grad_norm": 2.0957817610708593, "learning_rate": 5.698630136986302e-06, "loss": 1.084, "step": 104 }, { "epoch": 0.008640197490228348, "grad_norm": 3.0607800999765105, "learning_rate": 5.753424657534246e-06, "loss": 1.0369, "step": 105 }, { "epoch": 0.00872248508537338, "grad_norm": 2.3550652220766404, "learning_rate": 5.8082191780821915e-06, "loss": 1.0785, "step": 106 }, { "epoch": 0.008804772680518411, "grad_norm": 2.885362070393249, "learning_rate": 5.863013698630137e-06, "loss": 1.1143, "step": 107 }, { "epoch": 0.008887060275663444, "grad_norm": 2.726344088292101, "learning_rate": 5.9178082191780825e-06, "loss": 1.0423, "step": 108 }, { "epoch": 0.008969347870808476, "grad_norm": 2.720421039977678, "learning_rate": 5.972602739726028e-06, "loss": 1.0424, "step": 109 }, { "epoch": 0.009051635465953507, "grad_norm": 2.7737084246092043, "learning_rate": 6.027397260273973e-06, "loss": 1.0669, "step": 110 }, { "epoch": 0.00913392306109854, "grad_norm": 2.4862795852431696, "learning_rate": 6.082191780821919e-06, "loss": 1.0798, "step": 111 }, { "epoch": 0.00921621065624357, "grad_norm": 1.9953691894673529, "learning_rate": 6.136986301369864e-06, "loss": 1.0337, "step": 112 }, { "epoch": 0.009298498251388603, "grad_norm": 2.1734409375655908, "learning_rate": 6.191780821917809e-06, "loss": 1.0769, "step": 113 }, { "epoch": 0.009380785846533636, "grad_norm": 2.4691052918090457, "learning_rate": 6.246575342465754e-06, "loss": 1.0758, "step": 114 }, { "epoch": 0.009463073441678667, "grad_norm": 2.51765809469206, "learning_rate": 6.301369863013699e-06, "loss": 1.1065, "step": 115 }, { "epoch": 0.009545361036823699, "grad_norm": 2.3976820917439916, "learning_rate": 6.356164383561645e-06, "loss": 1.0454, "step": 116 }, { "epoch": 0.00962764863196873, "grad_norm": 0.5713752667519881, "learning_rate": 6.41095890410959e-06, "loss": 0.5767, "step": 117 }, { "epoch": 0.009709936227113762, "grad_norm": 2.9303587471653385, "learning_rate": 6.465753424657535e-06, "loss": 1.0596, "step": 118 }, { "epoch": 0.009792223822258795, "grad_norm": 2.625385971373383, "learning_rate": 6.5205479452054794e-06, "loss": 1.0694, "step": 119 }, { "epoch": 0.009874511417403826, "grad_norm": 2.6850490082257368, "learning_rate": 6.5753424657534245e-06, "loss": 1.0629, "step": 120 }, { "epoch": 0.009956799012548858, "grad_norm": 2.8941680627630575, "learning_rate": 6.630136986301371e-06, "loss": 1.0797, "step": 121 }, { "epoch": 0.010039086607693891, "grad_norm": 2.437227451528501, "learning_rate": 6.684931506849316e-06, "loss": 1.0446, "step": 122 }, { "epoch": 0.010121374202838922, "grad_norm": 4.2330170384868655, "learning_rate": 6.739726027397261e-06, "loss": 1.077, "step": 123 }, { "epoch": 0.010203661797983954, "grad_norm": 3.742681446646284, "learning_rate": 6.794520547945206e-06, "loss": 1.0578, "step": 124 }, { "epoch": 0.010285949393128985, "grad_norm": 2.905751102486295, "learning_rate": 6.849315068493151e-06, "loss": 1.0397, "step": 125 }, { "epoch": 0.010368236988274018, "grad_norm": 2.248809486049495, "learning_rate": 6.904109589041097e-06, "loss": 1.0057, "step": 126 }, { "epoch": 0.01045052458341905, "grad_norm": 2.793469113179832, "learning_rate": 6.958904109589042e-06, "loss": 1.0423, "step": 127 }, { "epoch": 0.010532812178564081, "grad_norm": 3.044433211099124, "learning_rate": 7.013698630136987e-06, "loss": 1.0519, "step": 128 }, { "epoch": 0.010615099773709114, "grad_norm": 3.453404138683163, "learning_rate": 7.068493150684932e-06, "loss": 1.0492, "step": 129 }, { "epoch": 0.010697387368854146, "grad_norm": 3.294896819292345, "learning_rate": 7.123287671232877e-06, "loss": 1.0186, "step": 130 }, { "epoch": 0.010779674963999177, "grad_norm": 2.652529510878711, "learning_rate": 7.178082191780823e-06, "loss": 1.0481, "step": 131 }, { "epoch": 0.01086196255914421, "grad_norm": 2.5635334133873835, "learning_rate": 7.232876712328768e-06, "loss": 1.0189, "step": 132 }, { "epoch": 0.01094425015428924, "grad_norm": 2.310822969570939, "learning_rate": 7.287671232876713e-06, "loss": 1.0804, "step": 133 }, { "epoch": 0.011026537749434273, "grad_norm": 2.7939745420750532, "learning_rate": 7.342465753424658e-06, "loss": 1.0731, "step": 134 }, { "epoch": 0.011108825344579305, "grad_norm": 10.159052417359996, "learning_rate": 7.397260273972603e-06, "loss": 1.0013, "step": 135 }, { "epoch": 0.011191112939724336, "grad_norm": 2.492104076947929, "learning_rate": 7.452054794520549e-06, "loss": 1.058, "step": 136 }, { "epoch": 0.011273400534869369, "grad_norm": 2.7323610574219512, "learning_rate": 7.506849315068494e-06, "loss": 1.0503, "step": 137 }, { "epoch": 0.0113556881300144, "grad_norm": 2.94667222448598, "learning_rate": 7.561643835616439e-06, "loss": 1.0283, "step": 138 }, { "epoch": 0.011437975725159432, "grad_norm": 4.017422542900321, "learning_rate": 7.616438356164384e-06, "loss": 1.0883, "step": 139 }, { "epoch": 0.011520263320304465, "grad_norm": 3.6715275879486633, "learning_rate": 7.671232876712329e-06, "loss": 1.0536, "step": 140 }, { "epoch": 0.011602550915449495, "grad_norm": 3.0172048685106603, "learning_rate": 7.726027397260276e-06, "loss": 1.055, "step": 141 }, { "epoch": 0.011684838510594528, "grad_norm": 3.077620329335805, "learning_rate": 7.78082191780822e-06, "loss": 1.0195, "step": 142 }, { "epoch": 0.01176712610573956, "grad_norm": 2.959594926294125, "learning_rate": 7.835616438356164e-06, "loss": 1.0369, "step": 143 }, { "epoch": 0.011849413700884591, "grad_norm": 5.2531338908420055, "learning_rate": 7.89041095890411e-06, "loss": 1.0524, "step": 144 }, { "epoch": 0.011931701296029624, "grad_norm": 2.9462988063147755, "learning_rate": 7.945205479452055e-06, "loss": 1.0258, "step": 145 }, { "epoch": 0.012013988891174655, "grad_norm": 2.835501864556677, "learning_rate": 8.000000000000001e-06, "loss": 1.0035, "step": 146 }, { "epoch": 0.012096276486319687, "grad_norm": 3.1002864915340798, "learning_rate": 8.054794520547946e-06, "loss": 1.0379, "step": 147 }, { "epoch": 0.01217856408146472, "grad_norm": 2.7184860323108464, "learning_rate": 8.109589041095892e-06, "loss": 1.0373, "step": 148 }, { "epoch": 0.01226085167660975, "grad_norm": 3.093424317685046, "learning_rate": 8.164383561643837e-06, "loss": 1.0559, "step": 149 }, { "epoch": 0.012343139271754783, "grad_norm": 2.9403313251924064, "learning_rate": 8.219178082191782e-06, "loss": 1.0312, "step": 150 }, { "epoch": 0.012425426866899816, "grad_norm": 3.334710236004298, "learning_rate": 8.273972602739727e-06, "loss": 1.032, "step": 151 }, { "epoch": 0.012507714462044846, "grad_norm": 3.754339855053731, "learning_rate": 8.328767123287672e-06, "loss": 1.007, "step": 152 }, { "epoch": 0.012590002057189879, "grad_norm": 3.468367068790295, "learning_rate": 8.383561643835617e-06, "loss": 1.0352, "step": 153 }, { "epoch": 0.01267228965233491, "grad_norm": 3.08946479512089, "learning_rate": 8.438356164383562e-06, "loss": 1.0285, "step": 154 }, { "epoch": 0.012754577247479942, "grad_norm": 2.7171722187405463, "learning_rate": 8.493150684931507e-06, "loss": 1.0355, "step": 155 }, { "epoch": 0.012836864842624975, "grad_norm": 2.9125857783989955, "learning_rate": 8.547945205479454e-06, "loss": 1.0383, "step": 156 }, { "epoch": 0.012919152437770006, "grad_norm": 3.431055558365553, "learning_rate": 8.602739726027397e-06, "loss": 0.9858, "step": 157 }, { "epoch": 0.013001440032915038, "grad_norm": 2.5695243675652906, "learning_rate": 8.657534246575343e-06, "loss": 1.0257, "step": 158 }, { "epoch": 0.013083727628060069, "grad_norm": 3.1403965108405645, "learning_rate": 8.712328767123288e-06, "loss": 1.0161, "step": 159 }, { "epoch": 0.013166015223205102, "grad_norm": 3.0914617102513535, "learning_rate": 8.767123287671233e-06, "loss": 1.0126, "step": 160 }, { "epoch": 0.013248302818350134, "grad_norm": 2.974266261740425, "learning_rate": 8.82191780821918e-06, "loss": 1.0146, "step": 161 }, { "epoch": 0.013330590413495165, "grad_norm": 4.453619610906972, "learning_rate": 8.876712328767125e-06, "loss": 1.01, "step": 162 }, { "epoch": 0.013412878008640198, "grad_norm": 3.3339134633525203, "learning_rate": 8.93150684931507e-06, "loss": 1.0164, "step": 163 }, { "epoch": 0.01349516560378523, "grad_norm": 3.096524915506246, "learning_rate": 8.986301369863015e-06, "loss": 1.0436, "step": 164 }, { "epoch": 0.013577453198930261, "grad_norm": 0.5714699105064062, "learning_rate": 9.04109589041096e-06, "loss": 0.5844, "step": 165 }, { "epoch": 0.013659740794075293, "grad_norm": 3.3053733088978294, "learning_rate": 9.095890410958905e-06, "loss": 1.01, "step": 166 }, { "epoch": 0.013742028389220324, "grad_norm": 3.042487650681917, "learning_rate": 9.15068493150685e-06, "loss": 1.0258, "step": 167 }, { "epoch": 0.013824315984365357, "grad_norm": 3.0826602321214267, "learning_rate": 9.205479452054795e-06, "loss": 1.0152, "step": 168 }, { "epoch": 0.01390660357951039, "grad_norm": 4.049305212778963, "learning_rate": 9.26027397260274e-06, "loss": 1.0344, "step": 169 }, { "epoch": 0.01398889117465542, "grad_norm": 2.262878129775452, "learning_rate": 9.315068493150685e-06, "loss": 0.9903, "step": 170 }, { "epoch": 0.014071178769800453, "grad_norm": 2.5478144837312904, "learning_rate": 9.36986301369863e-06, "loss": 1.0255, "step": 171 }, { "epoch": 0.014153466364945485, "grad_norm": 0.5963923221726043, "learning_rate": 9.424657534246576e-06, "loss": 0.5835, "step": 172 }, { "epoch": 0.014235753960090516, "grad_norm": 2.4229291883624775, "learning_rate": 9.47945205479452e-06, "loss": 0.9969, "step": 173 }, { "epoch": 0.014318041555235549, "grad_norm": 2.5861485778295563, "learning_rate": 9.534246575342466e-06, "loss": 1.0321, "step": 174 }, { "epoch": 0.01440032915038058, "grad_norm": 3.0535728376170868, "learning_rate": 9.589041095890411e-06, "loss": 1.0545, "step": 175 }, { "epoch": 0.014482616745525612, "grad_norm": 3.167624134264756, "learning_rate": 9.643835616438358e-06, "loss": 1.0212, "step": 176 }, { "epoch": 0.014564904340670645, "grad_norm": 2.532407359117499, "learning_rate": 9.698630136986303e-06, "loss": 1.0395, "step": 177 }, { "epoch": 0.014647191935815675, "grad_norm": 3.335905765902237, "learning_rate": 9.753424657534248e-06, "loss": 1.0444, "step": 178 }, { "epoch": 0.014729479530960708, "grad_norm": 2.6694368517880376, "learning_rate": 9.808219178082193e-06, "loss": 1.0609, "step": 179 }, { "epoch": 0.01481176712610574, "grad_norm": 2.4432476499205946, "learning_rate": 9.863013698630138e-06, "loss": 1.028, "step": 180 }, { "epoch": 0.014894054721250771, "grad_norm": 3.074867289580692, "learning_rate": 9.917808219178083e-06, "loss": 1.0277, "step": 181 }, { "epoch": 0.014976342316395804, "grad_norm": 2.8234239360995548, "learning_rate": 9.972602739726028e-06, "loss": 1.0145, "step": 182 }, { "epoch": 0.015058629911540835, "grad_norm": 2.7243533214462636, "learning_rate": 1.0027397260273975e-05, "loss": 0.9962, "step": 183 }, { "epoch": 0.015140917506685867, "grad_norm": 9.268831121545867, "learning_rate": 1.008219178082192e-05, "loss": 1.0202, "step": 184 }, { "epoch": 0.0152232051018309, "grad_norm": 0.6032487906705319, "learning_rate": 1.0136986301369864e-05, "loss": 0.5914, "step": 185 }, { "epoch": 0.01530549269697593, "grad_norm": 2.446903956621448, "learning_rate": 1.0191780821917809e-05, "loss": 1.0332, "step": 186 }, { "epoch": 0.015387780292120963, "grad_norm": 2.9898530283159857, "learning_rate": 1.0246575342465754e-05, "loss": 1.0058, "step": 187 }, { "epoch": 0.015470067887265994, "grad_norm": 3.1462756197093147, "learning_rate": 1.0301369863013699e-05, "loss": 0.9956, "step": 188 }, { "epoch": 0.015552355482411026, "grad_norm": 2.603677254795289, "learning_rate": 1.0356164383561644e-05, "loss": 1.0567, "step": 189 }, { "epoch": 0.01563464307755606, "grad_norm": 2.888609337531178, "learning_rate": 1.0410958904109589e-05, "loss": 1.0117, "step": 190 }, { "epoch": 0.01571693067270109, "grad_norm": 3.4481892347405694, "learning_rate": 1.0465753424657534e-05, "loss": 1.0312, "step": 191 }, { "epoch": 0.01579921826784612, "grad_norm": 2.723259220748936, "learning_rate": 1.052054794520548e-05, "loss": 1.0011, "step": 192 }, { "epoch": 0.015881505862991155, "grad_norm": 2.400388335266181, "learning_rate": 1.0575342465753426e-05, "loss": 1.0397, "step": 193 }, { "epoch": 0.015963793458136186, "grad_norm": 2.459799194471057, "learning_rate": 1.0630136986301371e-05, "loss": 1.0051, "step": 194 }, { "epoch": 0.016046081053281216, "grad_norm": 2.493367813709158, "learning_rate": 1.0684931506849316e-05, "loss": 0.9877, "step": 195 }, { "epoch": 0.01612836864842625, "grad_norm": 2.997365023733453, "learning_rate": 1.0739726027397261e-05, "loss": 0.9991, "step": 196 }, { "epoch": 0.01621065624357128, "grad_norm": 3.1534988892754927, "learning_rate": 1.0794520547945206e-05, "loss": 1.0088, "step": 197 }, { "epoch": 0.016292943838716312, "grad_norm": 0.7839570400001313, "learning_rate": 1.0849315068493152e-05, "loss": 0.5796, "step": 198 }, { "epoch": 0.016375231433861347, "grad_norm": 2.968831135340441, "learning_rate": 1.0904109589041097e-05, "loss": 1.0169, "step": 199 }, { "epoch": 0.016457519029006377, "grad_norm": 3.1769343467774736, "learning_rate": 1.0958904109589042e-05, "loss": 1.0097, "step": 200 }, { "epoch": 0.01653980662415141, "grad_norm": 2.941876345769733, "learning_rate": 1.1013698630136987e-05, "loss": 1.0021, "step": 201 }, { "epoch": 0.016622094219296443, "grad_norm": 3.3680817014108353, "learning_rate": 1.1068493150684932e-05, "loss": 1.0218, "step": 202 }, { "epoch": 0.016704381814441473, "grad_norm": 2.908397865551594, "learning_rate": 1.1123287671232879e-05, "loss": 0.9939, "step": 203 }, { "epoch": 0.016786669409586504, "grad_norm": 2.822395296594326, "learning_rate": 1.1178082191780824e-05, "loss": 1.0172, "step": 204 }, { "epoch": 0.016868957004731535, "grad_norm": 2.758365809402905, "learning_rate": 1.1232876712328769e-05, "loss": 1.05, "step": 205 }, { "epoch": 0.01695124459987657, "grad_norm": 2.9222144058188984, "learning_rate": 1.1287671232876714e-05, "loss": 1.0073, "step": 206 }, { "epoch": 0.0170335321950216, "grad_norm": 2.7763083571649547, "learning_rate": 1.1342465753424659e-05, "loss": 0.9958, "step": 207 }, { "epoch": 0.01711581979016663, "grad_norm": 0.9573751817349475, "learning_rate": 1.1397260273972604e-05, "loss": 0.6336, "step": 208 }, { "epoch": 0.017198107385311665, "grad_norm": 3.6768856466236857, "learning_rate": 1.1452054794520548e-05, "loss": 0.9839, "step": 209 }, { "epoch": 0.017280394980456696, "grad_norm": 0.6002615125347783, "learning_rate": 1.1506849315068493e-05, "loss": 0.5964, "step": 210 }, { "epoch": 0.017362682575601727, "grad_norm": 3.003839522918383, "learning_rate": 1.1561643835616438e-05, "loss": 1.0106, "step": 211 }, { "epoch": 0.01744497017074676, "grad_norm": 3.0141237654512305, "learning_rate": 1.1616438356164383e-05, "loss": 1.005, "step": 212 }, { "epoch": 0.017527257765891792, "grad_norm": 2.3380796106197583, "learning_rate": 1.1671232876712331e-05, "loss": 1.0025, "step": 213 }, { "epoch": 0.017609545361036823, "grad_norm": 2.749317750470713, "learning_rate": 1.1726027397260275e-05, "loss": 1.0208, "step": 214 }, { "epoch": 0.017691832956181857, "grad_norm": 2.5174324368341363, "learning_rate": 1.178082191780822e-05, "loss": 1.0225, "step": 215 }, { "epoch": 0.017774120551326888, "grad_norm": 2.6939469770631206, "learning_rate": 1.1835616438356165e-05, "loss": 1.0181, "step": 216 }, { "epoch": 0.01785640814647192, "grad_norm": 2.7969043874385218, "learning_rate": 1.189041095890411e-05, "loss": 1.0321, "step": 217 }, { "epoch": 0.017938695741616953, "grad_norm": 2.130515743950604, "learning_rate": 1.1945205479452055e-05, "loss": 0.9939, "step": 218 }, { "epoch": 0.018020983336761984, "grad_norm": 2.8848097718992296, "learning_rate": 1.2e-05, "loss": 1.0064, "step": 219 }, { "epoch": 0.018103270931907015, "grad_norm": 1.496463088281579, "learning_rate": 1.2054794520547945e-05, "loss": 0.6077, "step": 220 }, { "epoch": 0.018185558527052045, "grad_norm": 3.6292481030110935, "learning_rate": 1.210958904109589e-05, "loss": 1.0446, "step": 221 }, { "epoch": 0.01826784612219708, "grad_norm": 2.252792644024641, "learning_rate": 1.2164383561643837e-05, "loss": 0.9739, "step": 222 }, { "epoch": 0.01835013371734211, "grad_norm": 2.4478822538483755, "learning_rate": 1.2219178082191782e-05, "loss": 1.0131, "step": 223 }, { "epoch": 0.01843242131248714, "grad_norm": 2.559717897830331, "learning_rate": 1.2273972602739727e-05, "loss": 1.0394, "step": 224 }, { "epoch": 0.018514708907632176, "grad_norm": 2.869935242686829, "learning_rate": 1.2328767123287673e-05, "loss": 0.982, "step": 225 }, { "epoch": 0.018596996502777206, "grad_norm": 2.5009663006221974, "learning_rate": 1.2383561643835618e-05, "loss": 1.0108, "step": 226 }, { "epoch": 0.018679284097922237, "grad_norm": 2.9956405565150654, "learning_rate": 1.2438356164383563e-05, "loss": 0.9902, "step": 227 }, { "epoch": 0.01876157169306727, "grad_norm": 2.674322004514903, "learning_rate": 1.2493150684931508e-05, "loss": 0.9927, "step": 228 }, { "epoch": 0.018843859288212302, "grad_norm": 2.8674094236769583, "learning_rate": 1.2547945205479453e-05, "loss": 1.003, "step": 229 }, { "epoch": 0.018926146883357333, "grad_norm": 2.9710081363188703, "learning_rate": 1.2602739726027398e-05, "loss": 0.9844, "step": 230 }, { "epoch": 0.019008434478502367, "grad_norm": 2.98201549226896, "learning_rate": 1.2657534246575343e-05, "loss": 0.967, "step": 231 }, { "epoch": 0.019090722073647398, "grad_norm": 2.903452559676373, "learning_rate": 1.271232876712329e-05, "loss": 1.0102, "step": 232 }, { "epoch": 0.01917300966879243, "grad_norm": 2.5049333400477813, "learning_rate": 1.2767123287671235e-05, "loss": 1.0096, "step": 233 }, { "epoch": 0.01925529726393746, "grad_norm": 2.6342420325330522, "learning_rate": 1.282191780821918e-05, "loss": 0.9718, "step": 234 }, { "epoch": 0.019337584859082494, "grad_norm": 2.616314817819011, "learning_rate": 1.2876712328767125e-05, "loss": 0.9977, "step": 235 }, { "epoch": 0.019419872454227525, "grad_norm": 2.420031810864845, "learning_rate": 1.293150684931507e-05, "loss": 1.0117, "step": 236 }, { "epoch": 0.019502160049372556, "grad_norm": 2.9412487319960126, "learning_rate": 1.2986301369863015e-05, "loss": 1.0471, "step": 237 }, { "epoch": 0.01958444764451759, "grad_norm": 2.7984406162708906, "learning_rate": 1.3041095890410959e-05, "loss": 0.9501, "step": 238 }, { "epoch": 0.01966673523966262, "grad_norm": 4.841561737416111, "learning_rate": 1.3095890410958904e-05, "loss": 1.0138, "step": 239 }, { "epoch": 0.01974902283480765, "grad_norm": 2.1778156992905577, "learning_rate": 1.3150684931506849e-05, "loss": 1.0101, "step": 240 }, { "epoch": 0.019831310429952686, "grad_norm": 2.67809296527932, "learning_rate": 1.3205479452054794e-05, "loss": 0.982, "step": 241 }, { "epoch": 0.019913598025097717, "grad_norm": 2.738306662356033, "learning_rate": 1.3260273972602743e-05, "loss": 0.9953, "step": 242 }, { "epoch": 0.019995885620242747, "grad_norm": 3.69258760845872, "learning_rate": 1.3315068493150686e-05, "loss": 0.9933, "step": 243 }, { "epoch": 0.020078173215387782, "grad_norm": 3.4285570541743096, "learning_rate": 1.3369863013698631e-05, "loss": 0.9891, "step": 244 }, { "epoch": 0.020160460810532813, "grad_norm": 2.1884703037736175, "learning_rate": 1.3424657534246576e-05, "loss": 0.9615, "step": 245 }, { "epoch": 0.020242748405677843, "grad_norm": 2.278997433805173, "learning_rate": 1.3479452054794521e-05, "loss": 0.9984, "step": 246 }, { "epoch": 0.020325036000822878, "grad_norm": 0.9732502137516167, "learning_rate": 1.3534246575342466e-05, "loss": 0.5964, "step": 247 }, { "epoch": 0.02040732359596791, "grad_norm": 4.111007905694721, "learning_rate": 1.3589041095890412e-05, "loss": 1.03, "step": 248 }, { "epoch": 0.02048961119111294, "grad_norm": 2.104309544659177, "learning_rate": 1.3643835616438357e-05, "loss": 0.9696, "step": 249 }, { "epoch": 0.02057189878625797, "grad_norm": 2.5670779853119665, "learning_rate": 1.3698630136986302e-05, "loss": 0.9589, "step": 250 }, { "epoch": 0.020654186381403004, "grad_norm": 2.7898261074191777, "learning_rate": 1.3753424657534247e-05, "loss": 1.0084, "step": 251 }, { "epoch": 0.020736473976548035, "grad_norm": 3.2009246830375204, "learning_rate": 1.3808219178082194e-05, "loss": 0.9911, "step": 252 }, { "epoch": 0.020818761571693066, "grad_norm": 3.1563797863262777, "learning_rate": 1.3863013698630139e-05, "loss": 0.9947, "step": 253 }, { "epoch": 0.0209010491668381, "grad_norm": 3.193090081286074, "learning_rate": 1.3917808219178084e-05, "loss": 1.0069, "step": 254 }, { "epoch": 0.02098333676198313, "grad_norm": 5.521797116199944, "learning_rate": 1.3972602739726029e-05, "loss": 0.9842, "step": 255 }, { "epoch": 0.021065624357128162, "grad_norm": 1.243014761274919, "learning_rate": 1.4027397260273974e-05, "loss": 0.6147, "step": 256 }, { "epoch": 0.021147911952273196, "grad_norm": 3.191364616862045, "learning_rate": 1.4082191780821919e-05, "loss": 0.974, "step": 257 }, { "epoch": 0.021230199547418227, "grad_norm": 2.93570172220106, "learning_rate": 1.4136986301369864e-05, "loss": 0.9719, "step": 258 }, { "epoch": 0.021312487142563258, "grad_norm": 4.468162617805659, "learning_rate": 1.419178082191781e-05, "loss": 0.9904, "step": 259 }, { "epoch": 0.021394774737708292, "grad_norm": 2.2571244653960862, "learning_rate": 1.4246575342465754e-05, "loss": 0.9613, "step": 260 }, { "epoch": 0.021477062332853323, "grad_norm": 4.467563699694284, "learning_rate": 1.43013698630137e-05, "loss": 0.9944, "step": 261 }, { "epoch": 0.021559349927998354, "grad_norm": 0.68889362412214, "learning_rate": 1.4356164383561646e-05, "loss": 0.5789, "step": 262 }, { "epoch": 0.021641637523143385, "grad_norm": 0.6373164384054985, "learning_rate": 1.4410958904109591e-05, "loss": 0.5688, "step": 263 }, { "epoch": 0.02172392511828842, "grad_norm": 3.597782460566262, "learning_rate": 1.4465753424657537e-05, "loss": 0.9776, "step": 264 }, { "epoch": 0.02180621271343345, "grad_norm": 2.7541673143111347, "learning_rate": 1.4520547945205482e-05, "loss": 0.9927, "step": 265 }, { "epoch": 0.02188850030857848, "grad_norm": 0.6805788182804722, "learning_rate": 1.4575342465753427e-05, "loss": 0.5971, "step": 266 }, { "epoch": 0.021970787903723515, "grad_norm": 2.725379141853366, "learning_rate": 1.463013698630137e-05, "loss": 0.9675, "step": 267 }, { "epoch": 0.022053075498868546, "grad_norm": 4.08013853272879, "learning_rate": 1.4684931506849315e-05, "loss": 0.9786, "step": 268 }, { "epoch": 0.022135363094013576, "grad_norm": 2.5492247984913483, "learning_rate": 1.473972602739726e-05, "loss": 0.9988, "step": 269 }, { "epoch": 0.02221765068915861, "grad_norm": 3.8860413387854327, "learning_rate": 1.4794520547945205e-05, "loss": 0.9697, "step": 270 }, { "epoch": 0.02229993828430364, "grad_norm": 3.0719505820425925, "learning_rate": 1.484931506849315e-05, "loss": 0.9778, "step": 271 }, { "epoch": 0.022382225879448672, "grad_norm": 3.065813452275364, "learning_rate": 1.4904109589041097e-05, "loss": 1.0114, "step": 272 }, { "epoch": 0.022464513474593707, "grad_norm": 3.119520514603019, "learning_rate": 1.4958904109589042e-05, "loss": 1.0143, "step": 273 }, { "epoch": 0.022546801069738737, "grad_norm": 2.8059490672957823, "learning_rate": 1.5013698630136988e-05, "loss": 0.9815, "step": 274 }, { "epoch": 0.022629088664883768, "grad_norm": 2.6271007340037706, "learning_rate": 1.5068493150684933e-05, "loss": 1.0251, "step": 275 }, { "epoch": 0.0227113762600288, "grad_norm": 3.114887825941429, "learning_rate": 1.5123287671232878e-05, "loss": 0.9722, "step": 276 }, { "epoch": 0.022793663855173833, "grad_norm": 3.222134871844559, "learning_rate": 1.5178082191780823e-05, "loss": 0.9895, "step": 277 }, { "epoch": 0.022875951450318864, "grad_norm": 0.8596732284566506, "learning_rate": 1.5232876712328768e-05, "loss": 0.6421, "step": 278 }, { "epoch": 0.022958239045463895, "grad_norm": 2.688881192050172, "learning_rate": 1.5287671232876713e-05, "loss": 0.9709, "step": 279 }, { "epoch": 0.02304052664060893, "grad_norm": 0.5908184070761948, "learning_rate": 1.5342465753424658e-05, "loss": 0.5813, "step": 280 }, { "epoch": 0.02312281423575396, "grad_norm": 2.5626042733441565, "learning_rate": 1.5397260273972603e-05, "loss": 1.0054, "step": 281 }, { "epoch": 0.02320510183089899, "grad_norm": 0.6319032426639426, "learning_rate": 1.545205479452055e-05, "loss": 0.569, "step": 282 }, { "epoch": 0.023287389426044025, "grad_norm": 3.381429029921771, "learning_rate": 1.5506849315068497e-05, "loss": 0.9924, "step": 283 }, { "epoch": 0.023369677021189056, "grad_norm": 0.6893518849945868, "learning_rate": 1.556164383561644e-05, "loss": 0.5947, "step": 284 }, { "epoch": 0.023451964616334087, "grad_norm": 0.6030322287256665, "learning_rate": 1.5616438356164384e-05, "loss": 0.5849, "step": 285 }, { "epoch": 0.02353425221147912, "grad_norm": 2.584371231162671, "learning_rate": 1.567123287671233e-05, "loss": 1.0113, "step": 286 }, { "epoch": 0.023616539806624152, "grad_norm": 2.617374246670965, "learning_rate": 1.5726027397260274e-05, "loss": 0.9952, "step": 287 }, { "epoch": 0.023698827401769183, "grad_norm": 3.131756380862052, "learning_rate": 1.578082191780822e-05, "loss": 0.9978, "step": 288 }, { "epoch": 0.023781114996914217, "grad_norm": 0.7149086621817794, "learning_rate": 1.5835616438356164e-05, "loss": 0.6005, "step": 289 }, { "epoch": 0.023863402592059248, "grad_norm": 2.8572031223595804, "learning_rate": 1.589041095890411e-05, "loss": 0.9764, "step": 290 }, { "epoch": 0.02394569018720428, "grad_norm": 3.0067656548078525, "learning_rate": 1.5945205479452054e-05, "loss": 0.9931, "step": 291 }, { "epoch": 0.02402797778234931, "grad_norm": 2.9396448545767067, "learning_rate": 1.6000000000000003e-05, "loss": 1.0167, "step": 292 }, { "epoch": 0.024110265377494344, "grad_norm": 2.551576593689318, "learning_rate": 1.6054794520547948e-05, "loss": 0.9652, "step": 293 }, { "epoch": 0.024192552972639374, "grad_norm": 3.4929495312083376, "learning_rate": 1.6109589041095893e-05, "loss": 0.9741, "step": 294 }, { "epoch": 0.024274840567784405, "grad_norm": 0.5986861672946895, "learning_rate": 1.6164383561643838e-05, "loss": 0.5967, "step": 295 }, { "epoch": 0.02435712816292944, "grad_norm": 2.3369563375899163, "learning_rate": 1.6219178082191783e-05, "loss": 0.9541, "step": 296 }, { "epoch": 0.02443941575807447, "grad_norm": 3.115001072277964, "learning_rate": 1.6273972602739728e-05, "loss": 1.002, "step": 297 }, { "epoch": 0.0245217033532195, "grad_norm": 3.594307440216849, "learning_rate": 1.6328767123287673e-05, "loss": 0.9483, "step": 298 }, { "epoch": 0.024603990948364535, "grad_norm": 2.4315114201324977, "learning_rate": 1.638356164383562e-05, "loss": 0.9844, "step": 299 }, { "epoch": 0.024686278543509566, "grad_norm": 3.3312431748162528, "learning_rate": 1.6438356164383563e-05, "loss": 1.0031, "step": 300 }, { "epoch": 0.024768566138654597, "grad_norm": 2.7478721222497695, "learning_rate": 1.649315068493151e-05, "loss": 0.9942, "step": 301 }, { "epoch": 0.02485085373379963, "grad_norm": 2.7443057694383097, "learning_rate": 1.6547945205479454e-05, "loss": 0.9841, "step": 302 }, { "epoch": 0.024933141328944662, "grad_norm": 2.5333469665657797, "learning_rate": 1.66027397260274e-05, "loss": 0.9751, "step": 303 }, { "epoch": 0.025015428924089693, "grad_norm": 3.161735273370277, "learning_rate": 1.6657534246575344e-05, "loss": 0.9687, "step": 304 }, { "epoch": 0.025097716519234724, "grad_norm": 2.6737823247108183, "learning_rate": 1.671232876712329e-05, "loss": 0.9787, "step": 305 }, { "epoch": 0.025180004114379758, "grad_norm": 0.6510425400067263, "learning_rate": 1.6767123287671234e-05, "loss": 0.5622, "step": 306 }, { "epoch": 0.02526229170952479, "grad_norm": 4.574909987598007, "learning_rate": 1.682191780821918e-05, "loss": 0.9643, "step": 307 }, { "epoch": 0.02534457930466982, "grad_norm": 3.4438804774031935, "learning_rate": 1.6876712328767124e-05, "loss": 0.9615, "step": 308 }, { "epoch": 0.025426866899814854, "grad_norm": 2.9285136796976015, "learning_rate": 1.693150684931507e-05, "loss": 0.9527, "step": 309 }, { "epoch": 0.025509154494959885, "grad_norm": 2.779888649016243, "learning_rate": 1.6986301369863014e-05, "loss": 0.9544, "step": 310 }, { "epoch": 0.025591442090104916, "grad_norm": 2.7248520567063848, "learning_rate": 1.7041095890410963e-05, "loss": 0.9473, "step": 311 }, { "epoch": 0.02567372968524995, "grad_norm": 3.5709762174348954, "learning_rate": 1.7095890410958908e-05, "loss": 0.9575, "step": 312 }, { "epoch": 0.02575601728039498, "grad_norm": 3.0856327234258827, "learning_rate": 1.715068493150685e-05, "loss": 0.9652, "step": 313 }, { "epoch": 0.02583830487554001, "grad_norm": 2.2692448164089343, "learning_rate": 1.7205479452054795e-05, "loss": 0.9735, "step": 314 }, { "epoch": 0.025920592470685046, "grad_norm": 5.769054110868784, "learning_rate": 1.726027397260274e-05, "loss": 0.9703, "step": 315 }, { "epoch": 0.026002880065830077, "grad_norm": 2.508893910476298, "learning_rate": 1.7315068493150685e-05, "loss": 0.944, "step": 316 }, { "epoch": 0.026085167660975107, "grad_norm": 2.8832916992173767, "learning_rate": 1.736986301369863e-05, "loss": 0.9646, "step": 317 }, { "epoch": 0.026167455256120138, "grad_norm": 2.919174367177141, "learning_rate": 1.7424657534246575e-05, "loss": 0.9642, "step": 318 }, { "epoch": 0.026249742851265172, "grad_norm": 2.3758292544134068, "learning_rate": 1.747945205479452e-05, "loss": 0.9819, "step": 319 }, { "epoch": 0.026332030446410203, "grad_norm": 2.8844662683768822, "learning_rate": 1.7534246575342465e-05, "loss": 0.9757, "step": 320 }, { "epoch": 0.026414318041555234, "grad_norm": 2.2651505276443964, "learning_rate": 1.7589041095890414e-05, "loss": 0.9461, "step": 321 }, { "epoch": 0.02649660563670027, "grad_norm": 3.148064595511082, "learning_rate": 1.764383561643836e-05, "loss": 0.9457, "step": 322 }, { "epoch": 0.0265788932318453, "grad_norm": 2.593793697550568, "learning_rate": 1.7698630136986304e-05, "loss": 0.9564, "step": 323 }, { "epoch": 0.02666118082699033, "grad_norm": 3.5777764577994637, "learning_rate": 1.775342465753425e-05, "loss": 0.9585, "step": 324 }, { "epoch": 0.026743468422135364, "grad_norm": 2.5200344733829434, "learning_rate": 1.7808219178082194e-05, "loss": 0.9429, "step": 325 }, { "epoch": 0.026825756017280395, "grad_norm": 0.7344214528472546, "learning_rate": 1.786301369863014e-05, "loss": 0.6191, "step": 326 }, { "epoch": 0.026908043612425426, "grad_norm": 3.3825851018048962, "learning_rate": 1.7917808219178085e-05, "loss": 0.9739, "step": 327 }, { "epoch": 0.02699033120757046, "grad_norm": 2.4626600175420212, "learning_rate": 1.797260273972603e-05, "loss": 0.9813, "step": 328 }, { "epoch": 0.02707261880271549, "grad_norm": 2.604744324101538, "learning_rate": 1.8027397260273975e-05, "loss": 0.9605, "step": 329 }, { "epoch": 0.027154906397860522, "grad_norm": 2.3443898191922408, "learning_rate": 1.808219178082192e-05, "loss": 0.968, "step": 330 }, { "epoch": 0.027237193993005556, "grad_norm": 2.2972121260527274, "learning_rate": 1.8136986301369865e-05, "loss": 0.9636, "step": 331 }, { "epoch": 0.027319481588150587, "grad_norm": 0.6704215743863139, "learning_rate": 1.819178082191781e-05, "loss": 0.5832, "step": 332 }, { "epoch": 0.027401769183295618, "grad_norm": 2.5588332490587806, "learning_rate": 1.8246575342465755e-05, "loss": 0.967, "step": 333 }, { "epoch": 0.02748405677844065, "grad_norm": 0.5729720504764441, "learning_rate": 1.83013698630137e-05, "loss": 0.5796, "step": 334 }, { "epoch": 0.027566344373585683, "grad_norm": 0.536934165288964, "learning_rate": 1.8356164383561645e-05, "loss": 0.586, "step": 335 }, { "epoch": 0.027648631968730714, "grad_norm": 2.729927929300927, "learning_rate": 1.841095890410959e-05, "loss": 1.0006, "step": 336 }, { "epoch": 0.027730919563875744, "grad_norm": 2.9380300033617193, "learning_rate": 1.8465753424657535e-05, "loss": 0.9806, "step": 337 }, { "epoch": 0.02781320715902078, "grad_norm": 3.1871007449922595, "learning_rate": 1.852054794520548e-05, "loss": 1.0205, "step": 338 }, { "epoch": 0.02789549475416581, "grad_norm": 2.7551362648970454, "learning_rate": 1.8575342465753426e-05, "loss": 0.9843, "step": 339 }, { "epoch": 0.02797778234931084, "grad_norm": 2.341899316621362, "learning_rate": 1.863013698630137e-05, "loss": 0.9828, "step": 340 }, { "epoch": 0.028060069944455875, "grad_norm": 3.0041315739517143, "learning_rate": 1.8684931506849316e-05, "loss": 0.9599, "step": 341 }, { "epoch": 0.028142357539600905, "grad_norm": 1.098290342373438, "learning_rate": 1.873972602739726e-05, "loss": 0.5762, "step": 342 }, { "epoch": 0.028224645134745936, "grad_norm": 2.793401629061216, "learning_rate": 1.8794520547945206e-05, "loss": 0.9599, "step": 343 }, { "epoch": 0.02830693272989097, "grad_norm": 3.381992225466734, "learning_rate": 1.884931506849315e-05, "loss": 1.0128, "step": 344 }, { "epoch": 0.028389220325036, "grad_norm": 3.0552921674313107, "learning_rate": 1.8904109589041096e-05, "loss": 0.9683, "step": 345 }, { "epoch": 0.028471507920181032, "grad_norm": 2.59026883064129, "learning_rate": 1.895890410958904e-05, "loss": 0.9361, "step": 346 }, { "epoch": 0.028553795515326063, "grad_norm": 3.0842540515307473, "learning_rate": 1.9013698630136986e-05, "loss": 0.9697, "step": 347 }, { "epoch": 0.028636083110471097, "grad_norm": 2.443425049236279, "learning_rate": 1.906849315068493e-05, "loss": 0.9183, "step": 348 }, { "epoch": 0.028718370705616128, "grad_norm": 3.127867492745528, "learning_rate": 1.9123287671232877e-05, "loss": 0.9601, "step": 349 }, { "epoch": 0.02880065830076116, "grad_norm": 4.402570399866093, "learning_rate": 1.9178082191780822e-05, "loss": 0.9303, "step": 350 }, { "epoch": 0.028882945895906193, "grad_norm": 0.8543818428159927, "learning_rate": 1.923287671232877e-05, "loss": 0.5988, "step": 351 }, { "epoch": 0.028965233491051224, "grad_norm": 0.7093532126289934, "learning_rate": 1.9287671232876715e-05, "loss": 0.5831, "step": 352 }, { "epoch": 0.029047521086196255, "grad_norm": 0.6407564149823172, "learning_rate": 1.934246575342466e-05, "loss": 0.577, "step": 353 }, { "epoch": 0.02912980868134129, "grad_norm": 3.390283574742443, "learning_rate": 1.9397260273972606e-05, "loss": 0.9609, "step": 354 }, { "epoch": 0.02921209627648632, "grad_norm": 2.53734497566345, "learning_rate": 1.945205479452055e-05, "loss": 0.9909, "step": 355 }, { "epoch": 0.02929438387163135, "grad_norm": 1.0115473868573372, "learning_rate": 1.9506849315068496e-05, "loss": 0.6035, "step": 356 }, { "epoch": 0.029376671466776385, "grad_norm": 0.8686466035185451, "learning_rate": 1.956164383561644e-05, "loss": 0.5971, "step": 357 }, { "epoch": 0.029458959061921416, "grad_norm": 3.039718625814903, "learning_rate": 1.9616438356164386e-05, "loss": 0.9912, "step": 358 }, { "epoch": 0.029541246657066447, "grad_norm": 3.1175114788948473, "learning_rate": 1.967123287671233e-05, "loss": 0.9866, "step": 359 }, { "epoch": 0.02962353425221148, "grad_norm": 6.758106134116968, "learning_rate": 1.9726027397260276e-05, "loss": 0.9847, "step": 360 }, { "epoch": 0.02970582184735651, "grad_norm": 2.589972092841794, "learning_rate": 1.978082191780822e-05, "loss": 0.9565, "step": 361 }, { "epoch": 0.029788109442501542, "grad_norm": 1.073769179644345, "learning_rate": 1.9835616438356166e-05, "loss": 0.6201, "step": 362 }, { "epoch": 0.029870397037646573, "grad_norm": 2.620541255700163, "learning_rate": 1.989041095890411e-05, "loss": 0.9694, "step": 363 }, { "epoch": 0.029952684632791608, "grad_norm": 2.9983273469412, "learning_rate": 1.9945205479452057e-05, "loss": 0.9517, "step": 364 }, { "epoch": 0.03003497222793664, "grad_norm": 3.1705127831701176, "learning_rate": 2e-05, "loss": 0.9757, "step": 365 }, { "epoch": 0.03011725982308167, "grad_norm": 3.0769206086851493, "learning_rate": 1.9999999644807997e-05, "loss": 0.9725, "step": 366 }, { "epoch": 0.030199547418226703, "grad_norm": 2.6381794624352346, "learning_rate": 1.999999857923201e-05, "loss": 0.9579, "step": 367 }, { "epoch": 0.030281835013371734, "grad_norm": 2.524417719057271, "learning_rate": 1.999999680327212e-05, "loss": 0.9491, "step": 368 }, { "epoch": 0.030364122608516765, "grad_norm": 2.0772737485337958, "learning_rate": 1.9999994316928445e-05, "loss": 0.9802, "step": 369 }, { "epoch": 0.0304464102036618, "grad_norm": 0.695305872906948, "learning_rate": 1.9999991120201172e-05, "loss": 0.6179, "step": 370 }, { "epoch": 0.03052869779880683, "grad_norm": 2.034367122214282, "learning_rate": 1.999998721309052e-05, "loss": 0.9365, "step": 371 }, { "epoch": 0.03061098539395186, "grad_norm": 2.5094859416224096, "learning_rate": 1.999998259559677e-05, "loss": 0.9806, "step": 372 }, { "epoch": 0.030693272989096895, "grad_norm": 2.037387180631793, "learning_rate": 1.9999977267720245e-05, "loss": 0.9625, "step": 373 }, { "epoch": 0.030775560584241926, "grad_norm": 1.9827245047395246, "learning_rate": 1.999997122946133e-05, "loss": 0.996, "step": 374 }, { "epoch": 0.030857848179386957, "grad_norm": 2.000201005705768, "learning_rate": 1.9999964480820448e-05, "loss": 0.9247, "step": 375 }, { "epoch": 0.030940135774531988, "grad_norm": 2.237696098262905, "learning_rate": 1.999995702179809e-05, "loss": 0.9432, "step": 376 }, { "epoch": 0.031022423369677022, "grad_norm": 2.1572992959011668, "learning_rate": 1.999994885239477e-05, "loss": 0.9567, "step": 377 }, { "epoch": 0.031104710964822053, "grad_norm": 2.5949178993773656, "learning_rate": 1.999993997261108e-05, "loss": 0.9523, "step": 378 }, { "epoch": 0.031186998559967084, "grad_norm": 4.412522046641788, "learning_rate": 1.9999930382447644e-05, "loss": 0.9463, "step": 379 }, { "epoch": 0.03126928615511212, "grad_norm": 4.095975078147534, "learning_rate": 1.9999920081905148e-05, "loss": 0.9562, "step": 380 }, { "epoch": 0.03135157375025715, "grad_norm": 0.7238222599759508, "learning_rate": 1.999990907098432e-05, "loss": 0.6367, "step": 381 }, { "epoch": 0.03143386134540218, "grad_norm": 2.051737393292375, "learning_rate": 1.9999897349685948e-05, "loss": 0.9396, "step": 382 }, { "epoch": 0.03151614894054721, "grad_norm": 3.608873989338571, "learning_rate": 1.999988491801086e-05, "loss": 0.9427, "step": 383 }, { "epoch": 0.03159843653569224, "grad_norm": 0.5731166749659096, "learning_rate": 1.999987177595994e-05, "loss": 0.6066, "step": 384 }, { "epoch": 0.03168072413083728, "grad_norm": 2.7911800909686244, "learning_rate": 1.9999857923534117e-05, "loss": 0.9553, "step": 385 }, { "epoch": 0.03176301172598231, "grad_norm": 0.5640032520210956, "learning_rate": 1.9999843360734384e-05, "loss": 0.6089, "step": 386 }, { "epoch": 0.03184529932112734, "grad_norm": 3.218289339029279, "learning_rate": 1.999982808756177e-05, "loss": 1.002, "step": 387 }, { "epoch": 0.03192758691627237, "grad_norm": 0.5298496199217386, "learning_rate": 1.999981210401736e-05, "loss": 0.6014, "step": 388 }, { "epoch": 0.0320098745114174, "grad_norm": 2.1651032679205544, "learning_rate": 1.9999795410102288e-05, "loss": 0.977, "step": 389 }, { "epoch": 0.03209216210656243, "grad_norm": 3.0876660454466336, "learning_rate": 1.999977800581775e-05, "loss": 0.954, "step": 390 }, { "epoch": 0.03217444970170747, "grad_norm": 2.8016809296721186, "learning_rate": 1.999975989116497e-05, "loss": 0.9773, "step": 391 }, { "epoch": 0.0322567372968525, "grad_norm": 2.2686954346227584, "learning_rate": 1.999974106614524e-05, "loss": 0.9284, "step": 392 }, { "epoch": 0.03233902489199753, "grad_norm": 2.848599719139828, "learning_rate": 1.9999721530759896e-05, "loss": 0.9666, "step": 393 }, { "epoch": 0.03242131248714256, "grad_norm": 2.5480580332195792, "learning_rate": 1.9999701285010327e-05, "loss": 0.9748, "step": 394 }, { "epoch": 0.032503600082287594, "grad_norm": 3.0659568674712587, "learning_rate": 1.999968032889797e-05, "loss": 0.9773, "step": 395 }, { "epoch": 0.032585887677432625, "grad_norm": 3.2486686691126607, "learning_rate": 1.9999658662424318e-05, "loss": 0.9378, "step": 396 }, { "epoch": 0.032668175272577656, "grad_norm": 2.231555735516029, "learning_rate": 1.9999636285590903e-05, "loss": 0.9402, "step": 397 }, { "epoch": 0.03275046286772269, "grad_norm": 7.750954267677904, "learning_rate": 1.999961319839932e-05, "loss": 0.9212, "step": 398 }, { "epoch": 0.032832750462867724, "grad_norm": 3.9379616174216747, "learning_rate": 1.9999589400851208e-05, "loss": 0.957, "step": 399 }, { "epoch": 0.032915038058012755, "grad_norm": 3.09592161673104, "learning_rate": 1.9999564892948254e-05, "loss": 0.9644, "step": 400 }, { "epoch": 0.032997325653157786, "grad_norm": 0.6258510816084707, "learning_rate": 1.9999539674692206e-05, "loss": 0.6, "step": 401 }, { "epoch": 0.03307961324830282, "grad_norm": 2.757532242911201, "learning_rate": 1.9999513746084848e-05, "loss": 0.9627, "step": 402 }, { "epoch": 0.03316190084344785, "grad_norm": 0.518069489983011, "learning_rate": 1.999948710712803e-05, "loss": 0.5736, "step": 403 }, { "epoch": 0.033244188438592885, "grad_norm": 2.7302377830347293, "learning_rate": 1.9999459757823632e-05, "loss": 0.9452, "step": 404 }, { "epoch": 0.033326476033737916, "grad_norm": 3.8829507326351678, "learning_rate": 1.9999431698173614e-05, "loss": 0.9501, "step": 405 }, { "epoch": 0.03340876362888295, "grad_norm": 3.030860642634053, "learning_rate": 1.9999402928179953e-05, "loss": 0.935, "step": 406 }, { "epoch": 0.03349105122402798, "grad_norm": 2.7297517789446735, "learning_rate": 1.99993734478447e-05, "loss": 0.9816, "step": 407 }, { "epoch": 0.03357333881917301, "grad_norm": 2.9131211283428864, "learning_rate": 1.999934325716995e-05, "loss": 0.953, "step": 408 }, { "epoch": 0.03365562641431804, "grad_norm": 2.8724758175032457, "learning_rate": 1.999931235615785e-05, "loss": 0.9543, "step": 409 }, { "epoch": 0.03373791400946307, "grad_norm": 3.8558067751787894, "learning_rate": 1.999928074481059e-05, "loss": 0.9024, "step": 410 }, { "epoch": 0.03382020160460811, "grad_norm": 4.890426251595657, "learning_rate": 1.9999248423130414e-05, "loss": 0.9557, "step": 411 }, { "epoch": 0.03390248919975314, "grad_norm": 3.9224502088816307, "learning_rate": 1.9999215391119623e-05, "loss": 0.9625, "step": 412 }, { "epoch": 0.03398477679489817, "grad_norm": 4.121169405356662, "learning_rate": 1.9999181648780564e-05, "loss": 0.9836, "step": 413 }, { "epoch": 0.0340670643900432, "grad_norm": 3.2570143865225365, "learning_rate": 1.999914719611563e-05, "loss": 0.9548, "step": 414 }, { "epoch": 0.03414935198518823, "grad_norm": 0.8551591188426197, "learning_rate": 1.999911203312727e-05, "loss": 0.6257, "step": 415 }, { "epoch": 0.03423163958033326, "grad_norm": 2.282348243685617, "learning_rate": 1.9999076159817984e-05, "loss": 0.9534, "step": 416 }, { "epoch": 0.0343139271754783, "grad_norm": 3.1849388817078417, "learning_rate": 1.999903957619032e-05, "loss": 0.9559, "step": 417 }, { "epoch": 0.03439621477062333, "grad_norm": 3.0160267374462744, "learning_rate": 1.9999002282246877e-05, "loss": 0.9414, "step": 418 }, { "epoch": 0.03447850236576836, "grad_norm": 2.8630460192439484, "learning_rate": 1.99989642779903e-05, "loss": 0.97, "step": 419 }, { "epoch": 0.03456078996091339, "grad_norm": 0.6092993503428186, "learning_rate": 1.999892556342329e-05, "loss": 0.5762, "step": 420 }, { "epoch": 0.03464307755605842, "grad_norm": 3.558089457861364, "learning_rate": 1.9998886138548597e-05, "loss": 0.9674, "step": 421 }, { "epoch": 0.034725365151203454, "grad_norm": 0.5392883644170888, "learning_rate": 1.9998846003369028e-05, "loss": 0.6002, "step": 422 }, { "epoch": 0.03480765274634849, "grad_norm": 2.4265611825364175, "learning_rate": 1.9998805157887432e-05, "loss": 0.9469, "step": 423 }, { "epoch": 0.03488994034149352, "grad_norm": 2.5084390180607508, "learning_rate": 1.9998763602106704e-05, "loss": 0.9547, "step": 424 }, { "epoch": 0.03497222793663855, "grad_norm": 3.0592802155387284, "learning_rate": 1.99987213360298e-05, "loss": 0.9549, "step": 425 }, { "epoch": 0.035054515531783584, "grad_norm": 3.0606106243138353, "learning_rate": 1.9998678359659726e-05, "loss": 0.925, "step": 426 }, { "epoch": 0.035136803126928615, "grad_norm": 0.5614840770252022, "learning_rate": 1.999863467299953e-05, "loss": 0.6226, "step": 427 }, { "epoch": 0.035219090722073645, "grad_norm": 2.3274481514972636, "learning_rate": 1.9998590276052318e-05, "loss": 0.9627, "step": 428 }, { "epoch": 0.035301378317218676, "grad_norm": 0.5247325522573751, "learning_rate": 1.999854516882124e-05, "loss": 0.5626, "step": 429 }, { "epoch": 0.035383665912363714, "grad_norm": 2.4963541117374635, "learning_rate": 1.999849935130951e-05, "loss": 0.9198, "step": 430 }, { "epoch": 0.035465953507508745, "grad_norm": 2.470517097187284, "learning_rate": 1.999845282352037e-05, "loss": 0.9433, "step": 431 }, { "epoch": 0.035548241102653776, "grad_norm": 2.7560008424762183, "learning_rate": 1.9998405585457134e-05, "loss": 0.9428, "step": 432 }, { "epoch": 0.035630528697798806, "grad_norm": 2.7637029961336226, "learning_rate": 1.9998357637123157e-05, "loss": 0.942, "step": 433 }, { "epoch": 0.03571281629294384, "grad_norm": 2.9100289752309045, "learning_rate": 1.9998308978521842e-05, "loss": 0.9457, "step": 434 }, { "epoch": 0.03579510388808887, "grad_norm": 4.313071561196342, "learning_rate": 1.9998259609656645e-05, "loss": 0.9367, "step": 435 }, { "epoch": 0.035877391483233906, "grad_norm": 2.9430306639688384, "learning_rate": 1.999820953053108e-05, "loss": 0.9292, "step": 436 }, { "epoch": 0.03595967907837894, "grad_norm": 3.336500502830984, "learning_rate": 1.9998158741148695e-05, "loss": 0.9517, "step": 437 }, { "epoch": 0.03604196667352397, "grad_norm": 2.830315148432978, "learning_rate": 1.99981072415131e-05, "loss": 0.9619, "step": 438 }, { "epoch": 0.036124254268669, "grad_norm": 2.9628110908182506, "learning_rate": 1.9998055031627964e-05, "loss": 0.9342, "step": 439 }, { "epoch": 0.03620654186381403, "grad_norm": 5.046468138436623, "learning_rate": 1.9998002111496986e-05, "loss": 0.9577, "step": 440 }, { "epoch": 0.03628882945895906, "grad_norm": 3.1781915402537324, "learning_rate": 1.9997948481123925e-05, "loss": 0.9275, "step": 441 }, { "epoch": 0.03637111705410409, "grad_norm": 3.291481831836819, "learning_rate": 1.9997894140512595e-05, "loss": 0.9504, "step": 442 }, { "epoch": 0.03645340464924913, "grad_norm": 3.1084220240196254, "learning_rate": 1.9997839089666854e-05, "loss": 0.9236, "step": 443 }, { "epoch": 0.03653569224439416, "grad_norm": 3.1887037749162093, "learning_rate": 1.9997783328590613e-05, "loss": 0.8855, "step": 444 }, { "epoch": 0.03661797983953919, "grad_norm": 3.305256714504642, "learning_rate": 1.9997726857287834e-05, "loss": 0.9552, "step": 445 }, { "epoch": 0.03670026743468422, "grad_norm": 4.754531864085289, "learning_rate": 1.9997669675762528e-05, "loss": 0.9504, "step": 446 }, { "epoch": 0.03678255502982925, "grad_norm": 2.474649426046985, "learning_rate": 1.9997611784018754e-05, "loss": 0.9518, "step": 447 }, { "epoch": 0.03686484262497428, "grad_norm": 2.880288649426941, "learning_rate": 1.9997553182060633e-05, "loss": 0.8702, "step": 448 }, { "epoch": 0.03694713022011932, "grad_norm": 2.9619541365703976, "learning_rate": 1.999749386989232e-05, "loss": 0.948, "step": 449 }, { "epoch": 0.03702941781526435, "grad_norm": 3.0040457692945552, "learning_rate": 1.999743384751803e-05, "loss": 0.9161, "step": 450 }, { "epoch": 0.03711170541040938, "grad_norm": 0.6917840645754628, "learning_rate": 1.999737311494203e-05, "loss": 0.5999, "step": 451 }, { "epoch": 0.03719399300555441, "grad_norm": 2.500969399378362, "learning_rate": 1.9997311672168632e-05, "loss": 0.9321, "step": 452 }, { "epoch": 0.037276280600699443, "grad_norm": 3.4756867592830076, "learning_rate": 1.99972495192022e-05, "loss": 0.9468, "step": 453 }, { "epoch": 0.037358568195844474, "grad_norm": 2.4507954914499974, "learning_rate": 1.9997186656047154e-05, "loss": 0.9367, "step": 454 }, { "epoch": 0.037440855790989505, "grad_norm": 2.3319357748120066, "learning_rate": 1.9997123082707954e-05, "loss": 0.9506, "step": 455 }, { "epoch": 0.03752314338613454, "grad_norm": 2.4614553831803896, "learning_rate": 1.999705879918912e-05, "loss": 0.9812, "step": 456 }, { "epoch": 0.037605430981279574, "grad_norm": 2.7421103733102665, "learning_rate": 1.999699380549521e-05, "loss": 0.975, "step": 457 }, { "epoch": 0.037687718576424604, "grad_norm": 3.193134683800622, "learning_rate": 1.9996928101630853e-05, "loss": 0.9462, "step": 458 }, { "epoch": 0.037770006171569635, "grad_norm": 2.4788434065823353, "learning_rate": 1.999686168760071e-05, "loss": 0.9442, "step": 459 }, { "epoch": 0.037852293766714666, "grad_norm": 2.67715161966991, "learning_rate": 1.99967945634095e-05, "loss": 0.9497, "step": 460 }, { "epoch": 0.0379345813618597, "grad_norm": 2.8286753306256234, "learning_rate": 1.9996726729061995e-05, "loss": 0.9371, "step": 461 }, { "epoch": 0.038016868957004735, "grad_norm": 2.494636914608068, "learning_rate": 1.999665818456301e-05, "loss": 0.9369, "step": 462 }, { "epoch": 0.038099156552149765, "grad_norm": 3.3684641604813312, "learning_rate": 1.9996588929917413e-05, "loss": 0.9167, "step": 463 }, { "epoch": 0.038181444147294796, "grad_norm": 2.8300347810651836, "learning_rate": 1.9996518965130126e-05, "loss": 0.96, "step": 464 }, { "epoch": 0.03826373174243983, "grad_norm": 2.7216914732590634, "learning_rate": 1.9996448290206117e-05, "loss": 0.9587, "step": 465 }, { "epoch": 0.03834601933758486, "grad_norm": 2.8897584926398223, "learning_rate": 1.999637690515041e-05, "loss": 0.9424, "step": 466 }, { "epoch": 0.03842830693272989, "grad_norm": 2.6782745713753364, "learning_rate": 1.9996304809968074e-05, "loss": 0.9421, "step": 467 }, { "epoch": 0.03851059452787492, "grad_norm": 0.8391702922649521, "learning_rate": 1.9996232004664232e-05, "loss": 0.6291, "step": 468 }, { "epoch": 0.03859288212301996, "grad_norm": 2.9110538284406213, "learning_rate": 1.9996158489244054e-05, "loss": 0.9548, "step": 469 }, { "epoch": 0.03867516971816499, "grad_norm": 2.9735024191976813, "learning_rate": 1.9996084263712764e-05, "loss": 0.9397, "step": 470 }, { "epoch": 0.03875745731331002, "grad_norm": 2.459802449779267, "learning_rate": 1.9996009328075635e-05, "loss": 0.9516, "step": 471 }, { "epoch": 0.03883974490845505, "grad_norm": 1.4795476906818943, "learning_rate": 1.999593368233799e-05, "loss": 0.6175, "step": 472 }, { "epoch": 0.03892203250360008, "grad_norm": 2.7329559825050844, "learning_rate": 1.9995857326505202e-05, "loss": 0.9279, "step": 473 }, { "epoch": 0.03900432009874511, "grad_norm": 2.7310837617231307, "learning_rate": 1.999578026058269e-05, "loss": 0.9325, "step": 474 }, { "epoch": 0.03908660769389015, "grad_norm": 3.580150174543716, "learning_rate": 1.999570248457594e-05, "loss": 0.9403, "step": 475 }, { "epoch": 0.03916889528903518, "grad_norm": 3.518367412394758, "learning_rate": 1.9995623998490473e-05, "loss": 0.9346, "step": 476 }, { "epoch": 0.03925118288418021, "grad_norm": 2.1655004063703167, "learning_rate": 1.999554480233186e-05, "loss": 0.9294, "step": 477 }, { "epoch": 0.03933347047932524, "grad_norm": 2.857429287491222, "learning_rate": 1.9995464896105727e-05, "loss": 0.9201, "step": 478 }, { "epoch": 0.03941575807447027, "grad_norm": 2.3230944603500094, "learning_rate": 1.999538427981776e-05, "loss": 0.9172, "step": 479 }, { "epoch": 0.0394980456696153, "grad_norm": 2.686091492583088, "learning_rate": 1.9995302953473673e-05, "loss": 0.7009, "step": 480 }, { "epoch": 0.039580333264760334, "grad_norm": 2.5370139223659445, "learning_rate": 1.999522091707925e-05, "loss": 0.9547, "step": 481 }, { "epoch": 0.03966262085990537, "grad_norm": 2.9114624346952787, "learning_rate": 1.9995138170640322e-05, "loss": 0.9309, "step": 482 }, { "epoch": 0.0397449084550504, "grad_norm": 2.636772148383987, "learning_rate": 1.9995054714162757e-05, "loss": 0.9224, "step": 483 }, { "epoch": 0.03982719605019543, "grad_norm": 2.3887969483327005, "learning_rate": 1.9994970547652495e-05, "loss": 0.9509, "step": 484 }, { "epoch": 0.039909483645340464, "grad_norm": 2.9497130431080256, "learning_rate": 1.9994885671115506e-05, "loss": 0.9693, "step": 485 }, { "epoch": 0.039991771240485495, "grad_norm": 2.225873777913106, "learning_rate": 1.9994800084557826e-05, "loss": 0.9382, "step": 486 }, { "epoch": 0.040074058835630526, "grad_norm": 3.015548118510522, "learning_rate": 1.9994713787985534e-05, "loss": 0.9084, "step": 487 }, { "epoch": 0.040156346430775564, "grad_norm": 3.2147762822609787, "learning_rate": 1.9994626781404754e-05, "loss": 0.9432, "step": 488 }, { "epoch": 0.040238634025920594, "grad_norm": 2.732749831828487, "learning_rate": 1.9994539064821676e-05, "loss": 0.9493, "step": 489 }, { "epoch": 0.040320921621065625, "grad_norm": 2.718095114325169, "learning_rate": 1.9994450638242524e-05, "loss": 0.6999, "step": 490 }, { "epoch": 0.040403209216210656, "grad_norm": 1.192110613853859, "learning_rate": 1.9994361501673586e-05, "loss": 0.606, "step": 491 }, { "epoch": 0.04048549681135569, "grad_norm": 2.6545275290481523, "learning_rate": 1.9994271655121187e-05, "loss": 0.9562, "step": 492 }, { "epoch": 0.04056778440650072, "grad_norm": 2.6306786770452217, "learning_rate": 1.999418109859171e-05, "loss": 0.932, "step": 493 }, { "epoch": 0.040650072001645755, "grad_norm": 0.7723300623794189, "learning_rate": 1.99940898320916e-05, "loss": 0.6167, "step": 494 }, { "epoch": 0.040732359596790786, "grad_norm": 3.4539680548732075, "learning_rate": 1.9993997855627323e-05, "loss": 0.9547, "step": 495 }, { "epoch": 0.04081464719193582, "grad_norm": 8.174151834055909, "learning_rate": 1.9993905169205425e-05, "loss": 0.9532, "step": 496 }, { "epoch": 0.04089693478708085, "grad_norm": 2.4333462034983517, "learning_rate": 1.9993811772832487e-05, "loss": 0.9201, "step": 497 }, { "epoch": 0.04097922238222588, "grad_norm": 2.621241890180304, "learning_rate": 1.9993717666515143e-05, "loss": 0.9336, "step": 498 }, { "epoch": 0.04106150997737091, "grad_norm": 2.8830815398438308, "learning_rate": 1.999362285026008e-05, "loss": 0.9254, "step": 499 }, { "epoch": 0.04114379757251594, "grad_norm": 3.0315366250694136, "learning_rate": 1.9993527324074028e-05, "loss": 0.9272, "step": 500 }, { "epoch": 0.04122608516766098, "grad_norm": 2.657554413096405, "learning_rate": 1.999343108796378e-05, "loss": 0.9462, "step": 501 }, { "epoch": 0.04130837276280601, "grad_norm": 2.905472644448609, "learning_rate": 1.999333414193617e-05, "loss": 0.9034, "step": 502 }, { "epoch": 0.04139066035795104, "grad_norm": 3.925086807406567, "learning_rate": 1.9993236485998085e-05, "loss": 0.9315, "step": 503 }, { "epoch": 0.04147294795309607, "grad_norm": 3.0313048521155146, "learning_rate": 1.999313812015646e-05, "loss": 0.9535, "step": 504 }, { "epoch": 0.0415552355482411, "grad_norm": 2.962993951360446, "learning_rate": 1.9993039044418286e-05, "loss": 0.9309, "step": 505 }, { "epoch": 0.04163752314338613, "grad_norm": 0.6779011051688715, "learning_rate": 1.99929392587906e-05, "loss": 0.5869, "step": 506 }, { "epoch": 0.04171981073853117, "grad_norm": 2.579639640184937, "learning_rate": 1.9992838763280488e-05, "loss": 0.9118, "step": 507 }, { "epoch": 0.0418020983336762, "grad_norm": 2.1450772300859655, "learning_rate": 1.9992737557895093e-05, "loss": 0.932, "step": 508 }, { "epoch": 0.04188438592882123, "grad_norm": 2.4058977622816977, "learning_rate": 1.9992635642641605e-05, "loss": 0.9301, "step": 509 }, { "epoch": 0.04196667352396626, "grad_norm": 2.4723871593300584, "learning_rate": 1.999253301752726e-05, "loss": 0.9362, "step": 510 }, { "epoch": 0.04204896111911129, "grad_norm": 2.7787980954607616, "learning_rate": 1.999242968255935e-05, "loss": 0.949, "step": 511 }, { "epoch": 0.042131248714256324, "grad_norm": 2.7091957078534783, "learning_rate": 1.9992325637745214e-05, "loss": 0.8939, "step": 512 }, { "epoch": 0.042213536309401355, "grad_norm": 3.104398485557938, "learning_rate": 1.9992220883092247e-05, "loss": 0.9201, "step": 513 }, { "epoch": 0.04229582390454639, "grad_norm": 2.688893801232366, "learning_rate": 1.9992115418607886e-05, "loss": 0.9314, "step": 514 }, { "epoch": 0.04237811149969142, "grad_norm": 0.6175757936794599, "learning_rate": 1.999200924429963e-05, "loss": 0.5823, "step": 515 }, { "epoch": 0.042460399094836454, "grad_norm": 2.134638530502557, "learning_rate": 1.9991902360175017e-05, "loss": 0.8988, "step": 516 }, { "epoch": 0.042542686689981485, "grad_norm": 2.660777130272323, "learning_rate": 1.9991794766241638e-05, "loss": 0.9058, "step": 517 }, { "epoch": 0.042624974285126516, "grad_norm": 2.519959303045957, "learning_rate": 1.9991686462507137e-05, "loss": 0.9157, "step": 518 }, { "epoch": 0.042707261880271546, "grad_norm": 0.5033254525320345, "learning_rate": 1.9991577448979213e-05, "loss": 0.5637, "step": 519 }, { "epoch": 0.042789549475416584, "grad_norm": 2.3638963921206777, "learning_rate": 1.9991467725665604e-05, "loss": 0.9532, "step": 520 }, { "epoch": 0.042871837070561615, "grad_norm": 2.760667379358993, "learning_rate": 1.9991357292574106e-05, "loss": 0.9194, "step": 521 }, { "epoch": 0.042954124665706646, "grad_norm": 2.285449190484726, "learning_rate": 1.9991246149712564e-05, "loss": 0.854, "step": 522 }, { "epoch": 0.04303641226085168, "grad_norm": 2.9222709070685315, "learning_rate": 1.9991134297088877e-05, "loss": 0.9534, "step": 523 }, { "epoch": 0.04311869985599671, "grad_norm": 3.1630611007009355, "learning_rate": 1.9991021734710988e-05, "loss": 0.9505, "step": 524 }, { "epoch": 0.04320098745114174, "grad_norm": 3.174869013367673, "learning_rate": 1.999090846258689e-05, "loss": 0.964, "step": 525 }, { "epoch": 0.04328327504628677, "grad_norm": 2.4328576962151693, "learning_rate": 1.9990794480724634e-05, "loss": 0.9084, "step": 526 }, { "epoch": 0.04336556264143181, "grad_norm": 0.5700103881605539, "learning_rate": 1.9990679789132317e-05, "loss": 0.5734, "step": 527 }, { "epoch": 0.04344785023657684, "grad_norm": 2.392627489613796, "learning_rate": 1.9990564387818087e-05, "loss": 0.916, "step": 528 }, { "epoch": 0.04353013783172187, "grad_norm": 3.2074775648239453, "learning_rate": 1.999044827679014e-05, "loss": 0.9095, "step": 529 }, { "epoch": 0.0436124254268669, "grad_norm": 3.140601191667111, "learning_rate": 1.999033145605672e-05, "loss": 0.904, "step": 530 }, { "epoch": 0.04369471302201193, "grad_norm": 2.3743918081273505, "learning_rate": 1.9990213925626135e-05, "loss": 0.9173, "step": 531 }, { "epoch": 0.04377700061715696, "grad_norm": 2.803625633325397, "learning_rate": 1.999009568550673e-05, "loss": 0.9425, "step": 532 }, { "epoch": 0.043859288212302, "grad_norm": 2.624304052527756, "learning_rate": 1.9989976735706903e-05, "loss": 0.8778, "step": 533 }, { "epoch": 0.04394157580744703, "grad_norm": 3.611007788459353, "learning_rate": 1.9989857076235105e-05, "loss": 0.9454, "step": 534 }, { "epoch": 0.04402386340259206, "grad_norm": 3.0477796789876885, "learning_rate": 1.9989736707099836e-05, "loss": 0.9301, "step": 535 }, { "epoch": 0.04410615099773709, "grad_norm": 3.661229035903915, "learning_rate": 1.998961562830965e-05, "loss": 0.9234, "step": 536 }, { "epoch": 0.04418843859288212, "grad_norm": 3.014314493078093, "learning_rate": 1.9989493839873144e-05, "loss": 0.9205, "step": 537 }, { "epoch": 0.04427072618802715, "grad_norm": 3.1607667446866348, "learning_rate": 1.998937134179897e-05, "loss": 0.9184, "step": 538 }, { "epoch": 0.044353013783172184, "grad_norm": 0.5679302245778807, "learning_rate": 1.9989248134095835e-05, "loss": 0.5808, "step": 539 }, { "epoch": 0.04443530137831722, "grad_norm": 3.4927267069905827, "learning_rate": 1.9989124216772486e-05, "loss": 0.9068, "step": 540 }, { "epoch": 0.04451758897346225, "grad_norm": 3.2792902354283524, "learning_rate": 1.9988999589837727e-05, "loss": 0.9441, "step": 541 }, { "epoch": 0.04459987656860728, "grad_norm": 3.2813608886269465, "learning_rate": 1.9988874253300415e-05, "loss": 0.9135, "step": 542 }, { "epoch": 0.044682164163752314, "grad_norm": 3.6532563430030387, "learning_rate": 1.9988748207169448e-05, "loss": 0.9124, "step": 543 }, { "epoch": 0.044764451758897345, "grad_norm": 3.0411510483789708, "learning_rate": 1.9988621451453783e-05, "loss": 0.9437, "step": 544 }, { "epoch": 0.044846739354042375, "grad_norm": 2.947067350806481, "learning_rate": 1.9988493986162426e-05, "loss": 0.9377, "step": 545 }, { "epoch": 0.04492902694918741, "grad_norm": 3.733984375480931, "learning_rate": 1.9988365811304434e-05, "loss": 0.9302, "step": 546 }, { "epoch": 0.045011314544332444, "grad_norm": 0.5973399530190582, "learning_rate": 1.99882369268889e-05, "loss": 0.5985, "step": 547 }, { "epoch": 0.045093602139477475, "grad_norm": 3.1946558451893483, "learning_rate": 1.9988107332924997e-05, "loss": 0.9306, "step": 548 }, { "epoch": 0.045175889734622506, "grad_norm": 3.0518182224655184, "learning_rate": 1.998797702942192e-05, "loss": 0.9238, "step": 549 }, { "epoch": 0.045258177329767536, "grad_norm": 0.5186994011171457, "learning_rate": 1.9987846016388927e-05, "loss": 0.5534, "step": 550 }, { "epoch": 0.04534046492491257, "grad_norm": 2.9538180602678072, "learning_rate": 1.9987714293835326e-05, "loss": 0.9131, "step": 551 }, { "epoch": 0.0454227525200576, "grad_norm": 3.583039419798021, "learning_rate": 1.9987581861770476e-05, "loss": 0.931, "step": 552 }, { "epoch": 0.045505040115202636, "grad_norm": 3.872167117824797, "learning_rate": 1.9987448720203783e-05, "loss": 0.9149, "step": 553 }, { "epoch": 0.045587327710347667, "grad_norm": 0.5153323660807152, "learning_rate": 1.9987314869144704e-05, "loss": 0.5707, "step": 554 }, { "epoch": 0.0456696153054927, "grad_norm": 3.2458016621373162, "learning_rate": 1.9987180308602752e-05, "loss": 0.9481, "step": 555 }, { "epoch": 0.04575190290063773, "grad_norm": 0.5131089745749331, "learning_rate": 1.998704503858748e-05, "loss": 0.6107, "step": 556 }, { "epoch": 0.04583419049578276, "grad_norm": 3.826718669936501, "learning_rate": 1.99869090591085e-05, "loss": 0.9334, "step": 557 }, { "epoch": 0.04591647809092779, "grad_norm": 2.808877894852513, "learning_rate": 1.9986772370175475e-05, "loss": 0.9313, "step": 558 }, { "epoch": 0.04599876568607283, "grad_norm": 3.429756806838896, "learning_rate": 1.998663497179811e-05, "loss": 0.9041, "step": 559 }, { "epoch": 0.04608105328121786, "grad_norm": 3.927553685701978, "learning_rate": 1.998649686398617e-05, "loss": 0.9229, "step": 560 }, { "epoch": 0.04616334087636289, "grad_norm": 4.358404357254217, "learning_rate": 1.9986358046749463e-05, "loss": 0.9453, "step": 561 }, { "epoch": 0.04624562847150792, "grad_norm": 0.6974205247527027, "learning_rate": 1.998621852009785e-05, "loss": 0.582, "step": 562 }, { "epoch": 0.04632791606665295, "grad_norm": 2.8790199811794213, "learning_rate": 1.9986078284041245e-05, "loss": 0.9073, "step": 563 }, { "epoch": 0.04641020366179798, "grad_norm": 3.1507198941552343, "learning_rate": 1.998593733858961e-05, "loss": 0.9285, "step": 564 }, { "epoch": 0.04649249125694301, "grad_norm": 3.3010925203438757, "learning_rate": 1.9985795683752955e-05, "loss": 0.8975, "step": 565 }, { "epoch": 0.04657477885208805, "grad_norm": 2.4173724120050277, "learning_rate": 1.9985653319541345e-05, "loss": 0.9211, "step": 566 }, { "epoch": 0.04665706644723308, "grad_norm": 3.219239778661617, "learning_rate": 1.9985510245964894e-05, "loss": 0.9414, "step": 567 }, { "epoch": 0.04673935404237811, "grad_norm": 4.702680418398121, "learning_rate": 1.9985366463033763e-05, "loss": 0.8886, "step": 568 }, { "epoch": 0.04682164163752314, "grad_norm": 2.946137626961066, "learning_rate": 1.9985221970758166e-05, "loss": 0.907, "step": 569 }, { "epoch": 0.04690392923266817, "grad_norm": 3.1637086789258224, "learning_rate": 1.9985076769148373e-05, "loss": 0.9063, "step": 570 }, { "epoch": 0.046986216827813204, "grad_norm": 2.7457117180469286, "learning_rate": 1.9984930858214695e-05, "loss": 0.9163, "step": 571 }, { "epoch": 0.04706850442295824, "grad_norm": 2.8795617581547597, "learning_rate": 1.9984784237967495e-05, "loss": 0.9272, "step": 572 }, { "epoch": 0.04715079201810327, "grad_norm": 3.539552457926088, "learning_rate": 1.998463690841719e-05, "loss": 0.9254, "step": 573 }, { "epoch": 0.047233079613248304, "grad_norm": 2.590893854876316, "learning_rate": 1.998448886957425e-05, "loss": 0.9135, "step": 574 }, { "epoch": 0.047315367208393334, "grad_norm": 3.385121747004568, "learning_rate": 1.9984340121449187e-05, "loss": 0.898, "step": 575 }, { "epoch": 0.047397654803538365, "grad_norm": 2.8668381053066248, "learning_rate": 1.998419066405257e-05, "loss": 0.9111, "step": 576 }, { "epoch": 0.047479942398683396, "grad_norm": 0.5561294337589316, "learning_rate": 1.9984040497395016e-05, "loss": 0.6026, "step": 577 }, { "epoch": 0.047562229993828434, "grad_norm": 2.7790207529975683, "learning_rate": 1.9983889621487193e-05, "loss": 0.8813, "step": 578 }, { "epoch": 0.047644517588973465, "grad_norm": 2.929493346002011, "learning_rate": 1.9983738036339818e-05, "loss": 0.934, "step": 579 }, { "epoch": 0.047726805184118495, "grad_norm": 2.6432622003873294, "learning_rate": 1.9983585741963655e-05, "loss": 0.935, "step": 580 }, { "epoch": 0.047809092779263526, "grad_norm": 2.343596103466015, "learning_rate": 1.998343273836953e-05, "loss": 0.8885, "step": 581 }, { "epoch": 0.04789138037440856, "grad_norm": 2.6377392327317355, "learning_rate": 1.998327902556831e-05, "loss": 0.9195, "step": 582 }, { "epoch": 0.04797366796955359, "grad_norm": 0.5734849677326599, "learning_rate": 1.9983124603570915e-05, "loss": 0.5804, "step": 583 }, { "epoch": 0.04805595556469862, "grad_norm": 2.359098397716237, "learning_rate": 1.9982969472388313e-05, "loss": 0.9154, "step": 584 }, { "epoch": 0.048138243159843656, "grad_norm": 3.07285660000184, "learning_rate": 1.9982813632031526e-05, "loss": 0.9293, "step": 585 }, { "epoch": 0.04822053075498869, "grad_norm": 3.145177565014435, "learning_rate": 1.9982657082511624e-05, "loss": 0.909, "step": 586 }, { "epoch": 0.04830281835013372, "grad_norm": 2.4460324686547, "learning_rate": 1.9982499823839726e-05, "loss": 0.9172, "step": 587 }, { "epoch": 0.04838510594527875, "grad_norm": 2.7860695223687335, "learning_rate": 1.9982341856027006e-05, "loss": 0.8962, "step": 588 }, { "epoch": 0.04846739354042378, "grad_norm": 2.5003193611135126, "learning_rate": 1.9982183179084683e-05, "loss": 0.9523, "step": 589 }, { "epoch": 0.04854968113556881, "grad_norm": 0.5728078039718163, "learning_rate": 1.998202379302403e-05, "loss": 0.5939, "step": 590 }, { "epoch": 0.04863196873071385, "grad_norm": 2.513890686672487, "learning_rate": 1.9981863697856376e-05, "loss": 0.9027, "step": 591 }, { "epoch": 0.04871425632585888, "grad_norm": 6.401109317568734, "learning_rate": 1.9981702893593086e-05, "loss": 0.9041, "step": 592 }, { "epoch": 0.04879654392100391, "grad_norm": 0.526955304818451, "learning_rate": 1.9981541380245586e-05, "loss": 0.6109, "step": 593 }, { "epoch": 0.04887883151614894, "grad_norm": 0.5280472746795982, "learning_rate": 1.9981379157825346e-05, "loss": 0.5801, "step": 594 }, { "epoch": 0.04896111911129397, "grad_norm": 2.831289529507686, "learning_rate": 1.99812162263439e-05, "loss": 0.9296, "step": 595 }, { "epoch": 0.049043406706439, "grad_norm": 2.5183731275746637, "learning_rate": 1.998105258581281e-05, "loss": 0.9373, "step": 596 }, { "epoch": 0.04912569430158403, "grad_norm": 2.290556291606923, "learning_rate": 1.998088823624371e-05, "loss": 0.9339, "step": 597 }, { "epoch": 0.04920798189672907, "grad_norm": 2.9827790643550065, "learning_rate": 1.998072317764827e-05, "loss": 0.9341, "step": 598 }, { "epoch": 0.0492902694918741, "grad_norm": 3.9980040686222535, "learning_rate": 1.998055741003822e-05, "loss": 0.9428, "step": 599 }, { "epoch": 0.04937255708701913, "grad_norm": 2.9421068715344125, "learning_rate": 1.998039093342533e-05, "loss": 0.9183, "step": 600 }, { "epoch": 0.04945484468216416, "grad_norm": 2.3512621164999654, "learning_rate": 1.998022374782143e-05, "loss": 0.9139, "step": 601 }, { "epoch": 0.049537132277309194, "grad_norm": 2.8922341692853863, "learning_rate": 1.9980055853238394e-05, "loss": 0.8847, "step": 602 }, { "epoch": 0.049619419872454225, "grad_norm": 2.5544870335833916, "learning_rate": 1.9979887249688158e-05, "loss": 0.9322, "step": 603 }, { "epoch": 0.04970170746759926, "grad_norm": 2.3713588179833427, "learning_rate": 1.9979717937182685e-05, "loss": 0.8953, "step": 604 }, { "epoch": 0.04978399506274429, "grad_norm": 2.567195793905517, "learning_rate": 1.9979547915734014e-05, "loss": 0.9287, "step": 605 }, { "epoch": 0.049866282657889324, "grad_norm": 2.116439796262553, "learning_rate": 1.997937718535422e-05, "loss": 0.9122, "step": 606 }, { "epoch": 0.049948570253034355, "grad_norm": 2.6728583449200967, "learning_rate": 1.9979205746055426e-05, "loss": 0.9409, "step": 607 }, { "epoch": 0.050030857848179386, "grad_norm": 2.9303321533796147, "learning_rate": 1.9979033597849817e-05, "loss": 0.877, "step": 608 }, { "epoch": 0.05011314544332442, "grad_norm": 2.6453736009345103, "learning_rate": 1.9978860740749618e-05, "loss": 0.9264, "step": 609 }, { "epoch": 0.05019543303846945, "grad_norm": 0.6463475109604742, "learning_rate": 1.9978687174767115e-05, "loss": 0.6037, "step": 610 }, { "epoch": 0.050277720633614485, "grad_norm": 2.1568723876857514, "learning_rate": 1.9978512899914632e-05, "loss": 0.9291, "step": 611 }, { "epoch": 0.050360008228759516, "grad_norm": 2.779974581309181, "learning_rate": 1.997833791620455e-05, "loss": 0.9487, "step": 612 }, { "epoch": 0.05044229582390455, "grad_norm": 2.6541794961423726, "learning_rate": 1.9978162223649303e-05, "loss": 0.9314, "step": 613 }, { "epoch": 0.05052458341904958, "grad_norm": 2.204822617972563, "learning_rate": 1.9977985822261367e-05, "loss": 0.9195, "step": 614 }, { "epoch": 0.05060687101419461, "grad_norm": 2.528877153941993, "learning_rate": 1.9977808712053276e-05, "loss": 0.925, "step": 615 }, { "epoch": 0.05068915860933964, "grad_norm": 2.89407673046398, "learning_rate": 1.9977630893037613e-05, "loss": 0.9164, "step": 616 }, { "epoch": 0.05077144620448468, "grad_norm": 2.8147196835709924, "learning_rate": 1.9977452365227005e-05, "loss": 0.9109, "step": 617 }, { "epoch": 0.05085373379962971, "grad_norm": 2.8624190313017697, "learning_rate": 1.997727312863414e-05, "loss": 0.9227, "step": 618 }, { "epoch": 0.05093602139477474, "grad_norm": 2.6853591545801243, "learning_rate": 1.9977093183271746e-05, "loss": 0.9043, "step": 619 }, { "epoch": 0.05101830898991977, "grad_norm": 2.847809177384018, "learning_rate": 1.997691252915261e-05, "loss": 0.8797, "step": 620 }, { "epoch": 0.0511005965850648, "grad_norm": 2.5413962256979477, "learning_rate": 1.9976731166289565e-05, "loss": 0.888, "step": 621 }, { "epoch": 0.05118288418020983, "grad_norm": 2.4434297876428768, "learning_rate": 1.997654909469549e-05, "loss": 0.9193, "step": 622 }, { "epoch": 0.05126517177535486, "grad_norm": 2.554334961124947, "learning_rate": 1.9976366314383323e-05, "loss": 0.945, "step": 623 }, { "epoch": 0.0513474593704999, "grad_norm": 3.0606359366025155, "learning_rate": 1.9976182825366052e-05, "loss": 0.9018, "step": 624 }, { "epoch": 0.05142974696564493, "grad_norm": 2.7602463387503877, "learning_rate": 1.9975998627656704e-05, "loss": 0.9572, "step": 625 }, { "epoch": 0.05151203456078996, "grad_norm": 2.645779738054759, "learning_rate": 1.997581372126837e-05, "loss": 0.8986, "step": 626 }, { "epoch": 0.05159432215593499, "grad_norm": 2.3004786981907808, "learning_rate": 1.997562810621418e-05, "loss": 0.9378, "step": 627 }, { "epoch": 0.05167660975108002, "grad_norm": 3.0529134410232954, "learning_rate": 1.9975441782507327e-05, "loss": 0.9374, "step": 628 }, { "epoch": 0.051758897346225054, "grad_norm": 6.366982443959264, "learning_rate": 1.997525475016104e-05, "loss": 0.9572, "step": 629 }, { "epoch": 0.05184118494137009, "grad_norm": 7.143057307651942, "learning_rate": 1.9975067009188608e-05, "loss": 0.9368, "step": 630 }, { "epoch": 0.05192347253651512, "grad_norm": 2.486114121904295, "learning_rate": 1.997487855960337e-05, "loss": 0.8618, "step": 631 }, { "epoch": 0.05200576013166015, "grad_norm": 2.909503733964849, "learning_rate": 1.9974689401418712e-05, "loss": 0.8998, "step": 632 }, { "epoch": 0.052088047726805184, "grad_norm": 2.506345699862428, "learning_rate": 1.9974499534648068e-05, "loss": 0.9119, "step": 633 }, { "epoch": 0.052170335321950215, "grad_norm": 0.5966023669088316, "learning_rate": 1.9974308959304933e-05, "loss": 0.5656, "step": 634 }, { "epoch": 0.052252622917095246, "grad_norm": 2.9205909740125784, "learning_rate": 1.997411767540284e-05, "loss": 0.9109, "step": 635 }, { "epoch": 0.052334910512240276, "grad_norm": 2.2641759973862534, "learning_rate": 1.9973925682955378e-05, "loss": 0.9023, "step": 636 }, { "epoch": 0.052417198107385314, "grad_norm": 2.4641130571954086, "learning_rate": 1.9973732981976188e-05, "loss": 0.909, "step": 637 }, { "epoch": 0.052499485702530345, "grad_norm": 2.2247912270982195, "learning_rate": 1.9973539572478955e-05, "loss": 0.9111, "step": 638 }, { "epoch": 0.052581773297675376, "grad_norm": 2.182850954981328, "learning_rate": 1.9973345454477422e-05, "loss": 0.885, "step": 639 }, { "epoch": 0.05266406089282041, "grad_norm": 0.5616279149900174, "learning_rate": 1.997315062798538e-05, "loss": 0.5634, "step": 640 }, { "epoch": 0.05274634848796544, "grad_norm": 2.1709200144119287, "learning_rate": 1.9972955093016662e-05, "loss": 0.9021, "step": 641 }, { "epoch": 0.05282863608311047, "grad_norm": 3.0243470611887853, "learning_rate": 1.9972758849585167e-05, "loss": 0.923, "step": 642 }, { "epoch": 0.052910923678255506, "grad_norm": 0.5181983481216014, "learning_rate": 1.9972561897704832e-05, "loss": 0.589, "step": 643 }, { "epoch": 0.05299321127340054, "grad_norm": 2.3618384003718904, "learning_rate": 1.997236423738965e-05, "loss": 0.8893, "step": 644 }, { "epoch": 0.05307549886854557, "grad_norm": 2.83302899205139, "learning_rate": 1.997216586865366e-05, "loss": 0.9056, "step": 645 }, { "epoch": 0.0531577864636906, "grad_norm": 2.1524435897397756, "learning_rate": 1.9971966791510952e-05, "loss": 0.8875, "step": 646 }, { "epoch": 0.05324007405883563, "grad_norm": 0.5403616002875096, "learning_rate": 1.9971767005975676e-05, "loss": 0.5864, "step": 647 }, { "epoch": 0.05332236165398066, "grad_norm": 3.032727501630103, "learning_rate": 1.9971566512062016e-05, "loss": 0.9269, "step": 648 }, { "epoch": 0.0534046492491257, "grad_norm": 2.677613120586094, "learning_rate": 1.9971365309784222e-05, "loss": 0.9319, "step": 649 }, { "epoch": 0.05348693684427073, "grad_norm": 2.7527601762070626, "learning_rate": 1.9971163399156577e-05, "loss": 0.911, "step": 650 }, { "epoch": 0.05356922443941576, "grad_norm": 2.456807133771137, "learning_rate": 1.9970960780193435e-05, "loss": 0.9274, "step": 651 }, { "epoch": 0.05365151203456079, "grad_norm": 0.5512339745238304, "learning_rate": 1.9970757452909185e-05, "loss": 0.5999, "step": 652 }, { "epoch": 0.05373379962970582, "grad_norm": 3.3078302086877454, "learning_rate": 1.997055341731827e-05, "loss": 0.9161, "step": 653 }, { "epoch": 0.05381608722485085, "grad_norm": 1.9567891820560834, "learning_rate": 1.9970348673435187e-05, "loss": 0.8954, "step": 654 }, { "epoch": 0.05389837481999588, "grad_norm": 2.4558167849951027, "learning_rate": 1.9970143221274477e-05, "loss": 0.9041, "step": 655 }, { "epoch": 0.05398066241514092, "grad_norm": 2.6700615275845214, "learning_rate": 1.996993706085074e-05, "loss": 0.9406, "step": 656 }, { "epoch": 0.05406295001028595, "grad_norm": 2.47054592661293, "learning_rate": 1.9969730192178618e-05, "loss": 0.9075, "step": 657 }, { "epoch": 0.05414523760543098, "grad_norm": 2.527986443897195, "learning_rate": 1.9969522615272806e-05, "loss": 0.9012, "step": 658 }, { "epoch": 0.05422752520057601, "grad_norm": 0.5565334590513972, "learning_rate": 1.9969314330148056e-05, "loss": 0.5587, "step": 659 }, { "epoch": 0.054309812795721044, "grad_norm": 1.8601076711624556, "learning_rate": 1.9969105336819154e-05, "loss": 0.8991, "step": 660 }, { "epoch": 0.054392100390866074, "grad_norm": 2.0210809868042356, "learning_rate": 1.9968895635300956e-05, "loss": 0.9302, "step": 661 }, { "epoch": 0.05447438798601111, "grad_norm": 2.1871429796039363, "learning_rate": 1.9968685225608353e-05, "loss": 0.8719, "step": 662 }, { "epoch": 0.05455667558115614, "grad_norm": 2.699275991596056, "learning_rate": 1.9968474107756295e-05, "loss": 0.9107, "step": 663 }, { "epoch": 0.054638963176301174, "grad_norm": 2.921814293546767, "learning_rate": 1.996826228175978e-05, "loss": 0.9124, "step": 664 }, { "epoch": 0.054721250771446205, "grad_norm": 2.9121454433336917, "learning_rate": 1.9968049747633848e-05, "loss": 0.8872, "step": 665 }, { "epoch": 0.054803538366591235, "grad_norm": 4.665109966003875, "learning_rate": 1.996783650539361e-05, "loss": 0.9337, "step": 666 }, { "epoch": 0.054885825961736266, "grad_norm": 2.2334882062761814, "learning_rate": 1.9967622555054204e-05, "loss": 0.9249, "step": 667 }, { "epoch": 0.0549681135568813, "grad_norm": 1.8093225226331142, "learning_rate": 1.9967407896630837e-05, "loss": 0.8666, "step": 668 }, { "epoch": 0.055050401152026335, "grad_norm": 0.5652676807003993, "learning_rate": 1.996719253013875e-05, "loss": 0.5961, "step": 669 }, { "epoch": 0.055132688747171366, "grad_norm": 0.5100457321950321, "learning_rate": 1.9966976455593247e-05, "loss": 0.5618, "step": 670 }, { "epoch": 0.055214976342316396, "grad_norm": 2.773850609378529, "learning_rate": 1.9966759673009677e-05, "loss": 0.9275, "step": 671 }, { "epoch": 0.05529726393746143, "grad_norm": 2.5443256480658296, "learning_rate": 1.9966542182403437e-05, "loss": 0.9077, "step": 672 }, { "epoch": 0.05537955153260646, "grad_norm": 3.282011580384134, "learning_rate": 1.9966323983789983e-05, "loss": 0.921, "step": 673 }, { "epoch": 0.05546183912775149, "grad_norm": 2.2203588190464885, "learning_rate": 1.996610507718481e-05, "loss": 0.8988, "step": 674 }, { "epoch": 0.05554412672289653, "grad_norm": 4.790143157081725, "learning_rate": 1.996588546260347e-05, "loss": 0.9526, "step": 675 }, { "epoch": 0.05562641431804156, "grad_norm": 2.092143807841506, "learning_rate": 1.9965665140061565e-05, "loss": 0.915, "step": 676 }, { "epoch": 0.05570870191318659, "grad_norm": 1.9784649465852888, "learning_rate": 1.9965444109574744e-05, "loss": 0.905, "step": 677 }, { "epoch": 0.05579098950833162, "grad_norm": 2.7843501048163217, "learning_rate": 1.9965222371158718e-05, "loss": 0.8951, "step": 678 }, { "epoch": 0.05587327710347665, "grad_norm": 2.6331805589786383, "learning_rate": 1.9964999924829224e-05, "loss": 0.8614, "step": 679 }, { "epoch": 0.05595556469862168, "grad_norm": 0.7467735870885243, "learning_rate": 1.9964776770602078e-05, "loss": 0.6063, "step": 680 }, { "epoch": 0.05603785229376671, "grad_norm": 2.680536053721946, "learning_rate": 1.9964552908493123e-05, "loss": 0.8782, "step": 681 }, { "epoch": 0.05612013988891175, "grad_norm": 3.49552823109986, "learning_rate": 1.9964328338518264e-05, "loss": 0.902, "step": 682 }, { "epoch": 0.05620242748405678, "grad_norm": 2.120123047682193, "learning_rate": 1.996410306069346e-05, "loss": 0.9496, "step": 683 }, { "epoch": 0.05628471507920181, "grad_norm": 1.937156037107827, "learning_rate": 1.9963877075034706e-05, "loss": 0.8875, "step": 684 }, { "epoch": 0.05636700267434684, "grad_norm": 2.4742509534066754, "learning_rate": 1.9963650381558063e-05, "loss": 0.9192, "step": 685 }, { "epoch": 0.05644929026949187, "grad_norm": 2.3426169694208903, "learning_rate": 1.996342298027963e-05, "loss": 0.9481, "step": 686 }, { "epoch": 0.0565315778646369, "grad_norm": 2.1543307158741434, "learning_rate": 1.9963194871215557e-05, "loss": 0.8948, "step": 687 }, { "epoch": 0.05661386545978194, "grad_norm": 1.7721734117310426, "learning_rate": 1.9962966054382062e-05, "loss": 0.8769, "step": 688 }, { "epoch": 0.05669615305492697, "grad_norm": 2.637184520870366, "learning_rate": 1.9962736529795388e-05, "loss": 0.9305, "step": 689 }, { "epoch": 0.056778440650072, "grad_norm": 2.5552424968357306, "learning_rate": 1.9962506297471846e-05, "loss": 0.9011, "step": 690 }, { "epoch": 0.05686072824521703, "grad_norm": 2.1091093097631797, "learning_rate": 1.9962275357427787e-05, "loss": 0.9153, "step": 691 }, { "epoch": 0.056943015840362064, "grad_norm": 3.8893843496883775, "learning_rate": 1.996204370967962e-05, "loss": 0.9516, "step": 692 }, { "epoch": 0.057025303435507095, "grad_norm": 0.6989567675386245, "learning_rate": 1.9961811354243798e-05, "loss": 0.6088, "step": 693 }, { "epoch": 0.057107591030652126, "grad_norm": 3.0703220705587326, "learning_rate": 1.9961578291136834e-05, "loss": 0.9468, "step": 694 }, { "epoch": 0.057189878625797164, "grad_norm": 0.5452905698296876, "learning_rate": 1.9961344520375276e-05, "loss": 0.5795, "step": 695 }, { "epoch": 0.057272166220942194, "grad_norm": 3.477621910759164, "learning_rate": 1.9961110041975732e-05, "loss": 0.9586, "step": 696 }, { "epoch": 0.057354453816087225, "grad_norm": 3.5385882928206454, "learning_rate": 1.9960874855954863e-05, "loss": 0.9508, "step": 697 }, { "epoch": 0.057436741411232256, "grad_norm": 2.6972731084205437, "learning_rate": 1.996063896232938e-05, "loss": 0.9313, "step": 698 }, { "epoch": 0.05751902900637729, "grad_norm": 0.6344603977192381, "learning_rate": 1.9960402361116026e-05, "loss": 0.6044, "step": 699 }, { "epoch": 0.05760131660152232, "grad_norm": 5.571545453742246, "learning_rate": 1.996016505233162e-05, "loss": 0.92, "step": 700 }, { "epoch": 0.057683604196667355, "grad_norm": 2.859612009759652, "learning_rate": 1.9959927035993017e-05, "loss": 0.897, "step": 701 }, { "epoch": 0.057765891791812386, "grad_norm": 2.426187536557682, "learning_rate": 1.9959688312117128e-05, "loss": 0.9305, "step": 702 }, { "epoch": 0.05784817938695742, "grad_norm": 2.7388965530788, "learning_rate": 1.995944888072091e-05, "loss": 0.9145, "step": 703 }, { "epoch": 0.05793046698210245, "grad_norm": 2.776291815110774, "learning_rate": 1.995920874182137e-05, "loss": 0.9075, "step": 704 }, { "epoch": 0.05801275457724748, "grad_norm": 2.575679639237728, "learning_rate": 1.995896789543557e-05, "loss": 0.9045, "step": 705 }, { "epoch": 0.05809504217239251, "grad_norm": 3.5403132152741263, "learning_rate": 1.9958726341580615e-05, "loss": 0.913, "step": 706 }, { "epoch": 0.05817732976753754, "grad_norm": 2.58072580176139, "learning_rate": 1.995848408027367e-05, "loss": 0.9229, "step": 707 }, { "epoch": 0.05825961736268258, "grad_norm": 2.5124996774654473, "learning_rate": 1.9958241111531942e-05, "loss": 0.9126, "step": 708 }, { "epoch": 0.05834190495782761, "grad_norm": 2.36119565147592, "learning_rate": 1.995799743537269e-05, "loss": 0.9066, "step": 709 }, { "epoch": 0.05842419255297264, "grad_norm": 3.2376572469679847, "learning_rate": 1.9957753051813228e-05, "loss": 0.9107, "step": 710 }, { "epoch": 0.05850648014811767, "grad_norm": 0.5718002254539629, "learning_rate": 1.9957507960870908e-05, "loss": 0.5838, "step": 711 }, { "epoch": 0.0585887677432627, "grad_norm": 2.9835296928097765, "learning_rate": 1.9957262162563155e-05, "loss": 0.9062, "step": 712 }, { "epoch": 0.05867105533840773, "grad_norm": 2.312335655498833, "learning_rate": 1.9957015656907417e-05, "loss": 0.9331, "step": 713 }, { "epoch": 0.05875334293355277, "grad_norm": 2.3792417930038168, "learning_rate": 1.9956768443921214e-05, "loss": 0.9371, "step": 714 }, { "epoch": 0.0588356305286978, "grad_norm": 3.0747711781753955, "learning_rate": 1.99565205236221e-05, "loss": 0.9245, "step": 715 }, { "epoch": 0.05891791812384283, "grad_norm": 2.469147337654409, "learning_rate": 1.9956271896027696e-05, "loss": 0.9053, "step": 716 }, { "epoch": 0.05900020571898786, "grad_norm": 4.677348829502867, "learning_rate": 1.9956022561155655e-05, "loss": 0.9316, "step": 717 }, { "epoch": 0.05908249331413289, "grad_norm": 2.574073344258724, "learning_rate": 1.9955772519023694e-05, "loss": 0.9144, "step": 718 }, { "epoch": 0.059164780909277924, "grad_norm": 0.6010291838312377, "learning_rate": 1.995552176964958e-05, "loss": 0.5969, "step": 719 }, { "epoch": 0.05924706850442296, "grad_norm": 0.48362592184616704, "learning_rate": 1.9955270313051115e-05, "loss": 0.6105, "step": 720 }, { "epoch": 0.05932935609956799, "grad_norm": 4.6846130266410935, "learning_rate": 1.995501814924617e-05, "loss": 0.9146, "step": 721 }, { "epoch": 0.05941164369471302, "grad_norm": 2.577204170673208, "learning_rate": 1.9954765278252656e-05, "loss": 0.9073, "step": 722 }, { "epoch": 0.059493931289858054, "grad_norm": 4.7923802267754985, "learning_rate": 1.995451170008854e-05, "loss": 0.9192, "step": 723 }, { "epoch": 0.059576218885003085, "grad_norm": 3.637556402050712, "learning_rate": 1.995425741477183e-05, "loss": 0.8916, "step": 724 }, { "epoch": 0.059658506480148116, "grad_norm": 3.318312481516906, "learning_rate": 1.9954002422320593e-05, "loss": 0.8979, "step": 725 }, { "epoch": 0.05974079407529315, "grad_norm": 2.2896767162285476, "learning_rate": 1.9953746722752944e-05, "loss": 0.9078, "step": 726 }, { "epoch": 0.059823081670438184, "grad_norm": 2.4261610228532433, "learning_rate": 1.9953490316087045e-05, "loss": 0.9094, "step": 727 }, { "epoch": 0.059905369265583215, "grad_norm": 3.5742603087267533, "learning_rate": 1.9953233202341115e-05, "loss": 0.9668, "step": 728 }, { "epoch": 0.059987656860728246, "grad_norm": 3.646866686252275, "learning_rate": 1.995297538153341e-05, "loss": 0.9081, "step": 729 }, { "epoch": 0.06006994445587328, "grad_norm": 3.5756298093016134, "learning_rate": 1.9952716853682258e-05, "loss": 0.932, "step": 730 }, { "epoch": 0.06015223205101831, "grad_norm": 2.461737210935374, "learning_rate": 1.9952457618806016e-05, "loss": 0.9161, "step": 731 }, { "epoch": 0.06023451964616334, "grad_norm": 2.9435688364135038, "learning_rate": 1.99521976769231e-05, "loss": 0.8791, "step": 732 }, { "epoch": 0.060316807241308376, "grad_norm": 3.752079579941048, "learning_rate": 1.995193702805198e-05, "loss": 0.8864, "step": 733 }, { "epoch": 0.06039909483645341, "grad_norm": 4.53396790098707, "learning_rate": 1.9951675672211163e-05, "loss": 0.8929, "step": 734 }, { "epoch": 0.06048138243159844, "grad_norm": 4.961620647630342, "learning_rate": 1.9951413609419225e-05, "loss": 0.8536, "step": 735 }, { "epoch": 0.06056367002674347, "grad_norm": 3.891304133200799, "learning_rate": 1.995115083969478e-05, "loss": 0.8944, "step": 736 }, { "epoch": 0.0606459576218885, "grad_norm": 2.712319861053012, "learning_rate": 1.9950887363056495e-05, "loss": 0.9206, "step": 737 }, { "epoch": 0.06072824521703353, "grad_norm": 4.223019111124196, "learning_rate": 1.9950623179523085e-05, "loss": 0.9025, "step": 738 }, { "epoch": 0.06081053281217856, "grad_norm": 5.016232013409377, "learning_rate": 1.9950358289113317e-05, "loss": 0.8815, "step": 739 }, { "epoch": 0.0608928204073236, "grad_norm": 2.6897434242049694, "learning_rate": 1.995009269184601e-05, "loss": 0.8836, "step": 740 }, { "epoch": 0.06097510800246863, "grad_norm": 0.7568433896575619, "learning_rate": 1.994982638774003e-05, "loss": 0.5993, "step": 741 }, { "epoch": 0.06105739559761366, "grad_norm": 2.553452324246678, "learning_rate": 1.9949559376814296e-05, "loss": 0.8986, "step": 742 }, { "epoch": 0.06113968319275869, "grad_norm": 0.5018812785768227, "learning_rate": 1.9949291659087776e-05, "loss": 0.5597, "step": 743 }, { "epoch": 0.06122197078790372, "grad_norm": 2.4064235706469, "learning_rate": 1.994902323457949e-05, "loss": 0.8943, "step": 744 }, { "epoch": 0.06130425838304875, "grad_norm": 2.295948111702661, "learning_rate": 1.9948754103308504e-05, "loss": 0.8668, "step": 745 }, { "epoch": 0.06138654597819379, "grad_norm": 0.6531820015601002, "learning_rate": 1.9948484265293934e-05, "loss": 0.5944, "step": 746 }, { "epoch": 0.06146883357333882, "grad_norm": 2.488686897667554, "learning_rate": 1.9948213720554955e-05, "loss": 0.8939, "step": 747 }, { "epoch": 0.06155112116848385, "grad_norm": 2.2478829073807867, "learning_rate": 1.994794246911078e-05, "loss": 0.878, "step": 748 }, { "epoch": 0.06163340876362888, "grad_norm": 3.21297658438237, "learning_rate": 1.9947670510980686e-05, "loss": 0.9367, "step": 749 }, { "epoch": 0.061715696358773914, "grad_norm": 2.5032219143064296, "learning_rate": 1.9947397846183986e-05, "loss": 0.909, "step": 750 }, { "epoch": 0.061797983953918945, "grad_norm": 2.3821398027611367, "learning_rate": 1.9947124474740052e-05, "loss": 0.8767, "step": 751 }, { "epoch": 0.061880271549063975, "grad_norm": 4.029427101966951, "learning_rate": 1.99468503966683e-05, "loss": 0.8618, "step": 752 }, { "epoch": 0.06196255914420901, "grad_norm": 2.404778806152705, "learning_rate": 1.9946575611988207e-05, "loss": 0.9047, "step": 753 }, { "epoch": 0.062044846739354044, "grad_norm": 2.962612526189809, "learning_rate": 1.9946300120719287e-05, "loss": 0.889, "step": 754 }, { "epoch": 0.062127134334499075, "grad_norm": 2.5437765511188695, "learning_rate": 1.994602392288112e-05, "loss": 0.9399, "step": 755 }, { "epoch": 0.062209421929644106, "grad_norm": 0.5539735241167393, "learning_rate": 1.9945747018493314e-05, "loss": 0.5963, "step": 756 }, { "epoch": 0.062291709524789136, "grad_norm": 3.1779858985642817, "learning_rate": 1.9945469407575543e-05, "loss": 0.876, "step": 757 }, { "epoch": 0.06237399711993417, "grad_norm": 2.687485842671492, "learning_rate": 1.9945191090147537e-05, "loss": 0.9022, "step": 758 }, { "epoch": 0.062456284715079205, "grad_norm": 2.9422463927653766, "learning_rate": 1.9944912066229058e-05, "loss": 0.8956, "step": 759 }, { "epoch": 0.06253857231022424, "grad_norm": 4.157936413648122, "learning_rate": 1.9944632335839927e-05, "loss": 0.9138, "step": 760 }, { "epoch": 0.06262085990536927, "grad_norm": 0.48567249965915693, "learning_rate": 1.9944351899000026e-05, "loss": 0.5563, "step": 761 }, { "epoch": 0.0627031475005143, "grad_norm": 2.7821820465506, "learning_rate": 1.9944070755729266e-05, "loss": 0.9122, "step": 762 }, { "epoch": 0.06278543509565933, "grad_norm": 2.65823773191475, "learning_rate": 1.9943788906047624e-05, "loss": 0.9009, "step": 763 }, { "epoch": 0.06286772269080436, "grad_norm": 0.4745158162176376, "learning_rate": 1.9943506349975118e-05, "loss": 0.5845, "step": 764 }, { "epoch": 0.06295001028594939, "grad_norm": 4.304541123505603, "learning_rate": 1.9943223087531824e-05, "loss": 0.911, "step": 765 }, { "epoch": 0.06303229788109442, "grad_norm": 2.599121308286042, "learning_rate": 1.9942939118737866e-05, "loss": 0.9082, "step": 766 }, { "epoch": 0.06311458547623945, "grad_norm": 2.661380985142305, "learning_rate": 1.9942654443613413e-05, "loss": 0.889, "step": 767 }, { "epoch": 0.06319687307138448, "grad_norm": 2.7289869422777406, "learning_rate": 1.994236906217869e-05, "loss": 0.8807, "step": 768 }, { "epoch": 0.06327916066652953, "grad_norm": 3.552184676009908, "learning_rate": 1.9942082974453968e-05, "loss": 0.8869, "step": 769 }, { "epoch": 0.06336144826167456, "grad_norm": 3.3116779659066222, "learning_rate": 1.994179618045957e-05, "loss": 0.886, "step": 770 }, { "epoch": 0.06344373585681959, "grad_norm": 2.733151926112565, "learning_rate": 1.9941508680215874e-05, "loss": 0.878, "step": 771 }, { "epoch": 0.06352602345196462, "grad_norm": 3.689575278866226, "learning_rate": 1.9941220473743297e-05, "loss": 0.9012, "step": 772 }, { "epoch": 0.06360831104710965, "grad_norm": 3.6509278934675344, "learning_rate": 1.994093156106232e-05, "loss": 0.8859, "step": 773 }, { "epoch": 0.06369059864225468, "grad_norm": 3.4408763078150373, "learning_rate": 1.9940641942193462e-05, "loss": 0.9895, "step": 774 }, { "epoch": 0.06377288623739971, "grad_norm": 3.356367722166113, "learning_rate": 1.9940351617157298e-05, "loss": 0.9321, "step": 775 }, { "epoch": 0.06385517383254474, "grad_norm": 2.6685489053310905, "learning_rate": 1.994006058597445e-05, "loss": 0.871, "step": 776 }, { "epoch": 0.06393746142768977, "grad_norm": 2.1000398415565447, "learning_rate": 1.99397688486656e-05, "loss": 0.8799, "step": 777 }, { "epoch": 0.0640197490228348, "grad_norm": 2.1292877692214462, "learning_rate": 1.9939476405251464e-05, "loss": 0.8955, "step": 778 }, { "epoch": 0.06410203661797984, "grad_norm": 3.4132241841166073, "learning_rate": 1.9939183255752817e-05, "loss": 0.8757, "step": 779 }, { "epoch": 0.06418432421312487, "grad_norm": 2.62487277122737, "learning_rate": 1.9938889400190494e-05, "loss": 0.8884, "step": 780 }, { "epoch": 0.0642666118082699, "grad_norm": 2.044302329571613, "learning_rate": 1.993859483858536e-05, "loss": 0.9023, "step": 781 }, { "epoch": 0.06434889940341494, "grad_norm": 0.5567547220538414, "learning_rate": 1.993829957095834e-05, "loss": 0.5694, "step": 782 }, { "epoch": 0.06443118699855997, "grad_norm": 0.48731474493235843, "learning_rate": 1.9938003597330415e-05, "loss": 0.5764, "step": 783 }, { "epoch": 0.064513474593705, "grad_norm": 2.335128235917664, "learning_rate": 1.9937706917722607e-05, "loss": 0.9091, "step": 784 }, { "epoch": 0.06459576218885003, "grad_norm": 2.6840226763995383, "learning_rate": 1.9937409532155992e-05, "loss": 0.8881, "step": 785 }, { "epoch": 0.06467804978399506, "grad_norm": 2.3949102024541653, "learning_rate": 1.99371114406517e-05, "loss": 0.9183, "step": 786 }, { "epoch": 0.0647603373791401, "grad_norm": 2.6216703824274488, "learning_rate": 1.99368126432309e-05, "loss": 0.9207, "step": 787 }, { "epoch": 0.06484262497428513, "grad_norm": 2.614435269135524, "learning_rate": 1.993651313991482e-05, "loss": 0.9145, "step": 788 }, { "epoch": 0.06492491256943016, "grad_norm": 1.9122678315195296, "learning_rate": 1.9936212930724742e-05, "loss": 0.8829, "step": 789 }, { "epoch": 0.06500720016457519, "grad_norm": 0.5913835221535177, "learning_rate": 1.9935912015681984e-05, "loss": 0.6145, "step": 790 }, { "epoch": 0.06508948775972022, "grad_norm": 2.528199419410872, "learning_rate": 1.993561039480793e-05, "loss": 0.8655, "step": 791 }, { "epoch": 0.06517177535486525, "grad_norm": 3.3798538121747326, "learning_rate": 1.9935308068124e-05, "loss": 0.9251, "step": 792 }, { "epoch": 0.06525406295001028, "grad_norm": 2.6588327121370194, "learning_rate": 1.9935005035651676e-05, "loss": 0.8983, "step": 793 }, { "epoch": 0.06533635054515531, "grad_norm": 0.5232567113259947, "learning_rate": 1.9934701297412482e-05, "loss": 0.578, "step": 794 }, { "epoch": 0.06541863814030036, "grad_norm": 4.752300485944965, "learning_rate": 1.9934396853427998e-05, "loss": 0.8953, "step": 795 }, { "epoch": 0.06550092573544539, "grad_norm": 2.2269507955655987, "learning_rate": 1.9934091703719846e-05, "loss": 0.9245, "step": 796 }, { "epoch": 0.06558321333059042, "grad_norm": 3.122445969674065, "learning_rate": 1.9933785848309708e-05, "loss": 0.8914, "step": 797 }, { "epoch": 0.06566550092573545, "grad_norm": 3.1204724551293426, "learning_rate": 1.9933479287219312e-05, "loss": 0.9287, "step": 798 }, { "epoch": 0.06574778852088048, "grad_norm": 14.479758337139925, "learning_rate": 1.9933172020470433e-05, "loss": 0.8677, "step": 799 }, { "epoch": 0.06583007611602551, "grad_norm": 2.1224285416282953, "learning_rate": 1.99328640480849e-05, "loss": 0.8755, "step": 800 }, { "epoch": 0.06591236371117054, "grad_norm": 2.487164087508179, "learning_rate": 1.9932555370084588e-05, "loss": 0.8775, "step": 801 }, { "epoch": 0.06599465130631557, "grad_norm": 0.5728404010402629, "learning_rate": 1.9932245986491425e-05, "loss": 0.5477, "step": 802 }, { "epoch": 0.0660769389014606, "grad_norm": 3.245446623126787, "learning_rate": 1.9931935897327396e-05, "loss": 0.9005, "step": 803 }, { "epoch": 0.06615922649660563, "grad_norm": 2.5198170754823237, "learning_rate": 1.9931625102614524e-05, "loss": 0.9251, "step": 804 }, { "epoch": 0.06624151409175066, "grad_norm": 2.7124091417439447, "learning_rate": 1.9931313602374886e-05, "loss": 0.9043, "step": 805 }, { "epoch": 0.0663238016868957, "grad_norm": 2.295917945326921, "learning_rate": 1.9931001396630613e-05, "loss": 0.9037, "step": 806 }, { "epoch": 0.06640608928204073, "grad_norm": 2.5595180677086176, "learning_rate": 1.9930688485403885e-05, "loss": 0.8916, "step": 807 }, { "epoch": 0.06648837687718577, "grad_norm": 2.54401264532517, "learning_rate": 1.993037486871693e-05, "loss": 0.8865, "step": 808 }, { "epoch": 0.0665706644723308, "grad_norm": 2.7644346282703567, "learning_rate": 1.993006054659202e-05, "loss": 0.875, "step": 809 }, { "epoch": 0.06665295206747583, "grad_norm": 2.145314542653547, "learning_rate": 1.9929745519051497e-05, "loss": 0.9358, "step": 810 }, { "epoch": 0.06673523966262086, "grad_norm": 3.2713117109960583, "learning_rate": 1.9929429786117724e-05, "loss": 0.8777, "step": 811 }, { "epoch": 0.0668175272577659, "grad_norm": 0.5829653015669467, "learning_rate": 1.9929113347813145e-05, "loss": 0.5366, "step": 812 }, { "epoch": 0.06689981485291092, "grad_norm": 2.4233464969419516, "learning_rate": 1.992879620416023e-05, "loss": 0.9099, "step": 813 }, { "epoch": 0.06698210244805596, "grad_norm": 2.7021068296091624, "learning_rate": 1.9928478355181512e-05, "loss": 0.9092, "step": 814 }, { "epoch": 0.06706439004320099, "grad_norm": 2.522776219516862, "learning_rate": 1.992815980089957e-05, "loss": 0.9024, "step": 815 }, { "epoch": 0.06714667763834602, "grad_norm": 2.232284370603574, "learning_rate": 1.9927840541337037e-05, "loss": 0.9233, "step": 816 }, { "epoch": 0.06722896523349105, "grad_norm": 2.9343145896014255, "learning_rate": 1.9927520576516587e-05, "loss": 0.9312, "step": 817 }, { "epoch": 0.06731125282863608, "grad_norm": 3.3222486630048764, "learning_rate": 1.9927199906460947e-05, "loss": 0.8681, "step": 818 }, { "epoch": 0.06739354042378111, "grad_norm": 2.1225744897957153, "learning_rate": 1.9926878531192908e-05, "loss": 0.8916, "step": 819 }, { "epoch": 0.06747582801892614, "grad_norm": 5.166258547080567, "learning_rate": 1.992655645073529e-05, "loss": 0.9153, "step": 820 }, { "epoch": 0.06755811561407118, "grad_norm": 3.2639889220707077, "learning_rate": 1.992623366511098e-05, "loss": 0.8715, "step": 821 }, { "epoch": 0.06764040320921622, "grad_norm": 4.714497016717951, "learning_rate": 1.9925910174342907e-05, "loss": 0.8723, "step": 822 }, { "epoch": 0.06772269080436125, "grad_norm": 2.5352280280058315, "learning_rate": 1.9925585978454043e-05, "loss": 0.9045, "step": 823 }, { "epoch": 0.06780497839950628, "grad_norm": 3.485579632575649, "learning_rate": 1.992526107746743e-05, "loss": 0.8797, "step": 824 }, { "epoch": 0.06788726599465131, "grad_norm": 12.454695730191421, "learning_rate": 1.992493547140614e-05, "loss": 0.8755, "step": 825 }, { "epoch": 0.06796955358979634, "grad_norm": 0.5679287848373274, "learning_rate": 1.9924609160293308e-05, "loss": 0.5737, "step": 826 }, { "epoch": 0.06805184118494137, "grad_norm": 6.733588252523935, "learning_rate": 1.9924282144152115e-05, "loss": 0.8607, "step": 827 }, { "epoch": 0.0681341287800864, "grad_norm": 2.8353728427421965, "learning_rate": 1.9923954423005786e-05, "loss": 0.8658, "step": 828 }, { "epoch": 0.06821641637523143, "grad_norm": 2.226675047912921, "learning_rate": 1.9923625996877607e-05, "loss": 0.8908, "step": 829 }, { "epoch": 0.06829870397037646, "grad_norm": 2.090011013197403, "learning_rate": 1.9923296865790907e-05, "loss": 0.9027, "step": 830 }, { "epoch": 0.06838099156552149, "grad_norm": 2.4269097740027687, "learning_rate": 1.992296702976907e-05, "loss": 0.8743, "step": 831 }, { "epoch": 0.06846327916066652, "grad_norm": 2.4454075613373174, "learning_rate": 1.9922636488835528e-05, "loss": 0.9188, "step": 832 }, { "epoch": 0.06854556675581157, "grad_norm": 2.708156376904729, "learning_rate": 1.992230524301375e-05, "loss": 0.8753, "step": 833 }, { "epoch": 0.0686278543509566, "grad_norm": 6.9289687760917955, "learning_rate": 1.9921973292327285e-05, "loss": 0.8714, "step": 834 }, { "epoch": 0.06871014194610163, "grad_norm": 2.833475838520833, "learning_rate": 1.9921640636799697e-05, "loss": 0.878, "step": 835 }, { "epoch": 0.06879242954124666, "grad_norm": 0.6390100760660502, "learning_rate": 1.992130727645463e-05, "loss": 0.5892, "step": 836 }, { "epoch": 0.06887471713639169, "grad_norm": 3.503075844449775, "learning_rate": 1.992097321131576e-05, "loss": 0.9134, "step": 837 }, { "epoch": 0.06895700473153672, "grad_norm": 2.928003367939948, "learning_rate": 1.992063844140682e-05, "loss": 0.916, "step": 838 }, { "epoch": 0.06903929232668175, "grad_norm": 2.79325002366026, "learning_rate": 1.992030296675159e-05, "loss": 0.8767, "step": 839 }, { "epoch": 0.06912157992182678, "grad_norm": 2.312184411585912, "learning_rate": 1.9919966787373902e-05, "loss": 0.9053, "step": 840 }, { "epoch": 0.06920386751697181, "grad_norm": 2.9138317208293594, "learning_rate": 1.991962990329764e-05, "loss": 0.9005, "step": 841 }, { "epoch": 0.06928615511211685, "grad_norm": 2.418947503313838, "learning_rate": 1.991929231454673e-05, "loss": 0.8876, "step": 842 }, { "epoch": 0.06936844270726188, "grad_norm": 2.746227734046784, "learning_rate": 1.9918954021145162e-05, "loss": 0.9174, "step": 843 }, { "epoch": 0.06945073030240691, "grad_norm": 4.054877897574317, "learning_rate": 1.991861502311696e-05, "loss": 0.8785, "step": 844 }, { "epoch": 0.06953301789755194, "grad_norm": 3.3645447414769856, "learning_rate": 1.9918275320486212e-05, "loss": 0.8885, "step": 845 }, { "epoch": 0.06961530549269698, "grad_norm": 0.6257651466469342, "learning_rate": 1.9917934913277047e-05, "loss": 0.5679, "step": 846 }, { "epoch": 0.06969759308784201, "grad_norm": 2.9579632903454987, "learning_rate": 1.9917593801513645e-05, "loss": 0.8892, "step": 847 }, { "epoch": 0.06977988068298704, "grad_norm": 2.3255674692633703, "learning_rate": 1.991725198522024e-05, "loss": 0.8969, "step": 848 }, { "epoch": 0.06986216827813208, "grad_norm": 1.8812338541653777, "learning_rate": 1.9916909464421118e-05, "loss": 0.84, "step": 849 }, { "epoch": 0.0699444558732771, "grad_norm": 4.348093261520783, "learning_rate": 1.9916566239140605e-05, "loss": 0.9035, "step": 850 }, { "epoch": 0.07002674346842214, "grad_norm": 2.2375985456191003, "learning_rate": 1.9916222309403085e-05, "loss": 0.8754, "step": 851 }, { "epoch": 0.07010903106356717, "grad_norm": 3.613200403801302, "learning_rate": 1.9915877675232992e-05, "loss": 0.8815, "step": 852 }, { "epoch": 0.0701913186587122, "grad_norm": 3.839543987455212, "learning_rate": 1.9915532336654807e-05, "loss": 0.9072, "step": 853 }, { "epoch": 0.07027360625385723, "grad_norm": 2.105567560984786, "learning_rate": 1.991518629369306e-05, "loss": 0.896, "step": 854 }, { "epoch": 0.07035589384900226, "grad_norm": 2.267537355899574, "learning_rate": 1.9914839546372336e-05, "loss": 0.9158, "step": 855 }, { "epoch": 0.07043818144414729, "grad_norm": 3.589047414435187, "learning_rate": 1.991449209471727e-05, "loss": 0.8734, "step": 856 }, { "epoch": 0.07052046903929232, "grad_norm": 3.1819343869570536, "learning_rate": 1.991414393875254e-05, "loss": 0.9089, "step": 857 }, { "epoch": 0.07060275663443735, "grad_norm": 2.5055069972264503, "learning_rate": 1.991379507850288e-05, "loss": 0.8681, "step": 858 }, { "epoch": 0.0706850442295824, "grad_norm": 2.545062208600291, "learning_rate": 1.991344551399307e-05, "loss": 0.8835, "step": 859 }, { "epoch": 0.07076733182472743, "grad_norm": 2.8423181256983487, "learning_rate": 1.9913095245247948e-05, "loss": 0.8855, "step": 860 }, { "epoch": 0.07084961941987246, "grad_norm": 2.623939420394984, "learning_rate": 1.9912744272292392e-05, "loss": 0.8912, "step": 861 }, { "epoch": 0.07093190701501749, "grad_norm": 2.456776383887346, "learning_rate": 1.9912392595151336e-05, "loss": 0.9026, "step": 862 }, { "epoch": 0.07101419461016252, "grad_norm": 2.7531225878969177, "learning_rate": 1.9912040213849762e-05, "loss": 0.8875, "step": 863 }, { "epoch": 0.07109648220530755, "grad_norm": 4.481796954208249, "learning_rate": 1.9911687128412708e-05, "loss": 0.8636, "step": 864 }, { "epoch": 0.07117876980045258, "grad_norm": 2.545397332779262, "learning_rate": 1.9911333338865245e-05, "loss": 0.8803, "step": 865 }, { "epoch": 0.07126105739559761, "grad_norm": 3.045980428767302, "learning_rate": 1.9910978845232517e-05, "loss": 0.9035, "step": 866 }, { "epoch": 0.07134334499074264, "grad_norm": 3.6871914250355715, "learning_rate": 1.9910623647539702e-05, "loss": 0.8666, "step": 867 }, { "epoch": 0.07142563258588767, "grad_norm": 2.116550202268351, "learning_rate": 1.991026774581203e-05, "loss": 0.9031, "step": 868 }, { "epoch": 0.0715079201810327, "grad_norm": 2.532009330642646, "learning_rate": 1.9909911140074788e-05, "loss": 0.8661, "step": 869 }, { "epoch": 0.07159020777617774, "grad_norm": 3.33485917673071, "learning_rate": 1.9909553830353308e-05, "loss": 0.8776, "step": 870 }, { "epoch": 0.07167249537132277, "grad_norm": 2.3439342371747167, "learning_rate": 1.990919581667297e-05, "loss": 0.9151, "step": 871 }, { "epoch": 0.07175478296646781, "grad_norm": 2.488600787006511, "learning_rate": 1.9908837099059212e-05, "loss": 0.9165, "step": 872 }, { "epoch": 0.07183707056161284, "grad_norm": 3.95670742389146, "learning_rate": 1.990847767753751e-05, "loss": 0.8659, "step": 873 }, { "epoch": 0.07191935815675787, "grad_norm": 0.5947750160477462, "learning_rate": 1.99081175521334e-05, "loss": 0.5886, "step": 874 }, { "epoch": 0.0720016457519029, "grad_norm": 2.033586754058639, "learning_rate": 1.9907756722872465e-05, "loss": 0.8897, "step": 875 }, { "epoch": 0.07208393334704793, "grad_norm": 3.346298659721499, "learning_rate": 1.9907395189780335e-05, "loss": 0.902, "step": 876 }, { "epoch": 0.07216622094219297, "grad_norm": 3.004056249927372, "learning_rate": 1.9907032952882703e-05, "loss": 0.8715, "step": 877 }, { "epoch": 0.072248508537338, "grad_norm": 5.4098932917643285, "learning_rate": 1.9906670012205286e-05, "loss": 0.8866, "step": 878 }, { "epoch": 0.07233079613248303, "grad_norm": 6.828654192266096, "learning_rate": 1.990630636777388e-05, "loss": 0.8689, "step": 879 }, { "epoch": 0.07241308372762806, "grad_norm": 2.6337207605941737, "learning_rate": 1.9905942019614312e-05, "loss": 0.8647, "step": 880 }, { "epoch": 0.07249537132277309, "grad_norm": 0.5235737963953581, "learning_rate": 1.990557696775246e-05, "loss": 0.5661, "step": 881 }, { "epoch": 0.07257765891791812, "grad_norm": 11.548238836629363, "learning_rate": 1.9905211212214266e-05, "loss": 0.9294, "step": 882 }, { "epoch": 0.07265994651306315, "grad_norm": 5.489164212385315, "learning_rate": 1.990484475302571e-05, "loss": 0.8685, "step": 883 }, { "epoch": 0.07274223410820818, "grad_norm": 7.88390924258145, "learning_rate": 1.990447759021282e-05, "loss": 0.874, "step": 884 }, { "epoch": 0.07282452170335323, "grad_norm": 4.299200684634295, "learning_rate": 1.9904109723801684e-05, "loss": 0.9146, "step": 885 }, { "epoch": 0.07290680929849826, "grad_norm": 6.21170690266594, "learning_rate": 1.990374115381843e-05, "loss": 0.8728, "step": 886 }, { "epoch": 0.07298909689364329, "grad_norm": 4.563438990093578, "learning_rate": 1.9903371880289247e-05, "loss": 0.8747, "step": 887 }, { "epoch": 0.07307138448878832, "grad_norm": 3.6273703961737187, "learning_rate": 1.990300190324036e-05, "loss": 0.9008, "step": 888 }, { "epoch": 0.07315367208393335, "grad_norm": 7.441233530871766, "learning_rate": 1.9902631222698057e-05, "loss": 0.9141, "step": 889 }, { "epoch": 0.07323595967907838, "grad_norm": 4.82833921873659, "learning_rate": 1.990225983868867e-05, "loss": 0.9339, "step": 890 }, { "epoch": 0.07331824727422341, "grad_norm": 5.887738980648113, "learning_rate": 1.9901887751238577e-05, "loss": 0.8799, "step": 891 }, { "epoch": 0.07340053486936844, "grad_norm": 2.5245499693701072, "learning_rate": 1.9901514960374217e-05, "loss": 0.8835, "step": 892 }, { "epoch": 0.07348282246451347, "grad_norm": 6.763974106441189, "learning_rate": 1.990114146612207e-05, "loss": 0.891, "step": 893 }, { "epoch": 0.0735651100596585, "grad_norm": 2.8844071869365835, "learning_rate": 1.9900767268508666e-05, "loss": 0.9097, "step": 894 }, { "epoch": 0.07364739765480353, "grad_norm": 5.440132687337712, "learning_rate": 1.9900392367560588e-05, "loss": 0.8831, "step": 895 }, { "epoch": 0.07372968524994856, "grad_norm": 3.745407109325051, "learning_rate": 1.9900016763304472e-05, "loss": 0.8805, "step": 896 }, { "epoch": 0.0738119728450936, "grad_norm": 4.288740968099518, "learning_rate": 1.9899640455766997e-05, "loss": 0.8891, "step": 897 }, { "epoch": 0.07389426044023864, "grad_norm": 2.755838421562454, "learning_rate": 1.9899263444974894e-05, "loss": 0.8973, "step": 898 }, { "epoch": 0.07397654803538367, "grad_norm": 2.63866374184814, "learning_rate": 1.9898885730954948e-05, "loss": 0.8418, "step": 899 }, { "epoch": 0.0740588356305287, "grad_norm": 3.0901321494386598, "learning_rate": 1.9898507313733995e-05, "loss": 0.8614, "step": 900 }, { "epoch": 0.07414112322567373, "grad_norm": 2.754917360078824, "learning_rate": 1.9898128193338907e-05, "loss": 0.8964, "step": 901 }, { "epoch": 0.07422341082081876, "grad_norm": 2.4717700343085163, "learning_rate": 1.9897748369796627e-05, "loss": 0.8793, "step": 902 }, { "epoch": 0.0743056984159638, "grad_norm": 2.2819538240312585, "learning_rate": 1.989736784313413e-05, "loss": 0.9086, "step": 903 }, { "epoch": 0.07438798601110883, "grad_norm": 2.7031870546344385, "learning_rate": 1.989698661337845e-05, "loss": 0.8601, "step": 904 }, { "epoch": 0.07447027360625386, "grad_norm": 2.2788277737039757, "learning_rate": 1.9896604680556664e-05, "loss": 0.8464, "step": 905 }, { "epoch": 0.07455256120139889, "grad_norm": 2.0567769102378954, "learning_rate": 1.9896222044695914e-05, "loss": 0.8807, "step": 906 }, { "epoch": 0.07463484879654392, "grad_norm": 2.384203325674513, "learning_rate": 1.9895838705823377e-05, "loss": 0.8923, "step": 907 }, { "epoch": 0.07471713639168895, "grad_norm": 2.0967277384590535, "learning_rate": 1.989545466396628e-05, "loss": 0.8793, "step": 908 }, { "epoch": 0.07479942398683398, "grad_norm": 9.442852725541027, "learning_rate": 1.9895069919151915e-05, "loss": 0.8965, "step": 909 }, { "epoch": 0.07488171158197901, "grad_norm": 5.109761027664979, "learning_rate": 1.9894684471407605e-05, "loss": 0.8983, "step": 910 }, { "epoch": 0.07496399917712405, "grad_norm": 2.2367018687313185, "learning_rate": 1.9894298320760733e-05, "loss": 0.8879, "step": 911 }, { "epoch": 0.07504628677226909, "grad_norm": 2.6873708972425656, "learning_rate": 1.989391146723873e-05, "loss": 0.8975, "step": 912 }, { "epoch": 0.07512857436741412, "grad_norm": 0.5656242706848698, "learning_rate": 1.9893523910869085e-05, "loss": 0.617, "step": 913 }, { "epoch": 0.07521086196255915, "grad_norm": 3.9316911134297814, "learning_rate": 1.989313565167932e-05, "loss": 0.9385, "step": 914 }, { "epoch": 0.07529314955770418, "grad_norm": 2.783913423475105, "learning_rate": 1.9892746689697024e-05, "loss": 0.898, "step": 915 }, { "epoch": 0.07537543715284921, "grad_norm": 4.235687618463353, "learning_rate": 1.989235702494982e-05, "loss": 0.8539, "step": 916 }, { "epoch": 0.07545772474799424, "grad_norm": 2.387819568149409, "learning_rate": 1.9891966657465397e-05, "loss": 0.8369, "step": 917 }, { "epoch": 0.07554001234313927, "grad_norm": 3.6947231383398424, "learning_rate": 1.989157558727148e-05, "loss": 0.8834, "step": 918 }, { "epoch": 0.0756222999382843, "grad_norm": 2.604963394831731, "learning_rate": 1.989118381439585e-05, "loss": 0.9019, "step": 919 }, { "epoch": 0.07570458753342933, "grad_norm": 0.5332477363950743, "learning_rate": 1.9890791338866344e-05, "loss": 0.5771, "step": 920 }, { "epoch": 0.07578687512857436, "grad_norm": 3.2104258542562953, "learning_rate": 1.9890398160710837e-05, "loss": 0.9337, "step": 921 }, { "epoch": 0.0758691627237194, "grad_norm": 0.48633325822320617, "learning_rate": 1.9890004279957266e-05, "loss": 0.5602, "step": 922 }, { "epoch": 0.07595145031886442, "grad_norm": 12.835475358323716, "learning_rate": 1.9889609696633606e-05, "loss": 0.8553, "step": 923 }, { "epoch": 0.07603373791400947, "grad_norm": 3.2124511867282037, "learning_rate": 1.9889214410767887e-05, "loss": 0.8674, "step": 924 }, { "epoch": 0.0761160255091545, "grad_norm": 2.904116877033008, "learning_rate": 1.9888818422388193e-05, "loss": 0.8747, "step": 925 }, { "epoch": 0.07619831310429953, "grad_norm": 3.157871788078832, "learning_rate": 1.9888421731522656e-05, "loss": 0.8891, "step": 926 }, { "epoch": 0.07628060069944456, "grad_norm": 2.3718730999123547, "learning_rate": 1.9888024338199448e-05, "loss": 0.8993, "step": 927 }, { "epoch": 0.07636288829458959, "grad_norm": 2.4565769064213723, "learning_rate": 1.988762624244681e-05, "loss": 0.9013, "step": 928 }, { "epoch": 0.07644517588973462, "grad_norm": 2.540968098318489, "learning_rate": 1.988722744429301e-05, "loss": 0.8633, "step": 929 }, { "epoch": 0.07652746348487965, "grad_norm": 3.56518007003656, "learning_rate": 1.988682794376639e-05, "loss": 0.8882, "step": 930 }, { "epoch": 0.07660975108002469, "grad_norm": 2.176182910474906, "learning_rate": 1.9886427740895325e-05, "loss": 0.9149, "step": 931 }, { "epoch": 0.07669203867516972, "grad_norm": 0.5807290241092793, "learning_rate": 1.9886026835708242e-05, "loss": 0.5897, "step": 932 }, { "epoch": 0.07677432627031475, "grad_norm": 0.5568253540494434, "learning_rate": 1.9885625228233624e-05, "loss": 0.5944, "step": 933 }, { "epoch": 0.07685661386545978, "grad_norm": 0.46307351633355415, "learning_rate": 1.9885222918499998e-05, "loss": 0.5687, "step": 934 }, { "epoch": 0.07693890146060481, "grad_norm": 2.21686936101954, "learning_rate": 1.9884819906535946e-05, "loss": 0.899, "step": 935 }, { "epoch": 0.07702118905574984, "grad_norm": 2.7051990886793758, "learning_rate": 1.9884416192370096e-05, "loss": 0.9015, "step": 936 }, { "epoch": 0.07710347665089488, "grad_norm": 2.1375647901334385, "learning_rate": 1.988401177603113e-05, "loss": 0.9001, "step": 937 }, { "epoch": 0.07718576424603991, "grad_norm": 4.132265546672556, "learning_rate": 1.988360665754777e-05, "loss": 0.8908, "step": 938 }, { "epoch": 0.07726805184118495, "grad_norm": 2.1359019957192533, "learning_rate": 1.9883200836948803e-05, "loss": 0.8717, "step": 939 }, { "epoch": 0.07735033943632998, "grad_norm": 3.9513646854514386, "learning_rate": 1.9882794314263053e-05, "loss": 0.8718, "step": 940 }, { "epoch": 0.07743262703147501, "grad_norm": 2.321609974282721, "learning_rate": 1.9882387089519398e-05, "loss": 0.869, "step": 941 }, { "epoch": 0.07751491462662004, "grad_norm": 3.70309268916697, "learning_rate": 1.9881979162746772e-05, "loss": 0.8649, "step": 942 }, { "epoch": 0.07759720222176507, "grad_norm": 3.361767416529052, "learning_rate": 1.9881570533974148e-05, "loss": 0.8683, "step": 943 }, { "epoch": 0.0776794898169101, "grad_norm": 3.4179325921845036, "learning_rate": 1.988116120323056e-05, "loss": 0.8963, "step": 944 }, { "epoch": 0.07776177741205513, "grad_norm": 3.021751145368183, "learning_rate": 1.988075117054508e-05, "loss": 0.8746, "step": 945 }, { "epoch": 0.07784406500720016, "grad_norm": 3.5878829514900974, "learning_rate": 1.9880340435946837e-05, "loss": 0.8516, "step": 946 }, { "epoch": 0.07792635260234519, "grad_norm": 1.920072678794743, "learning_rate": 1.9879928999465016e-05, "loss": 0.8937, "step": 947 }, { "epoch": 0.07800864019749022, "grad_norm": 2.2091268186489796, "learning_rate": 1.9879516861128835e-05, "loss": 0.8475, "step": 948 }, { "epoch": 0.07809092779263525, "grad_norm": 2.2168445139505644, "learning_rate": 1.9879104020967577e-05, "loss": 0.8633, "step": 949 }, { "epoch": 0.0781732153877803, "grad_norm": 1.0323698606460356, "learning_rate": 1.9878690479010568e-05, "loss": 0.6111, "step": 950 }, { "epoch": 0.07825550298292533, "grad_norm": 2.682420816107399, "learning_rate": 1.987827623528719e-05, "loss": 0.9341, "step": 951 }, { "epoch": 0.07833779057807036, "grad_norm": 0.6240540448167275, "learning_rate": 1.987786128982686e-05, "loss": 0.5523, "step": 952 }, { "epoch": 0.07842007817321539, "grad_norm": 3.6752862094905905, "learning_rate": 1.9877445642659066e-05, "loss": 0.9273, "step": 953 }, { "epoch": 0.07850236576836042, "grad_norm": 2.3734201750601858, "learning_rate": 1.987702929381333e-05, "loss": 0.8919, "step": 954 }, { "epoch": 0.07858465336350545, "grad_norm": 0.7387548503010232, "learning_rate": 1.9876612243319228e-05, "loss": 0.5746, "step": 955 }, { "epoch": 0.07866694095865048, "grad_norm": 0.6959735516945202, "learning_rate": 1.9876194491206388e-05, "loss": 0.5751, "step": 956 }, { "epoch": 0.07874922855379551, "grad_norm": 2.1882974936345394, "learning_rate": 1.9875776037504482e-05, "loss": 0.9006, "step": 957 }, { "epoch": 0.07883151614894054, "grad_norm": 2.341847998608011, "learning_rate": 1.9875356882243245e-05, "loss": 0.9041, "step": 958 }, { "epoch": 0.07891380374408558, "grad_norm": 2.1628210206575433, "learning_rate": 1.9874937025452445e-05, "loss": 0.8883, "step": 959 }, { "epoch": 0.0789960913392306, "grad_norm": 2.8510221399462483, "learning_rate": 1.9874516467161914e-05, "loss": 0.9231, "step": 960 }, { "epoch": 0.07907837893437564, "grad_norm": 4.694838855869676, "learning_rate": 1.9874095207401526e-05, "loss": 0.9156, "step": 961 }, { "epoch": 0.07916066652952067, "grad_norm": 2.877307386668155, "learning_rate": 1.98736732462012e-05, "loss": 0.8686, "step": 962 }, { "epoch": 0.07924295412466571, "grad_norm": 2.581259841624273, "learning_rate": 1.9873250583590923e-05, "loss": 0.9125, "step": 963 }, { "epoch": 0.07932524171981074, "grad_norm": 2.3158798477006037, "learning_rate": 1.9872827219600716e-05, "loss": 0.8926, "step": 964 }, { "epoch": 0.07940752931495577, "grad_norm": 3.0098712265326784, "learning_rate": 1.987240315426065e-05, "loss": 0.8758, "step": 965 }, { "epoch": 0.0794898169101008, "grad_norm": 3.1422180864323233, "learning_rate": 1.987197838760085e-05, "loss": 0.8908, "step": 966 }, { "epoch": 0.07957210450524584, "grad_norm": 0.9645131727703571, "learning_rate": 1.9871552919651494e-05, "loss": 0.6045, "step": 967 }, { "epoch": 0.07965439210039087, "grad_norm": 3.56520313826412, "learning_rate": 1.9871126750442807e-05, "loss": 0.8696, "step": 968 }, { "epoch": 0.0797366796955359, "grad_norm": 2.0059409411059113, "learning_rate": 1.9870699880005063e-05, "loss": 0.8799, "step": 969 }, { "epoch": 0.07981896729068093, "grad_norm": 4.983123742682501, "learning_rate": 1.9870272308368584e-05, "loss": 0.8693, "step": 970 }, { "epoch": 0.07990125488582596, "grad_norm": 2.1182309366583474, "learning_rate": 1.9869844035563747e-05, "loss": 0.8649, "step": 971 }, { "epoch": 0.07998354248097099, "grad_norm": 2.157976641839583, "learning_rate": 1.986941506162097e-05, "loss": 0.8844, "step": 972 }, { "epoch": 0.08006583007611602, "grad_norm": 3.1179516322271117, "learning_rate": 1.9868985386570734e-05, "loss": 0.8702, "step": 973 }, { "epoch": 0.08014811767126105, "grad_norm": 2.1804704549093246, "learning_rate": 1.986855501044356e-05, "loss": 0.8963, "step": 974 }, { "epoch": 0.08023040526640608, "grad_norm": 2.825665735780858, "learning_rate": 1.986812393327002e-05, "loss": 0.9028, "step": 975 }, { "epoch": 0.08031269286155113, "grad_norm": 2.7064578154820276, "learning_rate": 1.9867692155080736e-05, "loss": 0.8922, "step": 976 }, { "epoch": 0.08039498045669616, "grad_norm": 4.940848988099329, "learning_rate": 1.9867259675906383e-05, "loss": 0.9096, "step": 977 }, { "epoch": 0.08047726805184119, "grad_norm": 3.7159663449631943, "learning_rate": 1.9866826495777683e-05, "loss": 0.8946, "step": 978 }, { "epoch": 0.08055955564698622, "grad_norm": 4.235722900766384, "learning_rate": 1.9866392614725408e-05, "loss": 0.8844, "step": 979 }, { "epoch": 0.08064184324213125, "grad_norm": 2.5725805077545796, "learning_rate": 1.9865958032780383e-05, "loss": 0.8849, "step": 980 }, { "epoch": 0.08072413083727628, "grad_norm": 3.2900229009140367, "learning_rate": 1.986552274997348e-05, "loss": 0.8712, "step": 981 }, { "epoch": 0.08080641843242131, "grad_norm": 2.7018112393037206, "learning_rate": 1.986508676633561e-05, "loss": 0.881, "step": 982 }, { "epoch": 0.08088870602756634, "grad_norm": 3.2565064868257356, "learning_rate": 1.986465008189776e-05, "loss": 0.8741, "step": 983 }, { "epoch": 0.08097099362271137, "grad_norm": 2.977427479800942, "learning_rate": 1.986421269669094e-05, "loss": 0.864, "step": 984 }, { "epoch": 0.0810532812178564, "grad_norm": 2.8391838913702734, "learning_rate": 1.986377461074623e-05, "loss": 0.8777, "step": 985 }, { "epoch": 0.08113556881300144, "grad_norm": 2.228144074432828, "learning_rate": 1.9863335824094742e-05, "loss": 0.8873, "step": 986 }, { "epoch": 0.08121785640814647, "grad_norm": 2.6153835393886444, "learning_rate": 1.9862896336767654e-05, "loss": 0.8565, "step": 987 }, { "epoch": 0.08130014400329151, "grad_norm": 2.469488378896095, "learning_rate": 1.9862456148796182e-05, "loss": 0.9062, "step": 988 }, { "epoch": 0.08138243159843654, "grad_norm": 0.9008951474609029, "learning_rate": 1.98620152602116e-05, "loss": 0.5855, "step": 989 }, { "epoch": 0.08146471919358157, "grad_norm": 3.1010964992276335, "learning_rate": 1.986157367104522e-05, "loss": 0.8901, "step": 990 }, { "epoch": 0.0815470067887266, "grad_norm": 2.745575020455269, "learning_rate": 1.9861131381328422e-05, "loss": 0.8992, "step": 991 }, { "epoch": 0.08162929438387163, "grad_norm": 2.319333762749616, "learning_rate": 1.9860688391092623e-05, "loss": 0.8489, "step": 992 }, { "epoch": 0.08171158197901666, "grad_norm": 1.8701951574677815, "learning_rate": 1.9860244700369288e-05, "loss": 0.8895, "step": 993 }, { "epoch": 0.0817938695741617, "grad_norm": 2.4973895580746928, "learning_rate": 1.985980030918994e-05, "loss": 0.8414, "step": 994 }, { "epoch": 0.08187615716930673, "grad_norm": 2.542292639884159, "learning_rate": 1.9859355217586144e-05, "loss": 0.8865, "step": 995 }, { "epoch": 0.08195844476445176, "grad_norm": 0.5992255264191748, "learning_rate": 1.9858909425589524e-05, "loss": 0.5575, "step": 996 }, { "epoch": 0.08204073235959679, "grad_norm": 2.143472686925439, "learning_rate": 1.9858462933231742e-05, "loss": 0.8543, "step": 997 }, { "epoch": 0.08212301995474182, "grad_norm": 2.49083696229216, "learning_rate": 1.9858015740544524e-05, "loss": 0.8961, "step": 998 }, { "epoch": 0.08220530754988685, "grad_norm": 5.032363107017064, "learning_rate": 1.985756784755963e-05, "loss": 0.869, "step": 999 }, { "epoch": 0.08228759514503188, "grad_norm": 3.456646347683982, "learning_rate": 1.9857119254308885e-05, "loss": 0.868, "step": 1000 }, { "epoch": 0.08236988274017693, "grad_norm": 3.7630419410589755, "learning_rate": 1.9856669960824147e-05, "loss": 0.9249, "step": 1001 }, { "epoch": 0.08245217033532196, "grad_norm": 3.1625549709552994, "learning_rate": 1.985621996713734e-05, "loss": 0.8869, "step": 1002 }, { "epoch": 0.08253445793046699, "grad_norm": 3.881507636381793, "learning_rate": 1.985576927328043e-05, "loss": 0.888, "step": 1003 }, { "epoch": 0.08261674552561202, "grad_norm": 2.544247409259161, "learning_rate": 1.9855317879285434e-05, "loss": 0.8715, "step": 1004 }, { "epoch": 0.08269903312075705, "grad_norm": 2.5279916413903583, "learning_rate": 1.9854865785184417e-05, "loss": 0.8849, "step": 1005 }, { "epoch": 0.08278132071590208, "grad_norm": 3.4196695037594576, "learning_rate": 1.9854412991009494e-05, "loss": 0.8364, "step": 1006 }, { "epoch": 0.08286360831104711, "grad_norm": 2.759961086631554, "learning_rate": 1.985395949679283e-05, "loss": 0.854, "step": 1007 }, { "epoch": 0.08294589590619214, "grad_norm": 0.5731316878529051, "learning_rate": 1.9853505302566646e-05, "loss": 0.6152, "step": 1008 }, { "epoch": 0.08302818350133717, "grad_norm": 2.9549671685361525, "learning_rate": 1.98530504083632e-05, "loss": 0.861, "step": 1009 }, { "epoch": 0.0831104710964822, "grad_norm": 2.3193711696281025, "learning_rate": 1.9852594814214812e-05, "loss": 0.865, "step": 1010 }, { "epoch": 0.08319275869162723, "grad_norm": 3.0076758009209636, "learning_rate": 1.9852138520153846e-05, "loss": 0.8852, "step": 1011 }, { "epoch": 0.08327504628677226, "grad_norm": 2.732008977686221, "learning_rate": 1.9851681526212716e-05, "loss": 0.8928, "step": 1012 }, { "epoch": 0.0833573338819173, "grad_norm": 2.37950207279815, "learning_rate": 1.9851223832423886e-05, "loss": 0.8617, "step": 1013 }, { "epoch": 0.08343962147706234, "grad_norm": 2.464424002675186, "learning_rate": 1.985076543881987e-05, "loss": 0.8625, "step": 1014 }, { "epoch": 0.08352190907220737, "grad_norm": 2.9080302916718015, "learning_rate": 1.985030634543323e-05, "loss": 0.8832, "step": 1015 }, { "epoch": 0.0836041966673524, "grad_norm": 2.6287476224799655, "learning_rate": 1.984984655229658e-05, "loss": 0.8728, "step": 1016 }, { "epoch": 0.08368648426249743, "grad_norm": 2.5936175763493052, "learning_rate": 1.9849386059442585e-05, "loss": 0.8678, "step": 1017 }, { "epoch": 0.08376877185764246, "grad_norm": 2.3604963235792904, "learning_rate": 1.9848924866903955e-05, "loss": 0.8783, "step": 1018 }, { "epoch": 0.0838510594527875, "grad_norm": 0.5341112663835049, "learning_rate": 1.984846297471345e-05, "loss": 0.605, "step": 1019 }, { "epoch": 0.08393334704793252, "grad_norm": 2.9860218730439057, "learning_rate": 1.984800038290389e-05, "loss": 0.8525, "step": 1020 }, { "epoch": 0.08401563464307756, "grad_norm": 2.4630212214875025, "learning_rate": 1.9847537091508134e-05, "loss": 0.8825, "step": 1021 }, { "epoch": 0.08409792223822259, "grad_norm": 2.424908485494412, "learning_rate": 1.984707310055909e-05, "loss": 0.891, "step": 1022 }, { "epoch": 0.08418020983336762, "grad_norm": 2.886480910540036, "learning_rate": 1.984660841008972e-05, "loss": 0.8935, "step": 1023 }, { "epoch": 0.08426249742851265, "grad_norm": 2.4246756718684384, "learning_rate": 1.9846143020133035e-05, "loss": 0.8679, "step": 1024 }, { "epoch": 0.08434478502365768, "grad_norm": 4.020038177987053, "learning_rate": 1.98456769307221e-05, "loss": 0.8191, "step": 1025 }, { "epoch": 0.08442707261880271, "grad_norm": 2.6823999549769795, "learning_rate": 1.9845210141890018e-05, "loss": 0.8618, "step": 1026 }, { "epoch": 0.08450936021394775, "grad_norm": 2.2350487266641035, "learning_rate": 1.9844742653669953e-05, "loss": 0.8595, "step": 1027 }, { "epoch": 0.08459164780909278, "grad_norm": 4.977761117586025, "learning_rate": 1.9844274466095117e-05, "loss": 0.8516, "step": 1028 }, { "epoch": 0.08467393540423782, "grad_norm": 3.31805191100729, "learning_rate": 1.9843805579198766e-05, "loss": 0.8636, "step": 1029 }, { "epoch": 0.08475622299938285, "grad_norm": 2.5881873279624648, "learning_rate": 1.9843335993014206e-05, "loss": 0.8667, "step": 1030 }, { "epoch": 0.08483851059452788, "grad_norm": 3.9560157884462, "learning_rate": 1.98428657075748e-05, "loss": 0.8799, "step": 1031 }, { "epoch": 0.08492079818967291, "grad_norm": 2.5965271671259753, "learning_rate": 1.984239472291396e-05, "loss": 0.8714, "step": 1032 }, { "epoch": 0.08500308578481794, "grad_norm": 2.9384162786300094, "learning_rate": 1.9841923039065136e-05, "loss": 0.8784, "step": 1033 }, { "epoch": 0.08508537337996297, "grad_norm": 4.575841979886102, "learning_rate": 1.984145065606184e-05, "loss": 0.871, "step": 1034 }, { "epoch": 0.085167660975108, "grad_norm": 2.6762798398130205, "learning_rate": 1.984097757393763e-05, "loss": 0.8884, "step": 1035 }, { "epoch": 0.08524994857025303, "grad_norm": 2.3317749715867757, "learning_rate": 1.9840503792726107e-05, "loss": 0.8582, "step": 1036 }, { "epoch": 0.08533223616539806, "grad_norm": 2.5192408862448925, "learning_rate": 1.9840029312460936e-05, "loss": 0.8987, "step": 1037 }, { "epoch": 0.08541452376054309, "grad_norm": 3.0314447963476954, "learning_rate": 1.9839554133175815e-05, "loss": 0.9115, "step": 1038 }, { "epoch": 0.08549681135568812, "grad_norm": 2.718611923577393, "learning_rate": 1.983907825490451e-05, "loss": 0.8768, "step": 1039 }, { "epoch": 0.08557909895083317, "grad_norm": 3.2506331598038063, "learning_rate": 1.9838601677680818e-05, "loss": 0.8892, "step": 1040 }, { "epoch": 0.0856613865459782, "grad_norm": 2.8785960552339844, "learning_rate": 1.9838124401538596e-05, "loss": 0.8762, "step": 1041 }, { "epoch": 0.08574367414112323, "grad_norm": 3.255205364224761, "learning_rate": 1.9837646426511755e-05, "loss": 0.8878, "step": 1042 }, { "epoch": 0.08582596173626826, "grad_norm": 2.152447959926313, "learning_rate": 1.9837167752634243e-05, "loss": 0.8939, "step": 1043 }, { "epoch": 0.08590824933141329, "grad_norm": 6.038167525170103, "learning_rate": 1.983668837994006e-05, "loss": 0.854, "step": 1044 }, { "epoch": 0.08599053692655832, "grad_norm": 2.4872882270608296, "learning_rate": 1.983620830846327e-05, "loss": 0.865, "step": 1045 }, { "epoch": 0.08607282452170335, "grad_norm": 5.0878964623293905, "learning_rate": 1.9835727538237977e-05, "loss": 0.8848, "step": 1046 }, { "epoch": 0.08615511211684838, "grad_norm": 0.5466809522376739, "learning_rate": 1.9835246069298325e-05, "loss": 0.5879, "step": 1047 }, { "epoch": 0.08623739971199341, "grad_norm": 2.8930059060138134, "learning_rate": 1.9834763901678523e-05, "loss": 0.9032, "step": 1048 }, { "epoch": 0.08631968730713845, "grad_norm": 3.481150201855255, "learning_rate": 1.983428103541282e-05, "loss": 0.895, "step": 1049 }, { "epoch": 0.08640197490228348, "grad_norm": 2.2668611618771806, "learning_rate": 1.983379747053552e-05, "loss": 0.8841, "step": 1050 }, { "epoch": 0.08648426249742851, "grad_norm": 0.5012767267519984, "learning_rate": 1.9833313207080976e-05, "loss": 0.5584, "step": 1051 }, { "epoch": 0.08656655009257354, "grad_norm": 4.03230401593853, "learning_rate": 1.983282824508359e-05, "loss": 0.8722, "step": 1052 }, { "epoch": 0.08664883768771858, "grad_norm": 3.2238027639613662, "learning_rate": 1.9832342584577808e-05, "loss": 0.9061, "step": 1053 }, { "epoch": 0.08673112528286361, "grad_norm": 2.5875473888993827, "learning_rate": 1.9831856225598134e-05, "loss": 0.8655, "step": 1054 }, { "epoch": 0.08681341287800864, "grad_norm": 2.9531227295823435, "learning_rate": 1.9831369168179116e-05, "loss": 0.9014, "step": 1055 }, { "epoch": 0.08689570047315368, "grad_norm": 3.2403950768604273, "learning_rate": 1.9830881412355356e-05, "loss": 0.8802, "step": 1056 }, { "epoch": 0.0869779880682987, "grad_norm": 2.6421330385224406, "learning_rate": 1.9830392958161505e-05, "loss": 0.8624, "step": 1057 }, { "epoch": 0.08706027566344374, "grad_norm": 2.796247945415367, "learning_rate": 1.9829903805632257e-05, "loss": 0.8465, "step": 1058 }, { "epoch": 0.08714256325858877, "grad_norm": 0.5356691167104551, "learning_rate": 1.982941395480236e-05, "loss": 0.5749, "step": 1059 }, { "epoch": 0.0872248508537338, "grad_norm": 2.543782162970702, "learning_rate": 1.9828923405706622e-05, "loss": 0.8651, "step": 1060 }, { "epoch": 0.08730713844887883, "grad_norm": 5.052374438346327, "learning_rate": 1.982843215837988e-05, "loss": 0.8556, "step": 1061 }, { "epoch": 0.08738942604402386, "grad_norm": 2.709282429422679, "learning_rate": 1.9827940212857038e-05, "loss": 0.8739, "step": 1062 }, { "epoch": 0.08747171363916889, "grad_norm": 12.014153200069254, "learning_rate": 1.982744756917304e-05, "loss": 0.8685, "step": 1063 }, { "epoch": 0.08755400123431392, "grad_norm": 4.7874082941622875, "learning_rate": 1.9826954227362883e-05, "loss": 0.8968, "step": 1064 }, { "epoch": 0.08763628882945895, "grad_norm": 3.094799934600602, "learning_rate": 1.9826460187461616e-05, "loss": 0.8678, "step": 1065 }, { "epoch": 0.087718576424604, "grad_norm": 2.2422659009449664, "learning_rate": 1.982596544950433e-05, "loss": 0.8764, "step": 1066 }, { "epoch": 0.08780086401974903, "grad_norm": 3.436687255418153, "learning_rate": 1.982547001352617e-05, "loss": 0.8516, "step": 1067 }, { "epoch": 0.08788315161489406, "grad_norm": 0.4947838359746663, "learning_rate": 1.982497387956234e-05, "loss": 0.5591, "step": 1068 }, { "epoch": 0.08796543921003909, "grad_norm": 2.6289534390817098, "learning_rate": 1.9824477047648073e-05, "loss": 0.8481, "step": 1069 }, { "epoch": 0.08804772680518412, "grad_norm": 0.4837575812403313, "learning_rate": 1.9823979517818672e-05, "loss": 0.5778, "step": 1070 }, { "epoch": 0.08813001440032915, "grad_norm": 3.538024856422455, "learning_rate": 1.9823481290109478e-05, "loss": 0.8619, "step": 1071 }, { "epoch": 0.08821230199547418, "grad_norm": 4.321407175482124, "learning_rate": 1.982298236455588e-05, "loss": 0.8846, "step": 1072 }, { "epoch": 0.08829458959061921, "grad_norm": 3.616450253072054, "learning_rate": 1.9822482741193324e-05, "loss": 0.8856, "step": 1073 }, { "epoch": 0.08837687718576424, "grad_norm": 4.473435045577941, "learning_rate": 1.9821982420057308e-05, "loss": 0.8608, "step": 1074 }, { "epoch": 0.08845916478090927, "grad_norm": 0.5344599795616546, "learning_rate": 1.9821481401183364e-05, "loss": 0.5741, "step": 1075 }, { "epoch": 0.0885414523760543, "grad_norm": 3.608389298386541, "learning_rate": 1.982097968460709e-05, "loss": 0.8832, "step": 1076 }, { "epoch": 0.08862373997119934, "grad_norm": 4.223422665021111, "learning_rate": 1.9820477270364123e-05, "loss": 0.8854, "step": 1077 }, { "epoch": 0.08870602756634437, "grad_norm": 3.236757188788279, "learning_rate": 1.981997415849016e-05, "loss": 0.8727, "step": 1078 }, { "epoch": 0.08878831516148941, "grad_norm": 0.5297374533084104, "learning_rate": 1.9819470349020936e-05, "loss": 0.5883, "step": 1079 }, { "epoch": 0.08887060275663444, "grad_norm": 2.8725890412006656, "learning_rate": 1.9818965841992243e-05, "loss": 0.8719, "step": 1080 }, { "epoch": 0.08895289035177947, "grad_norm": 0.4917914943060142, "learning_rate": 1.9818460637439917e-05, "loss": 0.5497, "step": 1081 }, { "epoch": 0.0890351779469245, "grad_norm": 3.666129989863918, "learning_rate": 1.9817954735399853e-05, "loss": 0.855, "step": 1082 }, { "epoch": 0.08911746554206953, "grad_norm": 3.667558282780085, "learning_rate": 1.9817448135907984e-05, "loss": 0.8618, "step": 1083 }, { "epoch": 0.08919975313721457, "grad_norm": 2.8134358753083597, "learning_rate": 1.9816940839000303e-05, "loss": 0.8639, "step": 1084 }, { "epoch": 0.0892820407323596, "grad_norm": 3.8554001706730907, "learning_rate": 1.981643284471284e-05, "loss": 0.8449, "step": 1085 }, { "epoch": 0.08936432832750463, "grad_norm": 3.767364747903415, "learning_rate": 1.981592415308169e-05, "loss": 0.8549, "step": 1086 }, { "epoch": 0.08944661592264966, "grad_norm": 2.8398571302805453, "learning_rate": 1.9815414764142986e-05, "loss": 0.8735, "step": 1087 }, { "epoch": 0.08952890351779469, "grad_norm": 2.980261363247237, "learning_rate": 1.9814904677932912e-05, "loss": 0.8725, "step": 1088 }, { "epoch": 0.08961119111293972, "grad_norm": 3.7219107197197916, "learning_rate": 1.9814393894487713e-05, "loss": 0.9151, "step": 1089 }, { "epoch": 0.08969347870808475, "grad_norm": 4.035211371174713, "learning_rate": 1.981388241384366e-05, "loss": 0.8825, "step": 1090 }, { "epoch": 0.08977576630322978, "grad_norm": 3.053085785512212, "learning_rate": 1.9813370236037098e-05, "loss": 0.8497, "step": 1091 }, { "epoch": 0.08985805389837483, "grad_norm": 0.5368604454434628, "learning_rate": 1.981285736110441e-05, "loss": 0.5812, "step": 1092 }, { "epoch": 0.08994034149351986, "grad_norm": 4.355844807027429, "learning_rate": 1.981234378908203e-05, "loss": 0.8887, "step": 1093 }, { "epoch": 0.09002262908866489, "grad_norm": 2.649968557975437, "learning_rate": 1.9811829520006433e-05, "loss": 0.8415, "step": 1094 }, { "epoch": 0.09010491668380992, "grad_norm": 3.4417587859008214, "learning_rate": 1.9811314553914166e-05, "loss": 0.8685, "step": 1095 }, { "epoch": 0.09018720427895495, "grad_norm": 0.48295286929932113, "learning_rate": 1.98107988908418e-05, "loss": 0.5608, "step": 1096 }, { "epoch": 0.09026949187409998, "grad_norm": 4.948234702126818, "learning_rate": 1.981028253082597e-05, "loss": 0.8638, "step": 1097 }, { "epoch": 0.09035177946924501, "grad_norm": 2.8257336957776733, "learning_rate": 1.9809765473903362e-05, "loss": 0.8402, "step": 1098 }, { "epoch": 0.09043406706439004, "grad_norm": 0.48328014205289604, "learning_rate": 1.98092477201107e-05, "loss": 0.5797, "step": 1099 }, { "epoch": 0.09051635465953507, "grad_norm": 3.1346349138814418, "learning_rate": 1.980872926948477e-05, "loss": 0.8675, "step": 1100 }, { "epoch": 0.0905986422546801, "grad_norm": 2.707381646623277, "learning_rate": 1.9808210122062396e-05, "loss": 0.8588, "step": 1101 }, { "epoch": 0.09068092984982513, "grad_norm": 0.4754150829561111, "learning_rate": 1.9807690277880464e-05, "loss": 0.5962, "step": 1102 }, { "epoch": 0.09076321744497017, "grad_norm": 3.2149488041323946, "learning_rate": 1.98071697369759e-05, "loss": 0.849, "step": 1103 }, { "epoch": 0.0908455050401152, "grad_norm": 3.1468421046064887, "learning_rate": 1.9806648499385678e-05, "loss": 0.8525, "step": 1104 }, { "epoch": 0.09092779263526024, "grad_norm": 3.011551334891878, "learning_rate": 1.9806126565146835e-05, "loss": 0.862, "step": 1105 }, { "epoch": 0.09101008023040527, "grad_norm": 3.7542041127163235, "learning_rate": 1.980560393429644e-05, "loss": 0.878, "step": 1106 }, { "epoch": 0.0910923678255503, "grad_norm": 3.924675309445745, "learning_rate": 1.9805080606871625e-05, "loss": 0.8932, "step": 1107 }, { "epoch": 0.09117465542069533, "grad_norm": 3.149434195229172, "learning_rate": 1.980455658290956e-05, "loss": 0.8968, "step": 1108 }, { "epoch": 0.09125694301584036, "grad_norm": 0.4528941005660691, "learning_rate": 1.9804031862447483e-05, "loss": 0.5658, "step": 1109 }, { "epoch": 0.0913392306109854, "grad_norm": 3.2710296854560688, "learning_rate": 1.9803506445522658e-05, "loss": 0.8739, "step": 1110 }, { "epoch": 0.09142151820613043, "grad_norm": 0.48322757491755364, "learning_rate": 1.9802980332172415e-05, "loss": 0.592, "step": 1111 }, { "epoch": 0.09150380580127546, "grad_norm": 3.600092282955291, "learning_rate": 1.9802453522434123e-05, "loss": 0.8524, "step": 1112 }, { "epoch": 0.09158609339642049, "grad_norm": 3.7142303319750773, "learning_rate": 1.980192601634521e-05, "loss": 0.8811, "step": 1113 }, { "epoch": 0.09166838099156552, "grad_norm": 3.133621188104266, "learning_rate": 1.9801397813943156e-05, "loss": 0.8937, "step": 1114 }, { "epoch": 0.09175066858671055, "grad_norm": 5.265940334189566, "learning_rate": 1.980086891526547e-05, "loss": 0.8761, "step": 1115 }, { "epoch": 0.09183295618185558, "grad_norm": 0.5062751751465183, "learning_rate": 1.9800339320349732e-05, "loss": 0.5516, "step": 1116 }, { "epoch": 0.09191524377700061, "grad_norm": 3.772473804543901, "learning_rate": 1.9799809029233558e-05, "loss": 0.8375, "step": 1117 }, { "epoch": 0.09199753137214566, "grad_norm": 3.8490743801526803, "learning_rate": 1.9799278041954628e-05, "loss": 0.877, "step": 1118 }, { "epoch": 0.09207981896729069, "grad_norm": 3.5820410192444174, "learning_rate": 1.9798746358550656e-05, "loss": 0.8833, "step": 1119 }, { "epoch": 0.09216210656243572, "grad_norm": 8.839295550642253, "learning_rate": 1.9798213979059412e-05, "loss": 0.8553, "step": 1120 }, { "epoch": 0.09224439415758075, "grad_norm": 3.7706882959014205, "learning_rate": 1.979768090351872e-05, "loss": 0.8564, "step": 1121 }, { "epoch": 0.09232668175272578, "grad_norm": 4.312690219016083, "learning_rate": 1.9797147131966445e-05, "loss": 0.8605, "step": 1122 }, { "epoch": 0.09240896934787081, "grad_norm": 6.342821693734463, "learning_rate": 1.9796612664440503e-05, "loss": 0.8863, "step": 1123 }, { "epoch": 0.09249125694301584, "grad_norm": 3.480039566309057, "learning_rate": 1.979607750097887e-05, "loss": 0.8676, "step": 1124 }, { "epoch": 0.09257354453816087, "grad_norm": 0.5209974485249531, "learning_rate": 1.9795541641619552e-05, "loss": 0.6128, "step": 1125 }, { "epoch": 0.0926558321333059, "grad_norm": 3.0644541451290106, "learning_rate": 1.9795005086400623e-05, "loss": 0.8596, "step": 1126 }, { "epoch": 0.09273811972845093, "grad_norm": 4.0339545836639585, "learning_rate": 1.9794467835360198e-05, "loss": 0.8956, "step": 1127 }, { "epoch": 0.09282040732359596, "grad_norm": 3.606396064787203, "learning_rate": 1.9793929888536443e-05, "loss": 0.8446, "step": 1128 }, { "epoch": 0.092902694918741, "grad_norm": 3.266963278351553, "learning_rate": 1.979339124596757e-05, "loss": 0.8804, "step": 1129 }, { "epoch": 0.09298498251388602, "grad_norm": 4.171351560316691, "learning_rate": 1.9792851907691847e-05, "loss": 0.8764, "step": 1130 }, { "epoch": 0.09306727010903107, "grad_norm": 3.1333885189366066, "learning_rate": 1.9792311873747584e-05, "loss": 0.8882, "step": 1131 }, { "epoch": 0.0931495577041761, "grad_norm": 4.115748009743592, "learning_rate": 1.9791771144173146e-05, "loss": 0.8693, "step": 1132 }, { "epoch": 0.09323184529932113, "grad_norm": 4.248749716560056, "learning_rate": 1.9791229719006947e-05, "loss": 0.866, "step": 1133 }, { "epoch": 0.09331413289446616, "grad_norm": 0.5602770220421947, "learning_rate": 1.979068759828745e-05, "loss": 0.5729, "step": 1134 }, { "epoch": 0.09339642048961119, "grad_norm": 3.208526975104471, "learning_rate": 1.979014478205316e-05, "loss": 0.8447, "step": 1135 }, { "epoch": 0.09347870808475622, "grad_norm": 3.837179354794119, "learning_rate": 1.978960127034264e-05, "loss": 0.8395, "step": 1136 }, { "epoch": 0.09356099567990125, "grad_norm": 4.22608442690413, "learning_rate": 1.9789057063194505e-05, "loss": 0.8345, "step": 1137 }, { "epoch": 0.09364328327504629, "grad_norm": 4.512917248957414, "learning_rate": 1.978851216064741e-05, "loss": 0.8755, "step": 1138 }, { "epoch": 0.09372557087019132, "grad_norm": 4.485181370046995, "learning_rate": 1.978796656274007e-05, "loss": 0.9001, "step": 1139 }, { "epoch": 0.09380785846533635, "grad_norm": 4.311526149543538, "learning_rate": 1.978742026951123e-05, "loss": 0.8147, "step": 1140 }, { "epoch": 0.09389014606048138, "grad_norm": 3.400869370992463, "learning_rate": 1.9786873280999716e-05, "loss": 0.8458, "step": 1141 }, { "epoch": 0.09397243365562641, "grad_norm": 3.484007931145798, "learning_rate": 1.978632559724437e-05, "loss": 0.8396, "step": 1142 }, { "epoch": 0.09405472125077145, "grad_norm": 5.974225023368629, "learning_rate": 1.9785777218284107e-05, "loss": 0.8544, "step": 1143 }, { "epoch": 0.09413700884591648, "grad_norm": 4.758176933846711, "learning_rate": 1.978522814415788e-05, "loss": 0.8738, "step": 1144 }, { "epoch": 0.09421929644106151, "grad_norm": 4.054376339470337, "learning_rate": 1.9784678374904694e-05, "loss": 0.8647, "step": 1145 }, { "epoch": 0.09430158403620655, "grad_norm": 3.254256033254886, "learning_rate": 1.9784127910563606e-05, "loss": 0.8353, "step": 1146 }, { "epoch": 0.09438387163135158, "grad_norm": 0.5816738083728531, "learning_rate": 1.978357675117372e-05, "loss": 0.5812, "step": 1147 }, { "epoch": 0.09446615922649661, "grad_norm": 0.49793035339456754, "learning_rate": 1.9783024896774187e-05, "loss": 0.5791, "step": 1148 }, { "epoch": 0.09454844682164164, "grad_norm": 4.179537892792988, "learning_rate": 1.9782472347404206e-05, "loss": 0.8907, "step": 1149 }, { "epoch": 0.09463073441678667, "grad_norm": 4.067029184300302, "learning_rate": 1.978191910310304e-05, "loss": 0.8541, "step": 1150 }, { "epoch": 0.0947130220119317, "grad_norm": 4.248345665782451, "learning_rate": 1.9781365163909984e-05, "loss": 0.8632, "step": 1151 }, { "epoch": 0.09479530960707673, "grad_norm": 6.439138971096778, "learning_rate": 1.978081052986439e-05, "loss": 0.8629, "step": 1152 }, { "epoch": 0.09487759720222176, "grad_norm": 6.71298685938902, "learning_rate": 1.9780255201005656e-05, "loss": 0.8549, "step": 1153 }, { "epoch": 0.09495988479736679, "grad_norm": 3.967437431624442, "learning_rate": 1.9779699177373236e-05, "loss": 0.8732, "step": 1154 }, { "epoch": 0.09504217239251182, "grad_norm": 0.8392360999561069, "learning_rate": 1.9779142459006626e-05, "loss": 0.5872, "step": 1155 }, { "epoch": 0.09512445998765687, "grad_norm": 4.657178845971167, "learning_rate": 1.9778585045945374e-05, "loss": 0.8495, "step": 1156 }, { "epoch": 0.0952067475828019, "grad_norm": 4.123727952348605, "learning_rate": 1.977802693822908e-05, "loss": 0.9142, "step": 1157 }, { "epoch": 0.09528903517794693, "grad_norm": 0.5860758553236142, "learning_rate": 1.9777468135897387e-05, "loss": 0.5549, "step": 1158 }, { "epoch": 0.09537132277309196, "grad_norm": 0.5401053295003246, "learning_rate": 1.9776908638989996e-05, "loss": 0.5801, "step": 1159 }, { "epoch": 0.09545361036823699, "grad_norm": 0.5496816005625466, "learning_rate": 1.9776348447546653e-05, "loss": 0.5839, "step": 1160 }, { "epoch": 0.09553589796338202, "grad_norm": 6.020685438337091, "learning_rate": 1.977578756160715e-05, "loss": 0.866, "step": 1161 }, { "epoch": 0.09561818555852705, "grad_norm": 2.792057637957128, "learning_rate": 1.9775225981211333e-05, "loss": 0.8638, "step": 1162 }, { "epoch": 0.09570047315367208, "grad_norm": 0.5553177375677683, "learning_rate": 1.9774663706399092e-05, "loss": 0.5612, "step": 1163 }, { "epoch": 0.09578276074881711, "grad_norm": 5.245834669495098, "learning_rate": 1.9774100737210376e-05, "loss": 0.8688, "step": 1164 }, { "epoch": 0.09586504834396214, "grad_norm": 3.5768926302294344, "learning_rate": 1.977353707368518e-05, "loss": 0.897, "step": 1165 }, { "epoch": 0.09594733593910718, "grad_norm": 3.381007087662086, "learning_rate": 1.9772972715863534e-05, "loss": 0.8956, "step": 1166 }, { "epoch": 0.0960296235342522, "grad_norm": 4.24711216964703, "learning_rate": 1.9772407663785538e-05, "loss": 0.8546, "step": 1167 }, { "epoch": 0.09611191112939724, "grad_norm": 0.5978826180005935, "learning_rate": 1.977184191749133e-05, "loss": 0.5658, "step": 1168 }, { "epoch": 0.09619419872454228, "grad_norm": 5.6864731543708285, "learning_rate": 1.9771275477021102e-05, "loss": 0.8573, "step": 1169 }, { "epoch": 0.09627648631968731, "grad_norm": 0.5306016735606011, "learning_rate": 1.9770708342415087e-05, "loss": 0.5443, "step": 1170 }, { "epoch": 0.09635877391483234, "grad_norm": 3.4108513712835733, "learning_rate": 1.9770140513713582e-05, "loss": 0.9162, "step": 1171 }, { "epoch": 0.09644106150997737, "grad_norm": 3.0240876250486775, "learning_rate": 1.976957199095692e-05, "loss": 0.8959, "step": 1172 }, { "epoch": 0.0965233491051224, "grad_norm": 4.329264160111276, "learning_rate": 1.9769002774185483e-05, "loss": 0.8581, "step": 1173 }, { "epoch": 0.09660563670026744, "grad_norm": 2.8538371301611045, "learning_rate": 1.9768432863439714e-05, "loss": 0.8472, "step": 1174 }, { "epoch": 0.09668792429541247, "grad_norm": 4.192529144078922, "learning_rate": 1.97678622587601e-05, "loss": 0.8697, "step": 1175 }, { "epoch": 0.0967702118905575, "grad_norm": 3.729038589656874, "learning_rate": 1.976729096018717e-05, "loss": 0.8319, "step": 1176 }, { "epoch": 0.09685249948570253, "grad_norm": 0.6437788103093597, "learning_rate": 1.976671896776151e-05, "loss": 0.5736, "step": 1177 }, { "epoch": 0.09693478708084756, "grad_norm": 3.9035454070115017, "learning_rate": 1.9766146281523753e-05, "loss": 0.8874, "step": 1178 }, { "epoch": 0.09701707467599259, "grad_norm": 3.819713897204886, "learning_rate": 1.9765572901514583e-05, "loss": 0.8422, "step": 1179 }, { "epoch": 0.09709936227113762, "grad_norm": 5.277006488684462, "learning_rate": 1.9764998827774734e-05, "loss": 0.8849, "step": 1180 }, { "epoch": 0.09718164986628265, "grad_norm": 5.189466257849834, "learning_rate": 1.9764424060344988e-05, "loss": 0.8612, "step": 1181 }, { "epoch": 0.0972639374614277, "grad_norm": 3.4415909778873743, "learning_rate": 1.9763848599266168e-05, "loss": 0.8649, "step": 1182 }, { "epoch": 0.09734622505657273, "grad_norm": 3.5762421871051, "learning_rate": 1.976327244457916e-05, "loss": 0.8643, "step": 1183 }, { "epoch": 0.09742851265171776, "grad_norm": 2.9475630534612116, "learning_rate": 1.976269559632489e-05, "loss": 0.8756, "step": 1184 }, { "epoch": 0.09751080024686279, "grad_norm": 2.865959286407617, "learning_rate": 1.976211805454434e-05, "loss": 0.8317, "step": 1185 }, { "epoch": 0.09759308784200782, "grad_norm": 0.5278838170529865, "learning_rate": 1.976153981927853e-05, "loss": 0.5707, "step": 1186 }, { "epoch": 0.09767537543715285, "grad_norm": 0.5151202226322995, "learning_rate": 1.976096089056855e-05, "loss": 0.5589, "step": 1187 }, { "epoch": 0.09775766303229788, "grad_norm": 5.474549135950859, "learning_rate": 1.9760381268455515e-05, "loss": 0.8707, "step": 1188 }, { "epoch": 0.09783995062744291, "grad_norm": 2.886942130305931, "learning_rate": 1.9759800952980604e-05, "loss": 0.8764, "step": 1189 }, { "epoch": 0.09792223822258794, "grad_norm": 3.5448856849038015, "learning_rate": 1.9759219944185045e-05, "loss": 0.8546, "step": 1190 }, { "epoch": 0.09800452581773297, "grad_norm": 2.3163053463145022, "learning_rate": 1.9758638242110105e-05, "loss": 0.827, "step": 1191 }, { "epoch": 0.098086813412878, "grad_norm": 3.2678753876711903, "learning_rate": 1.9758055846797113e-05, "loss": 0.8456, "step": 1192 }, { "epoch": 0.09816910100802304, "grad_norm": 4.046087494412628, "learning_rate": 1.9757472758287437e-05, "loss": 0.8565, "step": 1193 }, { "epoch": 0.09825138860316807, "grad_norm": 5.312871548189173, "learning_rate": 1.9756888976622504e-05, "loss": 0.8316, "step": 1194 }, { "epoch": 0.09833367619831311, "grad_norm": 3.5965506794172035, "learning_rate": 1.9756304501843782e-05, "loss": 0.8479, "step": 1195 }, { "epoch": 0.09841596379345814, "grad_norm": 4.869038156703397, "learning_rate": 1.975571933399279e-05, "loss": 0.8957, "step": 1196 }, { "epoch": 0.09849825138860317, "grad_norm": 5.073504198475643, "learning_rate": 1.9755133473111097e-05, "loss": 0.8748, "step": 1197 }, { "epoch": 0.0985805389837482, "grad_norm": 4.129896753535656, "learning_rate": 1.9754546919240325e-05, "loss": 0.8624, "step": 1198 }, { "epoch": 0.09866282657889323, "grad_norm": 0.75499109894716, "learning_rate": 1.975395967242214e-05, "loss": 0.5753, "step": 1199 }, { "epoch": 0.09874511417403826, "grad_norm": 4.926214741317277, "learning_rate": 1.9753371732698255e-05, "loss": 0.8514, "step": 1200 }, { "epoch": 0.0988274017691833, "grad_norm": 4.113995566064139, "learning_rate": 1.9752783100110443e-05, "loss": 0.8735, "step": 1201 }, { "epoch": 0.09890968936432833, "grad_norm": 0.5883860438611207, "learning_rate": 1.975219377470052e-05, "loss": 0.6035, "step": 1202 }, { "epoch": 0.09899197695947336, "grad_norm": 3.3466076308514863, "learning_rate": 1.9751603756510344e-05, "loss": 0.8769, "step": 1203 }, { "epoch": 0.09907426455461839, "grad_norm": 0.47595350765066086, "learning_rate": 1.9751013045581835e-05, "loss": 0.5663, "step": 1204 }, { "epoch": 0.09915655214976342, "grad_norm": 3.4049170080353615, "learning_rate": 1.975042164195695e-05, "loss": 0.8363, "step": 1205 }, { "epoch": 0.09923883974490845, "grad_norm": 3.7661200169302327, "learning_rate": 1.974982954567771e-05, "loss": 0.8437, "step": 1206 }, { "epoch": 0.09932112734005348, "grad_norm": 3.6094210284619286, "learning_rate": 1.9749236756786167e-05, "loss": 0.861, "step": 1207 }, { "epoch": 0.09940341493519853, "grad_norm": 3.145969814243711, "learning_rate": 1.9748643275324438e-05, "loss": 0.8454, "step": 1208 }, { "epoch": 0.09948570253034356, "grad_norm": 3.6067880218861568, "learning_rate": 1.9748049101334684e-05, "loss": 0.8682, "step": 1209 }, { "epoch": 0.09956799012548859, "grad_norm": 3.0185050449291984, "learning_rate": 1.974745423485911e-05, "loss": 0.8708, "step": 1210 }, { "epoch": 0.09965027772063362, "grad_norm": 3.128449103884966, "learning_rate": 1.9746858675939974e-05, "loss": 0.8594, "step": 1211 }, { "epoch": 0.09973256531577865, "grad_norm": 0.6028578588325906, "learning_rate": 1.9746262424619585e-05, "loss": 0.6006, "step": 1212 }, { "epoch": 0.09981485291092368, "grad_norm": 0.5378805528352323, "learning_rate": 1.9745665480940304e-05, "loss": 0.5702, "step": 1213 }, { "epoch": 0.09989714050606871, "grad_norm": 2.9709104250769025, "learning_rate": 1.974506784494453e-05, "loss": 0.8769, "step": 1214 }, { "epoch": 0.09997942810121374, "grad_norm": 3.5710834059738983, "learning_rate": 1.974446951667472e-05, "loss": 0.8524, "step": 1215 }, { "epoch": 0.10006171569635877, "grad_norm": 3.564453597862319, "learning_rate": 1.9743870496173385e-05, "loss": 0.8602, "step": 1216 }, { "epoch": 0.1001440032915038, "grad_norm": 3.7485777754801415, "learning_rate": 1.974327078348307e-05, "loss": 0.8478, "step": 1217 }, { "epoch": 0.10022629088664883, "grad_norm": 0.6391149383767559, "learning_rate": 1.974267037864638e-05, "loss": 0.5585, "step": 1218 }, { "epoch": 0.10030857848179386, "grad_norm": 3.9853421053234044, "learning_rate": 1.9742069281705967e-05, "loss": 0.8742, "step": 1219 }, { "epoch": 0.1003908660769389, "grad_norm": 7.216394178355804, "learning_rate": 1.974146749270453e-05, "loss": 0.8459, "step": 1220 }, { "epoch": 0.10047315367208394, "grad_norm": 2.582703369923991, "learning_rate": 1.9740865011684827e-05, "loss": 0.8772, "step": 1221 }, { "epoch": 0.10055544126722897, "grad_norm": 4.096893921176322, "learning_rate": 1.974026183868965e-05, "loss": 0.8564, "step": 1222 }, { "epoch": 0.100637728862374, "grad_norm": 3.625029367682308, "learning_rate": 1.973965797376185e-05, "loss": 0.8505, "step": 1223 }, { "epoch": 0.10072001645751903, "grad_norm": 3.42182935905832, "learning_rate": 1.973905341694432e-05, "loss": 0.8314, "step": 1224 }, { "epoch": 0.10080230405266406, "grad_norm": 2.8684151430131664, "learning_rate": 1.9738448168280014e-05, "loss": 0.8524, "step": 1225 }, { "epoch": 0.1008845916478091, "grad_norm": 4.2068547384992545, "learning_rate": 1.9737842227811924e-05, "loss": 0.8525, "step": 1226 }, { "epoch": 0.10096687924295412, "grad_norm": 3.637604906458846, "learning_rate": 1.9737235595583093e-05, "loss": 0.8927, "step": 1227 }, { "epoch": 0.10104916683809916, "grad_norm": 3.986554301688107, "learning_rate": 1.973662827163662e-05, "loss": 0.9003, "step": 1228 }, { "epoch": 0.10113145443324419, "grad_norm": 0.6119674016964393, "learning_rate": 1.9736020256015647e-05, "loss": 0.5653, "step": 1229 }, { "epoch": 0.10121374202838922, "grad_norm": 3.947009339846442, "learning_rate": 1.9735411548763364e-05, "loss": 0.8614, "step": 1230 }, { "epoch": 0.10129602962353425, "grad_norm": 3.8850893245972666, "learning_rate": 1.9734802149923014e-05, "loss": 0.8663, "step": 1231 }, { "epoch": 0.10137831721867928, "grad_norm": 3.260028438383931, "learning_rate": 1.9734192059537888e-05, "loss": 0.864, "step": 1232 }, { "epoch": 0.10146060481382431, "grad_norm": 3.3728492367289795, "learning_rate": 1.9733581277651327e-05, "loss": 0.8524, "step": 1233 }, { "epoch": 0.10154289240896935, "grad_norm": 3.2625677444712946, "learning_rate": 1.9732969804306716e-05, "loss": 0.8299, "step": 1234 }, { "epoch": 0.10162518000411438, "grad_norm": 0.5270258088317135, "learning_rate": 1.9732357639547497e-05, "loss": 0.5695, "step": 1235 }, { "epoch": 0.10170746759925942, "grad_norm": 4.034862594266343, "learning_rate": 1.9731744783417154e-05, "loss": 0.9067, "step": 1236 }, { "epoch": 0.10178975519440445, "grad_norm": 3.368163010498083, "learning_rate": 1.9731131235959228e-05, "loss": 0.8785, "step": 1237 }, { "epoch": 0.10187204278954948, "grad_norm": 4.268507894834593, "learning_rate": 1.97305169972173e-05, "loss": 0.8497, "step": 1238 }, { "epoch": 0.10195433038469451, "grad_norm": 4.262009151943327, "learning_rate": 1.9729902067235006e-05, "loss": 0.8528, "step": 1239 }, { "epoch": 0.10203661797983954, "grad_norm": 3.7072453125521734, "learning_rate": 1.9729286446056033e-05, "loss": 0.837, "step": 1240 }, { "epoch": 0.10211890557498457, "grad_norm": 0.5042716296341209, "learning_rate": 1.9728670133724108e-05, "loss": 0.5718, "step": 1241 }, { "epoch": 0.1022011931701296, "grad_norm": 3.5004783261501466, "learning_rate": 1.9728053130283015e-05, "loss": 0.8695, "step": 1242 }, { "epoch": 0.10228348076527463, "grad_norm": 3.269137481777619, "learning_rate": 1.9727435435776584e-05, "loss": 0.8456, "step": 1243 }, { "epoch": 0.10236576836041966, "grad_norm": 4.183726994796829, "learning_rate": 1.97268170502487e-05, "loss": 0.8246, "step": 1244 }, { "epoch": 0.10244805595556469, "grad_norm": 3.447500278075762, "learning_rate": 1.9726197973743285e-05, "loss": 0.8538, "step": 1245 }, { "epoch": 0.10253034355070972, "grad_norm": 6.2832374035907606, "learning_rate": 1.9725578206304323e-05, "loss": 0.8363, "step": 1246 }, { "epoch": 0.10261263114585477, "grad_norm": 3.3223666951374327, "learning_rate": 1.972495774797584e-05, "loss": 0.8068, "step": 1247 }, { "epoch": 0.1026949187409998, "grad_norm": 4.527729681936454, "learning_rate": 1.972433659880191e-05, "loss": 0.8515, "step": 1248 }, { "epoch": 0.10277720633614483, "grad_norm": 3.219592992240681, "learning_rate": 1.9723714758826657e-05, "loss": 0.8491, "step": 1249 }, { "epoch": 0.10285949393128986, "grad_norm": 14.881817686003856, "learning_rate": 1.9723092228094262e-05, "loss": 0.8535, "step": 1250 }, { "epoch": 0.10294178152643489, "grad_norm": 7.751162451175856, "learning_rate": 1.9722469006648946e-05, "loss": 0.8366, "step": 1251 }, { "epoch": 0.10302406912157992, "grad_norm": 3.6264058447910785, "learning_rate": 1.9721845094534977e-05, "loss": 0.8544, "step": 1252 }, { "epoch": 0.10310635671672495, "grad_norm": 3.422457414693753, "learning_rate": 1.9721220491796682e-05, "loss": 0.8615, "step": 1253 }, { "epoch": 0.10318864431186998, "grad_norm": 6.929960642374395, "learning_rate": 1.972059519847843e-05, "loss": 0.8971, "step": 1254 }, { "epoch": 0.10327093190701502, "grad_norm": 3.4436829246073937, "learning_rate": 1.971996921462464e-05, "loss": 0.859, "step": 1255 }, { "epoch": 0.10335321950216005, "grad_norm": 0.6287334412236155, "learning_rate": 1.9719342540279783e-05, "loss": 0.5832, "step": 1256 }, { "epoch": 0.10343550709730508, "grad_norm": 4.455529227081377, "learning_rate": 1.9718715175488373e-05, "loss": 0.8551, "step": 1257 }, { "epoch": 0.10351779469245011, "grad_norm": 0.49625825545453955, "learning_rate": 1.9718087120294983e-05, "loss": 0.5907, "step": 1258 }, { "epoch": 0.10360008228759514, "grad_norm": 7.610855562933589, "learning_rate": 1.9717458374744226e-05, "loss": 0.8518, "step": 1259 }, { "epoch": 0.10368236988274018, "grad_norm": 6.814266905432093, "learning_rate": 1.9716828938880766e-05, "loss": 0.892, "step": 1260 }, { "epoch": 0.10376465747788521, "grad_norm": 5.503964342264624, "learning_rate": 1.9716198812749316e-05, "loss": 0.8575, "step": 1261 }, { "epoch": 0.10384694507303024, "grad_norm": 6.63227200743735, "learning_rate": 1.9715567996394642e-05, "loss": 0.899, "step": 1262 }, { "epoch": 0.10392923266817528, "grad_norm": 5.730417088676314, "learning_rate": 1.9714936489861557e-05, "loss": 0.8747, "step": 1263 }, { "epoch": 0.1040115202633203, "grad_norm": 0.48714044685236985, "learning_rate": 1.9714304293194918e-05, "loss": 0.5698, "step": 1264 }, { "epoch": 0.10409380785846534, "grad_norm": 4.811813633190729, "learning_rate": 1.971367140643964e-05, "loss": 0.8528, "step": 1265 }, { "epoch": 0.10417609545361037, "grad_norm": 0.45476138424455886, "learning_rate": 1.971303782964068e-05, "loss": 0.5733, "step": 1266 }, { "epoch": 0.1042583830487554, "grad_norm": 4.144890858016052, "learning_rate": 1.9712403562843045e-05, "loss": 0.8308, "step": 1267 }, { "epoch": 0.10434067064390043, "grad_norm": 5.275387836703206, "learning_rate": 1.9711768606091795e-05, "loss": 0.8931, "step": 1268 }, { "epoch": 0.10442295823904546, "grad_norm": 5.053640055345281, "learning_rate": 1.9711132959432033e-05, "loss": 0.84, "step": 1269 }, { "epoch": 0.10450524583419049, "grad_norm": 19.786582333651765, "learning_rate": 1.9710496622908917e-05, "loss": 0.8148, "step": 1270 }, { "epoch": 0.10458753342933552, "grad_norm": 3.9891501660738253, "learning_rate": 1.970985959656765e-05, "loss": 0.8575, "step": 1271 }, { "epoch": 0.10466982102448055, "grad_norm": 4.510634946553714, "learning_rate": 1.9709221880453488e-05, "loss": 0.865, "step": 1272 }, { "epoch": 0.1047521086196256, "grad_norm": 3.734578158484028, "learning_rate": 1.970858347461173e-05, "loss": 0.8837, "step": 1273 }, { "epoch": 0.10483439621477063, "grad_norm": 5.721110074673601, "learning_rate": 1.9707944379087727e-05, "loss": 0.8538, "step": 1274 }, { "epoch": 0.10491668380991566, "grad_norm": 4.410682194182307, "learning_rate": 1.9707304593926883e-05, "loss": 0.8515, "step": 1275 }, { "epoch": 0.10499897140506069, "grad_norm": 6.312032763782244, "learning_rate": 1.9706664119174643e-05, "loss": 0.8473, "step": 1276 }, { "epoch": 0.10508125900020572, "grad_norm": 4.691303195768097, "learning_rate": 1.970602295487651e-05, "loss": 0.8725, "step": 1277 }, { "epoch": 0.10516354659535075, "grad_norm": 4.316070261641844, "learning_rate": 1.9705381101078028e-05, "loss": 0.821, "step": 1278 }, { "epoch": 0.10524583419049578, "grad_norm": 4.997740394972133, "learning_rate": 1.9704738557824795e-05, "loss": 0.8647, "step": 1279 }, { "epoch": 0.10532812178564081, "grad_norm": 9.863962397731285, "learning_rate": 1.970409532516245e-05, "loss": 0.8627, "step": 1280 }, { "epoch": 0.10541040938078584, "grad_norm": 4.11123640524789, "learning_rate": 1.9703451403136696e-05, "loss": 0.8407, "step": 1281 }, { "epoch": 0.10549269697593087, "grad_norm": 0.5916809195753411, "learning_rate": 1.9702806791793277e-05, "loss": 0.5848, "step": 1282 }, { "epoch": 0.1055749845710759, "grad_norm": 6.69582821146116, "learning_rate": 1.9702161491177976e-05, "loss": 0.8853, "step": 1283 }, { "epoch": 0.10565727216622094, "grad_norm": 0.4953187105131954, "learning_rate": 1.9701515501336642e-05, "loss": 0.5822, "step": 1284 }, { "epoch": 0.10573955976136598, "grad_norm": 8.936946054345633, "learning_rate": 1.970086882231516e-05, "loss": 0.851, "step": 1285 }, { "epoch": 0.10582184735651101, "grad_norm": 6.8488454890517705, "learning_rate": 1.970022145415947e-05, "loss": 0.8961, "step": 1286 }, { "epoch": 0.10590413495165604, "grad_norm": 4.237563628391966, "learning_rate": 1.9699573396915563e-05, "loss": 0.8378, "step": 1287 }, { "epoch": 0.10598642254680107, "grad_norm": 51.42004889155801, "learning_rate": 1.969892465062947e-05, "loss": 0.8389, "step": 1288 }, { "epoch": 0.1060687101419461, "grad_norm": 6.786819850265654, "learning_rate": 1.9698275215347287e-05, "loss": 0.887, "step": 1289 }, { "epoch": 0.10615099773709114, "grad_norm": 22.715342269516267, "learning_rate": 1.969762509111514e-05, "loss": 0.8792, "step": 1290 }, { "epoch": 0.10623328533223617, "grad_norm": 4.6055650003906194, "learning_rate": 1.969697427797922e-05, "loss": 0.8886, "step": 1291 }, { "epoch": 0.1063155729273812, "grad_norm": 0.6622955664034255, "learning_rate": 1.9696322775985748e-05, "loss": 0.5781, "step": 1292 }, { "epoch": 0.10639786052252623, "grad_norm": 0.5456666190829798, "learning_rate": 1.9695670585181016e-05, "loss": 0.5594, "step": 1293 }, { "epoch": 0.10648014811767126, "grad_norm": 0.5361303752940896, "learning_rate": 1.969501770561135e-05, "loss": 0.6009, "step": 1294 }, { "epoch": 0.10656243571281629, "grad_norm": 7.635441041173641, "learning_rate": 1.9694364137323133e-05, "loss": 0.8371, "step": 1295 }, { "epoch": 0.10664472330796132, "grad_norm": 12.228827021078185, "learning_rate": 1.969370988036279e-05, "loss": 0.8745, "step": 1296 }, { "epoch": 0.10672701090310635, "grad_norm": 0.6908564482765909, "learning_rate": 1.9693054934776803e-05, "loss": 0.5781, "step": 1297 }, { "epoch": 0.1068092984982514, "grad_norm": 4.800629808465259, "learning_rate": 1.9692399300611693e-05, "loss": 0.8426, "step": 1298 }, { "epoch": 0.10689158609339643, "grad_norm": 6.024553599030264, "learning_rate": 1.969174297791404e-05, "loss": 0.8792, "step": 1299 }, { "epoch": 0.10697387368854146, "grad_norm": 11.239833398928637, "learning_rate": 1.969108596673046e-05, "loss": 0.8752, "step": 1300 }, { "epoch": 0.10705616128368649, "grad_norm": 6.631855999332642, "learning_rate": 1.9690428267107636e-05, "loss": 0.864, "step": 1301 }, { "epoch": 0.10713844887883152, "grad_norm": 8.509042674536547, "learning_rate": 1.9689769879092285e-05, "loss": 0.8539, "step": 1302 }, { "epoch": 0.10722073647397655, "grad_norm": 6.153625548192156, "learning_rate": 1.9689110802731174e-05, "loss": 0.8872, "step": 1303 }, { "epoch": 0.10730302406912158, "grad_norm": 0.7071137613706345, "learning_rate": 1.968845103807113e-05, "loss": 0.6264, "step": 1304 }, { "epoch": 0.10738531166426661, "grad_norm": 4.709892779228607, "learning_rate": 1.968779058515902e-05, "loss": 0.8795, "step": 1305 }, { "epoch": 0.10746759925941164, "grad_norm": 18.75805597847832, "learning_rate": 1.968712944404176e-05, "loss": 0.8674, "step": 1306 }, { "epoch": 0.10754988685455667, "grad_norm": 0.5225280033189051, "learning_rate": 1.9686467614766317e-05, "loss": 0.576, "step": 1307 }, { "epoch": 0.1076321744497017, "grad_norm": 11.926670592194546, "learning_rate": 1.9685805097379706e-05, "loss": 0.8787, "step": 1308 }, { "epoch": 0.10771446204484673, "grad_norm": 7.4263466600964625, "learning_rate": 1.9685141891928988e-05, "loss": 0.8328, "step": 1309 }, { "epoch": 0.10779674963999177, "grad_norm": 0.5357602955430172, "learning_rate": 1.968447799846128e-05, "loss": 0.5619, "step": 1310 }, { "epoch": 0.10787903723513681, "grad_norm": 6.813630602811481, "learning_rate": 1.9683813417023744e-05, "loss": 0.8788, "step": 1311 }, { "epoch": 0.10796132483028184, "grad_norm": 10.657671843850299, "learning_rate": 1.968314814766359e-05, "loss": 0.8245, "step": 1312 }, { "epoch": 0.10804361242542687, "grad_norm": 6.152874752366795, "learning_rate": 1.9682482190428078e-05, "loss": 0.8491, "step": 1313 }, { "epoch": 0.1081259000205719, "grad_norm": 8.128402041044515, "learning_rate": 1.9681815545364514e-05, "loss": 0.8548, "step": 1314 }, { "epoch": 0.10820818761571693, "grad_norm": 9.164034587713218, "learning_rate": 1.968114821252026e-05, "loss": 0.8648, "step": 1315 }, { "epoch": 0.10829047521086196, "grad_norm": 6.8988719562955785, "learning_rate": 1.9680480191942715e-05, "loss": 0.8515, "step": 1316 }, { "epoch": 0.108372762806007, "grad_norm": 5.77279873046973, "learning_rate": 1.9679811483679344e-05, "loss": 0.8743, "step": 1317 }, { "epoch": 0.10845505040115203, "grad_norm": 0.6678932721554799, "learning_rate": 1.9679142087777646e-05, "loss": 0.5631, "step": 1318 }, { "epoch": 0.10853733799629706, "grad_norm": 5.142613068462937, "learning_rate": 1.9678472004285168e-05, "loss": 0.8364, "step": 1319 }, { "epoch": 0.10861962559144209, "grad_norm": 4.36636440759262, "learning_rate": 1.9677801233249522e-05, "loss": 0.8776, "step": 1320 }, { "epoch": 0.10870191318658712, "grad_norm": 0.515905767911221, "learning_rate": 1.9677129774718354e-05, "loss": 0.5542, "step": 1321 }, { "epoch": 0.10878420078173215, "grad_norm": 0.5052244867766346, "learning_rate": 1.967645762873936e-05, "loss": 0.5827, "step": 1322 }, { "epoch": 0.10886648837687718, "grad_norm": 5.083386834635681, "learning_rate": 1.9675784795360294e-05, "loss": 0.8736, "step": 1323 }, { "epoch": 0.10894877597202222, "grad_norm": 5.440391953229143, "learning_rate": 1.967511127462895e-05, "loss": 0.8759, "step": 1324 }, { "epoch": 0.10903106356716726, "grad_norm": 15.884511720887403, "learning_rate": 1.9674437066593172e-05, "loss": 0.8322, "step": 1325 }, { "epoch": 0.10911335116231229, "grad_norm": 7.211730831323343, "learning_rate": 1.9673762171300857e-05, "loss": 0.8625, "step": 1326 }, { "epoch": 0.10919563875745732, "grad_norm": 0.5669850108936473, "learning_rate": 1.967308658879995e-05, "loss": 0.5535, "step": 1327 }, { "epoch": 0.10927792635260235, "grad_norm": 4.598126891030946, "learning_rate": 1.9672410319138442e-05, "loss": 0.8267, "step": 1328 }, { "epoch": 0.10936021394774738, "grad_norm": 5.389091584276692, "learning_rate": 1.967173336236437e-05, "loss": 0.8526, "step": 1329 }, { "epoch": 0.10944250154289241, "grad_norm": 5.396740224845529, "learning_rate": 1.967105571852583e-05, "loss": 0.8583, "step": 1330 }, { "epoch": 0.10952478913803744, "grad_norm": 15.756743051056382, "learning_rate": 1.9670377387670962e-05, "loss": 0.8352, "step": 1331 }, { "epoch": 0.10960707673318247, "grad_norm": 45.36041441825305, "learning_rate": 1.966969836984794e-05, "loss": 0.8664, "step": 1332 }, { "epoch": 0.1096893643283275, "grad_norm": 0.6091498990566993, "learning_rate": 1.9669018665105022e-05, "loss": 0.5725, "step": 1333 }, { "epoch": 0.10977165192347253, "grad_norm": 4.9099814047052766, "learning_rate": 1.9668338273490476e-05, "loss": 0.8601, "step": 1334 }, { "epoch": 0.10985393951861756, "grad_norm": 5.695516010249542, "learning_rate": 1.966765719505264e-05, "loss": 0.8469, "step": 1335 }, { "epoch": 0.1099362271137626, "grad_norm": 0.5018115432883228, "learning_rate": 1.9666975429839898e-05, "loss": 0.5708, "step": 1336 }, { "epoch": 0.11001851470890764, "grad_norm": 5.03543554455548, "learning_rate": 1.9666292977900683e-05, "loss": 0.8538, "step": 1337 }, { "epoch": 0.11010080230405267, "grad_norm": 6.0730970796747785, "learning_rate": 1.966560983928347e-05, "loss": 0.8436, "step": 1338 }, { "epoch": 0.1101830898991977, "grad_norm": 0.5251518856290188, "learning_rate": 1.96649260140368e-05, "loss": 0.5561, "step": 1339 }, { "epoch": 0.11026537749434273, "grad_norm": 4.080683188875858, "learning_rate": 1.9664241502209235e-05, "loss": 0.8623, "step": 1340 }, { "epoch": 0.11034766508948776, "grad_norm": 5.393952148873809, "learning_rate": 1.9663556303849413e-05, "loss": 0.8716, "step": 1341 }, { "epoch": 0.11042995268463279, "grad_norm": 5.475630634902594, "learning_rate": 1.9662870419006005e-05, "loss": 0.864, "step": 1342 }, { "epoch": 0.11051224027977782, "grad_norm": 5.368155159963875, "learning_rate": 1.9662183847727738e-05, "loss": 0.85, "step": 1343 }, { "epoch": 0.11059452787492285, "grad_norm": 5.333748381981491, "learning_rate": 1.966149659006338e-05, "loss": 0.9021, "step": 1344 }, { "epoch": 0.11067681547006789, "grad_norm": 5.152529335147286, "learning_rate": 1.9660808646061755e-05, "loss": 0.8265, "step": 1345 }, { "epoch": 0.11075910306521292, "grad_norm": 5.014489867671769, "learning_rate": 1.9660120015771736e-05, "loss": 0.8614, "step": 1346 }, { "epoch": 0.11084139066035795, "grad_norm": 6.878459736348937, "learning_rate": 1.965943069924224e-05, "loss": 0.8413, "step": 1347 }, { "epoch": 0.11092367825550298, "grad_norm": 1.309054235475711, "learning_rate": 1.9658740696522235e-05, "loss": 0.5864, "step": 1348 }, { "epoch": 0.11100596585064801, "grad_norm": 4.10092938202275, "learning_rate": 1.9658050007660736e-05, "loss": 0.8697, "step": 1349 }, { "epoch": 0.11108825344579305, "grad_norm": 3.718338112475671, "learning_rate": 1.9657358632706812e-05, "loss": 0.8629, "step": 1350 }, { "epoch": 0.11117054104093808, "grad_norm": 5.648391310440999, "learning_rate": 1.9656666571709575e-05, "loss": 0.8538, "step": 1351 }, { "epoch": 0.11125282863608311, "grad_norm": 6.739012308284066, "learning_rate": 1.965597382471819e-05, "loss": 0.8593, "step": 1352 }, { "epoch": 0.11133511623122815, "grad_norm": 4.490203845995398, "learning_rate": 1.9655280391781862e-05, "loss": 0.884, "step": 1353 }, { "epoch": 0.11141740382637318, "grad_norm": 7.535420440316182, "learning_rate": 1.965458627294986e-05, "loss": 0.8226, "step": 1354 }, { "epoch": 0.11149969142151821, "grad_norm": 4.304043216970036, "learning_rate": 1.965389146827149e-05, "loss": 0.8435, "step": 1355 }, { "epoch": 0.11158197901666324, "grad_norm": 0.46979177459935967, "learning_rate": 1.9653195977796108e-05, "loss": 0.546, "step": 1356 }, { "epoch": 0.11166426661180827, "grad_norm": 3.244913645135368, "learning_rate": 1.9652499801573122e-05, "loss": 0.8431, "step": 1357 }, { "epoch": 0.1117465542069533, "grad_norm": 4.113074523021319, "learning_rate": 1.9651802939651988e-05, "loss": 0.8569, "step": 1358 }, { "epoch": 0.11182884180209833, "grad_norm": 3.8188152990212383, "learning_rate": 1.9651105392082206e-05, "loss": 0.8706, "step": 1359 }, { "epoch": 0.11191112939724336, "grad_norm": 5.332042480420847, "learning_rate": 1.9650407158913335e-05, "loss": 0.8566, "step": 1360 }, { "epoch": 0.11199341699238839, "grad_norm": 5.354305156748114, "learning_rate": 1.964970824019497e-05, "loss": 0.8499, "step": 1361 }, { "epoch": 0.11207570458753342, "grad_norm": 5.591727704877664, "learning_rate": 1.9649008635976765e-05, "loss": 0.8842, "step": 1362 }, { "epoch": 0.11215799218267847, "grad_norm": 0.5192326170528965, "learning_rate": 1.964830834630842e-05, "loss": 0.5876, "step": 1363 }, { "epoch": 0.1122402797778235, "grad_norm": 0.4703501221850813, "learning_rate": 1.9647607371239678e-05, "loss": 0.5843, "step": 1364 }, { "epoch": 0.11232256737296853, "grad_norm": 4.252531567672639, "learning_rate": 1.964690571082034e-05, "loss": 0.8273, "step": 1365 }, { "epoch": 0.11240485496811356, "grad_norm": 5.191619286934386, "learning_rate": 1.9646203365100243e-05, "loss": 0.8585, "step": 1366 }, { "epoch": 0.11248714256325859, "grad_norm": 4.322950576422303, "learning_rate": 1.964550033412929e-05, "loss": 0.8582, "step": 1367 }, { "epoch": 0.11256943015840362, "grad_norm": 4.108989491540691, "learning_rate": 1.9644796617957418e-05, "loss": 0.8742, "step": 1368 }, { "epoch": 0.11265171775354865, "grad_norm": 3.002075018468358, "learning_rate": 1.9644092216634618e-05, "loss": 0.8378, "step": 1369 }, { "epoch": 0.11273400534869368, "grad_norm": 3.84643687208559, "learning_rate": 1.9643387130210933e-05, "loss": 0.8217, "step": 1370 }, { "epoch": 0.11281629294383871, "grad_norm": 3.504634608794414, "learning_rate": 1.9642681358736446e-05, "loss": 0.8462, "step": 1371 }, { "epoch": 0.11289858053898374, "grad_norm": 8.219632510725962, "learning_rate": 1.9641974902261296e-05, "loss": 0.8589, "step": 1372 }, { "epoch": 0.11298086813412878, "grad_norm": 4.787889769332161, "learning_rate": 1.964126776083567e-05, "loss": 0.8478, "step": 1373 }, { "epoch": 0.1130631557292738, "grad_norm": 3.431865259864041, "learning_rate": 1.96405599345098e-05, "loss": 0.8936, "step": 1374 }, { "epoch": 0.11314544332441884, "grad_norm": 4.790101098374644, "learning_rate": 1.9639851423333973e-05, "loss": 0.8771, "step": 1375 }, { "epoch": 0.11322773091956388, "grad_norm": 30.15921587978486, "learning_rate": 1.9639142227358515e-05, "loss": 0.8205, "step": 1376 }, { "epoch": 0.11331001851470891, "grad_norm": 5.235814790178753, "learning_rate": 1.9638432346633813e-05, "loss": 0.8403, "step": 1377 }, { "epoch": 0.11339230610985394, "grad_norm": 3.8559194687305203, "learning_rate": 1.9637721781210285e-05, "loss": 0.8873, "step": 1378 }, { "epoch": 0.11347459370499897, "grad_norm": 4.562934487291937, "learning_rate": 1.963701053113842e-05, "loss": 0.8147, "step": 1379 }, { "epoch": 0.113556881300144, "grad_norm": 4.762200299495142, "learning_rate": 1.9636298596468734e-05, "loss": 0.8363, "step": 1380 }, { "epoch": 0.11363916889528904, "grad_norm": 3.958688391159092, "learning_rate": 1.9635585977251813e-05, "loss": 0.8677, "step": 1381 }, { "epoch": 0.11372145649043407, "grad_norm": 5.206608356672264, "learning_rate": 1.963487267353827e-05, "loss": 0.8687, "step": 1382 }, { "epoch": 0.1138037440855791, "grad_norm": 0.5547846036648372, "learning_rate": 1.963415868537878e-05, "loss": 0.5501, "step": 1383 }, { "epoch": 0.11388603168072413, "grad_norm": 0.5226904808147554, "learning_rate": 1.9633444012824066e-05, "loss": 0.5864, "step": 1384 }, { "epoch": 0.11396831927586916, "grad_norm": 7.417593071882848, "learning_rate": 1.96327286559249e-05, "loss": 0.8479, "step": 1385 }, { "epoch": 0.11405060687101419, "grad_norm": 4.63099678124493, "learning_rate": 1.963201261473209e-05, "loss": 0.8265, "step": 1386 }, { "epoch": 0.11413289446615922, "grad_norm": 4.481887681345341, "learning_rate": 1.963129588929651e-05, "loss": 0.8559, "step": 1387 }, { "epoch": 0.11421518206130425, "grad_norm": 7.2760595228059, "learning_rate": 1.963057847966907e-05, "loss": 0.8443, "step": 1388 }, { "epoch": 0.1142974696564493, "grad_norm": 4.414691898420469, "learning_rate": 1.962986038590074e-05, "loss": 0.8377, "step": 1389 }, { "epoch": 0.11437975725159433, "grad_norm": 3.410257185708971, "learning_rate": 1.9629141608042527e-05, "loss": 0.8198, "step": 1390 }, { "epoch": 0.11446204484673936, "grad_norm": 5.969142940247697, "learning_rate": 1.9628422146145496e-05, "loss": 0.842, "step": 1391 }, { "epoch": 0.11454433244188439, "grad_norm": 4.102136287601403, "learning_rate": 1.9627702000260755e-05, "loss": 0.8504, "step": 1392 }, { "epoch": 0.11462662003702942, "grad_norm": 4.63289281295388, "learning_rate": 1.962698117043946e-05, "loss": 0.8669, "step": 1393 }, { "epoch": 0.11470890763217445, "grad_norm": 4.412255314793883, "learning_rate": 1.9626259656732816e-05, "loss": 0.8385, "step": 1394 }, { "epoch": 0.11479119522731948, "grad_norm": 3.424254537799306, "learning_rate": 1.962553745919208e-05, "loss": 0.8332, "step": 1395 }, { "epoch": 0.11487348282246451, "grad_norm": 0.7843996001653674, "learning_rate": 1.962481457786856e-05, "loss": 0.5559, "step": 1396 }, { "epoch": 0.11495577041760954, "grad_norm": 6.392616540047419, "learning_rate": 1.9624091012813606e-05, "loss": 0.861, "step": 1397 }, { "epoch": 0.11503805801275457, "grad_norm": 4.375124486829069, "learning_rate": 1.9623366764078616e-05, "loss": 0.8593, "step": 1398 }, { "epoch": 0.1151203456078996, "grad_norm": 4.818446405657229, "learning_rate": 1.962264183171504e-05, "loss": 0.8331, "step": 1399 }, { "epoch": 0.11520263320304464, "grad_norm": 4.703078440427859, "learning_rate": 1.9621916215774382e-05, "loss": 0.8457, "step": 1400 }, { "epoch": 0.11528492079818967, "grad_norm": 4.037026903799907, "learning_rate": 1.9621189916308178e-05, "loss": 0.816, "step": 1401 }, { "epoch": 0.11536720839333471, "grad_norm": 5.119130138779516, "learning_rate": 1.9620462933368033e-05, "loss": 0.8436, "step": 1402 }, { "epoch": 0.11544949598847974, "grad_norm": 4.852298451053651, "learning_rate": 1.961973526700559e-05, "loss": 0.8485, "step": 1403 }, { "epoch": 0.11553178358362477, "grad_norm": 4.92540174605456, "learning_rate": 1.961900691727253e-05, "loss": 0.8549, "step": 1404 }, { "epoch": 0.1156140711787698, "grad_norm": 4.282680446115329, "learning_rate": 1.9618277884220606e-05, "loss": 0.8503, "step": 1405 }, { "epoch": 0.11569635877391483, "grad_norm": 6.16103079429699, "learning_rate": 1.9617548167901606e-05, "loss": 0.8613, "step": 1406 }, { "epoch": 0.11577864636905986, "grad_norm": 0.6928414088371742, "learning_rate": 1.9616817768367362e-05, "loss": 0.579, "step": 1407 }, { "epoch": 0.1158609339642049, "grad_norm": 6.34405553140399, "learning_rate": 1.9616086685669764e-05, "loss": 0.839, "step": 1408 }, { "epoch": 0.11594322155934993, "grad_norm": 4.315757032376313, "learning_rate": 1.9615354919860748e-05, "loss": 0.8458, "step": 1409 }, { "epoch": 0.11602550915449496, "grad_norm": 0.5455750851135943, "learning_rate": 1.961462247099229e-05, "loss": 0.5573, "step": 1410 }, { "epoch": 0.11610779674963999, "grad_norm": 6.034656447100853, "learning_rate": 1.9613889339116436e-05, "loss": 0.8626, "step": 1411 }, { "epoch": 0.11619008434478502, "grad_norm": 7.09081398025819, "learning_rate": 1.9613155524285257e-05, "loss": 0.8381, "step": 1412 }, { "epoch": 0.11627237193993005, "grad_norm": 6.863304280435548, "learning_rate": 1.961242102655088e-05, "loss": 0.8502, "step": 1413 }, { "epoch": 0.11635465953507508, "grad_norm": 10.784533484864859, "learning_rate": 1.961168584596549e-05, "loss": 0.8679, "step": 1414 }, { "epoch": 0.11643694713022013, "grad_norm": 4.470623239984508, "learning_rate": 1.9610949982581305e-05, "loss": 0.8402, "step": 1415 }, { "epoch": 0.11651923472536516, "grad_norm": 5.269519406473394, "learning_rate": 1.9610213436450605e-05, "loss": 0.8472, "step": 1416 }, { "epoch": 0.11660152232051019, "grad_norm": 5.275728737736502, "learning_rate": 1.9609476207625712e-05, "loss": 0.8542, "step": 1417 }, { "epoch": 0.11668380991565522, "grad_norm": 5.631423086351522, "learning_rate": 1.9608738296158997e-05, "loss": 0.8693, "step": 1418 }, { "epoch": 0.11676609751080025, "grad_norm": 4.4364700256265195, "learning_rate": 1.9607999702102882e-05, "loss": 0.852, "step": 1419 }, { "epoch": 0.11684838510594528, "grad_norm": 0.5703836069388057, "learning_rate": 1.9607260425509832e-05, "loss": 0.5766, "step": 1420 }, { "epoch": 0.11693067270109031, "grad_norm": 4.058718090767148, "learning_rate": 1.9606520466432368e-05, "loss": 0.8632, "step": 1421 }, { "epoch": 0.11701296029623534, "grad_norm": 0.5038706574336734, "learning_rate": 1.9605779824923053e-05, "loss": 0.579, "step": 1422 }, { "epoch": 0.11709524789138037, "grad_norm": 0.4777953546804379, "learning_rate": 1.96050385010345e-05, "loss": 0.5404, "step": 1423 }, { "epoch": 0.1171775354865254, "grad_norm": 4.103894856069992, "learning_rate": 1.9604296494819372e-05, "loss": 0.8169, "step": 1424 }, { "epoch": 0.11725982308167043, "grad_norm": 5.3210038004322335, "learning_rate": 1.9603553806330383e-05, "loss": 0.8412, "step": 1425 }, { "epoch": 0.11734211067681546, "grad_norm": 5.333076136990717, "learning_rate": 1.960281043562029e-05, "loss": 0.8566, "step": 1426 }, { "epoch": 0.1174243982719605, "grad_norm": 4.6932345814293965, "learning_rate": 1.96020663827419e-05, "loss": 0.8448, "step": 1427 }, { "epoch": 0.11750668586710554, "grad_norm": 8.408659107060835, "learning_rate": 1.960132164774807e-05, "loss": 0.8552, "step": 1428 }, { "epoch": 0.11758897346225057, "grad_norm": 5.633908689330064, "learning_rate": 1.9600576230691704e-05, "loss": 0.8782, "step": 1429 }, { "epoch": 0.1176712610573956, "grad_norm": 12.574732342527673, "learning_rate": 1.9599830131625763e-05, "loss": 0.8689, "step": 1430 }, { "epoch": 0.11775354865254063, "grad_norm": 0.6380068292188105, "learning_rate": 1.9599083350603237e-05, "loss": 0.5682, "step": 1431 }, { "epoch": 0.11783583624768566, "grad_norm": 5.858612887393994, "learning_rate": 1.959833588767718e-05, "loss": 0.8578, "step": 1432 }, { "epoch": 0.1179181238428307, "grad_norm": 5.579945680423859, "learning_rate": 1.9597587742900693e-05, "loss": 0.8282, "step": 1433 }, { "epoch": 0.11800041143797572, "grad_norm": 6.137557946832193, "learning_rate": 1.9596838916326923e-05, "loss": 0.8397, "step": 1434 }, { "epoch": 0.11808269903312076, "grad_norm": 19.398037303752524, "learning_rate": 1.9596089408009066e-05, "loss": 0.8243, "step": 1435 }, { "epoch": 0.11816498662826579, "grad_norm": 6.767037667437653, "learning_rate": 1.959533921800036e-05, "loss": 0.8431, "step": 1436 }, { "epoch": 0.11824727422341082, "grad_norm": 7.424308817399147, "learning_rate": 1.9594588346354104e-05, "loss": 0.8434, "step": 1437 }, { "epoch": 0.11832956181855585, "grad_norm": 4.930241166882705, "learning_rate": 1.9593836793123637e-05, "loss": 0.8736, "step": 1438 }, { "epoch": 0.11841184941370088, "grad_norm": 11.449117099938325, "learning_rate": 1.9593084558362347e-05, "loss": 0.8572, "step": 1439 }, { "epoch": 0.11849413700884592, "grad_norm": 5.001807158753299, "learning_rate": 1.9592331642123667e-05, "loss": 0.8825, "step": 1440 }, { "epoch": 0.11857642460399095, "grad_norm": 0.5722309140048856, "learning_rate": 1.9591578044461092e-05, "loss": 0.595, "step": 1441 }, { "epoch": 0.11865871219913598, "grad_norm": 4.519860069953814, "learning_rate": 1.959082376542815e-05, "loss": 0.8882, "step": 1442 }, { "epoch": 0.11874099979428102, "grad_norm": 6.76893243360037, "learning_rate": 1.959006880507843e-05, "loss": 0.8414, "step": 1443 }, { "epoch": 0.11882328738942605, "grad_norm": 9.155966381696617, "learning_rate": 1.958931316346556e-05, "loss": 0.8287, "step": 1444 }, { "epoch": 0.11890557498457108, "grad_norm": 5.19033125874358, "learning_rate": 1.9588556840643212e-05, "loss": 0.8754, "step": 1445 }, { "epoch": 0.11898786257971611, "grad_norm": 5.356560891356975, "learning_rate": 1.9587799836665125e-05, "loss": 0.8372, "step": 1446 }, { "epoch": 0.11907015017486114, "grad_norm": 18.341970904215607, "learning_rate": 1.958704215158507e-05, "loss": 0.8482, "step": 1447 }, { "epoch": 0.11915243777000617, "grad_norm": 8.160225573294031, "learning_rate": 1.9586283785456873e-05, "loss": 0.8293, "step": 1448 }, { "epoch": 0.1192347253651512, "grad_norm": 9.809055722956309, "learning_rate": 1.9585524738334408e-05, "loss": 0.8323, "step": 1449 }, { "epoch": 0.11931701296029623, "grad_norm": 6.1116327045703445, "learning_rate": 1.9584765010271593e-05, "loss": 0.8255, "step": 1450 }, { "epoch": 0.11939930055544126, "grad_norm": 5.676397508769702, "learning_rate": 1.9584004601322403e-05, "loss": 0.8729, "step": 1451 }, { "epoch": 0.1194815881505863, "grad_norm": 11.731682837110242, "learning_rate": 1.958324351154085e-05, "loss": 0.8485, "step": 1452 }, { "epoch": 0.11956387574573134, "grad_norm": 5.524439480046927, "learning_rate": 1.9582481740981006e-05, "loss": 0.8268, "step": 1453 }, { "epoch": 0.11964616334087637, "grad_norm": 4.631670905958911, "learning_rate": 1.9581719289696982e-05, "loss": 0.852, "step": 1454 }, { "epoch": 0.1197284509360214, "grad_norm": 8.965374291575179, "learning_rate": 1.9580956157742946e-05, "loss": 0.8404, "step": 1455 }, { "epoch": 0.11981073853116643, "grad_norm": 0.6261324902081594, "learning_rate": 1.958019234517311e-05, "loss": 0.5977, "step": 1456 }, { "epoch": 0.11989302612631146, "grad_norm": 11.465995096047832, "learning_rate": 1.9579427852041726e-05, "loss": 0.8321, "step": 1457 }, { "epoch": 0.11997531372145649, "grad_norm": 6.000409897850183, "learning_rate": 1.957866267840311e-05, "loss": 0.8627, "step": 1458 }, { "epoch": 0.12005760131660152, "grad_norm": 18.843059181510863, "learning_rate": 1.9577896824311614e-05, "loss": 0.8605, "step": 1459 }, { "epoch": 0.12013988891174655, "grad_norm": 0.557334104917724, "learning_rate": 1.9577130289821645e-05, "loss": 0.5944, "step": 1460 }, { "epoch": 0.12022217650689158, "grad_norm": 6.479047097320461, "learning_rate": 1.9576363074987657e-05, "loss": 0.8217, "step": 1461 }, { "epoch": 0.12030446410203662, "grad_norm": 7.966997648213342, "learning_rate": 1.9575595179864152e-05, "loss": 0.8549, "step": 1462 }, { "epoch": 0.12038675169718165, "grad_norm": 5.241930410780214, "learning_rate": 1.957482660450568e-05, "loss": 0.846, "step": 1463 }, { "epoch": 0.12046903929232668, "grad_norm": 4.581892002978914, "learning_rate": 1.9574057348966836e-05, "loss": 0.8455, "step": 1464 }, { "epoch": 0.12055132688747171, "grad_norm": 6.431741301592298, "learning_rate": 1.957328741330227e-05, "loss": 0.8713, "step": 1465 }, { "epoch": 0.12063361448261675, "grad_norm": 6.792937329047407, "learning_rate": 1.9572516797566684e-05, "loss": 0.8235, "step": 1466 }, { "epoch": 0.12071590207776178, "grad_norm": 5.865092979344445, "learning_rate": 1.9571745501814804e-05, "loss": 0.8509, "step": 1467 }, { "epoch": 0.12079818967290681, "grad_norm": 6.617024495241083, "learning_rate": 1.9570973526101436e-05, "loss": 0.8415, "step": 1468 }, { "epoch": 0.12088047726805184, "grad_norm": 6.63634081135842, "learning_rate": 1.9570200870481412e-05, "loss": 0.831, "step": 1469 }, { "epoch": 0.12096276486319688, "grad_norm": 12.04891396249779, "learning_rate": 1.9569427535009628e-05, "loss": 0.8267, "step": 1470 }, { "epoch": 0.1210450524583419, "grad_norm": 10.237297997250556, "learning_rate": 1.956865351974101e-05, "loss": 0.8459, "step": 1471 }, { "epoch": 0.12112734005348694, "grad_norm": 7.236361914133526, "learning_rate": 1.9567878824730555e-05, "loss": 0.8306, "step": 1472 }, { "epoch": 0.12120962764863197, "grad_norm": 10.608567363064175, "learning_rate": 1.9567103450033287e-05, "loss": 0.8419, "step": 1473 }, { "epoch": 0.121291915243777, "grad_norm": 6.2236993124342455, "learning_rate": 1.956632739570429e-05, "loss": 0.8488, "step": 1474 }, { "epoch": 0.12137420283892203, "grad_norm": 6.201916448058059, "learning_rate": 1.9565550661798694e-05, "loss": 0.8447, "step": 1475 }, { "epoch": 0.12145649043406706, "grad_norm": 14.813294651870368, "learning_rate": 1.9564773248371675e-05, "loss": 0.8574, "step": 1476 }, { "epoch": 0.12153877802921209, "grad_norm": 6.638992018546782, "learning_rate": 1.9563995155478465e-05, "loss": 0.8426, "step": 1477 }, { "epoch": 0.12162106562435712, "grad_norm": 9.428113607773538, "learning_rate": 1.9563216383174334e-05, "loss": 0.8247, "step": 1478 }, { "epoch": 0.12170335321950217, "grad_norm": 8.214290390573499, "learning_rate": 1.95624369315146e-05, "loss": 0.8271, "step": 1479 }, { "epoch": 0.1217856408146472, "grad_norm": 8.920576079471006, "learning_rate": 1.9561656800554646e-05, "loss": 0.8576, "step": 1480 }, { "epoch": 0.12186792840979223, "grad_norm": 8.787444339921239, "learning_rate": 1.956087599034988e-05, "loss": 0.8277, "step": 1481 }, { "epoch": 0.12195021600493726, "grad_norm": 0.5502297806463167, "learning_rate": 1.9560094500955776e-05, "loss": 0.6048, "step": 1482 }, { "epoch": 0.12203250360008229, "grad_norm": 11.694419053216938, "learning_rate": 1.9559312332427845e-05, "loss": 0.8305, "step": 1483 }, { "epoch": 0.12211479119522732, "grad_norm": 12.873042218198869, "learning_rate": 1.9558529484821657e-05, "loss": 0.8183, "step": 1484 }, { "epoch": 0.12219707879037235, "grad_norm": 0.46111310227130453, "learning_rate": 1.955774595819282e-05, "loss": 0.5531, "step": 1485 }, { "epoch": 0.12227936638551738, "grad_norm": 0.45642862343281065, "learning_rate": 1.9556961752596996e-05, "loss": 0.5605, "step": 1486 }, { "epoch": 0.12236165398066241, "grad_norm": 0.4551291025310518, "learning_rate": 1.955617686808989e-05, "loss": 0.5594, "step": 1487 }, { "epoch": 0.12244394157580744, "grad_norm": 9.604778819022467, "learning_rate": 1.955539130472727e-05, "loss": 0.8574, "step": 1488 }, { "epoch": 0.12252622917095247, "grad_norm": 7.949165431488096, "learning_rate": 1.9554605062564924e-05, "loss": 0.8115, "step": 1489 }, { "epoch": 0.1226085167660975, "grad_norm": 0.552813848862915, "learning_rate": 1.955381814165872e-05, "loss": 0.5678, "step": 1490 }, { "epoch": 0.12269080436124254, "grad_norm": 25.858842385856658, "learning_rate": 1.955303054206455e-05, "loss": 0.8248, "step": 1491 }, { "epoch": 0.12277309195638758, "grad_norm": 21.311435668873827, "learning_rate": 1.9552242263838373e-05, "loss": 0.8456, "step": 1492 }, { "epoch": 0.12285537955153261, "grad_norm": 12.870319025289076, "learning_rate": 1.9551453307036184e-05, "loss": 0.8608, "step": 1493 }, { "epoch": 0.12293766714667764, "grad_norm": 23.32006316043952, "learning_rate": 1.955066367171402e-05, "loss": 0.8373, "step": 1494 }, { "epoch": 0.12301995474182267, "grad_norm": 9.485367656306368, "learning_rate": 1.954987335792799e-05, "loss": 0.8557, "step": 1495 }, { "epoch": 0.1231022423369677, "grad_norm": 13.860805709310752, "learning_rate": 1.9549082365734223e-05, "loss": 0.8742, "step": 1496 }, { "epoch": 0.12318452993211274, "grad_norm": 0.6621327410930654, "learning_rate": 1.9548290695188922e-05, "loss": 0.5893, "step": 1497 }, { "epoch": 0.12326681752725777, "grad_norm": 0.49494709664955483, "learning_rate": 1.9547498346348316e-05, "loss": 0.5697, "step": 1498 }, { "epoch": 0.1233491051224028, "grad_norm": 11.047657969799024, "learning_rate": 1.9546705319268697e-05, "loss": 0.8373, "step": 1499 }, { "epoch": 0.12343139271754783, "grad_norm": 10.749614937500533, "learning_rate": 1.95459116140064e-05, "loss": 0.8224, "step": 1500 }, { "epoch": 0.12351368031269286, "grad_norm": 9.894337094098182, "learning_rate": 1.954511723061781e-05, "loss": 0.8733, "step": 1501 }, { "epoch": 0.12359596790783789, "grad_norm": 7.20174895173815, "learning_rate": 1.9544322169159356e-05, "loss": 0.8386, "step": 1502 }, { "epoch": 0.12367825550298292, "grad_norm": 17.273897223584264, "learning_rate": 1.954352642968752e-05, "loss": 0.8412, "step": 1503 }, { "epoch": 0.12376054309812795, "grad_norm": 12.15000397023379, "learning_rate": 1.9542730012258827e-05, "loss": 0.8449, "step": 1504 }, { "epoch": 0.123842830693273, "grad_norm": 0.8275745786613676, "learning_rate": 1.9541932916929856e-05, "loss": 0.5836, "step": 1505 }, { "epoch": 0.12392511828841803, "grad_norm": 15.852588331242886, "learning_rate": 1.954113514375723e-05, "loss": 0.8527, "step": 1506 }, { "epoch": 0.12400740588356306, "grad_norm": 13.570552755618733, "learning_rate": 1.9540336692797624e-05, "loss": 0.8706, "step": 1507 }, { "epoch": 0.12408969347870809, "grad_norm": 10.231318756335888, "learning_rate": 1.9539537564107757e-05, "loss": 0.8975, "step": 1508 }, { "epoch": 0.12417198107385312, "grad_norm": 20.140957454242425, "learning_rate": 1.9538737757744397e-05, "loss": 0.8995, "step": 1509 }, { "epoch": 0.12425426866899815, "grad_norm": 16.907927618041068, "learning_rate": 1.953793727376436e-05, "loss": 0.8719, "step": 1510 }, { "epoch": 0.12433655626414318, "grad_norm": 9.640180209086472, "learning_rate": 1.9537136112224515e-05, "loss": 0.8524, "step": 1511 }, { "epoch": 0.12441884385928821, "grad_norm": 1.0609910785257508, "learning_rate": 1.9536334273181774e-05, "loss": 0.5868, "step": 1512 }, { "epoch": 0.12450113145443324, "grad_norm": 0.5945416770474524, "learning_rate": 1.9535531756693093e-05, "loss": 0.5585, "step": 1513 }, { "epoch": 0.12458341904957827, "grad_norm": 14.737935841719995, "learning_rate": 1.953472856281549e-05, "loss": 0.8653, "step": 1514 }, { "epoch": 0.1246657066447233, "grad_norm": 18.226124842762093, "learning_rate": 1.9533924691606015e-05, "loss": 0.8444, "step": 1515 }, { "epoch": 0.12474799423986833, "grad_norm": 11.981069540475804, "learning_rate": 1.953312014312178e-05, "loss": 0.8725, "step": 1516 }, { "epoch": 0.12483028183501337, "grad_norm": 17.168765421546752, "learning_rate": 1.9532314917419936e-05, "loss": 0.9122, "step": 1517 }, { "epoch": 0.12491256943015841, "grad_norm": 10.036564112856656, "learning_rate": 1.9531509014557683e-05, "loss": 0.9176, "step": 1518 }, { "epoch": 0.12499485702530344, "grad_norm": 20.07946819855691, "learning_rate": 1.9530702434592274e-05, "loss": 0.8879, "step": 1519 }, { "epoch": 0.12507714462044847, "grad_norm": 29.057862151469628, "learning_rate": 1.9529895177581007e-05, "loss": 0.8805, "step": 1520 }, { "epoch": 0.1251594322155935, "grad_norm": 21.321561085033913, "learning_rate": 1.9529087243581228e-05, "loss": 0.8783, "step": 1521 }, { "epoch": 0.12524171981073853, "grad_norm": 10.669113063471517, "learning_rate": 1.9528278632650325e-05, "loss": 0.8843, "step": 1522 }, { "epoch": 0.12532400740588356, "grad_norm": 15.487682058293332, "learning_rate": 1.9527469344845752e-05, "loss": 0.8621, "step": 1523 }, { "epoch": 0.1254062950010286, "grad_norm": 22.91067214478477, "learning_rate": 1.9526659380224994e-05, "loss": 0.871, "step": 1524 }, { "epoch": 0.12548858259617363, "grad_norm": 18.78756629418322, "learning_rate": 1.9525848738845586e-05, "loss": 0.8383, "step": 1525 }, { "epoch": 0.12557087019131866, "grad_norm": 1.826156850833794, "learning_rate": 1.952503742076512e-05, "loss": 0.6692, "step": 1526 }, { "epoch": 0.1256531577864637, "grad_norm": 10.01852490231935, "learning_rate": 1.9524225426041225e-05, "loss": 0.8774, "step": 1527 }, { "epoch": 0.12573544538160872, "grad_norm": 9.799823932923529, "learning_rate": 1.9523412754731594e-05, "loss": 0.8553, "step": 1528 }, { "epoch": 0.12581773297675375, "grad_norm": 10.243780274009985, "learning_rate": 1.9522599406893946e-05, "loss": 0.8825, "step": 1529 }, { "epoch": 0.12590002057189878, "grad_norm": 0.5629974857686041, "learning_rate": 1.952178538258607e-05, "loss": 0.582, "step": 1530 }, { "epoch": 0.1259823081670438, "grad_norm": 11.36979045705732, "learning_rate": 1.9520970681865784e-05, "loss": 0.8768, "step": 1531 }, { "epoch": 0.12606459576218884, "grad_norm": 10.058423558398582, "learning_rate": 1.9520155304790966e-05, "loss": 0.8548, "step": 1532 }, { "epoch": 0.12614688335733387, "grad_norm": 8.76973647154349, "learning_rate": 1.9519339251419546e-05, "loss": 0.8628, "step": 1533 }, { "epoch": 0.1262291709524789, "grad_norm": 19.24178771102118, "learning_rate": 1.9518522521809483e-05, "loss": 0.8974, "step": 1534 }, { "epoch": 0.12631145854762393, "grad_norm": 0.8687538872056417, "learning_rate": 1.951770511601881e-05, "loss": 0.6081, "step": 1535 }, { "epoch": 0.12639374614276896, "grad_norm": 18.208746524258896, "learning_rate": 1.9516887034105582e-05, "loss": 0.8705, "step": 1536 }, { "epoch": 0.12647603373791402, "grad_norm": 0.5991121850056094, "learning_rate": 1.951606827612792e-05, "loss": 0.5812, "step": 1537 }, { "epoch": 0.12655832133305905, "grad_norm": 11.188990497417041, "learning_rate": 1.9515248842143985e-05, "loss": 0.8493, "step": 1538 }, { "epoch": 0.12664060892820408, "grad_norm": 12.13754210039802, "learning_rate": 1.951442873221199e-05, "loss": 0.8539, "step": 1539 }, { "epoch": 0.12672289652334912, "grad_norm": 0.6371445875057375, "learning_rate": 1.9513607946390198e-05, "loss": 0.5771, "step": 1540 }, { "epoch": 0.12680518411849415, "grad_norm": 38.38383515449128, "learning_rate": 1.9512786484736907e-05, "loss": 0.8685, "step": 1541 }, { "epoch": 0.12688747171363918, "grad_norm": 12.647804021527618, "learning_rate": 1.951196434731048e-05, "loss": 0.8547, "step": 1542 }, { "epoch": 0.1269697593087842, "grad_norm": 11.663684157984594, "learning_rate": 1.951114153416932e-05, "loss": 0.8518, "step": 1543 }, { "epoch": 0.12705204690392924, "grad_norm": 14.698290238642006, "learning_rate": 1.9510318045371873e-05, "loss": 0.8299, "step": 1544 }, { "epoch": 0.12713433449907427, "grad_norm": 18.836062665500368, "learning_rate": 1.9509493880976645e-05, "loss": 0.8649, "step": 1545 }, { "epoch": 0.1272166220942193, "grad_norm": 37.604495524678796, "learning_rate": 1.9508669041042175e-05, "loss": 0.8771, "step": 1546 }, { "epoch": 0.12729890968936433, "grad_norm": 16.242906985077767, "learning_rate": 1.950784352562707e-05, "loss": 0.8559, "step": 1547 }, { "epoch": 0.12738119728450936, "grad_norm": 12.303960320937623, "learning_rate": 1.950701733478996e-05, "loss": 0.8269, "step": 1548 }, { "epoch": 0.1274634848796544, "grad_norm": 15.582543893350989, "learning_rate": 1.9506190468589542e-05, "loss": 0.8547, "step": 1549 }, { "epoch": 0.12754577247479942, "grad_norm": 12.36757962681498, "learning_rate": 1.950536292708456e-05, "loss": 0.854, "step": 1550 }, { "epoch": 0.12762806006994445, "grad_norm": 11.80476370196413, "learning_rate": 1.9504534710333795e-05, "loss": 0.8952, "step": 1551 }, { "epoch": 0.12771034766508949, "grad_norm": 30.303720277928623, "learning_rate": 1.950370581839609e-05, "loss": 0.8396, "step": 1552 }, { "epoch": 0.12779263526023452, "grad_norm": 9.097984679042607, "learning_rate": 1.9502876251330315e-05, "loss": 0.8424, "step": 1553 }, { "epoch": 0.12787492285537955, "grad_norm": 16.918561440988324, "learning_rate": 1.9502046009195413e-05, "loss": 0.8429, "step": 1554 }, { "epoch": 0.12795721045052458, "grad_norm": 13.858699886134135, "learning_rate": 1.9501215092050357e-05, "loss": 0.8222, "step": 1555 }, { "epoch": 0.1280394980456696, "grad_norm": 0.7950557504894921, "learning_rate": 1.9500383499954178e-05, "loss": 0.5902, "step": 1556 }, { "epoch": 0.12812178564081464, "grad_norm": 24.803535791656607, "learning_rate": 1.9499551232965948e-05, "loss": 0.8334, "step": 1557 }, { "epoch": 0.12820407323595967, "grad_norm": 11.828695261044466, "learning_rate": 1.949871829114479e-05, "loss": 0.8758, "step": 1558 }, { "epoch": 0.1282863608311047, "grad_norm": 12.441875233794036, "learning_rate": 1.9497884674549875e-05, "loss": 0.8732, "step": 1559 }, { "epoch": 0.12836864842624973, "grad_norm": 28.468422713163175, "learning_rate": 1.9497050383240423e-05, "loss": 0.8573, "step": 1560 }, { "epoch": 0.12845093602139476, "grad_norm": 17.260133565508898, "learning_rate": 1.94962154172757e-05, "loss": 0.8569, "step": 1561 }, { "epoch": 0.1285332236165398, "grad_norm": 17.78196970160732, "learning_rate": 1.949537977671502e-05, "loss": 0.8392, "step": 1562 }, { "epoch": 0.12861551121168485, "grad_norm": 11.158957006384213, "learning_rate": 1.949454346161775e-05, "loss": 0.8347, "step": 1563 }, { "epoch": 0.12869779880682988, "grad_norm": 17.498878506507015, "learning_rate": 1.949370647204329e-05, "loss": 0.8809, "step": 1564 }, { "epoch": 0.1287800864019749, "grad_norm": 7.9176045197846205, "learning_rate": 1.9492868808051112e-05, "loss": 0.8456, "step": 1565 }, { "epoch": 0.12886237399711994, "grad_norm": 20.166846846916215, "learning_rate": 1.9492030469700712e-05, "loss": 0.832, "step": 1566 }, { "epoch": 0.12894466159226498, "grad_norm": 19.174742273691734, "learning_rate": 1.9491191457051646e-05, "loss": 0.8443, "step": 1567 }, { "epoch": 0.12902694918741, "grad_norm": 21.32891181756379, "learning_rate": 1.9490351770163523e-05, "loss": 0.8464, "step": 1568 }, { "epoch": 0.12910923678255504, "grad_norm": 25.5931955657056, "learning_rate": 1.9489511409095982e-05, "loss": 0.8524, "step": 1569 }, { "epoch": 0.12919152437770007, "grad_norm": 6.885350368161633, "learning_rate": 1.9488670373908732e-05, "loss": 0.8566, "step": 1570 }, { "epoch": 0.1292738119728451, "grad_norm": 0.8064509572258421, "learning_rate": 1.948782866466151e-05, "loss": 0.5833, "step": 1571 }, { "epoch": 0.12935609956799013, "grad_norm": 0.6437332857165583, "learning_rate": 1.9486986281414113e-05, "loss": 0.591, "step": 1572 }, { "epoch": 0.12943838716313516, "grad_norm": 8.783260476561136, "learning_rate": 1.9486143224226386e-05, "loss": 0.8232, "step": 1573 }, { "epoch": 0.1295206747582802, "grad_norm": 27.307179915869348, "learning_rate": 1.9485299493158213e-05, "loss": 0.8362, "step": 1574 }, { "epoch": 0.12960296235342522, "grad_norm": 14.887699977158707, "learning_rate": 1.948445508826953e-05, "loss": 0.8774, "step": 1575 }, { "epoch": 0.12968524994857025, "grad_norm": 8.94111905597464, "learning_rate": 1.948361000962033e-05, "loss": 0.8493, "step": 1576 }, { "epoch": 0.12976753754371528, "grad_norm": 11.638077472882795, "learning_rate": 1.9482764257270643e-05, "loss": 0.8597, "step": 1577 }, { "epoch": 0.12984982513886031, "grad_norm": 9.045639740647525, "learning_rate": 1.9481917831280547e-05, "loss": 0.8523, "step": 1578 }, { "epoch": 0.12993211273400535, "grad_norm": 13.046344860495239, "learning_rate": 1.948107073171017e-05, "loss": 0.8627, "step": 1579 }, { "epoch": 0.13001440032915038, "grad_norm": 14.174127420393626, "learning_rate": 1.9480222958619696e-05, "loss": 0.8322, "step": 1580 }, { "epoch": 0.1300966879242954, "grad_norm": 7.026950552196628, "learning_rate": 1.947937451206934e-05, "loss": 0.86, "step": 1581 }, { "epoch": 0.13017897551944044, "grad_norm": 1.335200372111077, "learning_rate": 1.947852539211938e-05, "loss": 0.6371, "step": 1582 }, { "epoch": 0.13026126311458547, "grad_norm": 12.298240422318646, "learning_rate": 1.9477675598830135e-05, "loss": 0.8637, "step": 1583 }, { "epoch": 0.1303435507097305, "grad_norm": 11.297299148156007, "learning_rate": 1.947682513226197e-05, "loss": 0.8403, "step": 1584 }, { "epoch": 0.13042583830487553, "grad_norm": 11.722864354186749, "learning_rate": 1.947597399247531e-05, "loss": 0.8774, "step": 1585 }, { "epoch": 0.13050812590002056, "grad_norm": 0.6703589211407367, "learning_rate": 1.9475122179530608e-05, "loss": 0.5872, "step": 1586 }, { "epoch": 0.1305904134951656, "grad_norm": 7.201572575314816, "learning_rate": 1.947426969348838e-05, "loss": 0.8333, "step": 1587 }, { "epoch": 0.13067270109031062, "grad_norm": 8.51219810707983, "learning_rate": 1.9473416534409183e-05, "loss": 0.8692, "step": 1588 }, { "epoch": 0.13075498868545568, "grad_norm": 12.72665901538907, "learning_rate": 1.9472562702353628e-05, "loss": 0.8353, "step": 1589 }, { "epoch": 0.1308372762806007, "grad_norm": 8.440240359174169, "learning_rate": 1.9471708197382367e-05, "loss": 0.8447, "step": 1590 }, { "epoch": 0.13091956387574574, "grad_norm": 18.794459104702618, "learning_rate": 1.9470853019556105e-05, "loss": 0.8514, "step": 1591 }, { "epoch": 0.13100185147089077, "grad_norm": 15.204805304916004, "learning_rate": 1.946999716893559e-05, "loss": 0.8918, "step": 1592 }, { "epoch": 0.1310841390660358, "grad_norm": 20.447557635429625, "learning_rate": 1.946914064558162e-05, "loss": 0.8291, "step": 1593 }, { "epoch": 0.13116642666118083, "grad_norm": 0.8902410259266276, "learning_rate": 1.9468283449555044e-05, "loss": 0.5855, "step": 1594 }, { "epoch": 0.13124871425632587, "grad_norm": 16.32691590599941, "learning_rate": 1.946742558091675e-05, "loss": 0.8729, "step": 1595 }, { "epoch": 0.1313310018514709, "grad_norm": 12.629234690471154, "learning_rate": 1.946656703972769e-05, "loss": 0.8502, "step": 1596 }, { "epoch": 0.13141328944661593, "grad_norm": 11.942150001738323, "learning_rate": 1.946570782604884e-05, "loss": 0.8488, "step": 1597 }, { "epoch": 0.13149557704176096, "grad_norm": 18.66420798538323, "learning_rate": 1.9464847939941253e-05, "loss": 0.8415, "step": 1598 }, { "epoch": 0.131577864636906, "grad_norm": 0.5487160534091807, "learning_rate": 1.9463987381465997e-05, "loss": 0.5807, "step": 1599 }, { "epoch": 0.13166015223205102, "grad_norm": 0.47378948261483483, "learning_rate": 1.9463126150684215e-05, "loss": 0.5611, "step": 1600 }, { "epoch": 0.13174243982719605, "grad_norm": 11.826166345708458, "learning_rate": 1.946226424765709e-05, "loss": 0.8621, "step": 1601 }, { "epoch": 0.13182472742234108, "grad_norm": 15.67907329393183, "learning_rate": 1.946140167244584e-05, "loss": 0.8326, "step": 1602 }, { "epoch": 0.1319070150174861, "grad_norm": 13.265461124752493, "learning_rate": 1.9460538425111747e-05, "loss": 0.8491, "step": 1603 }, { "epoch": 0.13198930261263114, "grad_norm": 21.302704279466422, "learning_rate": 1.9459674505716134e-05, "loss": 0.8538, "step": 1604 }, { "epoch": 0.13207159020777617, "grad_norm": 0.8219538162066992, "learning_rate": 1.9458809914320376e-05, "loss": 0.6139, "step": 1605 }, { "epoch": 0.1321538778029212, "grad_norm": 14.037933923062655, "learning_rate": 1.9457944650985883e-05, "loss": 0.825, "step": 1606 }, { "epoch": 0.13223616539806624, "grad_norm": 16.51723934307185, "learning_rate": 1.9457078715774137e-05, "loss": 0.8398, "step": 1607 }, { "epoch": 0.13231845299321127, "grad_norm": 14.88358516004508, "learning_rate": 1.9456212108746638e-05, "loss": 0.8464, "step": 1608 }, { "epoch": 0.1324007405883563, "grad_norm": 30.189032243968434, "learning_rate": 1.9455344829964952e-05, "loss": 0.8403, "step": 1609 }, { "epoch": 0.13248302818350133, "grad_norm": 12.53490946521001, "learning_rate": 1.945447687949069e-05, "loss": 0.8593, "step": 1610 }, { "epoch": 0.13256531577864636, "grad_norm": 15.356236880805202, "learning_rate": 1.9453608257385515e-05, "loss": 0.8752, "step": 1611 }, { "epoch": 0.1326476033737914, "grad_norm": 30.490886482179693, "learning_rate": 1.9452738963711127e-05, "loss": 0.8433, "step": 1612 }, { "epoch": 0.13272989096893642, "grad_norm": 13.432846768419989, "learning_rate": 1.945186899852928e-05, "loss": 0.8632, "step": 1613 }, { "epoch": 0.13281217856408145, "grad_norm": 15.210857748618107, "learning_rate": 1.9450998361901778e-05, "loss": 0.8299, "step": 1614 }, { "epoch": 0.1328944661592265, "grad_norm": 9.248268507271018, "learning_rate": 1.945012705389046e-05, "loss": 0.8795, "step": 1615 }, { "epoch": 0.13297675375437154, "grad_norm": 13.312884263474453, "learning_rate": 1.9449255074557233e-05, "loss": 0.8447, "step": 1616 }, { "epoch": 0.13305904134951657, "grad_norm": 0.6387822773814731, "learning_rate": 1.9448382423964038e-05, "loss": 0.5803, "step": 1617 }, { "epoch": 0.1331413289446616, "grad_norm": 8.987025451840744, "learning_rate": 1.944750910217287e-05, "loss": 0.8486, "step": 1618 }, { "epoch": 0.13322361653980663, "grad_norm": 15.860555119958025, "learning_rate": 1.944663510924576e-05, "loss": 0.8429, "step": 1619 }, { "epoch": 0.13330590413495166, "grad_norm": 15.755917461958926, "learning_rate": 1.94457604452448e-05, "loss": 0.8648, "step": 1620 }, { "epoch": 0.1333881917300967, "grad_norm": 9.495081106148424, "learning_rate": 1.9444885110232122e-05, "loss": 0.8511, "step": 1621 }, { "epoch": 0.13347047932524173, "grad_norm": 9.228948535480242, "learning_rate": 1.9444009104269912e-05, "loss": 0.8903, "step": 1622 }, { "epoch": 0.13355276692038676, "grad_norm": 0.6835320058663733, "learning_rate": 1.9443132427420402e-05, "loss": 0.5648, "step": 1623 }, { "epoch": 0.1336350545155318, "grad_norm": 7.512489044480011, "learning_rate": 1.944225507974586e-05, "loss": 0.8648, "step": 1624 }, { "epoch": 0.13371734211067682, "grad_norm": 10.385149112147042, "learning_rate": 1.9441377061308625e-05, "loss": 0.8467, "step": 1625 }, { "epoch": 0.13379962970582185, "grad_norm": 0.628760629783465, "learning_rate": 1.9440498372171057e-05, "loss": 0.5519, "step": 1626 }, { "epoch": 0.13388191730096688, "grad_norm": 8.0862924915863, "learning_rate": 1.9439619012395587e-05, "loss": 0.8742, "step": 1627 }, { "epoch": 0.1339642048961119, "grad_norm": 9.007382090603102, "learning_rate": 1.9438738982044678e-05, "loss": 0.8313, "step": 1628 }, { "epoch": 0.13404649249125694, "grad_norm": 15.188130778601375, "learning_rate": 1.9437858281180845e-05, "loss": 0.8616, "step": 1629 }, { "epoch": 0.13412878008640197, "grad_norm": 8.695715161522436, "learning_rate": 1.9436976909866652e-05, "loss": 0.8798, "step": 1630 }, { "epoch": 0.134211067681547, "grad_norm": 8.332586114090097, "learning_rate": 1.9436094868164714e-05, "loss": 0.8304, "step": 1631 }, { "epoch": 0.13429335527669203, "grad_norm": 6.131546976942165, "learning_rate": 1.943521215613769e-05, "loss": 0.8323, "step": 1632 }, { "epoch": 0.13437564287183706, "grad_norm": 7.75079357067481, "learning_rate": 1.9434328773848275e-05, "loss": 0.8761, "step": 1633 }, { "epoch": 0.1344579304669821, "grad_norm": 9.591056911031714, "learning_rate": 1.943344472135924e-05, "loss": 0.8391, "step": 1634 }, { "epoch": 0.13454021806212713, "grad_norm": 0.5571523026882707, "learning_rate": 1.943255999873338e-05, "loss": 0.5626, "step": 1635 }, { "epoch": 0.13462250565727216, "grad_norm": 7.180341879101182, "learning_rate": 1.9431674606033535e-05, "loss": 0.8573, "step": 1636 }, { "epoch": 0.1347047932524172, "grad_norm": 6.297703126019438, "learning_rate": 1.9430788543322614e-05, "loss": 0.8645, "step": 1637 }, { "epoch": 0.13478708084756222, "grad_norm": 13.328252581658347, "learning_rate": 1.942990181066356e-05, "loss": 0.841, "step": 1638 }, { "epoch": 0.13486936844270725, "grad_norm": 9.478995566997952, "learning_rate": 1.9429014408119354e-05, "loss": 0.843, "step": 1639 }, { "epoch": 0.13495165603785228, "grad_norm": 10.30497161424214, "learning_rate": 1.942812633575305e-05, "loss": 0.8561, "step": 1640 }, { "epoch": 0.13503394363299734, "grad_norm": 10.783364375169487, "learning_rate": 1.9427237593627727e-05, "loss": 0.8702, "step": 1641 }, { "epoch": 0.13511623122814237, "grad_norm": 5.936021952203429, "learning_rate": 1.9426348181806527e-05, "loss": 0.8651, "step": 1642 }, { "epoch": 0.1351985188232874, "grad_norm": 6.865865248520576, "learning_rate": 1.9425458100352622e-05, "loss": 0.8369, "step": 1643 }, { "epoch": 0.13528080641843243, "grad_norm": 9.25539230502882, "learning_rate": 1.942456734932925e-05, "loss": 0.8614, "step": 1644 }, { "epoch": 0.13536309401357746, "grad_norm": 6.449494273747695, "learning_rate": 1.9423675928799684e-05, "loss": 0.8495, "step": 1645 }, { "epoch": 0.1354453816087225, "grad_norm": 9.981428791114054, "learning_rate": 1.942278383882725e-05, "loss": 0.8203, "step": 1646 }, { "epoch": 0.13552766920386752, "grad_norm": 11.905054324962366, "learning_rate": 1.9421891079475323e-05, "loss": 0.8565, "step": 1647 }, { "epoch": 0.13560995679901255, "grad_norm": 5.099205899457276, "learning_rate": 1.9420997650807324e-05, "loss": 0.8462, "step": 1648 }, { "epoch": 0.13569224439415759, "grad_norm": 7.551077756646788, "learning_rate": 1.9420103552886718e-05, "loss": 0.8278, "step": 1649 }, { "epoch": 0.13577453198930262, "grad_norm": 9.34742160721662, "learning_rate": 1.941920878577702e-05, "loss": 0.855, "step": 1650 }, { "epoch": 0.13585681958444765, "grad_norm": 14.953328811805543, "learning_rate": 1.9418313349541792e-05, "loss": 0.8555, "step": 1651 }, { "epoch": 0.13593910717959268, "grad_norm": 19.65415961612349, "learning_rate": 1.9417417244244645e-05, "loss": 0.8416, "step": 1652 }, { "epoch": 0.1360213947747377, "grad_norm": 6.723089668465277, "learning_rate": 1.9416520469949242e-05, "loss": 0.8485, "step": 1653 }, { "epoch": 0.13610368236988274, "grad_norm": 9.685387477166035, "learning_rate": 1.9415623026719282e-05, "loss": 0.8333, "step": 1654 }, { "epoch": 0.13618596996502777, "grad_norm": 6.442031893344583, "learning_rate": 1.941472491461852e-05, "loss": 0.8313, "step": 1655 }, { "epoch": 0.1362682575601728, "grad_norm": 11.978462205062288, "learning_rate": 1.941382613371076e-05, "loss": 0.8356, "step": 1656 }, { "epoch": 0.13635054515531783, "grad_norm": 8.959450516597315, "learning_rate": 1.9412926684059844e-05, "loss": 0.8299, "step": 1657 }, { "epoch": 0.13643283275046286, "grad_norm": 8.564749166275782, "learning_rate": 1.9412026565729668e-05, "loss": 0.8062, "step": 1658 }, { "epoch": 0.1365151203456079, "grad_norm": 0.48110518320018675, "learning_rate": 1.941112577878418e-05, "loss": 0.5745, "step": 1659 }, { "epoch": 0.13659740794075292, "grad_norm": 11.492434132872301, "learning_rate": 1.9410224323287368e-05, "loss": 0.8564, "step": 1660 }, { "epoch": 0.13667969553589795, "grad_norm": 7.0096165345110455, "learning_rate": 1.9409322199303265e-05, "loss": 0.8245, "step": 1661 }, { "epoch": 0.13676198313104299, "grad_norm": 7.244034742758887, "learning_rate": 1.9408419406895963e-05, "loss": 0.8423, "step": 1662 }, { "epoch": 0.13684427072618802, "grad_norm": 8.758950640082848, "learning_rate": 1.9407515946129596e-05, "loss": 0.8159, "step": 1663 }, { "epoch": 0.13692655832133305, "grad_norm": 11.355355331778188, "learning_rate": 1.9406611817068342e-05, "loss": 0.8395, "step": 1664 }, { "epoch": 0.13700884591647808, "grad_norm": 21.457605657742324, "learning_rate": 1.9405707019776426e-05, "loss": 0.8284, "step": 1665 }, { "epoch": 0.13709113351162314, "grad_norm": 9.208046052928568, "learning_rate": 1.9404801554318124e-05, "loss": 0.8354, "step": 1666 }, { "epoch": 0.13717342110676817, "grad_norm": 11.164511842285489, "learning_rate": 1.940389542075776e-05, "loss": 0.8446, "step": 1667 }, { "epoch": 0.1372557087019132, "grad_norm": 7.698644554716207, "learning_rate": 1.9402988619159706e-05, "loss": 0.8295, "step": 1668 }, { "epoch": 0.13733799629705823, "grad_norm": 0.49519598486863875, "learning_rate": 1.940208114958838e-05, "loss": 0.5877, "step": 1669 }, { "epoch": 0.13742028389220326, "grad_norm": 10.405215695876064, "learning_rate": 1.9401173012108244e-05, "loss": 0.858, "step": 1670 }, { "epoch": 0.1375025714873483, "grad_norm": 6.274166537890226, "learning_rate": 1.940026420678381e-05, "loss": 0.8495, "step": 1671 }, { "epoch": 0.13758485908249332, "grad_norm": 8.247076915562847, "learning_rate": 1.9399354733679644e-05, "loss": 0.8241, "step": 1672 }, { "epoch": 0.13766714667763835, "grad_norm": 9.44348381745056, "learning_rate": 1.9398444592860346e-05, "loss": 0.8807, "step": 1673 }, { "epoch": 0.13774943427278338, "grad_norm": 0.46561126197328223, "learning_rate": 1.9397533784390577e-05, "loss": 0.5663, "step": 1674 }, { "epoch": 0.13783172186792841, "grad_norm": 0.444368069087187, "learning_rate": 1.939662230833504e-05, "loss": 0.5139, "step": 1675 }, { "epoch": 0.13791400946307344, "grad_norm": 7.357820789657159, "learning_rate": 1.9395710164758478e-05, "loss": 0.7818, "step": 1676 }, { "epoch": 0.13799629705821848, "grad_norm": 6.9252347936815974, "learning_rate": 1.9394797353725693e-05, "loss": 0.8313, "step": 1677 }, { "epoch": 0.1380785846533635, "grad_norm": 6.812431247075556, "learning_rate": 1.9393883875301528e-05, "loss": 0.7983, "step": 1678 }, { "epoch": 0.13816087224850854, "grad_norm": 15.635575837691977, "learning_rate": 1.9392969729550874e-05, "loss": 0.8385, "step": 1679 }, { "epoch": 0.13824315984365357, "grad_norm": 7.966855868898477, "learning_rate": 1.9392054916538676e-05, "loss": 0.8513, "step": 1680 }, { "epoch": 0.1383254474387986, "grad_norm": 0.5191616103239319, "learning_rate": 1.939113943632992e-05, "loss": 0.5545, "step": 1681 }, { "epoch": 0.13840773503394363, "grad_norm": 9.268850841092327, "learning_rate": 1.939022328898963e-05, "loss": 0.8638, "step": 1682 }, { "epoch": 0.13849002262908866, "grad_norm": 6.2611125201139055, "learning_rate": 1.9389306474582898e-05, "loss": 0.8328, "step": 1683 }, { "epoch": 0.1385723102242337, "grad_norm": 18.750042480887604, "learning_rate": 1.938838899317485e-05, "loss": 0.8319, "step": 1684 }, { "epoch": 0.13865459781937872, "grad_norm": 9.045445107488831, "learning_rate": 1.9387470844830663e-05, "loss": 0.8632, "step": 1685 }, { "epoch": 0.13873688541452375, "grad_norm": 10.10333245390392, "learning_rate": 1.938655202961556e-05, "loss": 0.8402, "step": 1686 }, { "epoch": 0.13881917300966878, "grad_norm": 14.37009496922347, "learning_rate": 1.938563254759481e-05, "loss": 0.842, "step": 1687 }, { "epoch": 0.13890146060481381, "grad_norm": 4.62456351377046, "learning_rate": 1.9384712398833737e-05, "loss": 0.8379, "step": 1688 }, { "epoch": 0.13898374819995885, "grad_norm": 5.48808023393408, "learning_rate": 1.9383791583397704e-05, "loss": 0.8401, "step": 1689 }, { "epoch": 0.13906603579510388, "grad_norm": 0.508054869823019, "learning_rate": 1.9382870101352122e-05, "loss": 0.5855, "step": 1690 }, { "epoch": 0.1391483233902489, "grad_norm": 0.45467031682961434, "learning_rate": 1.9381947952762456e-05, "loss": 0.5582, "step": 1691 }, { "epoch": 0.13923061098539397, "grad_norm": 4.698170093241064, "learning_rate": 1.9381025137694213e-05, "loss": 0.8488, "step": 1692 }, { "epoch": 0.139312898580539, "grad_norm": 17.925934835917072, "learning_rate": 1.9380101656212942e-05, "loss": 0.8374, "step": 1693 }, { "epoch": 0.13939518617568403, "grad_norm": 21.74128240936288, "learning_rate": 1.937917750838425e-05, "loss": 0.835, "step": 1694 }, { "epoch": 0.13947747377082906, "grad_norm": 0.5455463949858089, "learning_rate": 1.9378252694273793e-05, "loss": 0.5776, "step": 1695 }, { "epoch": 0.1395597613659741, "grad_norm": 7.021491632670848, "learning_rate": 1.937732721394726e-05, "loss": 0.8338, "step": 1696 }, { "epoch": 0.13964204896111912, "grad_norm": 0.47906268792032236, "learning_rate": 1.93764010674704e-05, "loss": 0.5718, "step": 1697 }, { "epoch": 0.13972433655626415, "grad_norm": 9.18921205759664, "learning_rate": 1.9375474254909002e-05, "loss": 0.8374, "step": 1698 }, { "epoch": 0.13980662415140918, "grad_norm": 6.4706598091478424, "learning_rate": 1.9374546776328906e-05, "loss": 0.8371, "step": 1699 }, { "epoch": 0.1398889117465542, "grad_norm": 11.019615591253086, "learning_rate": 1.9373618631796e-05, "loss": 0.8192, "step": 1700 }, { "epoch": 0.13997119934169924, "grad_norm": 6.695489764575939, "learning_rate": 1.937268982137622e-05, "loss": 0.7922, "step": 1701 }, { "epoch": 0.14005348693684427, "grad_norm": 7.916432098380521, "learning_rate": 1.937176034513554e-05, "loss": 0.8493, "step": 1702 }, { "epoch": 0.1401357745319893, "grad_norm": 5.689792227478553, "learning_rate": 1.9370830203139998e-05, "loss": 0.8239, "step": 1703 }, { "epoch": 0.14021806212713434, "grad_norm": 4.746618108561176, "learning_rate": 1.936989939545566e-05, "loss": 0.8285, "step": 1704 }, { "epoch": 0.14030034972227937, "grad_norm": 4.051544179948662, "learning_rate": 1.936896792214866e-05, "loss": 0.8394, "step": 1705 }, { "epoch": 0.1403826373174244, "grad_norm": 5.5482681923761525, "learning_rate": 1.9368035783285157e-05, "loss": 0.8484, "step": 1706 }, { "epoch": 0.14046492491256943, "grad_norm": 11.365762748018584, "learning_rate": 1.9367102978931375e-05, "loss": 0.8805, "step": 1707 }, { "epoch": 0.14054721250771446, "grad_norm": 0.5814669329358698, "learning_rate": 1.9366169509153578e-05, "loss": 0.5721, "step": 1708 }, { "epoch": 0.1406295001028595, "grad_norm": 8.761055022275535, "learning_rate": 1.936523537401808e-05, "loss": 0.8048, "step": 1709 }, { "epoch": 0.14071178769800452, "grad_norm": 4.94096986820974, "learning_rate": 1.9364300573591234e-05, "loss": 0.8751, "step": 1710 }, { "epoch": 0.14079407529314955, "grad_norm": 8.95653472365129, "learning_rate": 1.9363365107939454e-05, "loss": 0.8411, "step": 1711 }, { "epoch": 0.14087636288829458, "grad_norm": 5.431161332386783, "learning_rate": 1.936242897712919e-05, "loss": 0.8469, "step": 1712 }, { "epoch": 0.1409586504834396, "grad_norm": 9.867753201582625, "learning_rate": 1.9361492181226947e-05, "loss": 0.8505, "step": 1713 }, { "epoch": 0.14104093807858464, "grad_norm": 5.136338927889941, "learning_rate": 1.936055472029927e-05, "loss": 0.8538, "step": 1714 }, { "epoch": 0.14112322567372967, "grad_norm": 8.355395875659436, "learning_rate": 1.9359616594412754e-05, "loss": 0.8068, "step": 1715 }, { "epoch": 0.1412055132688747, "grad_norm": 5.819359443189252, "learning_rate": 1.9358677803634044e-05, "loss": 0.8652, "step": 1716 }, { "epoch": 0.14128780086401974, "grad_norm": 6.059353532101893, "learning_rate": 1.9357738348029832e-05, "loss": 0.834, "step": 1717 }, { "epoch": 0.1413700884591648, "grad_norm": 0.5260853686785989, "learning_rate": 1.9356798227666852e-05, "loss": 0.5886, "step": 1718 }, { "epoch": 0.14145237605430983, "grad_norm": 5.154728517737795, "learning_rate": 1.935585744261189e-05, "loss": 0.8592, "step": 1719 }, { "epoch": 0.14153466364945486, "grad_norm": 4.60543857905063, "learning_rate": 1.9354915992931778e-05, "loss": 0.8158, "step": 1720 }, { "epoch": 0.1416169512445999, "grad_norm": 6.439591937901596, "learning_rate": 1.9353973878693393e-05, "loss": 0.8429, "step": 1721 }, { "epoch": 0.14169923883974492, "grad_norm": 5.191853855244775, "learning_rate": 1.9353031099963665e-05, "loss": 0.8704, "step": 1722 }, { "epoch": 0.14178152643488995, "grad_norm": 7.89752048115663, "learning_rate": 1.9352087656809563e-05, "loss": 0.8092, "step": 1723 }, { "epoch": 0.14186381403003498, "grad_norm": 5.331792359032669, "learning_rate": 1.9351143549298115e-05, "loss": 0.8468, "step": 1724 }, { "epoch": 0.14194610162518, "grad_norm": 0.48989316809380234, "learning_rate": 1.935019877749638e-05, "loss": 0.563, "step": 1725 }, { "epoch": 0.14202838922032504, "grad_norm": 5.256869924215583, "learning_rate": 1.9349253341471483e-05, "loss": 0.8459, "step": 1726 }, { "epoch": 0.14211067681547007, "grad_norm": 4.772772230009674, "learning_rate": 1.9348307241290574e-05, "loss": 0.8424, "step": 1727 }, { "epoch": 0.1421929644106151, "grad_norm": 5.139167308693871, "learning_rate": 1.9347360477020873e-05, "loss": 0.8294, "step": 1728 }, { "epoch": 0.14227525200576013, "grad_norm": 8.427018162829064, "learning_rate": 1.934641304872963e-05, "loss": 0.8984, "step": 1729 }, { "epoch": 0.14235753960090516, "grad_norm": 5.195894888253091, "learning_rate": 1.934546495648415e-05, "loss": 0.8575, "step": 1730 }, { "epoch": 0.1424398271960502, "grad_norm": 6.03048911441289, "learning_rate": 1.934451620035179e-05, "loss": 0.8305, "step": 1731 }, { "epoch": 0.14252211479119523, "grad_norm": 5.3710703103909765, "learning_rate": 1.934356678039994e-05, "loss": 0.8413, "step": 1732 }, { "epoch": 0.14260440238634026, "grad_norm": 0.45884838090967267, "learning_rate": 1.934261669669605e-05, "loss": 0.5654, "step": 1733 }, { "epoch": 0.1426866899814853, "grad_norm": 5.871990326718468, "learning_rate": 1.934166594930761e-05, "loss": 0.8636, "step": 1734 }, { "epoch": 0.14276897757663032, "grad_norm": 6.040345693572168, "learning_rate": 1.9340714538302165e-05, "loss": 0.8436, "step": 1735 }, { "epoch": 0.14285126517177535, "grad_norm": 16.452741842652962, "learning_rate": 1.9339762463747293e-05, "loss": 0.8507, "step": 1736 }, { "epoch": 0.14293355276692038, "grad_norm": 8.023246371031073, "learning_rate": 1.9338809725710636e-05, "loss": 0.795, "step": 1737 }, { "epoch": 0.1430158403620654, "grad_norm": 9.169126419921785, "learning_rate": 1.933785632425987e-05, "loss": 0.8365, "step": 1738 }, { "epoch": 0.14309812795721044, "grad_norm": 0.4843788693491957, "learning_rate": 1.933690225946272e-05, "loss": 0.5824, "step": 1739 }, { "epoch": 0.14318041555235547, "grad_norm": 6.87259085555059, "learning_rate": 1.933594753138697e-05, "loss": 0.85, "step": 1740 }, { "epoch": 0.1432627031475005, "grad_norm": 10.501019350819892, "learning_rate": 1.9334992140100437e-05, "loss": 0.859, "step": 1741 }, { "epoch": 0.14334499074264553, "grad_norm": 5.621762683180854, "learning_rate": 1.9334036085670993e-05, "loss": 0.8416, "step": 1742 }, { "epoch": 0.14342727833779056, "grad_norm": 12.795324591638737, "learning_rate": 1.933307936816655e-05, "loss": 0.8138, "step": 1743 }, { "epoch": 0.14350956593293562, "grad_norm": 6.250772565375806, "learning_rate": 1.933212198765508e-05, "loss": 0.8431, "step": 1744 }, { "epoch": 0.14359185352808065, "grad_norm": 6.649923452980046, "learning_rate": 1.933116394420458e-05, "loss": 0.8683, "step": 1745 }, { "epoch": 0.14367414112322568, "grad_norm": 0.4783295138092446, "learning_rate": 1.9330205237883125e-05, "loss": 0.5472, "step": 1746 }, { "epoch": 0.14375642871837072, "grad_norm": 6.450808259182912, "learning_rate": 1.9329245868758805e-05, "loss": 0.8689, "step": 1747 }, { "epoch": 0.14383871631351575, "grad_norm": 8.144945238397163, "learning_rate": 1.9328285836899782e-05, "loss": 0.8409, "step": 1748 }, { "epoch": 0.14392100390866078, "grad_norm": 8.41829303690563, "learning_rate": 1.932732514237425e-05, "loss": 0.7911, "step": 1749 }, { "epoch": 0.1440032915038058, "grad_norm": 7.756732625129085, "learning_rate": 1.9326363785250456e-05, "loss": 0.8231, "step": 1750 }, { "epoch": 0.14408557909895084, "grad_norm": 6.8419012862661965, "learning_rate": 1.9325401765596695e-05, "loss": 0.7966, "step": 1751 }, { "epoch": 0.14416786669409587, "grad_norm": 5.118896582942905, "learning_rate": 1.9324439083481308e-05, "loss": 0.8469, "step": 1752 }, { "epoch": 0.1442501542892409, "grad_norm": 0.46273923634227443, "learning_rate": 1.9323475738972682e-05, "loss": 0.5401, "step": 1753 }, { "epoch": 0.14433244188438593, "grad_norm": 5.933555491033483, "learning_rate": 1.9322511732139247e-05, "loss": 0.8232, "step": 1754 }, { "epoch": 0.14441472947953096, "grad_norm": 19.74022464785281, "learning_rate": 1.9321547063049487e-05, "loss": 0.8377, "step": 1755 }, { "epoch": 0.144497017074676, "grad_norm": 0.47208399084192787, "learning_rate": 1.9320581731771933e-05, "loss": 0.5788, "step": 1756 }, { "epoch": 0.14457930466982102, "grad_norm": 6.155109939821394, "learning_rate": 1.9319615738375156e-05, "loss": 0.8477, "step": 1757 }, { "epoch": 0.14466159226496605, "grad_norm": 4.5236917326626855, "learning_rate": 1.9318649082927784e-05, "loss": 0.8241, "step": 1758 }, { "epoch": 0.14474387986011109, "grad_norm": 6.68261711206281, "learning_rate": 1.9317681765498485e-05, "loss": 0.8389, "step": 1759 }, { "epoch": 0.14482616745525612, "grad_norm": 8.82307043815515, "learning_rate": 1.9316713786155974e-05, "loss": 0.8591, "step": 1760 }, { "epoch": 0.14490845505040115, "grad_norm": 0.5330655449901153, "learning_rate": 1.9315745144969017e-05, "loss": 0.595, "step": 1761 }, { "epoch": 0.14499074264554618, "grad_norm": 4.782447942910814, "learning_rate": 1.9314775842006422e-05, "loss": 0.8231, "step": 1762 }, { "epoch": 0.1450730302406912, "grad_norm": 6.461784587179363, "learning_rate": 1.931380587733705e-05, "loss": 0.8385, "step": 1763 }, { "epoch": 0.14515531783583624, "grad_norm": 6.211444562729787, "learning_rate": 1.93128352510298e-05, "loss": 0.8268, "step": 1764 }, { "epoch": 0.14523760543098127, "grad_norm": 10.531808114746536, "learning_rate": 1.931186396315363e-05, "loss": 0.8177, "step": 1765 }, { "epoch": 0.1453198930261263, "grad_norm": 19.19707142844517, "learning_rate": 1.9310892013777533e-05, "loss": 0.831, "step": 1766 }, { "epoch": 0.14540218062127133, "grad_norm": 0.5389341914459922, "learning_rate": 1.930991940297056e-05, "loss": 0.6002, "step": 1767 }, { "epoch": 0.14548446821641636, "grad_norm": 5.89170072392646, "learning_rate": 1.93089461308018e-05, "loss": 0.8448, "step": 1768 }, { "epoch": 0.1455667558115614, "grad_norm": 5.109916223855104, "learning_rate": 1.9307972197340397e-05, "loss": 0.8092, "step": 1769 }, { "epoch": 0.14564904340670645, "grad_norm": 0.48671468331497286, "learning_rate": 1.9306997602655534e-05, "loss": 0.5869, "step": 1770 }, { "epoch": 0.14573133100185148, "grad_norm": 0.4650224096015914, "learning_rate": 1.9306022346816446e-05, "loss": 0.5473, "step": 1771 }, { "epoch": 0.1458136185969965, "grad_norm": 10.109459292829206, "learning_rate": 1.930504642989241e-05, "loss": 0.8677, "step": 1772 }, { "epoch": 0.14589590619214154, "grad_norm": 9.295217402252955, "learning_rate": 1.930406985195276e-05, "loss": 0.8392, "step": 1773 }, { "epoch": 0.14597819378728658, "grad_norm": 0.4680780702597178, "learning_rate": 1.9303092613066868e-05, "loss": 0.5466, "step": 1774 }, { "epoch": 0.1460604813824316, "grad_norm": 8.171994410417643, "learning_rate": 1.9302114713304156e-05, "loss": 0.857, "step": 1775 }, { "epoch": 0.14614276897757664, "grad_norm": 5.044836964241432, "learning_rate": 1.9301136152734087e-05, "loss": 0.852, "step": 1776 }, { "epoch": 0.14622505657272167, "grad_norm": 6.317438834212792, "learning_rate": 1.9300156931426182e-05, "loss": 0.8407, "step": 1777 }, { "epoch": 0.1463073441678667, "grad_norm": 5.174942827941844, "learning_rate": 1.9299177049450004e-05, "loss": 0.8338, "step": 1778 }, { "epoch": 0.14638963176301173, "grad_norm": 0.4723884851141025, "learning_rate": 1.9298196506875158e-05, "loss": 0.5827, "step": 1779 }, { "epoch": 0.14647191935815676, "grad_norm": 13.596767621441385, "learning_rate": 1.9297215303771304e-05, "loss": 0.8679, "step": 1780 }, { "epoch": 0.1465542069533018, "grad_norm": 5.205013158256939, "learning_rate": 1.9296233440208142e-05, "loss": 0.8362, "step": 1781 }, { "epoch": 0.14663649454844682, "grad_norm": 5.587153299584088, "learning_rate": 1.9295250916255425e-05, "loss": 0.8351, "step": 1782 }, { "epoch": 0.14671878214359185, "grad_norm": 0.47266552949506424, "learning_rate": 1.9294267731982948e-05, "loss": 0.5719, "step": 1783 }, { "epoch": 0.14680106973873688, "grad_norm": 5.473479336109091, "learning_rate": 1.9293283887460553e-05, "loss": 0.8347, "step": 1784 }, { "epoch": 0.14688335733388191, "grad_norm": 7.213933436839228, "learning_rate": 1.9292299382758138e-05, "loss": 0.8379, "step": 1785 }, { "epoch": 0.14696564492902695, "grad_norm": 18.805904514994236, "learning_rate": 1.9291314217945634e-05, "loss": 0.8388, "step": 1786 }, { "epoch": 0.14704793252417198, "grad_norm": 4.844653756754676, "learning_rate": 1.9290328393093026e-05, "loss": 0.8472, "step": 1787 }, { "epoch": 0.147130220119317, "grad_norm": 4.296135787825302, "learning_rate": 1.9289341908270347e-05, "loss": 0.8629, "step": 1788 }, { "epoch": 0.14721250771446204, "grad_norm": 11.658303520335162, "learning_rate": 1.9288354763547673e-05, "loss": 0.8029, "step": 1789 }, { "epoch": 0.14729479530960707, "grad_norm": 4.837349799431046, "learning_rate": 1.9287366958995136e-05, "loss": 0.8772, "step": 1790 }, { "epoch": 0.1473770829047521, "grad_norm": 0.48305002203280695, "learning_rate": 1.9286378494682896e-05, "loss": 0.5614, "step": 1791 }, { "epoch": 0.14745937049989713, "grad_norm": 4.7653711011756075, "learning_rate": 1.9285389370681184e-05, "loss": 0.8333, "step": 1792 }, { "epoch": 0.14754165809504216, "grad_norm": 6.453771640410041, "learning_rate": 1.9284399587060262e-05, "loss": 0.8446, "step": 1793 }, { "epoch": 0.1476239456901872, "grad_norm": 10.175339873053852, "learning_rate": 1.928340914389044e-05, "loss": 0.8579, "step": 1794 }, { "epoch": 0.14770623328533222, "grad_norm": 5.858011029442454, "learning_rate": 1.9282418041242078e-05, "loss": 0.8283, "step": 1795 }, { "epoch": 0.14778852088047728, "grad_norm": 5.123269754961845, "learning_rate": 1.9281426279185586e-05, "loss": 0.8182, "step": 1796 }, { "epoch": 0.1478708084756223, "grad_norm": 6.518221006236574, "learning_rate": 1.928043385779141e-05, "loss": 0.8492, "step": 1797 }, { "epoch": 0.14795309607076734, "grad_norm": 5.832793043417557, "learning_rate": 1.9279440777130056e-05, "loss": 0.8485, "step": 1798 }, { "epoch": 0.14803538366591237, "grad_norm": 6.497382970971675, "learning_rate": 1.9278447037272072e-05, "loss": 0.8638, "step": 1799 }, { "epoch": 0.1481176712610574, "grad_norm": 9.185854302788478, "learning_rate": 1.927745263828805e-05, "loss": 0.8222, "step": 1800 }, { "epoch": 0.14819995885620244, "grad_norm": 7.203459832191322, "learning_rate": 1.9276457580248628e-05, "loss": 0.8328, "step": 1801 }, { "epoch": 0.14828224645134747, "grad_norm": 0.4659154540512444, "learning_rate": 1.9275461863224492e-05, "loss": 0.5878, "step": 1802 }, { "epoch": 0.1483645340464925, "grad_norm": 4.761079806856456, "learning_rate": 1.9274465487286383e-05, "loss": 0.8401, "step": 1803 }, { "epoch": 0.14844682164163753, "grad_norm": 5.43480146188045, "learning_rate": 1.9273468452505075e-05, "loss": 0.8504, "step": 1804 }, { "epoch": 0.14852910923678256, "grad_norm": 0.4589008559325783, "learning_rate": 1.92724707589514e-05, "loss": 0.5615, "step": 1805 }, { "epoch": 0.1486113968319276, "grad_norm": 0.4490133485736386, "learning_rate": 1.9271472406696236e-05, "loss": 0.5758, "step": 1806 }, { "epoch": 0.14869368442707262, "grad_norm": 4.8766467786430665, "learning_rate": 1.9270473395810494e-05, "loss": 0.8508, "step": 1807 }, { "epoch": 0.14877597202221765, "grad_norm": 4.857249289546154, "learning_rate": 1.9269473726365147e-05, "loss": 0.818, "step": 1808 }, { "epoch": 0.14885825961736268, "grad_norm": 4.3364511276912845, "learning_rate": 1.9268473398431217e-05, "loss": 0.807, "step": 1809 }, { "epoch": 0.1489405472125077, "grad_norm": 5.55022355695734, "learning_rate": 1.9267472412079755e-05, "loss": 0.8409, "step": 1810 }, { "epoch": 0.14902283480765274, "grad_norm": 5.009007386374566, "learning_rate": 1.9266470767381876e-05, "loss": 0.8357, "step": 1811 }, { "epoch": 0.14910512240279777, "grad_norm": 7.485805210738528, "learning_rate": 1.9265468464408734e-05, "loss": 0.8193, "step": 1812 }, { "epoch": 0.1491874099979428, "grad_norm": 0.4919132266850831, "learning_rate": 1.9264465503231526e-05, "loss": 0.5705, "step": 1813 }, { "epoch": 0.14926969759308784, "grad_norm": 7.072148553826402, "learning_rate": 1.9263461883921506e-05, "loss": 0.8298, "step": 1814 }, { "epoch": 0.14935198518823287, "grad_norm": 15.97777993374769, "learning_rate": 1.9262457606549973e-05, "loss": 0.8325, "step": 1815 }, { "epoch": 0.1494342727833779, "grad_norm": 10.369257659029978, "learning_rate": 1.9261452671188257e-05, "loss": 0.8727, "step": 1816 }, { "epoch": 0.14951656037852293, "grad_norm": 5.9281264245080925, "learning_rate": 1.926044707790776e-05, "loss": 0.828, "step": 1817 }, { "epoch": 0.14959884797366796, "grad_norm": 4.551465103131345, "learning_rate": 1.9259440826779915e-05, "loss": 0.8484, "step": 1818 }, { "epoch": 0.149681135568813, "grad_norm": 4.877909704956422, "learning_rate": 1.9258433917876197e-05, "loss": 0.8548, "step": 1819 }, { "epoch": 0.14976342316395802, "grad_norm": 0.5042068110016803, "learning_rate": 1.9257426351268145e-05, "loss": 0.5747, "step": 1820 }, { "epoch": 0.14984571075910308, "grad_norm": 0.4843633549516813, "learning_rate": 1.9256418127027325e-05, "loss": 0.5803, "step": 1821 }, { "epoch": 0.1499279983542481, "grad_norm": 6.15038334110192, "learning_rate": 1.9255409245225366e-05, "loss": 0.8112, "step": 1822 }, { "epoch": 0.15001028594939314, "grad_norm": 6.421532923241749, "learning_rate": 1.925439970593394e-05, "loss": 0.8305, "step": 1823 }, { "epoch": 0.15009257354453817, "grad_norm": 7.677045661984815, "learning_rate": 1.9253389509224754e-05, "loss": 0.8404, "step": 1824 }, { "epoch": 0.1501748611396832, "grad_norm": 4.960096719845569, "learning_rate": 1.925237865516958e-05, "loss": 0.8275, "step": 1825 }, { "epoch": 0.15025714873482823, "grad_norm": 4.836857288232429, "learning_rate": 1.9251367143840218e-05, "loss": 0.8426, "step": 1826 }, { "epoch": 0.15033943632997326, "grad_norm": 4.6523616976989866, "learning_rate": 1.9250354975308534e-05, "loss": 0.8433, "step": 1827 }, { "epoch": 0.1504217239251183, "grad_norm": 5.486424130354286, "learning_rate": 1.9249342149646426e-05, "loss": 0.83, "step": 1828 }, { "epoch": 0.15050401152026333, "grad_norm": 6.185401090574369, "learning_rate": 1.9248328666925838e-05, "loss": 0.8208, "step": 1829 }, { "epoch": 0.15058629911540836, "grad_norm": 6.335103339362094, "learning_rate": 1.9247314527218778e-05, "loss": 0.8487, "step": 1830 }, { "epoch": 0.1506685867105534, "grad_norm": 4.5706381595966015, "learning_rate": 1.9246299730597284e-05, "loss": 0.8587, "step": 1831 }, { "epoch": 0.15075087430569842, "grad_norm": 5.392989295897071, "learning_rate": 1.924528427713344e-05, "loss": 0.8276, "step": 1832 }, { "epoch": 0.15083316190084345, "grad_norm": 8.741338516555382, "learning_rate": 1.924426816689939e-05, "loss": 0.8151, "step": 1833 }, { "epoch": 0.15091544949598848, "grad_norm": 0.6685839984852447, "learning_rate": 1.9243251399967313e-05, "loss": 0.5844, "step": 1834 }, { "epoch": 0.1509977370911335, "grad_norm": 0.5107566191246578, "learning_rate": 1.9242233976409438e-05, "loss": 0.5402, "step": 1835 }, { "epoch": 0.15108002468627854, "grad_norm": 0.4613973290236187, "learning_rate": 1.9241215896298043e-05, "loss": 0.5524, "step": 1836 }, { "epoch": 0.15116231228142357, "grad_norm": 8.64624478329892, "learning_rate": 1.9240197159705448e-05, "loss": 0.8503, "step": 1837 }, { "epoch": 0.1512445998765686, "grad_norm": 5.785794004079789, "learning_rate": 1.9239177766704026e-05, "loss": 0.8447, "step": 1838 }, { "epoch": 0.15132688747171363, "grad_norm": 6.449386312407525, "learning_rate": 1.923815771736619e-05, "loss": 0.8625, "step": 1839 }, { "epoch": 0.15140917506685866, "grad_norm": 5.974603624558202, "learning_rate": 1.9237137011764404e-05, "loss": 0.8859, "step": 1840 }, { "epoch": 0.1514914626620037, "grad_norm": 6.07613604597075, "learning_rate": 1.9236115649971177e-05, "loss": 0.8288, "step": 1841 }, { "epoch": 0.15157375025714873, "grad_norm": 5.610188501747942, "learning_rate": 1.9235093632059067e-05, "loss": 0.8472, "step": 1842 }, { "epoch": 0.15165603785229376, "grad_norm": 6.513611642507301, "learning_rate": 1.9234070958100675e-05, "loss": 0.8406, "step": 1843 }, { "epoch": 0.1517383254474388, "grad_norm": 4.706538246713925, "learning_rate": 1.923304762816865e-05, "loss": 0.8313, "step": 1844 }, { "epoch": 0.15182061304258382, "grad_norm": 5.335789671949779, "learning_rate": 1.9232023642335683e-05, "loss": 0.8423, "step": 1845 }, { "epoch": 0.15190290063772885, "grad_norm": 5.374080569021992, "learning_rate": 1.9230999000674526e-05, "loss": 0.8584, "step": 1846 }, { "epoch": 0.1519851882328739, "grad_norm": 4.023515750853757, "learning_rate": 1.922997370325796e-05, "loss": 0.8269, "step": 1847 }, { "epoch": 0.15206747582801894, "grad_norm": 5.075361759924576, "learning_rate": 1.9228947750158826e-05, "loss": 0.8628, "step": 1848 }, { "epoch": 0.15214976342316397, "grad_norm": 7.288310947212566, "learning_rate": 1.922792114145e-05, "loss": 0.8284, "step": 1849 }, { "epoch": 0.152232051018309, "grad_norm": 4.911647064052488, "learning_rate": 1.9226893877204418e-05, "loss": 0.8098, "step": 1850 }, { "epoch": 0.15231433861345403, "grad_norm": 6.14158528557125, "learning_rate": 1.922586595749505e-05, "loss": 0.8586, "step": 1851 }, { "epoch": 0.15239662620859906, "grad_norm": 0.9609316724016479, "learning_rate": 1.9224837382394915e-05, "loss": 0.5865, "step": 1852 }, { "epoch": 0.1524789138037441, "grad_norm": 4.8074089673842115, "learning_rate": 1.9223808151977086e-05, "loss": 0.8574, "step": 1853 }, { "epoch": 0.15256120139888912, "grad_norm": 5.09401658512305, "learning_rate": 1.9222778266314682e-05, "loss": 0.8242, "step": 1854 }, { "epoch": 0.15264348899403415, "grad_norm": 7.660360501973494, "learning_rate": 1.9221747725480858e-05, "loss": 0.8409, "step": 1855 }, { "epoch": 0.15272577658917919, "grad_norm": 0.6386425451713716, "learning_rate": 1.922071652954882e-05, "loss": 0.565, "step": 1856 }, { "epoch": 0.15280806418432422, "grad_norm": 3.8625412908418486, "learning_rate": 1.9219684678591828e-05, "loss": 0.8589, "step": 1857 }, { "epoch": 0.15289035177946925, "grad_norm": 4.707426779826058, "learning_rate": 1.9218652172683182e-05, "loss": 0.8378, "step": 1858 }, { "epoch": 0.15297263937461428, "grad_norm": 8.068080909985033, "learning_rate": 1.9217619011896228e-05, "loss": 0.8404, "step": 1859 }, { "epoch": 0.1530549269697593, "grad_norm": 5.122827903994561, "learning_rate": 1.9216585196304362e-05, "loss": 0.8384, "step": 1860 }, { "epoch": 0.15313721456490434, "grad_norm": 4.44536713211764, "learning_rate": 1.9215550725981025e-05, "loss": 0.851, "step": 1861 }, { "epoch": 0.15321950216004937, "grad_norm": 5.1179320546544576, "learning_rate": 1.92145156009997e-05, "loss": 0.8349, "step": 1862 }, { "epoch": 0.1533017897551944, "grad_norm": 4.460759987151901, "learning_rate": 1.9213479821433922e-05, "loss": 0.88, "step": 1863 }, { "epoch": 0.15338407735033943, "grad_norm": 0.6059111588013519, "learning_rate": 1.9212443387357274e-05, "loss": 0.5736, "step": 1864 }, { "epoch": 0.15346636494548446, "grad_norm": 4.023527415266168, "learning_rate": 1.921140629884338e-05, "loss": 0.8557, "step": 1865 }, { "epoch": 0.1535486525406295, "grad_norm": 5.42493421555172, "learning_rate": 1.9210368555965915e-05, "loss": 0.8703, "step": 1866 }, { "epoch": 0.15363094013577452, "grad_norm": 5.751996259296113, "learning_rate": 1.9209330158798597e-05, "loss": 0.844, "step": 1867 }, { "epoch": 0.15371322773091955, "grad_norm": 26.285583113760325, "learning_rate": 1.920829110741519e-05, "loss": 0.8506, "step": 1868 }, { "epoch": 0.15379551532606459, "grad_norm": 3.7662288767932037, "learning_rate": 1.9207251401889514e-05, "loss": 0.8539, "step": 1869 }, { "epoch": 0.15387780292120962, "grad_norm": 6.159059075831218, "learning_rate": 1.920621104229542e-05, "loss": 0.8153, "step": 1870 }, { "epoch": 0.15396009051635465, "grad_norm": 3.529280846535288, "learning_rate": 1.920517002870682e-05, "loss": 0.8152, "step": 1871 }, { "epoch": 0.15404237811149968, "grad_norm": 3.8640076235353713, "learning_rate": 1.920412836119766e-05, "loss": 0.833, "step": 1872 }, { "epoch": 0.15412466570664474, "grad_norm": 0.5522002280100713, "learning_rate": 1.9203086039841944e-05, "loss": 0.5538, "step": 1873 }, { "epoch": 0.15420695330178977, "grad_norm": 4.57263225829868, "learning_rate": 1.9202043064713708e-05, "loss": 0.8199, "step": 1874 }, { "epoch": 0.1542892408969348, "grad_norm": 5.739782670934446, "learning_rate": 1.9200999435887053e-05, "loss": 0.8286, "step": 1875 }, { "epoch": 0.15437152849207983, "grad_norm": 4.3925079943317265, "learning_rate": 1.919995515343611e-05, "loss": 0.8358, "step": 1876 }, { "epoch": 0.15445381608722486, "grad_norm": 4.789241874040683, "learning_rate": 1.9198910217435073e-05, "loss": 0.8344, "step": 1877 }, { "epoch": 0.1545361036823699, "grad_norm": 4.400609569663084, "learning_rate": 1.919786462795816e-05, "loss": 0.8432, "step": 1878 }, { "epoch": 0.15461839127751492, "grad_norm": 2.9184772315404124, "learning_rate": 1.9196818385079655e-05, "loss": 0.8271, "step": 1879 }, { "epoch": 0.15470067887265995, "grad_norm": 4.851603050312278, "learning_rate": 1.919577148887388e-05, "loss": 0.8408, "step": 1880 }, { "epoch": 0.15478296646780498, "grad_norm": 3.7963500670198562, "learning_rate": 1.9194723939415203e-05, "loss": 0.8466, "step": 1881 }, { "epoch": 0.15486525406295001, "grad_norm": 4.176493857818385, "learning_rate": 1.9193675736778047e-05, "loss": 0.8282, "step": 1882 }, { "epoch": 0.15494754165809504, "grad_norm": 5.221962997293721, "learning_rate": 1.9192626881036866e-05, "loss": 0.86, "step": 1883 }, { "epoch": 0.15502982925324008, "grad_norm": 4.184852828323477, "learning_rate": 1.9191577372266174e-05, "loss": 0.8329, "step": 1884 }, { "epoch": 0.1551121168483851, "grad_norm": 0.5636559518533191, "learning_rate": 1.9190527210540524e-05, "loss": 0.5639, "step": 1885 }, { "epoch": 0.15519440444353014, "grad_norm": 6.193137368635008, "learning_rate": 1.918947639593452e-05, "loss": 0.8209, "step": 1886 }, { "epoch": 0.15527669203867517, "grad_norm": 4.6492158556775625, "learning_rate": 1.918842492852281e-05, "loss": 0.8231, "step": 1887 }, { "epoch": 0.1553589796338202, "grad_norm": 4.2648570505473575, "learning_rate": 1.9187372808380085e-05, "loss": 0.8558, "step": 1888 }, { "epoch": 0.15544126722896523, "grad_norm": 4.4365923177857765, "learning_rate": 1.918632003558109e-05, "loss": 0.871, "step": 1889 }, { "epoch": 0.15552355482411026, "grad_norm": 6.451657210590928, "learning_rate": 1.9185266610200612e-05, "loss": 0.8205, "step": 1890 }, { "epoch": 0.1556058424192553, "grad_norm": 5.063597696743102, "learning_rate": 1.9184212532313483e-05, "loss": 0.823, "step": 1891 }, { "epoch": 0.15568813001440032, "grad_norm": 5.33443100117654, "learning_rate": 1.9183157801994585e-05, "loss": 0.8152, "step": 1892 }, { "epoch": 0.15577041760954535, "grad_norm": 6.843779103961609, "learning_rate": 1.9182102419318842e-05, "loss": 0.8466, "step": 1893 }, { "epoch": 0.15585270520469038, "grad_norm": 0.5560946461260619, "learning_rate": 1.9181046384361228e-05, "loss": 0.5749, "step": 1894 }, { "epoch": 0.15593499279983541, "grad_norm": 5.741982504307724, "learning_rate": 1.9179989697196762e-05, "loss": 0.819, "step": 1895 }, { "epoch": 0.15601728039498045, "grad_norm": 6.531500373200525, "learning_rate": 1.9178932357900505e-05, "loss": 0.8298, "step": 1896 }, { "epoch": 0.15609956799012548, "grad_norm": 6.1546326697051, "learning_rate": 1.917787436654758e-05, "loss": 0.837, "step": 1897 }, { "epoch": 0.1561818555852705, "grad_norm": 5.185203064348844, "learning_rate": 1.9176815723213132e-05, "loss": 0.8632, "step": 1898 }, { "epoch": 0.15626414318041557, "grad_norm": 5.4084609854089365, "learning_rate": 1.9175756427972375e-05, "loss": 0.8307, "step": 1899 }, { "epoch": 0.1563464307755606, "grad_norm": 5.703538856423496, "learning_rate": 1.9174696480900554e-05, "loss": 0.8472, "step": 1900 }, { "epoch": 0.15642871837070563, "grad_norm": 5.405477693059169, "learning_rate": 1.9173635882072967e-05, "loss": 0.8721, "step": 1901 }, { "epoch": 0.15651100596585066, "grad_norm": 4.921592944151, "learning_rate": 1.9172574631564963e-05, "loss": 0.8327, "step": 1902 }, { "epoch": 0.1565932935609957, "grad_norm": 7.000417448334732, "learning_rate": 1.917151272945192e-05, "loss": 0.8074, "step": 1903 }, { "epoch": 0.15667558115614072, "grad_norm": 8.881365953302256, "learning_rate": 1.9170450175809283e-05, "loss": 0.8486, "step": 1904 }, { "epoch": 0.15675786875128575, "grad_norm": 3.557591238337821, "learning_rate": 1.9169386970712532e-05, "loss": 0.8428, "step": 1905 }, { "epoch": 0.15684015634643078, "grad_norm": 5.326825829060843, "learning_rate": 1.9168323114237193e-05, "loss": 0.8439, "step": 1906 }, { "epoch": 0.1569224439415758, "grad_norm": 5.478137574938721, "learning_rate": 1.9167258606458846e-05, "loss": 0.8479, "step": 1907 }, { "epoch": 0.15700473153672084, "grad_norm": 10.86664357588869, "learning_rate": 1.9166193447453107e-05, "loss": 0.8481, "step": 1908 }, { "epoch": 0.15708701913186587, "grad_norm": 5.939616047516853, "learning_rate": 1.916512763729564e-05, "loss": 0.8572, "step": 1909 }, { "epoch": 0.1571693067270109, "grad_norm": 3.7039344771688256, "learning_rate": 1.9164061176062166e-05, "loss": 0.8424, "step": 1910 }, { "epoch": 0.15725159432215594, "grad_norm": 0.49197443127291796, "learning_rate": 1.9162994063828445e-05, "loss": 0.5803, "step": 1911 }, { "epoch": 0.15733388191730097, "grad_norm": 0.4553081859661651, "learning_rate": 1.9161926300670277e-05, "loss": 0.5647, "step": 1912 }, { "epoch": 0.157416169512446, "grad_norm": 5.5999535437779215, "learning_rate": 1.916085788666352e-05, "loss": 0.8441, "step": 1913 }, { "epoch": 0.15749845710759103, "grad_norm": 5.132735593812985, "learning_rate": 1.9159788821884064e-05, "loss": 0.8579, "step": 1914 }, { "epoch": 0.15758074470273606, "grad_norm": 5.5625878807458005, "learning_rate": 1.9158719106407862e-05, "loss": 0.8574, "step": 1915 }, { "epoch": 0.1576630322978811, "grad_norm": 3.8672917005273706, "learning_rate": 1.9157648740310905e-05, "loss": 0.8474, "step": 1916 }, { "epoch": 0.15774531989302612, "grad_norm": 0.5195626974877546, "learning_rate": 1.915657772366922e-05, "loss": 0.5716, "step": 1917 }, { "epoch": 0.15782760748817115, "grad_norm": 0.5118379542817698, "learning_rate": 1.9155506056558903e-05, "loss": 0.5727, "step": 1918 }, { "epoch": 0.15790989508331618, "grad_norm": 0.546243421780183, "learning_rate": 1.9154433739056078e-05, "loss": 0.5735, "step": 1919 }, { "epoch": 0.1579921826784612, "grad_norm": 0.4496319616069995, "learning_rate": 1.9153360771236915e-05, "loss": 0.5672, "step": 1920 }, { "epoch": 0.15807447027360624, "grad_norm": 17.06080467505602, "learning_rate": 1.9152287153177646e-05, "loss": 0.8598, "step": 1921 }, { "epoch": 0.15815675786875127, "grad_norm": 5.0084313071310484, "learning_rate": 1.9151212884954534e-05, "loss": 0.8499, "step": 1922 }, { "epoch": 0.1582390454638963, "grad_norm": 0.6107687958291511, "learning_rate": 1.9150137966643892e-05, "loss": 0.5938, "step": 1923 }, { "epoch": 0.15832133305904134, "grad_norm": 6.509270710519911, "learning_rate": 1.9149062398322084e-05, "loss": 0.8879, "step": 1924 }, { "epoch": 0.1584036206541864, "grad_norm": 7.504768979432874, "learning_rate": 1.9147986180065515e-05, "loss": 0.8179, "step": 1925 }, { "epoch": 0.15848590824933143, "grad_norm": 8.842157642443578, "learning_rate": 1.9146909311950636e-05, "loss": 0.829, "step": 1926 }, { "epoch": 0.15856819584447646, "grad_norm": 4.932666405596189, "learning_rate": 1.914583179405395e-05, "loss": 0.8504, "step": 1927 }, { "epoch": 0.1586504834396215, "grad_norm": 6.372961990608066, "learning_rate": 1.9144753626452e-05, "loss": 0.8624, "step": 1928 }, { "epoch": 0.15873277103476652, "grad_norm": 4.522677476310382, "learning_rate": 1.9143674809221376e-05, "loss": 0.8391, "step": 1929 }, { "epoch": 0.15881505862991155, "grad_norm": 0.5270211749274826, "learning_rate": 1.914259534243872e-05, "loss": 0.5819, "step": 1930 }, { "epoch": 0.15889734622505658, "grad_norm": 0.46338890013052, "learning_rate": 1.9141515226180708e-05, "loss": 0.5434, "step": 1931 }, { "epoch": 0.1589796338202016, "grad_norm": 5.053929441164545, "learning_rate": 1.9140434460524075e-05, "loss": 0.8184, "step": 1932 }, { "epoch": 0.15906192141534664, "grad_norm": 0.46822296714643163, "learning_rate": 1.9139353045545595e-05, "loss": 0.5468, "step": 1933 }, { "epoch": 0.15914420901049167, "grad_norm": 4.818972577828098, "learning_rate": 1.9138270981322093e-05, "loss": 0.8158, "step": 1934 }, { "epoch": 0.1592264966056367, "grad_norm": 5.056437597852356, "learning_rate": 1.9137188267930434e-05, "loss": 0.8395, "step": 1935 }, { "epoch": 0.15930878420078173, "grad_norm": 5.0850642468159535, "learning_rate": 1.9136104905447533e-05, "loss": 0.861, "step": 1936 }, { "epoch": 0.15939107179592676, "grad_norm": 0.5152390077794073, "learning_rate": 1.913502089395035e-05, "loss": 0.5544, "step": 1937 }, { "epoch": 0.1594733593910718, "grad_norm": 0.49678784494055134, "learning_rate": 1.9133936233515893e-05, "loss": 0.5633, "step": 1938 }, { "epoch": 0.15955564698621683, "grad_norm": 8.081617306656465, "learning_rate": 1.9132850924221214e-05, "loss": 0.8199, "step": 1939 }, { "epoch": 0.15963793458136186, "grad_norm": 4.050834409746043, "learning_rate": 1.913176496614341e-05, "loss": 0.8565, "step": 1940 }, { "epoch": 0.1597202221765069, "grad_norm": 5.538913745420739, "learning_rate": 1.913067835935963e-05, "loss": 0.8568, "step": 1941 }, { "epoch": 0.15980250977165192, "grad_norm": 5.169208456192216, "learning_rate": 1.912959110394706e-05, "loss": 0.8502, "step": 1942 }, { "epoch": 0.15988479736679695, "grad_norm": 3.717640082018756, "learning_rate": 1.9128503199982934e-05, "loss": 0.8515, "step": 1943 }, { "epoch": 0.15996708496194198, "grad_norm": 4.289581072812726, "learning_rate": 1.9127414647544546e-05, "loss": 0.8574, "step": 1944 }, { "epoch": 0.160049372557087, "grad_norm": 7.156272072258111, "learning_rate": 1.9126325446709217e-05, "loss": 0.8228, "step": 1945 }, { "epoch": 0.16013166015223204, "grad_norm": 3.666071946569591, "learning_rate": 1.912523559755432e-05, "loss": 0.8278, "step": 1946 }, { "epoch": 0.16021394774737707, "grad_norm": 4.716701647887716, "learning_rate": 1.9124145100157284e-05, "loss": 0.8434, "step": 1947 }, { "epoch": 0.1602962353425221, "grad_norm": 4.404603024503413, "learning_rate": 1.9123053954595572e-05, "loss": 0.8568, "step": 1948 }, { "epoch": 0.16037852293766713, "grad_norm": 12.693487709300616, "learning_rate": 1.9121962160946696e-05, "loss": 0.8659, "step": 1949 }, { "epoch": 0.16046081053281216, "grad_norm": 4.994468190010449, "learning_rate": 1.9120869719288216e-05, "loss": 0.8583, "step": 1950 }, { "epoch": 0.16054309812795722, "grad_norm": 3.350272271821083, "learning_rate": 1.9119776629697738e-05, "loss": 0.844, "step": 1951 }, { "epoch": 0.16062538572310225, "grad_norm": 0.626835237381398, "learning_rate": 1.911868289225291e-05, "loss": 0.6034, "step": 1952 }, { "epoch": 0.16070767331824728, "grad_norm": 0.5174318048940684, "learning_rate": 1.911758850703144e-05, "loss": 0.5745, "step": 1953 }, { "epoch": 0.16078996091339232, "grad_norm": 0.46025061805771117, "learning_rate": 1.9116493474111056e-05, "loss": 0.5644, "step": 1954 }, { "epoch": 0.16087224850853735, "grad_norm": 3.954810471983505, "learning_rate": 1.9115397793569558e-05, "loss": 0.8655, "step": 1955 }, { "epoch": 0.16095453610368238, "grad_norm": 5.181663850622272, "learning_rate": 1.911430146548478e-05, "loss": 0.88, "step": 1956 }, { "epoch": 0.1610368236988274, "grad_norm": 5.090124727277422, "learning_rate": 1.9113204489934603e-05, "loss": 0.8526, "step": 1957 }, { "epoch": 0.16111911129397244, "grad_norm": 4.995989345346734, "learning_rate": 1.911210686699695e-05, "loss": 0.8515, "step": 1958 }, { "epoch": 0.16120139888911747, "grad_norm": 3.8664089788873537, "learning_rate": 1.91110085967498e-05, "loss": 0.8226, "step": 1959 }, { "epoch": 0.1612836864842625, "grad_norm": 3.485843985195341, "learning_rate": 1.9109909679271173e-05, "loss": 0.8226, "step": 1960 }, { "epoch": 0.16136597407940753, "grad_norm": 6.108467733707024, "learning_rate": 1.910881011463913e-05, "loss": 0.8439, "step": 1961 }, { "epoch": 0.16144826167455256, "grad_norm": 0.8281518473412987, "learning_rate": 1.910770990293178e-05, "loss": 0.6601, "step": 1962 }, { "epoch": 0.1615305492696976, "grad_norm": 4.1919775317779, "learning_rate": 1.910660904422729e-05, "loss": 0.8358, "step": 1963 }, { "epoch": 0.16161283686484262, "grad_norm": 3.6486336150800818, "learning_rate": 1.910550753860385e-05, "loss": 0.8752, "step": 1964 }, { "epoch": 0.16169512445998765, "grad_norm": 18.125327367252893, "learning_rate": 1.9104405386139722e-05, "loss": 0.8411, "step": 1965 }, { "epoch": 0.16177741205513269, "grad_norm": 0.519789005202432, "learning_rate": 1.9103302586913194e-05, "loss": 0.5852, "step": 1966 }, { "epoch": 0.16185969965027772, "grad_norm": 10.74734646569047, "learning_rate": 1.9102199141002612e-05, "loss": 0.8597, "step": 1967 }, { "epoch": 0.16194198724542275, "grad_norm": 4.997595569212722, "learning_rate": 1.9101095048486353e-05, "loss": 0.8487, "step": 1968 }, { "epoch": 0.16202427484056778, "grad_norm": 0.4693045988249685, "learning_rate": 1.9099990309442863e-05, "loss": 0.5493, "step": 1969 }, { "epoch": 0.1621065624357128, "grad_norm": 9.65821630646961, "learning_rate": 1.909888492395061e-05, "loss": 0.8503, "step": 1970 }, { "epoch": 0.16218885003085784, "grad_norm": 4.327280300731587, "learning_rate": 1.9097778892088126e-05, "loss": 0.8611, "step": 1971 }, { "epoch": 0.16227113762600287, "grad_norm": 0.4866359984742805, "learning_rate": 1.9096672213933983e-05, "loss": 0.6039, "step": 1972 }, { "epoch": 0.1623534252211479, "grad_norm": 4.267347798551128, "learning_rate": 1.9095564889566787e-05, "loss": 0.8286, "step": 1973 }, { "epoch": 0.16243571281629293, "grad_norm": 3.478809856338831, "learning_rate": 1.909445691906521e-05, "loss": 0.8343, "step": 1974 }, { "epoch": 0.16251800041143796, "grad_norm": 0.47794812337530074, "learning_rate": 1.9093348302507958e-05, "loss": 0.5616, "step": 1975 }, { "epoch": 0.16260028800658302, "grad_norm": 3.955142852276849, "learning_rate": 1.909223903997379e-05, "loss": 0.8123, "step": 1976 }, { "epoch": 0.16268257560172805, "grad_norm": 3.964513425565348, "learning_rate": 1.9091129131541496e-05, "loss": 0.8416, "step": 1977 }, { "epoch": 0.16276486319687308, "grad_norm": 4.549111628205324, "learning_rate": 1.909001857728993e-05, "loss": 0.8257, "step": 1978 }, { "epoch": 0.1628471507920181, "grad_norm": 6.970898550322873, "learning_rate": 1.9088907377297977e-05, "loss": 0.8488, "step": 1979 }, { "epoch": 0.16292943838716314, "grad_norm": 4.2908238891553445, "learning_rate": 1.9087795531644583e-05, "loss": 0.8412, "step": 1980 }, { "epoch": 0.16301172598230818, "grad_norm": 4.755754852120496, "learning_rate": 1.9086683040408728e-05, "loss": 0.8137, "step": 1981 }, { "epoch": 0.1630940135774532, "grad_norm": 4.9388049580707705, "learning_rate": 1.9085569903669444e-05, "loss": 0.8425, "step": 1982 }, { "epoch": 0.16317630117259824, "grad_norm": 3.7789696800914507, "learning_rate": 1.9084456121505802e-05, "loss": 0.8601, "step": 1983 }, { "epoch": 0.16325858876774327, "grad_norm": 0.5036875299743072, "learning_rate": 1.9083341693996926e-05, "loss": 0.5823, "step": 1984 }, { "epoch": 0.1633408763628883, "grad_norm": 0.4745416575117629, "learning_rate": 1.908222662122198e-05, "loss": 0.5523, "step": 1985 }, { "epoch": 0.16342316395803333, "grad_norm": 5.2700911406234185, "learning_rate": 1.9081110903260184e-05, "loss": 0.8406, "step": 1986 }, { "epoch": 0.16350545155317836, "grad_norm": 4.278987118782611, "learning_rate": 1.907999454019079e-05, "loss": 0.8375, "step": 1987 }, { "epoch": 0.1635877391483234, "grad_norm": 4.375545347985468, "learning_rate": 1.907887753209311e-05, "loss": 0.8304, "step": 1988 }, { "epoch": 0.16367002674346842, "grad_norm": 4.920337609057978, "learning_rate": 1.907775987904648e-05, "loss": 0.8077, "step": 1989 }, { "epoch": 0.16375231433861345, "grad_norm": 6.998763866445164, "learning_rate": 1.9076641581130313e-05, "loss": 0.8528, "step": 1990 }, { "epoch": 0.16383460193375848, "grad_norm": 5.1844149471073075, "learning_rate": 1.907552263842404e-05, "loss": 0.8272, "step": 1991 }, { "epoch": 0.16391688952890351, "grad_norm": 5.675331934385374, "learning_rate": 1.9074403051007158e-05, "loss": 0.8266, "step": 1992 }, { "epoch": 0.16399917712404855, "grad_norm": 6.85376327606783, "learning_rate": 1.9073282818959192e-05, "loss": 0.8199, "step": 1993 }, { "epoch": 0.16408146471919358, "grad_norm": 5.037618123313906, "learning_rate": 1.907216194235973e-05, "loss": 0.8397, "step": 1994 }, { "epoch": 0.1641637523143386, "grad_norm": 7.450741530103121, "learning_rate": 1.9071040421288388e-05, "loss": 0.8075, "step": 1995 }, { "epoch": 0.16424603990948364, "grad_norm": 4.939981647738216, "learning_rate": 1.906991825582484e-05, "loss": 0.8243, "step": 1996 }, { "epoch": 0.16432832750462867, "grad_norm": 4.301415531275492, "learning_rate": 1.9068795446048806e-05, "loss": 0.8604, "step": 1997 }, { "epoch": 0.1644106150997737, "grad_norm": 4.4888916464050865, "learning_rate": 1.9067671992040046e-05, "loss": 0.8721, "step": 1998 }, { "epoch": 0.16449290269491873, "grad_norm": 6.357665442149673, "learning_rate": 1.9066547893878372e-05, "loss": 0.8874, "step": 1999 }, { "epoch": 0.16457519029006376, "grad_norm": 0.5634041513601309, "learning_rate": 1.9065423151643633e-05, "loss": 0.5899, "step": 2000 }, { "epoch": 0.1646574778852088, "grad_norm": 8.209637125511108, "learning_rate": 1.906429776541573e-05, "loss": 0.8543, "step": 2001 }, { "epoch": 0.16473976548035385, "grad_norm": 4.881472200579807, "learning_rate": 1.9063171735274615e-05, "loss": 0.8203, "step": 2002 }, { "epoch": 0.16482205307549888, "grad_norm": 4.506268312516657, "learning_rate": 1.906204506130027e-05, "loss": 0.8246, "step": 2003 }, { "epoch": 0.1649043406706439, "grad_norm": 0.4482842576468044, "learning_rate": 1.906091774357274e-05, "loss": 0.5494, "step": 2004 }, { "epoch": 0.16498662826578894, "grad_norm": 0.46462466183979095, "learning_rate": 1.90597897821721e-05, "loss": 0.5828, "step": 2005 }, { "epoch": 0.16506891586093397, "grad_norm": 3.535446139587554, "learning_rate": 1.9058661177178487e-05, "loss": 0.8295, "step": 2006 }, { "epoch": 0.165151203456079, "grad_norm": 6.898430563265459, "learning_rate": 1.905753192867207e-05, "loss": 0.8122, "step": 2007 }, { "epoch": 0.16523349105122404, "grad_norm": 6.659782186420017, "learning_rate": 1.905640203673307e-05, "loss": 0.8368, "step": 2008 }, { "epoch": 0.16531577864636907, "grad_norm": 5.247470551212812, "learning_rate": 1.905527150144175e-05, "loss": 0.8493, "step": 2009 }, { "epoch": 0.1653980662415141, "grad_norm": 0.5007023360676847, "learning_rate": 1.9054140322878426e-05, "loss": 0.5794, "step": 2010 }, { "epoch": 0.16548035383665913, "grad_norm": 5.512743851784457, "learning_rate": 1.9053008501123456e-05, "loss": 0.8303, "step": 2011 }, { "epoch": 0.16556264143180416, "grad_norm": 0.47692966581809587, "learning_rate": 1.9051876036257236e-05, "loss": 0.5789, "step": 2012 }, { "epoch": 0.1656449290269492, "grad_norm": 4.661215851298964, "learning_rate": 1.905074292836022e-05, "loss": 0.8438, "step": 2013 }, { "epoch": 0.16572721662209422, "grad_norm": 4.417221172728927, "learning_rate": 1.90496091775129e-05, "loss": 0.8223, "step": 2014 }, { "epoch": 0.16580950421723925, "grad_norm": 4.542426359477869, "learning_rate": 1.904847478379582e-05, "loss": 0.8531, "step": 2015 }, { "epoch": 0.16589179181238428, "grad_norm": 8.947547537374573, "learning_rate": 1.9047339747289562e-05, "loss": 0.8367, "step": 2016 }, { "epoch": 0.1659740794075293, "grad_norm": 6.689553062284241, "learning_rate": 1.904620406807476e-05, "loss": 0.8381, "step": 2017 }, { "epoch": 0.16605636700267434, "grad_norm": 4.414270465815786, "learning_rate": 1.904506774623208e-05, "loss": 0.7952, "step": 2018 }, { "epoch": 0.16613865459781937, "grad_norm": 4.617600219967089, "learning_rate": 1.904393078184226e-05, "loss": 0.8529, "step": 2019 }, { "epoch": 0.1662209421929644, "grad_norm": 4.8939785097994, "learning_rate": 1.9042793174986057e-05, "loss": 0.818, "step": 2020 }, { "epoch": 0.16630322978810944, "grad_norm": 5.503071643587064, "learning_rate": 1.9041654925744292e-05, "loss": 0.8312, "step": 2021 }, { "epoch": 0.16638551738325447, "grad_norm": 3.8994514270947325, "learning_rate": 1.904051603419782e-05, "loss": 0.8381, "step": 2022 }, { "epoch": 0.1664678049783995, "grad_norm": 30.549691571453124, "learning_rate": 1.9039376500427543e-05, "loss": 0.8293, "step": 2023 }, { "epoch": 0.16655009257354453, "grad_norm": 6.388021109679703, "learning_rate": 1.9038236324514418e-05, "loss": 0.8559, "step": 2024 }, { "epoch": 0.16663238016868956, "grad_norm": 5.235002404805821, "learning_rate": 1.903709550653944e-05, "loss": 0.8253, "step": 2025 }, { "epoch": 0.1667146677638346, "grad_norm": 5.684263599129397, "learning_rate": 1.903595404658365e-05, "loss": 0.8288, "step": 2026 }, { "epoch": 0.16679695535897962, "grad_norm": 6.189154945456667, "learning_rate": 1.9034811944728134e-05, "loss": 0.8284, "step": 2027 }, { "epoch": 0.16687924295412468, "grad_norm": 4.925352091032951, "learning_rate": 1.903366920105403e-05, "loss": 0.8593, "step": 2028 }, { "epoch": 0.1669615305492697, "grad_norm": 8.79075541252156, "learning_rate": 1.903252581564251e-05, "loss": 0.8553, "step": 2029 }, { "epoch": 0.16704381814441474, "grad_norm": 5.720905547782337, "learning_rate": 1.9031381788574803e-05, "loss": 0.8607, "step": 2030 }, { "epoch": 0.16712610573955977, "grad_norm": 6.2820248941486545, "learning_rate": 1.9030237119932175e-05, "loss": 0.8562, "step": 2031 }, { "epoch": 0.1672083933347048, "grad_norm": 0.5465467961898273, "learning_rate": 1.9029091809795948e-05, "loss": 0.5764, "step": 2032 }, { "epoch": 0.16729068092984983, "grad_norm": 4.015285658563253, "learning_rate": 1.9027945858247475e-05, "loss": 0.8362, "step": 2033 }, { "epoch": 0.16737296852499486, "grad_norm": 4.679455678251065, "learning_rate": 1.9026799265368168e-05, "loss": 0.8772, "step": 2034 }, { "epoch": 0.1674552561201399, "grad_norm": 5.128976395208152, "learning_rate": 1.9025652031239478e-05, "loss": 0.8291, "step": 2035 }, { "epoch": 0.16753754371528493, "grad_norm": 0.46671868261974975, "learning_rate": 1.9024504155942897e-05, "loss": 0.5705, "step": 2036 }, { "epoch": 0.16761983131042996, "grad_norm": 3.9936315880019753, "learning_rate": 1.902335563955998e-05, "loss": 0.8426, "step": 2037 }, { "epoch": 0.167702118905575, "grad_norm": 9.886683613104376, "learning_rate": 1.9022206482172304e-05, "loss": 0.835, "step": 2038 }, { "epoch": 0.16778440650072002, "grad_norm": 5.57469886239202, "learning_rate": 1.9021056683861513e-05, "loss": 0.8303, "step": 2039 }, { "epoch": 0.16786669409586505, "grad_norm": 7.1891426779298335, "learning_rate": 1.9019906244709276e-05, "loss": 0.8464, "step": 2040 }, { "epoch": 0.16794898169101008, "grad_norm": 4.623477995459667, "learning_rate": 1.901875516479733e-05, "loss": 0.8101, "step": 2041 }, { "epoch": 0.1680312692861551, "grad_norm": 3.0508878772984485, "learning_rate": 1.901760344420744e-05, "loss": 0.8234, "step": 2042 }, { "epoch": 0.16811355688130014, "grad_norm": 5.360858070318203, "learning_rate": 1.9016451083021422e-05, "loss": 0.8556, "step": 2043 }, { "epoch": 0.16819584447644517, "grad_norm": 6.773934807897697, "learning_rate": 1.9015298081321138e-05, "loss": 0.865, "step": 2044 }, { "epoch": 0.1682781320715902, "grad_norm": 3.4190071351252214, "learning_rate": 1.90141444391885e-05, "loss": 0.8337, "step": 2045 }, { "epoch": 0.16836041966673523, "grad_norm": 3.96545776358378, "learning_rate": 1.9012990156705447e-05, "loss": 0.8042, "step": 2046 }, { "epoch": 0.16844270726188026, "grad_norm": 0.5430641590195081, "learning_rate": 1.9011835233953995e-05, "loss": 0.5773, "step": 2047 }, { "epoch": 0.1685249948570253, "grad_norm": 6.5560663968018815, "learning_rate": 1.901067967101618e-05, "loss": 0.802, "step": 2048 }, { "epoch": 0.16860728245217033, "grad_norm": 5.259779927167468, "learning_rate": 1.9009523467974093e-05, "loss": 0.8522, "step": 2049 }, { "epoch": 0.16868957004731536, "grad_norm": 5.439013570375262, "learning_rate": 1.9008366624909866e-05, "loss": 0.8457, "step": 2050 }, { "epoch": 0.1687718576424604, "grad_norm": 4.656209247100697, "learning_rate": 1.900720914190568e-05, "loss": 0.8294, "step": 2051 }, { "epoch": 0.16885414523760542, "grad_norm": 0.46861758893659844, "learning_rate": 1.900605101904376e-05, "loss": 0.5416, "step": 2052 }, { "epoch": 0.16893643283275045, "grad_norm": 0.4447180872394492, "learning_rate": 1.9004892256406383e-05, "loss": 0.5309, "step": 2053 }, { "epoch": 0.1690187204278955, "grad_norm": 3.5959277617933796, "learning_rate": 1.9003732854075857e-05, "loss": 0.8213, "step": 2054 }, { "epoch": 0.16910100802304054, "grad_norm": 7.590801550911286, "learning_rate": 1.900257281213455e-05, "loss": 0.8512, "step": 2055 }, { "epoch": 0.16918329561818557, "grad_norm": 6.650467541507469, "learning_rate": 1.9001412130664868e-05, "loss": 0.8201, "step": 2056 }, { "epoch": 0.1692655832133306, "grad_norm": 4.544222679328591, "learning_rate": 1.9000250809749262e-05, "loss": 0.8474, "step": 2057 }, { "epoch": 0.16934787080847563, "grad_norm": 4.950922151070744, "learning_rate": 1.8999088849470237e-05, "loss": 0.8212, "step": 2058 }, { "epoch": 0.16943015840362066, "grad_norm": 3.541665949963335, "learning_rate": 1.8997926249910326e-05, "loss": 0.8193, "step": 2059 }, { "epoch": 0.1695124459987657, "grad_norm": 3.4851879291005834, "learning_rate": 1.8996763011152127e-05, "loss": 0.8621, "step": 2060 }, { "epoch": 0.16959473359391072, "grad_norm": 4.630559669844992, "learning_rate": 1.899559913327827e-05, "loss": 0.791, "step": 2061 }, { "epoch": 0.16967702118905575, "grad_norm": 6.316492323089438, "learning_rate": 1.899443461637144e-05, "loss": 0.8477, "step": 2062 }, { "epoch": 0.16975930878420079, "grad_norm": 3.8105295270536765, "learning_rate": 1.899326946051436e-05, "loss": 0.8205, "step": 2063 }, { "epoch": 0.16984159637934582, "grad_norm": 3.5873582141439075, "learning_rate": 1.89921036657898e-05, "loss": 0.852, "step": 2064 }, { "epoch": 0.16992388397449085, "grad_norm": 3.4961803242089005, "learning_rate": 1.8990937232280574e-05, "loss": 0.8069, "step": 2065 }, { "epoch": 0.17000617156963588, "grad_norm": 4.709922748386927, "learning_rate": 1.8989770160069546e-05, "loss": 0.7968, "step": 2066 }, { "epoch": 0.1700884591647809, "grad_norm": 4.419602258456932, "learning_rate": 1.8988602449239626e-05, "loss": 0.8233, "step": 2067 }, { "epoch": 0.17017074675992594, "grad_norm": 4.165559426153498, "learning_rate": 1.8987434099873757e-05, "loss": 0.8783, "step": 2068 }, { "epoch": 0.17025303435507097, "grad_norm": 5.665039435657368, "learning_rate": 1.898626511205495e-05, "loss": 0.8422, "step": 2069 }, { "epoch": 0.170335321950216, "grad_norm": 3.261634336133814, "learning_rate": 1.8985095485866235e-05, "loss": 0.8308, "step": 2070 }, { "epoch": 0.17041760954536103, "grad_norm": 5.4431406561359355, "learning_rate": 1.898392522139071e-05, "loss": 0.8437, "step": 2071 }, { "epoch": 0.17049989714050606, "grad_norm": 3.5106150268111196, "learning_rate": 1.8982754318711506e-05, "loss": 0.8896, "step": 2072 }, { "epoch": 0.1705821847356511, "grad_norm": 0.5309314180484399, "learning_rate": 1.8981582777911795e-05, "loss": 0.5644, "step": 2073 }, { "epoch": 0.17066447233079612, "grad_norm": 0.49354945222966823, "learning_rate": 1.8980410599074812e-05, "loss": 0.5708, "step": 2074 }, { "epoch": 0.17074675992594116, "grad_norm": 3.537172251389112, "learning_rate": 1.897923778228382e-05, "loss": 0.8529, "step": 2075 }, { "epoch": 0.17082904752108619, "grad_norm": 3.6627295874426844, "learning_rate": 1.8978064327622138e-05, "loss": 0.8605, "step": 2076 }, { "epoch": 0.17091133511623122, "grad_norm": 4.426810425328736, "learning_rate": 1.8976890235173125e-05, "loss": 0.8576, "step": 2077 }, { "epoch": 0.17099362271137625, "grad_norm": 0.5453550393689479, "learning_rate": 1.8975715505020186e-05, "loss": 0.5629, "step": 2078 }, { "epoch": 0.17107591030652128, "grad_norm": 3.1760197055119965, "learning_rate": 1.897454013724677e-05, "loss": 0.8448, "step": 2079 }, { "epoch": 0.17115819790166634, "grad_norm": 3.6427547600767234, "learning_rate": 1.8973364131936374e-05, "loss": 0.8095, "step": 2080 }, { "epoch": 0.17124048549681137, "grad_norm": 5.314413481064159, "learning_rate": 1.8972187489172544e-05, "loss": 0.8585, "step": 2081 }, { "epoch": 0.1713227730919564, "grad_norm": 3.9553194322089307, "learning_rate": 1.8971010209038864e-05, "loss": 0.8398, "step": 2082 }, { "epoch": 0.17140506068710143, "grad_norm": 3.6803775188325965, "learning_rate": 1.8969832291618963e-05, "loss": 0.8357, "step": 2083 }, { "epoch": 0.17148734828224646, "grad_norm": 2.910049437995119, "learning_rate": 1.896865373699652e-05, "loss": 0.864, "step": 2084 }, { "epoch": 0.1715696358773915, "grad_norm": 3.1274442749778775, "learning_rate": 1.8967474545255264e-05, "loss": 0.8159, "step": 2085 }, { "epoch": 0.17165192347253652, "grad_norm": 2.769262707484498, "learning_rate": 1.8966294716478955e-05, "loss": 0.8091, "step": 2086 }, { "epoch": 0.17173421106768155, "grad_norm": 2.814835709251874, "learning_rate": 1.896511425075141e-05, "loss": 0.836, "step": 2087 }, { "epoch": 0.17181649866282658, "grad_norm": 2.749983643046786, "learning_rate": 1.8963933148156484e-05, "loss": 0.819, "step": 2088 }, { "epoch": 0.17189878625797161, "grad_norm": 2.9841402775497534, "learning_rate": 1.8962751408778083e-05, "loss": 0.8329, "step": 2089 }, { "epoch": 0.17198107385311664, "grad_norm": 4.490372651250418, "learning_rate": 1.8961569032700158e-05, "loss": 0.8227, "step": 2090 }, { "epoch": 0.17206336144826168, "grad_norm": 0.5195677303990671, "learning_rate": 1.89603860200067e-05, "loss": 0.5714, "step": 2091 }, { "epoch": 0.1721456490434067, "grad_norm": 4.7162828671329535, "learning_rate": 1.895920237078175e-05, "loss": 0.8219, "step": 2092 }, { "epoch": 0.17222793663855174, "grad_norm": 3.1845930400558493, "learning_rate": 1.895801808510939e-05, "loss": 0.8057, "step": 2093 }, { "epoch": 0.17231022423369677, "grad_norm": 3.856861957628647, "learning_rate": 1.895683316307375e-05, "loss": 0.8164, "step": 2094 }, { "epoch": 0.1723925118288418, "grad_norm": 4.3911680999892715, "learning_rate": 1.8955647604759007e-05, "loss": 0.805, "step": 2095 }, { "epoch": 0.17247479942398683, "grad_norm": 3.1328124148047105, "learning_rate": 1.8954461410249383e-05, "loss": 0.8281, "step": 2096 }, { "epoch": 0.17255708701913186, "grad_norm": 2.8193595324575518, "learning_rate": 1.895327457962914e-05, "loss": 0.8265, "step": 2097 }, { "epoch": 0.1726393746142769, "grad_norm": 3.0449731741865556, "learning_rate": 1.895208711298259e-05, "loss": 0.8186, "step": 2098 }, { "epoch": 0.17272166220942192, "grad_norm": 2.842969553300606, "learning_rate": 1.8950899010394086e-05, "loss": 0.8155, "step": 2099 }, { "epoch": 0.17280394980456695, "grad_norm": 2.908844344437982, "learning_rate": 1.8949710271948032e-05, "loss": 0.8327, "step": 2100 }, { "epoch": 0.17288623739971198, "grad_norm": 12.840651351543812, "learning_rate": 1.8948520897728873e-05, "loss": 0.8483, "step": 2101 }, { "epoch": 0.17296852499485701, "grad_norm": 5.194797922834837, "learning_rate": 1.8947330887821103e-05, "loss": 0.8508, "step": 2102 }, { "epoch": 0.17305081259000205, "grad_norm": 3.247249400708314, "learning_rate": 1.8946140242309252e-05, "loss": 0.8728, "step": 2103 }, { "epoch": 0.17313310018514708, "grad_norm": 2.3298477189440168, "learning_rate": 1.894494896127791e-05, "loss": 0.8029, "step": 2104 }, { "epoch": 0.1732153877802921, "grad_norm": 3.4162144543592334, "learning_rate": 1.8943757044811698e-05, "loss": 0.8188, "step": 2105 }, { "epoch": 0.17329767537543717, "grad_norm": 3.1878432889019943, "learning_rate": 1.8942564492995285e-05, "loss": 0.8428, "step": 2106 }, { "epoch": 0.1733799629705822, "grad_norm": 2.59384199789863, "learning_rate": 1.8941371305913395e-05, "loss": 0.7934, "step": 2107 }, { "epoch": 0.17346225056572723, "grad_norm": 2.870188493472898, "learning_rate": 1.8940177483650787e-05, "loss": 0.804, "step": 2108 }, { "epoch": 0.17354453816087226, "grad_norm": 0.5251962887433449, "learning_rate": 1.8938983026292268e-05, "loss": 0.5658, "step": 2109 }, { "epoch": 0.1736268257560173, "grad_norm": 0.4726930814322826, "learning_rate": 1.893778793392269e-05, "loss": 0.5555, "step": 2110 }, { "epoch": 0.17370911335116232, "grad_norm": 2.8381361257445836, "learning_rate": 1.893659220662695e-05, "loss": 0.8473, "step": 2111 }, { "epoch": 0.17379140094630735, "grad_norm": 2.513114361510452, "learning_rate": 1.8935395844489993e-05, "loss": 0.832, "step": 2112 }, { "epoch": 0.17387368854145238, "grad_norm": 3.2468542434176597, "learning_rate": 1.8934198847596807e-05, "loss": 0.8507, "step": 2113 }, { "epoch": 0.1739559761365974, "grad_norm": 3.4980797369338186, "learning_rate": 1.8933001216032422e-05, "loss": 0.8275, "step": 2114 }, { "epoch": 0.17403826373174244, "grad_norm": 3.1993556523484075, "learning_rate": 1.8931802949881913e-05, "loss": 0.8563, "step": 2115 }, { "epoch": 0.17412055132688747, "grad_norm": 2.287521279159301, "learning_rate": 1.893060404923041e-05, "loss": 0.8244, "step": 2116 }, { "epoch": 0.1742028389220325, "grad_norm": 2.6968982726780824, "learning_rate": 1.892940451416308e-05, "loss": 0.8495, "step": 2117 }, { "epoch": 0.17428512651717754, "grad_norm": 2.2600810375191656, "learning_rate": 1.892820434476513e-05, "loss": 0.8619, "step": 2118 }, { "epoch": 0.17436741411232257, "grad_norm": 2.4966318778937953, "learning_rate": 1.8927003541121823e-05, "loss": 0.843, "step": 2119 }, { "epoch": 0.1744497017074676, "grad_norm": 2.4997626727819773, "learning_rate": 1.8925802103318463e-05, "loss": 0.8273, "step": 2120 }, { "epoch": 0.17453198930261263, "grad_norm": 3.1828425470337995, "learning_rate": 1.8924600031440398e-05, "loss": 0.8407, "step": 2121 }, { "epoch": 0.17461427689775766, "grad_norm": 3.0897156196532816, "learning_rate": 1.8923397325573015e-05, "loss": 0.8385, "step": 2122 }, { "epoch": 0.1746965644929027, "grad_norm": 2.871249207390649, "learning_rate": 1.892219398580176e-05, "loss": 0.8133, "step": 2123 }, { "epoch": 0.17477885208804772, "grad_norm": 3.207694935164232, "learning_rate": 1.8920990012212108e-05, "loss": 0.8474, "step": 2124 }, { "epoch": 0.17486113968319275, "grad_norm": 3.0197963812764996, "learning_rate": 1.8919785404889596e-05, "loss": 0.8259, "step": 2125 }, { "epoch": 0.17494342727833778, "grad_norm": 2.3693479079198645, "learning_rate": 1.8918580163919795e-05, "loss": 0.8511, "step": 2126 }, { "epoch": 0.1750257148734828, "grad_norm": 0.6715909607363658, "learning_rate": 1.891737428938832e-05, "loss": 0.5611, "step": 2127 }, { "epoch": 0.17510800246862784, "grad_norm": 0.7381501490976441, "learning_rate": 1.891616778138084e-05, "loss": 0.5944, "step": 2128 }, { "epoch": 0.17519029006377287, "grad_norm": 2.945923101304315, "learning_rate": 1.8914960639983056e-05, "loss": 0.8633, "step": 2129 }, { "epoch": 0.1752725776589179, "grad_norm": 3.276267836360085, "learning_rate": 1.891375286528073e-05, "loss": 0.8466, "step": 2130 }, { "epoch": 0.17535486525406296, "grad_norm": 2.6643947216924886, "learning_rate": 1.891254445735965e-05, "loss": 0.855, "step": 2131 }, { "epoch": 0.175437152849208, "grad_norm": 2.5634784016407615, "learning_rate": 1.891133541630567e-05, "loss": 0.85, "step": 2132 }, { "epoch": 0.17551944044435303, "grad_norm": 3.122244685290648, "learning_rate": 1.8910125742204674e-05, "loss": 0.8372, "step": 2133 }, { "epoch": 0.17560172803949806, "grad_norm": 3.587765275523561, "learning_rate": 1.8908915435142593e-05, "loss": 0.8405, "step": 2134 }, { "epoch": 0.1756840156346431, "grad_norm": 2.9341338775130046, "learning_rate": 1.8907704495205408e-05, "loss": 0.8305, "step": 2135 }, { "epoch": 0.17576630322978812, "grad_norm": 0.8269887826396807, "learning_rate": 1.8906492922479138e-05, "loss": 0.5875, "step": 2136 }, { "epoch": 0.17584859082493315, "grad_norm": 3.0404267692410323, "learning_rate": 1.890528071704986e-05, "loss": 0.8665, "step": 2137 }, { "epoch": 0.17593087842007818, "grad_norm": 3.8451043737340926, "learning_rate": 1.8904067879003678e-05, "loss": 0.829, "step": 2138 }, { "epoch": 0.1760131660152232, "grad_norm": 2.9384675238670286, "learning_rate": 1.8902854408426754e-05, "loss": 0.8368, "step": 2139 }, { "epoch": 0.17609545361036824, "grad_norm": 3.1352954501943757, "learning_rate": 1.8901640305405293e-05, "loss": 0.8595, "step": 2140 }, { "epoch": 0.17617774120551327, "grad_norm": 0.5265036938780178, "learning_rate": 1.890042557002554e-05, "loss": 0.5988, "step": 2141 }, { "epoch": 0.1762600288006583, "grad_norm": 3.213441123374223, "learning_rate": 1.8899210202373787e-05, "loss": 0.8325, "step": 2142 }, { "epoch": 0.17634231639580333, "grad_norm": 5.0681082814516865, "learning_rate": 1.8897994202536377e-05, "loss": 0.8512, "step": 2143 }, { "epoch": 0.17642460399094836, "grad_norm": 3.210520788729363, "learning_rate": 1.8896777570599685e-05, "loss": 0.8836, "step": 2144 }, { "epoch": 0.1765068915860934, "grad_norm": 2.923135957435321, "learning_rate": 1.8895560306650145e-05, "loss": 0.8766, "step": 2145 }, { "epoch": 0.17658917918123843, "grad_norm": 2.888963184179868, "learning_rate": 1.8894342410774226e-05, "loss": 0.8824, "step": 2146 }, { "epoch": 0.17667146677638346, "grad_norm": 2.739409826930194, "learning_rate": 1.8893123883058448e-05, "loss": 0.8197, "step": 2147 }, { "epoch": 0.1767537543715285, "grad_norm": 2.650298520937812, "learning_rate": 1.8891904723589373e-05, "loss": 0.8215, "step": 2148 }, { "epoch": 0.17683604196667352, "grad_norm": 3.0306167804289426, "learning_rate": 1.8890684932453602e-05, "loss": 0.8613, "step": 2149 }, { "epoch": 0.17691832956181855, "grad_norm": 2.812060438572058, "learning_rate": 1.8889464509737795e-05, "loss": 0.8626, "step": 2150 }, { "epoch": 0.17700061715696358, "grad_norm": 3.0994618400593175, "learning_rate": 1.8888243455528648e-05, "loss": 0.8097, "step": 2151 }, { "epoch": 0.1770829047521086, "grad_norm": 2.976960760141478, "learning_rate": 1.8887021769912896e-05, "loss": 0.873, "step": 2152 }, { "epoch": 0.17716519234725364, "grad_norm": 2.9470177979558816, "learning_rate": 1.8885799452977332e-05, "loss": 0.8498, "step": 2153 }, { "epoch": 0.17724747994239867, "grad_norm": 2.9807070159935933, "learning_rate": 1.8884576504808787e-05, "loss": 0.8261, "step": 2154 }, { "epoch": 0.1773297675375437, "grad_norm": 3.55425495238515, "learning_rate": 1.8883352925494132e-05, "loss": 0.8763, "step": 2155 }, { "epoch": 0.17741205513268873, "grad_norm": 3.4293358551956596, "learning_rate": 1.8882128715120295e-05, "loss": 0.859, "step": 2156 }, { "epoch": 0.1774943427278338, "grad_norm": 3.5065020238393316, "learning_rate": 1.888090387377424e-05, "loss": 0.8426, "step": 2157 }, { "epoch": 0.17757663032297882, "grad_norm": 3.4946671740505963, "learning_rate": 1.8879678401542977e-05, "loss": 0.8238, "step": 2158 }, { "epoch": 0.17765891791812385, "grad_norm": 3.08246124790394, "learning_rate": 1.8878452298513558e-05, "loss": 0.855, "step": 2159 }, { "epoch": 0.17774120551326889, "grad_norm": 3.304694159764267, "learning_rate": 1.887722556477309e-05, "loss": 0.8558, "step": 2160 }, { "epoch": 0.17782349310841392, "grad_norm": 2.8888763632081638, "learning_rate": 1.8875998200408715e-05, "loss": 0.8549, "step": 2161 }, { "epoch": 0.17790578070355895, "grad_norm": 3.0898876229980377, "learning_rate": 1.887477020550762e-05, "loss": 0.8719, "step": 2162 }, { "epoch": 0.17798806829870398, "grad_norm": 0.5400253029327587, "learning_rate": 1.8873541580157044e-05, "loss": 0.5705, "step": 2163 }, { "epoch": 0.178070355893849, "grad_norm": 3.2094499045309814, "learning_rate": 1.8872312324444263e-05, "loss": 0.8729, "step": 2164 }, { "epoch": 0.17815264348899404, "grad_norm": 4.768655096715249, "learning_rate": 1.8871082438456607e-05, "loss": 0.8439, "step": 2165 }, { "epoch": 0.17823493108413907, "grad_norm": 0.46479355653430865, "learning_rate": 1.8869851922281443e-05, "loss": 0.5564, "step": 2166 }, { "epoch": 0.1783172186792841, "grad_norm": 3.2141222616690452, "learning_rate": 1.8868620776006177e-05, "loss": 0.8033, "step": 2167 }, { "epoch": 0.17839950627442913, "grad_norm": 2.8977339445629355, "learning_rate": 1.8867388999718282e-05, "loss": 0.826, "step": 2168 }, { "epoch": 0.17848179386957416, "grad_norm": 4.146203873336381, "learning_rate": 1.8866156593505248e-05, "loss": 0.8297, "step": 2169 }, { "epoch": 0.1785640814647192, "grad_norm": 0.4677473869816791, "learning_rate": 1.8864923557454635e-05, "loss": 0.5713, "step": 2170 }, { "epoch": 0.17864636905986422, "grad_norm": 3.1766961448469506, "learning_rate": 1.8863689891654027e-05, "loss": 0.8487, "step": 2171 }, { "epoch": 0.17872865665500925, "grad_norm": 3.983573244064544, "learning_rate": 1.886245559619106e-05, "loss": 0.818, "step": 2172 }, { "epoch": 0.17881094425015429, "grad_norm": 3.813218593430025, "learning_rate": 1.8861220671153427e-05, "loss": 0.8538, "step": 2173 }, { "epoch": 0.17889323184529932, "grad_norm": 3.5142200829496835, "learning_rate": 1.8859985116628845e-05, "loss": 0.8509, "step": 2174 }, { "epoch": 0.17897551944044435, "grad_norm": 0.47604867648075594, "learning_rate": 1.8858748932705093e-05, "loss": 0.5557, "step": 2175 }, { "epoch": 0.17905780703558938, "grad_norm": 5.865810912897995, "learning_rate": 1.8857512119469982e-05, "loss": 0.8107, "step": 2176 }, { "epoch": 0.1791400946307344, "grad_norm": 4.34519437878938, "learning_rate": 1.8856274677011375e-05, "loss": 0.8393, "step": 2177 }, { "epoch": 0.17922238222587944, "grad_norm": 4.675423402486858, "learning_rate": 1.8855036605417182e-05, "loss": 0.8242, "step": 2178 }, { "epoch": 0.17930466982102447, "grad_norm": 3.0460405802151294, "learning_rate": 1.8853797904775347e-05, "loss": 0.8499, "step": 2179 }, { "epoch": 0.1793869574161695, "grad_norm": 4.6587197890485, "learning_rate": 1.885255857517387e-05, "loss": 0.862, "step": 2180 }, { "epoch": 0.17946924501131453, "grad_norm": 3.595468610999008, "learning_rate": 1.8851318616700785e-05, "loss": 0.8425, "step": 2181 }, { "epoch": 0.17955153260645956, "grad_norm": 4.6716822565870935, "learning_rate": 1.8850078029444184e-05, "loss": 0.8323, "step": 2182 }, { "epoch": 0.17963382020160462, "grad_norm": 3.4397771250372275, "learning_rate": 1.8848836813492198e-05, "loss": 0.8398, "step": 2183 }, { "epoch": 0.17971610779674965, "grad_norm": 3.1128586403508813, "learning_rate": 1.8847594968932988e-05, "loss": 0.8073, "step": 2184 }, { "epoch": 0.17979839539189468, "grad_norm": 4.137928527045669, "learning_rate": 1.884635249585479e-05, "loss": 0.8435, "step": 2185 }, { "epoch": 0.17988068298703971, "grad_norm": 3.9153345854479276, "learning_rate": 1.884510939434585e-05, "loss": 0.8529, "step": 2186 }, { "epoch": 0.17996297058218474, "grad_norm": 3.132348968761736, "learning_rate": 1.884386566449449e-05, "loss": 0.8046, "step": 2187 }, { "epoch": 0.18004525817732978, "grad_norm": 3.736988154773954, "learning_rate": 1.8842621306389055e-05, "loss": 0.8396, "step": 2188 }, { "epoch": 0.1801275457724748, "grad_norm": 6.114132305121805, "learning_rate": 1.8841376320117942e-05, "loss": 0.8277, "step": 2189 }, { "epoch": 0.18020983336761984, "grad_norm": 3.328527236630722, "learning_rate": 1.8840130705769598e-05, "loss": 0.7974, "step": 2190 }, { "epoch": 0.18029212096276487, "grad_norm": 3.881694260683007, "learning_rate": 1.8838884463432505e-05, "loss": 0.8145, "step": 2191 }, { "epoch": 0.1803744085579099, "grad_norm": 4.009278703627772, "learning_rate": 1.8837637593195196e-05, "loss": 0.8377, "step": 2192 }, { "epoch": 0.18045669615305493, "grad_norm": 3.953343089533144, "learning_rate": 1.8836390095146246e-05, "loss": 0.8501, "step": 2193 }, { "epoch": 0.18053898374819996, "grad_norm": 3.8815659926184716, "learning_rate": 1.8835141969374274e-05, "loss": 0.8278, "step": 2194 }, { "epoch": 0.180621271343345, "grad_norm": 9.408638288023258, "learning_rate": 1.883389321596795e-05, "loss": 0.862, "step": 2195 }, { "epoch": 0.18070355893849002, "grad_norm": 3.00111833564058, "learning_rate": 1.8832643835015977e-05, "loss": 0.8334, "step": 2196 }, { "epoch": 0.18078584653363505, "grad_norm": 3.9288506872472997, "learning_rate": 1.8831393826607112e-05, "loss": 0.8268, "step": 2197 }, { "epoch": 0.18086813412878008, "grad_norm": 3.151568521244493, "learning_rate": 1.883014319083015e-05, "loss": 0.8345, "step": 2198 }, { "epoch": 0.18095042172392511, "grad_norm": 3.6006903348835126, "learning_rate": 1.882889192777394e-05, "loss": 0.8291, "step": 2199 }, { "epoch": 0.18103270931907015, "grad_norm": 3.064791691782749, "learning_rate": 1.882764003752737e-05, "loss": 0.8236, "step": 2200 }, { "epoch": 0.18111499691421518, "grad_norm": 2.687808317055217, "learning_rate": 1.8826387520179366e-05, "loss": 0.836, "step": 2201 }, { "epoch": 0.1811972845093602, "grad_norm": 3.020220830213787, "learning_rate": 1.8825134375818907e-05, "loss": 0.8315, "step": 2202 }, { "epoch": 0.18127957210450524, "grad_norm": 3.0945061455675784, "learning_rate": 1.882388060453502e-05, "loss": 0.8298, "step": 2203 }, { "epoch": 0.18136185969965027, "grad_norm": 6.79903493713913, "learning_rate": 1.8822626206416765e-05, "loss": 0.8438, "step": 2204 }, { "epoch": 0.1814441472947953, "grad_norm": 3.2340255761593224, "learning_rate": 1.8821371181553255e-05, "loss": 0.8217, "step": 2205 }, { "epoch": 0.18152643488994033, "grad_norm": 3.3572188316655374, "learning_rate": 1.882011553003364e-05, "loss": 0.8471, "step": 2206 }, { "epoch": 0.18160872248508536, "grad_norm": 2.407818729109725, "learning_rate": 1.8818859251947126e-05, "loss": 0.8119, "step": 2207 }, { "epoch": 0.1816910100802304, "grad_norm": 2.8458563815348534, "learning_rate": 1.8817602347382956e-05, "loss": 0.8398, "step": 2208 }, { "epoch": 0.18177329767537545, "grad_norm": 3.8214337213544702, "learning_rate": 1.8816344816430414e-05, "loss": 0.8369, "step": 2209 }, { "epoch": 0.18185558527052048, "grad_norm": 0.5075834375751112, "learning_rate": 1.8815086659178838e-05, "loss": 0.5619, "step": 2210 }, { "epoch": 0.1819378728656655, "grad_norm": 2.8951969539455296, "learning_rate": 1.8813827875717603e-05, "loss": 0.8389, "step": 2211 }, { "epoch": 0.18202016046081054, "grad_norm": 2.6573677658322485, "learning_rate": 1.8812568466136128e-05, "loss": 0.822, "step": 2212 }, { "epoch": 0.18210244805595557, "grad_norm": 3.766069897146698, "learning_rate": 1.8811308430523888e-05, "loss": 0.8325, "step": 2213 }, { "epoch": 0.1821847356511006, "grad_norm": 3.0191461597717706, "learning_rate": 1.8810047768970387e-05, "loss": 0.8303, "step": 2214 }, { "epoch": 0.18226702324624564, "grad_norm": 4.4512451838577345, "learning_rate": 1.880878648156518e-05, "loss": 0.8336, "step": 2215 }, { "epoch": 0.18234931084139067, "grad_norm": 0.46711778997403974, "learning_rate": 1.8807524568397873e-05, "loss": 0.584, "step": 2216 }, { "epoch": 0.1824315984365357, "grad_norm": 3.34850100547728, "learning_rate": 1.88062620295581e-05, "loss": 0.8361, "step": 2217 }, { "epoch": 0.18251388603168073, "grad_norm": 2.5871023413010223, "learning_rate": 1.880499886513556e-05, "loss": 0.8372, "step": 2218 }, { "epoch": 0.18259617362682576, "grad_norm": 2.7853773719177233, "learning_rate": 1.8803735075219985e-05, "loss": 0.8164, "step": 2219 }, { "epoch": 0.1826784612219708, "grad_norm": 0.43484423574492026, "learning_rate": 1.8802470659901143e-05, "loss": 0.5434, "step": 2220 }, { "epoch": 0.18276074881711582, "grad_norm": 2.270864089725784, "learning_rate": 1.8801205619268867e-05, "loss": 0.8195, "step": 2221 }, { "epoch": 0.18284303641226085, "grad_norm": 2.5762907517132447, "learning_rate": 1.8799939953413017e-05, "loss": 0.8508, "step": 2222 }, { "epoch": 0.18292532400740588, "grad_norm": 3.792182714917021, "learning_rate": 1.879867366242351e-05, "loss": 0.8724, "step": 2223 }, { "epoch": 0.1830076116025509, "grad_norm": 2.737508395308536, "learning_rate": 1.8797406746390295e-05, "loss": 0.8249, "step": 2224 }, { "epoch": 0.18308989919769594, "grad_norm": 3.3931450650826553, "learning_rate": 1.8796139205403373e-05, "loss": 0.8194, "step": 2225 }, { "epoch": 0.18317218679284097, "grad_norm": 3.244932583038611, "learning_rate": 1.8794871039552792e-05, "loss": 0.8389, "step": 2226 }, { "epoch": 0.183254474387986, "grad_norm": 2.6631938105606485, "learning_rate": 1.8793602248928636e-05, "loss": 0.8195, "step": 2227 }, { "epoch": 0.18333676198313104, "grad_norm": 0.499931675074203, "learning_rate": 1.8792332833621038e-05, "loss": 0.5945, "step": 2228 }, { "epoch": 0.18341904957827607, "grad_norm": 0.4348697621921401, "learning_rate": 1.879106279372018e-05, "loss": 0.5656, "step": 2229 }, { "epoch": 0.1835013371734211, "grad_norm": 3.2176301339993567, "learning_rate": 1.878979212931628e-05, "loss": 0.8338, "step": 2230 }, { "epoch": 0.18358362476856613, "grad_norm": 3.385877652749623, "learning_rate": 1.8788520840499602e-05, "loss": 0.8569, "step": 2231 }, { "epoch": 0.18366591236371116, "grad_norm": 2.6879516134859016, "learning_rate": 1.8787248927360456e-05, "loss": 0.8562, "step": 2232 }, { "epoch": 0.1837481999588562, "grad_norm": 3.1793592151350816, "learning_rate": 1.8785976389989206e-05, "loss": 0.8183, "step": 2233 }, { "epoch": 0.18383048755400122, "grad_norm": 3.5810162565636308, "learning_rate": 1.878470322847624e-05, "loss": 0.8458, "step": 2234 }, { "epoch": 0.18391277514914628, "grad_norm": 2.7189540947205972, "learning_rate": 1.878342944291201e-05, "loss": 0.8702, "step": 2235 }, { "epoch": 0.1839950627442913, "grad_norm": 2.8595668370124354, "learning_rate": 1.8782155033386994e-05, "loss": 0.8538, "step": 2236 }, { "epoch": 0.18407735033943634, "grad_norm": 3.703233163107493, "learning_rate": 1.8780879999991733e-05, "loss": 0.7903, "step": 2237 }, { "epoch": 0.18415963793458137, "grad_norm": 0.4798818692626459, "learning_rate": 1.87796043428168e-05, "loss": 0.558, "step": 2238 }, { "epoch": 0.1842419255297264, "grad_norm": 3.9357951606994956, "learning_rate": 1.8778328061952812e-05, "loss": 0.8604, "step": 2239 }, { "epoch": 0.18432421312487143, "grad_norm": 0.4613130280558913, "learning_rate": 1.877705115749044e-05, "loss": 0.5557, "step": 2240 }, { "epoch": 0.18440650072001646, "grad_norm": 0.4468114582336002, "learning_rate": 1.877577362952039e-05, "loss": 0.5699, "step": 2241 }, { "epoch": 0.1844887883151615, "grad_norm": 3.7317140281652956, "learning_rate": 1.8774495478133413e-05, "loss": 0.8726, "step": 2242 }, { "epoch": 0.18457107591030653, "grad_norm": 3.622122345180218, "learning_rate": 1.8773216703420316e-05, "loss": 0.8426, "step": 2243 }, { "epoch": 0.18465336350545156, "grad_norm": 2.7851269583116323, "learning_rate": 1.8771937305471933e-05, "loss": 0.8274, "step": 2244 }, { "epoch": 0.1847356511005966, "grad_norm": 3.0839548975497557, "learning_rate": 1.877065728437915e-05, "loss": 0.8522, "step": 2245 }, { "epoch": 0.18481793869574162, "grad_norm": 3.3013680164209305, "learning_rate": 1.87693766402329e-05, "loss": 0.8541, "step": 2246 }, { "epoch": 0.18490022629088665, "grad_norm": 2.9028412908174626, "learning_rate": 1.8768095373124163e-05, "loss": 0.8221, "step": 2247 }, { "epoch": 0.18498251388603168, "grad_norm": 3.138930779052706, "learning_rate": 1.8766813483143948e-05, "loss": 0.8398, "step": 2248 }, { "epoch": 0.1850648014811767, "grad_norm": 3.381505399118227, "learning_rate": 1.8765530970383327e-05, "loss": 0.8375, "step": 2249 }, { "epoch": 0.18514708907632174, "grad_norm": 2.9033167705998433, "learning_rate": 1.87642478349334e-05, "loss": 0.8365, "step": 2250 }, { "epoch": 0.18522937667146677, "grad_norm": 3.2505234992977696, "learning_rate": 1.8762964076885328e-05, "loss": 0.8237, "step": 2251 }, { "epoch": 0.1853116642666118, "grad_norm": 2.5321575440720956, "learning_rate": 1.8761679696330298e-05, "loss": 0.7935, "step": 2252 }, { "epoch": 0.18539395186175683, "grad_norm": 3.04670905608212, "learning_rate": 1.876039469335956e-05, "loss": 0.8158, "step": 2253 }, { "epoch": 0.18547623945690186, "grad_norm": 3.0164781648706636, "learning_rate": 1.875910906806439e-05, "loss": 0.83, "step": 2254 }, { "epoch": 0.1855585270520469, "grad_norm": 4.150397095497481, "learning_rate": 1.875782282053612e-05, "loss": 0.8448, "step": 2255 }, { "epoch": 0.18564081464719193, "grad_norm": 5.5579579203703435, "learning_rate": 1.875653595086612e-05, "loss": 0.8278, "step": 2256 }, { "epoch": 0.18572310224233696, "grad_norm": 2.7887068893538762, "learning_rate": 1.875524845914581e-05, "loss": 0.862, "step": 2257 }, { "epoch": 0.185805389837482, "grad_norm": 4.214553200363649, "learning_rate": 1.8753960345466658e-05, "loss": 0.842, "step": 2258 }, { "epoch": 0.18588767743262702, "grad_norm": 3.3289965629927916, "learning_rate": 1.875267160992016e-05, "loss": 0.834, "step": 2259 }, { "epoch": 0.18596996502777205, "grad_norm": 20.600639052601434, "learning_rate": 1.8751382252597868e-05, "loss": 0.8326, "step": 2260 }, { "epoch": 0.1860522526229171, "grad_norm": 2.710388129719034, "learning_rate": 1.8750092273591374e-05, "loss": 0.8377, "step": 2261 }, { "epoch": 0.18613454021806214, "grad_norm": 0.5909281433096669, "learning_rate": 1.8748801672992324e-05, "loss": 0.5645, "step": 2262 }, { "epoch": 0.18621682781320717, "grad_norm": 0.4908149271615487, "learning_rate": 1.874751045089239e-05, "loss": 0.5609, "step": 2263 }, { "epoch": 0.1862991154083522, "grad_norm": 3.1296412097537996, "learning_rate": 1.8746218607383304e-05, "loss": 0.8435, "step": 2264 }, { "epoch": 0.18638140300349723, "grad_norm": 2.9727680344314704, "learning_rate": 1.874492614255684e-05, "loss": 0.8584, "step": 2265 }, { "epoch": 0.18646369059864226, "grad_norm": 3.078013174658112, "learning_rate": 1.87436330565048e-05, "loss": 0.8797, "step": 2266 }, { "epoch": 0.1865459781937873, "grad_norm": 2.683367791467896, "learning_rate": 1.8742339349319056e-05, "loss": 0.8384, "step": 2267 }, { "epoch": 0.18662826578893232, "grad_norm": 3.1448392740468853, "learning_rate": 1.874104502109151e-05, "loss": 0.8489, "step": 2268 }, { "epoch": 0.18671055338407735, "grad_norm": 2.727871775490667, "learning_rate": 1.8739750071914096e-05, "loss": 0.8488, "step": 2269 }, { "epoch": 0.18679284097922239, "grad_norm": 3.3007638198806455, "learning_rate": 1.873845450187882e-05, "loss": 0.8398, "step": 2270 }, { "epoch": 0.18687512857436742, "grad_norm": 3.2603426010994294, "learning_rate": 1.873715831107771e-05, "loss": 0.8339, "step": 2271 }, { "epoch": 0.18695741616951245, "grad_norm": 2.4598611597771702, "learning_rate": 1.873586149960285e-05, "loss": 0.86, "step": 2272 }, { "epoch": 0.18703970376465748, "grad_norm": 0.7355623803063913, "learning_rate": 1.8734564067546354e-05, "loss": 0.594, "step": 2273 }, { "epoch": 0.1871219913598025, "grad_norm": 2.8609086065159235, "learning_rate": 1.8733266015000397e-05, "loss": 0.8542, "step": 2274 }, { "epoch": 0.18720427895494754, "grad_norm": 3.213644959041243, "learning_rate": 1.8731967342057192e-05, "loss": 0.8496, "step": 2275 }, { "epoch": 0.18728656655009257, "grad_norm": 2.839023213264933, "learning_rate": 1.8730668048808992e-05, "loss": 0.8466, "step": 2276 }, { "epoch": 0.1873688541452376, "grad_norm": 2.8302014001992735, "learning_rate": 1.8729368135348092e-05, "loss": 0.8277, "step": 2277 }, { "epoch": 0.18745114174038263, "grad_norm": 2.336413397518663, "learning_rate": 1.8728067601766843e-05, "loss": 0.8037, "step": 2278 }, { "epoch": 0.18753342933552766, "grad_norm": 3.093535380847658, "learning_rate": 1.872676644815763e-05, "loss": 0.8531, "step": 2279 }, { "epoch": 0.1876157169306727, "grad_norm": 2.923666112609439, "learning_rate": 1.8725464674612886e-05, "loss": 0.8425, "step": 2280 }, { "epoch": 0.18769800452581772, "grad_norm": 2.8664539298672573, "learning_rate": 1.8724162281225085e-05, "loss": 0.849, "step": 2281 }, { "epoch": 0.18778029212096276, "grad_norm": 3.538690882300127, "learning_rate": 1.8722859268086745e-05, "loss": 0.8086, "step": 2282 }, { "epoch": 0.18786257971610779, "grad_norm": 0.5196858721316295, "learning_rate": 1.8721555635290435e-05, "loss": 0.5675, "step": 2283 }, { "epoch": 0.18794486731125282, "grad_norm": 2.93418455305132, "learning_rate": 1.8720251382928762e-05, "loss": 0.8235, "step": 2284 }, { "epoch": 0.18802715490639785, "grad_norm": 3.2004573663386946, "learning_rate": 1.8718946511094375e-05, "loss": 0.8581, "step": 2285 }, { "epoch": 0.1881094425015429, "grad_norm": 2.9202248368901285, "learning_rate": 1.8717641019879972e-05, "loss": 0.8238, "step": 2286 }, { "epoch": 0.18819173009668794, "grad_norm": 3.1236514413686636, "learning_rate": 1.8716334909378294e-05, "loss": 0.8501, "step": 2287 }, { "epoch": 0.18827401769183297, "grad_norm": 3.1609170877521557, "learning_rate": 1.8715028179682122e-05, "loss": 0.8686, "step": 2288 }, { "epoch": 0.188356305286978, "grad_norm": 2.6161046242258497, "learning_rate": 1.871372083088429e-05, "loss": 0.8283, "step": 2289 }, { "epoch": 0.18843859288212303, "grad_norm": 2.8853603189305623, "learning_rate": 1.871241286307766e-05, "loss": 0.8106, "step": 2290 }, { "epoch": 0.18852088047726806, "grad_norm": 3.085578931198705, "learning_rate": 1.8711104276355153e-05, "loss": 0.8465, "step": 2291 }, { "epoch": 0.1886031680724131, "grad_norm": 0.48302128504076397, "learning_rate": 1.8709795070809737e-05, "loss": 0.5678, "step": 2292 }, { "epoch": 0.18868545566755812, "grad_norm": 3.288958363632039, "learning_rate": 1.87084852465344e-05, "loss": 0.8566, "step": 2293 }, { "epoch": 0.18876774326270315, "grad_norm": 3.4831897332745387, "learning_rate": 1.8707174803622202e-05, "loss": 0.8428, "step": 2294 }, { "epoch": 0.18885003085784818, "grad_norm": 0.4349912345338283, "learning_rate": 1.8705863742166232e-05, "loss": 0.5621, "step": 2295 }, { "epoch": 0.18893231845299321, "grad_norm": 2.931342167177246, "learning_rate": 1.8704552062259624e-05, "loss": 0.8505, "step": 2296 }, { "epoch": 0.18901460604813825, "grad_norm": 4.189017728249225, "learning_rate": 1.870323976399556e-05, "loss": 0.8631, "step": 2297 }, { "epoch": 0.18909689364328328, "grad_norm": 4.18904839099143, "learning_rate": 1.870192684746726e-05, "loss": 0.8408, "step": 2298 }, { "epoch": 0.1891791812384283, "grad_norm": 0.4624111826904663, "learning_rate": 1.8700613312767994e-05, "loss": 0.5512, "step": 2299 }, { "epoch": 0.18926146883357334, "grad_norm": 3.8345017192105604, "learning_rate": 1.8699299159991072e-05, "loss": 0.8251, "step": 2300 }, { "epoch": 0.18934375642871837, "grad_norm": 2.81460141306639, "learning_rate": 1.869798438922985e-05, "loss": 0.8566, "step": 2301 }, { "epoch": 0.1894260440238634, "grad_norm": 3.4766589626322815, "learning_rate": 1.8696669000577726e-05, "loss": 0.8427, "step": 2302 }, { "epoch": 0.18950833161900843, "grad_norm": 3.7855392027129473, "learning_rate": 1.869535299412815e-05, "loss": 0.8159, "step": 2303 }, { "epoch": 0.18959061921415346, "grad_norm": 2.9712686170543345, "learning_rate": 1.86940363699746e-05, "loss": 0.8284, "step": 2304 }, { "epoch": 0.1896729068092985, "grad_norm": 3.3034575793695518, "learning_rate": 1.8692719128210607e-05, "loss": 0.8163, "step": 2305 }, { "epoch": 0.18975519440444352, "grad_norm": 3.44328822803266, "learning_rate": 1.8691401268929754e-05, "loss": 0.8304, "step": 2306 }, { "epoch": 0.18983748199958855, "grad_norm": 3.393030951997428, "learning_rate": 1.8690082792225653e-05, "loss": 0.826, "step": 2307 }, { "epoch": 0.18991976959473358, "grad_norm": 3.112853842296299, "learning_rate": 1.868876369819197e-05, "loss": 0.8202, "step": 2308 }, { "epoch": 0.19000205718987861, "grad_norm": 3.018481253544045, "learning_rate": 1.8687443986922408e-05, "loss": 0.8323, "step": 2309 }, { "epoch": 0.19008434478502365, "grad_norm": 4.303472704157856, "learning_rate": 1.8686123658510715e-05, "loss": 0.8461, "step": 2310 }, { "epoch": 0.19016663238016868, "grad_norm": 11.860345394687899, "learning_rate": 1.8684802713050692e-05, "loss": 0.8089, "step": 2311 }, { "epoch": 0.19024891997531373, "grad_norm": 2.8572470787809126, "learning_rate": 1.8683481150636176e-05, "loss": 0.8382, "step": 2312 }, { "epoch": 0.19033120757045877, "grad_norm": 0.4654004570208718, "learning_rate": 1.8682158971361044e-05, "loss": 0.5562, "step": 2313 }, { "epoch": 0.1904134951656038, "grad_norm": 2.9838829688293855, "learning_rate": 1.8680836175319223e-05, "loss": 0.8277, "step": 2314 }, { "epoch": 0.19049578276074883, "grad_norm": 2.810228948693419, "learning_rate": 1.8679512762604683e-05, "loss": 0.8648, "step": 2315 }, { "epoch": 0.19057807035589386, "grad_norm": 2.881269477726161, "learning_rate": 1.8678188733311436e-05, "loss": 0.8557, "step": 2316 }, { "epoch": 0.1906603579510389, "grad_norm": 2.7469044292228353, "learning_rate": 1.8676864087533542e-05, "loss": 0.8506, "step": 2317 }, { "epoch": 0.19074264554618392, "grad_norm": 3.34744286595029, "learning_rate": 1.8675538825365104e-05, "loss": 0.8227, "step": 2318 }, { "epoch": 0.19082493314132895, "grad_norm": 2.737049820008201, "learning_rate": 1.8674212946900257e-05, "loss": 0.8469, "step": 2319 }, { "epoch": 0.19090722073647398, "grad_norm": 2.6458546665277303, "learning_rate": 1.8672886452233195e-05, "loss": 0.8492, "step": 2320 }, { "epoch": 0.190989508331619, "grad_norm": 0.4571970442666248, "learning_rate": 1.8671559341458148e-05, "loss": 0.5322, "step": 2321 }, { "epoch": 0.19107179592676404, "grad_norm": 2.482157540329627, "learning_rate": 1.8670231614669395e-05, "loss": 0.8354, "step": 2322 }, { "epoch": 0.19115408352190907, "grad_norm": 3.095874840133814, "learning_rate": 1.8668903271961258e-05, "loss": 0.8443, "step": 2323 }, { "epoch": 0.1912363711170541, "grad_norm": 2.638500405402555, "learning_rate": 1.8667574313428096e-05, "loss": 0.8592, "step": 2324 }, { "epoch": 0.19131865871219914, "grad_norm": 2.6137043327453715, "learning_rate": 1.866624473916431e-05, "loss": 0.8019, "step": 2325 }, { "epoch": 0.19140094630734417, "grad_norm": 2.394976700091314, "learning_rate": 1.8664914549264362e-05, "loss": 0.8372, "step": 2326 }, { "epoch": 0.1914832339024892, "grad_norm": 2.888092102030868, "learning_rate": 1.866358374382274e-05, "loss": 0.8372, "step": 2327 }, { "epoch": 0.19156552149763423, "grad_norm": 2.6980010643491124, "learning_rate": 1.8662252322933986e-05, "loss": 0.847, "step": 2328 }, { "epoch": 0.19164780909277926, "grad_norm": 2.796077785977032, "learning_rate": 1.866092028669268e-05, "loss": 0.8385, "step": 2329 }, { "epoch": 0.1917300966879243, "grad_norm": 2.9754697600807343, "learning_rate": 1.8659587635193447e-05, "loss": 0.8177, "step": 2330 }, { "epoch": 0.19181238428306932, "grad_norm": 2.5725791673462703, "learning_rate": 1.865825436853096e-05, "loss": 0.8363, "step": 2331 }, { "epoch": 0.19189467187821435, "grad_norm": 2.4801240805526388, "learning_rate": 1.8656920486799927e-05, "loss": 0.8175, "step": 2332 }, { "epoch": 0.19197695947335938, "grad_norm": 0.4704649399722642, "learning_rate": 1.8655585990095105e-05, "loss": 0.5492, "step": 2333 }, { "epoch": 0.1920592470685044, "grad_norm": 2.529497640955037, "learning_rate": 1.86542508785113e-05, "loss": 0.81, "step": 2334 }, { "epoch": 0.19214153466364944, "grad_norm": 0.4428561186683141, "learning_rate": 1.8652915152143353e-05, "loss": 0.5552, "step": 2335 }, { "epoch": 0.19222382225879447, "grad_norm": 2.446374147239063, "learning_rate": 1.8651578811086152e-05, "loss": 0.8422, "step": 2336 }, { "epoch": 0.1923061098539395, "grad_norm": 2.2281607455842605, "learning_rate": 1.8650241855434625e-05, "loss": 0.837, "step": 2337 }, { "epoch": 0.19238839744908456, "grad_norm": 0.4770791744471931, "learning_rate": 1.8648904285283754e-05, "loss": 0.5636, "step": 2338 }, { "epoch": 0.1924706850442296, "grad_norm": 2.8785971320310972, "learning_rate": 1.8647566100728553e-05, "loss": 0.8528, "step": 2339 }, { "epoch": 0.19255297263937463, "grad_norm": 3.100990086853741, "learning_rate": 1.864622730186409e-05, "loss": 0.819, "step": 2340 }, { "epoch": 0.19263526023451966, "grad_norm": 2.1552854912066515, "learning_rate": 1.8644887888785464e-05, "loss": 0.8404, "step": 2341 }, { "epoch": 0.1927175478296647, "grad_norm": 2.2255369657187076, "learning_rate": 1.8643547861587827e-05, "loss": 0.8721, "step": 2342 }, { "epoch": 0.19279983542480972, "grad_norm": 0.46985992482292277, "learning_rate": 1.8642207220366373e-05, "loss": 0.5565, "step": 2343 }, { "epoch": 0.19288212301995475, "grad_norm": 2.0616661872374173, "learning_rate": 1.8640865965216338e-05, "loss": 0.8231, "step": 2344 }, { "epoch": 0.19296441061509978, "grad_norm": 2.2729463116759927, "learning_rate": 1.8639524096233008e-05, "loss": 0.8361, "step": 2345 }, { "epoch": 0.1930466982102448, "grad_norm": 2.417986376102915, "learning_rate": 1.8638181613511702e-05, "loss": 0.8203, "step": 2346 }, { "epoch": 0.19312898580538984, "grad_norm": 2.0973418519154996, "learning_rate": 1.8636838517147785e-05, "loss": 0.8541, "step": 2347 }, { "epoch": 0.19321127340053487, "grad_norm": 5.893917396678879, "learning_rate": 1.8635494807236675e-05, "loss": 0.8374, "step": 2348 }, { "epoch": 0.1932935609956799, "grad_norm": 2.3484552007853186, "learning_rate": 1.8634150483873824e-05, "loss": 0.8721, "step": 2349 }, { "epoch": 0.19337584859082493, "grad_norm": 2.1061799447137677, "learning_rate": 1.8632805547154735e-05, "loss": 0.8276, "step": 2350 }, { "epoch": 0.19345813618596996, "grad_norm": 2.231070914752389, "learning_rate": 1.8631459997174942e-05, "loss": 0.8332, "step": 2351 }, { "epoch": 0.193540423781115, "grad_norm": 0.4785412716293615, "learning_rate": 1.8630113834030035e-05, "loss": 0.5741, "step": 2352 }, { "epoch": 0.19362271137626003, "grad_norm": 2.933088496173211, "learning_rate": 1.8628767057815643e-05, "loss": 0.8362, "step": 2353 }, { "epoch": 0.19370499897140506, "grad_norm": 3.3919255724286694, "learning_rate": 1.862741966862744e-05, "loss": 0.8178, "step": 2354 }, { "epoch": 0.1937872865665501, "grad_norm": 3.969290909229754, "learning_rate": 1.862607166656114e-05, "loss": 0.8415, "step": 2355 }, { "epoch": 0.19386957416169512, "grad_norm": 0.43363832800498575, "learning_rate": 1.8624723051712504e-05, "loss": 0.5441, "step": 2356 }, { "epoch": 0.19395186175684015, "grad_norm": 1.9923432140871553, "learning_rate": 1.8623373824177337e-05, "loss": 0.8504, "step": 2357 }, { "epoch": 0.19403414935198518, "grad_norm": 2.138729060920641, "learning_rate": 1.8622023984051486e-05, "loss": 0.8531, "step": 2358 }, { "epoch": 0.1941164369471302, "grad_norm": 2.996103398885607, "learning_rate": 1.8620673531430835e-05, "loss": 0.8295, "step": 2359 }, { "epoch": 0.19419872454227524, "grad_norm": 1.9881513928273853, "learning_rate": 1.8619322466411327e-05, "loss": 0.8423, "step": 2360 }, { "epoch": 0.19428101213742027, "grad_norm": 4.609204489054207, "learning_rate": 1.8617970789088936e-05, "loss": 0.8215, "step": 2361 }, { "epoch": 0.1943632997325653, "grad_norm": 2.4870160713294323, "learning_rate": 1.861661849955968e-05, "loss": 0.8403, "step": 2362 }, { "epoch": 0.19444558732771033, "grad_norm": 2.5663255039551793, "learning_rate": 1.8615265597919628e-05, "loss": 0.8504, "step": 2363 }, { "epoch": 0.1945278749228554, "grad_norm": 0.4462116251981424, "learning_rate": 1.8613912084264885e-05, "loss": 0.5743, "step": 2364 }, { "epoch": 0.19461016251800042, "grad_norm": 2.599011971955559, "learning_rate": 1.8612557958691603e-05, "loss": 0.8316, "step": 2365 }, { "epoch": 0.19469245011314545, "grad_norm": 2.162713113376905, "learning_rate": 1.861120322129598e-05, "loss": 0.8602, "step": 2366 }, { "epoch": 0.19477473770829049, "grad_norm": 2.427465370256237, "learning_rate": 1.860984787217425e-05, "loss": 0.852, "step": 2367 }, { "epoch": 0.19485702530343552, "grad_norm": 2.1567218802548083, "learning_rate": 1.8608491911422696e-05, "loss": 0.7971, "step": 2368 }, { "epoch": 0.19493931289858055, "grad_norm": 3.099411551421204, "learning_rate": 1.860713533913764e-05, "loss": 0.8112, "step": 2369 }, { "epoch": 0.19502160049372558, "grad_norm": 0.458001012562976, "learning_rate": 1.8605778155415462e-05, "loss": 0.5699, "step": 2370 }, { "epoch": 0.1951038880888706, "grad_norm": 2.490707822113723, "learning_rate": 1.860442036035256e-05, "loss": 0.8604, "step": 2371 }, { "epoch": 0.19518617568401564, "grad_norm": 0.4231122112274931, "learning_rate": 1.8603061954045404e-05, "loss": 0.5447, "step": 2372 }, { "epoch": 0.19526846327916067, "grad_norm": 0.43311431879242407, "learning_rate": 1.860170293659048e-05, "loss": 0.577, "step": 2373 }, { "epoch": 0.1953507508743057, "grad_norm": 3.3753031020556534, "learning_rate": 1.8600343308084338e-05, "loss": 0.8346, "step": 2374 }, { "epoch": 0.19543303846945073, "grad_norm": 2.504799282753451, "learning_rate": 1.859898306862356e-05, "loss": 0.8768, "step": 2375 }, { "epoch": 0.19551532606459576, "grad_norm": 2.4364845514832423, "learning_rate": 1.8597622218304775e-05, "loss": 0.8097, "step": 2376 }, { "epoch": 0.1955976136597408, "grad_norm": 0.43615558945077293, "learning_rate": 1.8596260757224664e-05, "loss": 0.5263, "step": 2377 }, { "epoch": 0.19567990125488582, "grad_norm": 0.43233615700524114, "learning_rate": 1.859489868547993e-05, "loss": 0.5498, "step": 2378 }, { "epoch": 0.19576218885003085, "grad_norm": 2.567122012275527, "learning_rate": 1.8593536003167343e-05, "loss": 0.8423, "step": 2379 }, { "epoch": 0.19584447644517589, "grad_norm": 2.205559994325501, "learning_rate": 1.8592172710383698e-05, "loss": 0.8249, "step": 2380 }, { "epoch": 0.19592676404032092, "grad_norm": 2.6574824466331655, "learning_rate": 1.8590808807225848e-05, "loss": 0.8404, "step": 2381 }, { "epoch": 0.19600905163546595, "grad_norm": 4.540012850243885, "learning_rate": 1.8589444293790676e-05, "loss": 0.8608, "step": 2382 }, { "epoch": 0.19609133923061098, "grad_norm": 2.376925428453061, "learning_rate": 1.858807917017512e-05, "loss": 0.8189, "step": 2383 }, { "epoch": 0.196173626825756, "grad_norm": 3.0647815868191, "learning_rate": 1.8586713436476157e-05, "loss": 0.8576, "step": 2384 }, { "epoch": 0.19625591442090104, "grad_norm": 2.205726772660387, "learning_rate": 1.85853470927908e-05, "loss": 0.8207, "step": 2385 }, { "epoch": 0.19633820201604607, "grad_norm": 2.5188989094601104, "learning_rate": 1.8583980139216118e-05, "loss": 0.8313, "step": 2386 }, { "epoch": 0.1964204896111911, "grad_norm": 3.4042284049906733, "learning_rate": 1.8582612575849213e-05, "loss": 0.8392, "step": 2387 }, { "epoch": 0.19650277720633613, "grad_norm": 2.626590132398553, "learning_rate": 1.858124440278724e-05, "loss": 0.8152, "step": 2388 }, { "epoch": 0.19658506480148116, "grad_norm": 0.4681042846841439, "learning_rate": 1.8579875620127383e-05, "loss": 0.55, "step": 2389 }, { "epoch": 0.19666735239662622, "grad_norm": 2.616631543764943, "learning_rate": 1.8578506227966888e-05, "loss": 0.8676, "step": 2390 }, { "epoch": 0.19674963999177125, "grad_norm": 2.7293629687423233, "learning_rate": 1.857713622640303e-05, "loss": 0.8578, "step": 2391 }, { "epoch": 0.19683192758691628, "grad_norm": 2.7035619612336235, "learning_rate": 1.8575765615533127e-05, "loss": 0.8684, "step": 2392 }, { "epoch": 0.19691421518206131, "grad_norm": 2.9052169281551765, "learning_rate": 1.8574394395454553e-05, "loss": 0.8565, "step": 2393 }, { "epoch": 0.19699650277720634, "grad_norm": 2.7331128487533873, "learning_rate": 1.8573022566264714e-05, "loss": 0.8281, "step": 2394 }, { "epoch": 0.19707879037235138, "grad_norm": 3.636132586598684, "learning_rate": 1.857165012806106e-05, "loss": 0.8503, "step": 2395 }, { "epoch": 0.1971610779674964, "grad_norm": 0.4439175289748809, "learning_rate": 1.8570277080941094e-05, "loss": 0.5331, "step": 2396 }, { "epoch": 0.19724336556264144, "grad_norm": 2.6806719070401916, "learning_rate": 1.8568903425002345e-05, "loss": 0.8419, "step": 2397 }, { "epoch": 0.19732565315778647, "grad_norm": 2.6079792907790753, "learning_rate": 1.8567529160342402e-05, "loss": 0.8212, "step": 2398 }, { "epoch": 0.1974079407529315, "grad_norm": 3.2683764361117613, "learning_rate": 1.8566154287058893e-05, "loss": 0.8368, "step": 2399 }, { "epoch": 0.19749022834807653, "grad_norm": 2.7142642134286192, "learning_rate": 1.8564778805249478e-05, "loss": 0.8486, "step": 2400 }, { "epoch": 0.19757251594322156, "grad_norm": 3.1786352003299063, "learning_rate": 1.856340271501188e-05, "loss": 0.8377, "step": 2401 }, { "epoch": 0.1976548035383666, "grad_norm": 0.4578846665888037, "learning_rate": 1.856202601644384e-05, "loss": 0.5602, "step": 2402 }, { "epoch": 0.19773709113351162, "grad_norm": 3.3414556761978127, "learning_rate": 1.856064870964317e-05, "loss": 0.8377, "step": 2403 }, { "epoch": 0.19781937872865665, "grad_norm": 3.5318250292925404, "learning_rate": 1.8559270794707705e-05, "loss": 0.8063, "step": 2404 }, { "epoch": 0.19790166632380168, "grad_norm": 5.51079782302022, "learning_rate": 1.855789227173533e-05, "loss": 0.8297, "step": 2405 }, { "epoch": 0.19798395391894671, "grad_norm": 2.7529644026691935, "learning_rate": 1.855651314082398e-05, "loss": 0.8244, "step": 2406 }, { "epoch": 0.19806624151409175, "grad_norm": 2.5494023443174614, "learning_rate": 1.8555133402071614e-05, "loss": 0.8445, "step": 2407 }, { "epoch": 0.19814852910923678, "grad_norm": 4.775236277519297, "learning_rate": 1.8553753055576254e-05, "loss": 0.8293, "step": 2408 }, { "epoch": 0.1982308167043818, "grad_norm": 7.835477243177075, "learning_rate": 1.8552372101435962e-05, "loss": 0.8173, "step": 2409 }, { "epoch": 0.19831310429952684, "grad_norm": 0.44220497880015885, "learning_rate": 1.855099053974883e-05, "loss": 0.5415, "step": 2410 }, { "epoch": 0.19839539189467187, "grad_norm": 0.4449089177486766, "learning_rate": 1.8549608370613006e-05, "loss": 0.5428, "step": 2411 }, { "epoch": 0.1984776794898169, "grad_norm": 3.004236595412859, "learning_rate": 1.8548225594126675e-05, "loss": 0.8524, "step": 2412 }, { "epoch": 0.19855996708496193, "grad_norm": 3.1225989803474627, "learning_rate": 1.8546842210388068e-05, "loss": 0.9006, "step": 2413 }, { "epoch": 0.19864225468010696, "grad_norm": 2.8258591055606126, "learning_rate": 1.854545821949546e-05, "loss": 0.8293, "step": 2414 }, { "epoch": 0.198724542275252, "grad_norm": 2.913102608268323, "learning_rate": 1.8544073621547166e-05, "loss": 0.8325, "step": 2415 }, { "epoch": 0.19880682987039705, "grad_norm": 2.694231768464871, "learning_rate": 1.854268841664155e-05, "loss": 0.8299, "step": 2416 }, { "epoch": 0.19888911746554208, "grad_norm": 2.548369516157711, "learning_rate": 1.8541302604877006e-05, "loss": 0.8597, "step": 2417 }, { "epoch": 0.1989714050606871, "grad_norm": 3.8359828941267007, "learning_rate": 1.8539916186351984e-05, "loss": 0.8384, "step": 2418 }, { "epoch": 0.19905369265583214, "grad_norm": 0.4921292366844118, "learning_rate": 1.8538529161164977e-05, "loss": 0.5389, "step": 2419 }, { "epoch": 0.19913598025097717, "grad_norm": 2.626437336085917, "learning_rate": 1.8537141529414516e-05, "loss": 0.8366, "step": 2420 }, { "epoch": 0.1992182678461222, "grad_norm": 2.9140289684900327, "learning_rate": 1.853575329119917e-05, "loss": 0.8311, "step": 2421 }, { "epoch": 0.19930055544126724, "grad_norm": 2.785950193969917, "learning_rate": 1.8534364446617564e-05, "loss": 0.8464, "step": 2422 }, { "epoch": 0.19938284303641227, "grad_norm": 2.57493998214015, "learning_rate": 1.853297499576835e-05, "loss": 0.8371, "step": 2423 }, { "epoch": 0.1994651306315573, "grad_norm": 3.038908047752371, "learning_rate": 1.8531584938750248e-05, "loss": 0.8134, "step": 2424 }, { "epoch": 0.19954741822670233, "grad_norm": 3.2786711756817204, "learning_rate": 1.8530194275661988e-05, "loss": 0.8103, "step": 2425 }, { "epoch": 0.19962970582184736, "grad_norm": 2.963800782951755, "learning_rate": 1.852880300660237e-05, "loss": 0.8317, "step": 2426 }, { "epoch": 0.1997119934169924, "grad_norm": 3.337774402134451, "learning_rate": 1.852741113167023e-05, "loss": 0.8266, "step": 2427 }, { "epoch": 0.19979428101213742, "grad_norm": 2.5929113251908276, "learning_rate": 1.852601865096444e-05, "loss": 0.8395, "step": 2428 }, { "epoch": 0.19987656860728245, "grad_norm": 2.5937196786890477, "learning_rate": 1.852462556458392e-05, "loss": 0.8333, "step": 2429 }, { "epoch": 0.19995885620242748, "grad_norm": 0.5152107323732656, "learning_rate": 1.852323187262763e-05, "loss": 0.557, "step": 2430 }, { "epoch": 0.2000411437975725, "grad_norm": 3.3407100425128715, "learning_rate": 1.8521837575194583e-05, "loss": 0.8529, "step": 2431 }, { "epoch": 0.20012343139271754, "grad_norm": 2.6538977779080963, "learning_rate": 1.852044267238382e-05, "loss": 0.8414, "step": 2432 }, { "epoch": 0.20020571898786257, "grad_norm": 2.314162147595894, "learning_rate": 1.851904716429444e-05, "loss": 0.8125, "step": 2433 }, { "epoch": 0.2002880065830076, "grad_norm": 2.2460408127807168, "learning_rate": 1.851765105102557e-05, "loss": 0.8334, "step": 2434 }, { "epoch": 0.20037029417815264, "grad_norm": 3.0959390544791385, "learning_rate": 1.8516254332676393e-05, "loss": 0.8331, "step": 2435 }, { "epoch": 0.20045258177329767, "grad_norm": 2.5566174122477814, "learning_rate": 1.8514857009346125e-05, "loss": 0.8476, "step": 2436 }, { "epoch": 0.2005348693684427, "grad_norm": 0.44869119102794647, "learning_rate": 1.8513459081134036e-05, "loss": 0.5641, "step": 2437 }, { "epoch": 0.20061715696358773, "grad_norm": 0.45617551602186374, "learning_rate": 1.8512060548139427e-05, "loss": 0.5813, "step": 2438 }, { "epoch": 0.20069944455873276, "grad_norm": 0.43039996828216154, "learning_rate": 1.8510661410461655e-05, "loss": 0.5552, "step": 2439 }, { "epoch": 0.2007817321538778, "grad_norm": 0.4327012016490628, "learning_rate": 1.8509261668200103e-05, "loss": 0.5479, "step": 2440 }, { "epoch": 0.20086401974902285, "grad_norm": 2.525208048517637, "learning_rate": 1.8507861321454207e-05, "loss": 0.8417, "step": 2441 }, { "epoch": 0.20094630734416788, "grad_norm": 2.311404228985444, "learning_rate": 1.8506460370323452e-05, "loss": 0.8064, "step": 2442 }, { "epoch": 0.2010285949393129, "grad_norm": 2.039238767153375, "learning_rate": 1.8505058814907358e-05, "loss": 0.8351, "step": 2443 }, { "epoch": 0.20111088253445794, "grad_norm": 0.47468901596411156, "learning_rate": 1.8503656655305488e-05, "loss": 0.538, "step": 2444 }, { "epoch": 0.20119317012960297, "grad_norm": 2.275462288576652, "learning_rate": 1.8502253891617447e-05, "loss": 0.8254, "step": 2445 }, { "epoch": 0.201275457724748, "grad_norm": 2.1855146818106106, "learning_rate": 1.8500850523942886e-05, "loss": 0.8405, "step": 2446 }, { "epoch": 0.20135774531989303, "grad_norm": 2.4976463221502985, "learning_rate": 1.84994465523815e-05, "loss": 0.8452, "step": 2447 }, { "epoch": 0.20144003291503806, "grad_norm": 2.367625706280731, "learning_rate": 1.8498041977033027e-05, "loss": 0.8341, "step": 2448 }, { "epoch": 0.2015223205101831, "grad_norm": 2.260678210056219, "learning_rate": 1.8496636797997238e-05, "loss": 0.8404, "step": 2449 }, { "epoch": 0.20160460810532813, "grad_norm": 2.5435304382753827, "learning_rate": 1.849523101537396e-05, "loss": 0.8238, "step": 2450 }, { "epoch": 0.20168689570047316, "grad_norm": 2.6618203820717263, "learning_rate": 1.8493824629263058e-05, "loss": 0.8193, "step": 2451 }, { "epoch": 0.2017691832956182, "grad_norm": 2.6893590793056763, "learning_rate": 1.8492417639764438e-05, "loss": 0.85, "step": 2452 }, { "epoch": 0.20185147089076322, "grad_norm": 0.48216153771983744, "learning_rate": 1.849101004697805e-05, "loss": 0.5712, "step": 2453 }, { "epoch": 0.20193375848590825, "grad_norm": 2.5294000953688967, "learning_rate": 1.8489601851003888e-05, "loss": 0.8515, "step": 2454 }, { "epoch": 0.20201604608105328, "grad_norm": 0.4643243447618097, "learning_rate": 1.8488193051941986e-05, "loss": 0.5578, "step": 2455 }, { "epoch": 0.2020983336761983, "grad_norm": 2.319559503794457, "learning_rate": 1.8486783649892426e-05, "loss": 0.8621, "step": 2456 }, { "epoch": 0.20218062127134334, "grad_norm": 2.4512435702486406, "learning_rate": 1.848537364495533e-05, "loss": 0.7958, "step": 2457 }, { "epoch": 0.20226290886648837, "grad_norm": 2.6411049911355655, "learning_rate": 1.848396303723086e-05, "loss": 0.8378, "step": 2458 }, { "epoch": 0.2023451964616334, "grad_norm": 2.8276073390762066, "learning_rate": 1.8482551826819222e-05, "loss": 0.8325, "step": 2459 }, { "epoch": 0.20242748405677843, "grad_norm": 2.2864955529084185, "learning_rate": 1.848114001382067e-05, "loss": 0.8112, "step": 2460 }, { "epoch": 0.20250977165192346, "grad_norm": 2.0420321250633875, "learning_rate": 1.847972759833549e-05, "loss": 0.8234, "step": 2461 }, { "epoch": 0.2025920592470685, "grad_norm": 0.48223680461587926, "learning_rate": 1.847831458046403e-05, "loss": 0.554, "step": 2462 }, { "epoch": 0.20267434684221353, "grad_norm": 2.129881382514526, "learning_rate": 1.8476900960306652e-05, "loss": 0.8267, "step": 2463 }, { "epoch": 0.20275663443735856, "grad_norm": 2.859892646285134, "learning_rate": 1.8475486737963796e-05, "loss": 0.8667, "step": 2464 }, { "epoch": 0.2028389220325036, "grad_norm": 2.624248665968573, "learning_rate": 1.847407191353591e-05, "loss": 0.8523, "step": 2465 }, { "epoch": 0.20292120962764862, "grad_norm": 2.8516232385657068, "learning_rate": 1.847265648712351e-05, "loss": 0.8541, "step": 2466 }, { "epoch": 0.20300349722279368, "grad_norm": 2.6055637034912404, "learning_rate": 1.8471240458827146e-05, "loss": 0.8502, "step": 2467 }, { "epoch": 0.2030857848179387, "grad_norm": 2.4577761773176783, "learning_rate": 1.8469823828747407e-05, "loss": 0.8413, "step": 2468 }, { "epoch": 0.20316807241308374, "grad_norm": 2.47006715525194, "learning_rate": 1.8468406596984926e-05, "loss": 0.8081, "step": 2469 }, { "epoch": 0.20325036000822877, "grad_norm": 2.5083809333937372, "learning_rate": 1.8466988763640384e-05, "loss": 0.833, "step": 2470 }, { "epoch": 0.2033326476033738, "grad_norm": 2.2500043659647817, "learning_rate": 1.84655703288145e-05, "loss": 0.8458, "step": 2471 }, { "epoch": 0.20341493519851883, "grad_norm": 0.47109708584420956, "learning_rate": 1.846415129260804e-05, "loss": 0.533, "step": 2472 }, { "epoch": 0.20349722279366386, "grad_norm": 2.735180153674388, "learning_rate": 1.846273165512181e-05, "loss": 0.8279, "step": 2473 }, { "epoch": 0.2035795103888089, "grad_norm": 2.5548649604074583, "learning_rate": 1.8461311416456656e-05, "loss": 0.8504, "step": 2474 }, { "epoch": 0.20366179798395392, "grad_norm": 2.7375890325929926, "learning_rate": 1.845989057671347e-05, "loss": 0.8506, "step": 2475 }, { "epoch": 0.20374408557909895, "grad_norm": 2.218342428166774, "learning_rate": 1.8458469135993188e-05, "loss": 0.8416, "step": 2476 }, { "epoch": 0.20382637317424399, "grad_norm": 2.44498954622628, "learning_rate": 1.845704709439679e-05, "loss": 0.7982, "step": 2477 }, { "epoch": 0.20390866076938902, "grad_norm": 2.3257198537986996, "learning_rate": 1.8455624452025284e-05, "loss": 0.8516, "step": 2478 }, { "epoch": 0.20399094836453405, "grad_norm": 3.589919113908559, "learning_rate": 1.845420120897974e-05, "loss": 0.848, "step": 2479 }, { "epoch": 0.20407323595967908, "grad_norm": 2.6683764973422375, "learning_rate": 1.8452777365361266e-05, "loss": 0.843, "step": 2480 }, { "epoch": 0.2041555235548241, "grad_norm": 0.46016880736381627, "learning_rate": 1.8451352921271007e-05, "loss": 0.5541, "step": 2481 }, { "epoch": 0.20423781114996914, "grad_norm": 0.45308129612654763, "learning_rate": 1.844992787681015e-05, "loss": 0.5546, "step": 2482 }, { "epoch": 0.20432009874511417, "grad_norm": 2.163903037214254, "learning_rate": 1.8448502232079933e-05, "loss": 0.8245, "step": 2483 }, { "epoch": 0.2044023863402592, "grad_norm": 2.5581655100420733, "learning_rate": 1.844707598718163e-05, "loss": 0.7763, "step": 2484 }, { "epoch": 0.20448467393540423, "grad_norm": 2.288182080148139, "learning_rate": 1.8445649142216553e-05, "loss": 0.8515, "step": 2485 }, { "epoch": 0.20456696153054926, "grad_norm": 2.559408657335864, "learning_rate": 1.844422169728607e-05, "loss": 0.8182, "step": 2486 }, { "epoch": 0.2046492491256943, "grad_norm": 2.2948972501014118, "learning_rate": 1.8442793652491583e-05, "loss": 0.816, "step": 2487 }, { "epoch": 0.20473153672083932, "grad_norm": 3.059449870890419, "learning_rate": 1.8441365007934537e-05, "loss": 0.8346, "step": 2488 }, { "epoch": 0.20481382431598436, "grad_norm": 2.3426692532650955, "learning_rate": 1.843993576371642e-05, "loss": 0.8382, "step": 2489 }, { "epoch": 0.20489611191112939, "grad_norm": 3.668869085131925, "learning_rate": 1.8438505919938764e-05, "loss": 0.8097, "step": 2490 }, { "epoch": 0.20497839950627442, "grad_norm": 0.4843583755464429, "learning_rate": 1.8437075476703145e-05, "loss": 0.5594, "step": 2491 }, { "epoch": 0.20506068710141945, "grad_norm": 2.9411304009446413, "learning_rate": 1.8435644434111172e-05, "loss": 0.8315, "step": 2492 }, { "epoch": 0.2051429746965645, "grad_norm": 3.0347377147594616, "learning_rate": 1.8434212792264512e-05, "loss": 0.8413, "step": 2493 }, { "epoch": 0.20522526229170954, "grad_norm": 2.948040171867707, "learning_rate": 1.8432780551264864e-05, "loss": 0.8287, "step": 2494 }, { "epoch": 0.20530754988685457, "grad_norm": 2.526042320792797, "learning_rate": 1.8431347711213975e-05, "loss": 0.8192, "step": 2495 }, { "epoch": 0.2053898374819996, "grad_norm": 2.4529626679650933, "learning_rate": 1.8429914272213624e-05, "loss": 0.8374, "step": 2496 }, { "epoch": 0.20547212507714463, "grad_norm": 3.0109020139279776, "learning_rate": 1.8428480234365648e-05, "loss": 0.8538, "step": 2497 }, { "epoch": 0.20555441267228966, "grad_norm": 2.7309518930126084, "learning_rate": 1.8427045597771915e-05, "loss": 0.8288, "step": 2498 }, { "epoch": 0.2056367002674347, "grad_norm": 2.7571851152572324, "learning_rate": 1.8425610362534336e-05, "loss": 0.818, "step": 2499 }, { "epoch": 0.20571898786257972, "grad_norm": 3.6541677601404463, "learning_rate": 1.8424174528754874e-05, "loss": 0.8242, "step": 2500 }, { "epoch": 0.20580127545772475, "grad_norm": 3.2673973935668363, "learning_rate": 1.8422738096535528e-05, "loss": 0.8559, "step": 2501 }, { "epoch": 0.20588356305286978, "grad_norm": 3.280268088475709, "learning_rate": 1.8421301065978336e-05, "loss": 0.8289, "step": 2502 }, { "epoch": 0.20596585064801481, "grad_norm": 3.644658448351934, "learning_rate": 1.8419863437185385e-05, "loss": 0.8492, "step": 2503 }, { "epoch": 0.20604813824315985, "grad_norm": 3.9409239786685584, "learning_rate": 1.84184252102588e-05, "loss": 0.861, "step": 2504 }, { "epoch": 0.20613042583830488, "grad_norm": 2.770614222633164, "learning_rate": 1.841698638530075e-05, "loss": 0.8466, "step": 2505 }, { "epoch": 0.2062127134334499, "grad_norm": 2.6802796451434037, "learning_rate": 1.841554696241345e-05, "loss": 0.8178, "step": 2506 }, { "epoch": 0.20629500102859494, "grad_norm": 5.256236524078497, "learning_rate": 1.8414106941699152e-05, "loss": 0.8525, "step": 2507 }, { "epoch": 0.20637728862373997, "grad_norm": 2.8550407022060478, "learning_rate": 1.8412666323260154e-05, "loss": 0.8244, "step": 2508 }, { "epoch": 0.206459576218885, "grad_norm": 2.837058387140895, "learning_rate": 1.8411225107198795e-05, "loss": 0.8158, "step": 2509 }, { "epoch": 0.20654186381403003, "grad_norm": 3.2657972442124312, "learning_rate": 1.8409783293617454e-05, "loss": 0.8482, "step": 2510 }, { "epoch": 0.20662415140917506, "grad_norm": 3.107298788236777, "learning_rate": 1.8408340882618557e-05, "loss": 0.8323, "step": 2511 }, { "epoch": 0.2067064390043201, "grad_norm": 3.7888502892935323, "learning_rate": 1.8406897874304576e-05, "loss": 0.8402, "step": 2512 }, { "epoch": 0.20678872659946512, "grad_norm": 2.793788251488496, "learning_rate": 1.840545426877801e-05, "loss": 0.8629, "step": 2513 }, { "epoch": 0.20687101419461015, "grad_norm": 2.7013088321665997, "learning_rate": 1.8404010066141414e-05, "loss": 0.8053, "step": 2514 }, { "epoch": 0.20695330178975518, "grad_norm": 3.230473671989884, "learning_rate": 1.840256526649739e-05, "loss": 0.8322, "step": 2515 }, { "epoch": 0.20703558938490021, "grad_norm": 4.101974661085115, "learning_rate": 1.840111986994856e-05, "loss": 0.8134, "step": 2516 }, { "epoch": 0.20711787698004525, "grad_norm": 4.152939298483347, "learning_rate": 1.8399673876597615e-05, "loss": 0.8474, "step": 2517 }, { "epoch": 0.20720016457519028, "grad_norm": 3.0769463216688115, "learning_rate": 1.839822728654727e-05, "loss": 0.8299, "step": 2518 }, { "epoch": 0.20728245217033534, "grad_norm": 2.9770483204175453, "learning_rate": 1.8396780099900287e-05, "loss": 0.8034, "step": 2519 }, { "epoch": 0.20736473976548037, "grad_norm": 10.913510804926778, "learning_rate": 1.8395332316759474e-05, "loss": 0.8102, "step": 2520 }, { "epoch": 0.2074470273606254, "grad_norm": 3.301389018336796, "learning_rate": 1.8393883937227682e-05, "loss": 0.8269, "step": 2521 }, { "epoch": 0.20752931495577043, "grad_norm": 3.9626609639773585, "learning_rate": 1.83924349614078e-05, "loss": 0.8484, "step": 2522 }, { "epoch": 0.20761160255091546, "grad_norm": 3.308462805380347, "learning_rate": 1.8390985389402757e-05, "loss": 0.8193, "step": 2523 }, { "epoch": 0.2076938901460605, "grad_norm": 3.446980055063646, "learning_rate": 1.8389535221315533e-05, "loss": 0.8165, "step": 2524 }, { "epoch": 0.20777617774120552, "grad_norm": 3.361166331767642, "learning_rate": 1.8388084457249145e-05, "loss": 0.8403, "step": 2525 }, { "epoch": 0.20785846533635055, "grad_norm": 0.4489733132461202, "learning_rate": 1.8386633097306652e-05, "loss": 0.5779, "step": 2526 }, { "epoch": 0.20794075293149558, "grad_norm": 3.3763074588542126, "learning_rate": 1.8385181141591155e-05, "loss": 0.8355, "step": 2527 }, { "epoch": 0.2080230405266406, "grad_norm": 3.1426058775191716, "learning_rate": 1.83837285902058e-05, "loss": 0.8228, "step": 2528 }, { "epoch": 0.20810532812178564, "grad_norm": 3.408831364628426, "learning_rate": 1.838227544325377e-05, "loss": 0.8374, "step": 2529 }, { "epoch": 0.20818761571693067, "grad_norm": 3.4386486634409086, "learning_rate": 1.8380821700838306e-05, "loss": 0.8511, "step": 2530 }, { "epoch": 0.2082699033120757, "grad_norm": 3.2638778987282717, "learning_rate": 1.8379367363062667e-05, "loss": 0.8327, "step": 2531 }, { "epoch": 0.20835219090722074, "grad_norm": 3.5679775987459994, "learning_rate": 1.8377912430030172e-05, "loss": 0.8329, "step": 2532 }, { "epoch": 0.20843447850236577, "grad_norm": 3.2528208627335955, "learning_rate": 1.8376456901844174e-05, "loss": 0.8053, "step": 2533 }, { "epoch": 0.2085167660975108, "grad_norm": 3.264120422841936, "learning_rate": 1.8375000778608077e-05, "loss": 0.825, "step": 2534 }, { "epoch": 0.20859905369265583, "grad_norm": 0.45110572381254516, "learning_rate": 1.8373544060425318e-05, "loss": 0.5506, "step": 2535 }, { "epoch": 0.20868134128780086, "grad_norm": 2.593216453166507, "learning_rate": 1.8372086747399377e-05, "loss": 0.8277, "step": 2536 }, { "epoch": 0.2087636288829459, "grad_norm": 4.31170410793178, "learning_rate": 1.8370628839633786e-05, "loss": 0.8103, "step": 2537 }, { "epoch": 0.20884591647809092, "grad_norm": 3.069700353630395, "learning_rate": 1.836917033723211e-05, "loss": 0.8451, "step": 2538 }, { "epoch": 0.20892820407323595, "grad_norm": 2.727415507565117, "learning_rate": 1.8367711240297955e-05, "loss": 0.8193, "step": 2539 }, { "epoch": 0.20901049166838098, "grad_norm": 2.583035618535975, "learning_rate": 1.836625154893498e-05, "loss": 0.8272, "step": 2540 }, { "epoch": 0.209092779263526, "grad_norm": 2.8510094205198984, "learning_rate": 1.8364791263246872e-05, "loss": 0.8416, "step": 2541 }, { "epoch": 0.20917506685867104, "grad_norm": 2.5411935858791237, "learning_rate": 1.8363330383337367e-05, "loss": 0.8165, "step": 2542 }, { "epoch": 0.20925735445381607, "grad_norm": 3.818919343874717, "learning_rate": 1.8361868909310252e-05, "loss": 0.816, "step": 2543 }, { "epoch": 0.2093396420489611, "grad_norm": 3.227265299702525, "learning_rate": 1.836040684126934e-05, "loss": 0.8406, "step": 2544 }, { "epoch": 0.20942192964410616, "grad_norm": 2.8657353537124775, "learning_rate": 1.8358944179318493e-05, "loss": 0.8013, "step": 2545 }, { "epoch": 0.2095042172392512, "grad_norm": 2.834396947381362, "learning_rate": 1.8357480923561626e-05, "loss": 0.816, "step": 2546 }, { "epoch": 0.20958650483439623, "grad_norm": 3.236954335848728, "learning_rate": 1.835601707410268e-05, "loss": 0.8182, "step": 2547 }, { "epoch": 0.20966879242954126, "grad_norm": 3.2899460403922824, "learning_rate": 1.835455263104564e-05, "loss": 0.8374, "step": 2548 }, { "epoch": 0.2097510800246863, "grad_norm": 3.436877513077752, "learning_rate": 1.8353087594494543e-05, "loss": 0.8474, "step": 2549 }, { "epoch": 0.20983336761983132, "grad_norm": 2.9944542174818594, "learning_rate": 1.8351621964553463e-05, "loss": 0.84, "step": 2550 }, { "epoch": 0.20991565521497635, "grad_norm": 3.6190473827362992, "learning_rate": 1.8350155741326518e-05, "loss": 0.8452, "step": 2551 }, { "epoch": 0.20999794281012138, "grad_norm": 2.4354433594418294, "learning_rate": 1.834868892491786e-05, "loss": 0.8122, "step": 2552 }, { "epoch": 0.2100802304052664, "grad_norm": 2.5426108248638855, "learning_rate": 1.8347221515431692e-05, "loss": 0.7887, "step": 2553 }, { "epoch": 0.21016251800041144, "grad_norm": 3.25334084668476, "learning_rate": 1.8345753512972258e-05, "loss": 0.8431, "step": 2554 }, { "epoch": 0.21024480559555647, "grad_norm": 2.7030206230832645, "learning_rate": 1.834428491764384e-05, "loss": 0.8317, "step": 2555 }, { "epoch": 0.2103270931907015, "grad_norm": 0.47494536114521607, "learning_rate": 1.834281572955077e-05, "loss": 0.5453, "step": 2556 }, { "epoch": 0.21040938078584653, "grad_norm": 4.098758048826732, "learning_rate": 1.834134594879741e-05, "loss": 0.8475, "step": 2557 }, { "epoch": 0.21049166838099156, "grad_norm": 3.3335138581703943, "learning_rate": 1.8339875575488176e-05, "loss": 0.8151, "step": 2558 }, { "epoch": 0.2105739559761366, "grad_norm": 3.66003173031739, "learning_rate": 1.8338404609727517e-05, "loss": 0.8462, "step": 2559 }, { "epoch": 0.21065624357128163, "grad_norm": 3.3051867621868847, "learning_rate": 1.833693305161993e-05, "loss": 0.8143, "step": 2560 }, { "epoch": 0.21073853116642666, "grad_norm": 3.3695611559510197, "learning_rate": 1.833546090126995e-05, "loss": 0.8315, "step": 2561 }, { "epoch": 0.2108208187615717, "grad_norm": 0.4589791313932974, "learning_rate": 1.8333988158782162e-05, "loss": 0.5253, "step": 2562 }, { "epoch": 0.21090310635671672, "grad_norm": 3.5936763964991374, "learning_rate": 1.833251482426118e-05, "loss": 0.8455, "step": 2563 }, { "epoch": 0.21098539395186175, "grad_norm": 3.2932378054726503, "learning_rate": 1.8331040897811672e-05, "loss": 0.8313, "step": 2564 }, { "epoch": 0.21106768154700678, "grad_norm": 3.232556623686311, "learning_rate": 1.8329566379538342e-05, "loss": 0.8091, "step": 2565 }, { "epoch": 0.2111499691421518, "grad_norm": 3.104605617968999, "learning_rate": 1.832809126954594e-05, "loss": 0.8163, "step": 2566 }, { "epoch": 0.21123225673729684, "grad_norm": 3.496201671234622, "learning_rate": 1.832661556793925e-05, "loss": 0.8261, "step": 2567 }, { "epoch": 0.21131454433244187, "grad_norm": 3.136057053194723, "learning_rate": 1.8325139274823108e-05, "loss": 0.8178, "step": 2568 }, { "epoch": 0.2113968319275869, "grad_norm": 3.0653813480017713, "learning_rate": 1.8323662390302385e-05, "loss": 0.8255, "step": 2569 }, { "epoch": 0.21147911952273196, "grad_norm": 3.8601064789131834, "learning_rate": 1.8322184914482e-05, "loss": 0.8372, "step": 2570 }, { "epoch": 0.211561407117877, "grad_norm": 2.8367654502679, "learning_rate": 1.8320706847466905e-05, "loss": 0.8198, "step": 2571 }, { "epoch": 0.21164369471302202, "grad_norm": 4.618800398679875, "learning_rate": 1.8319228189362105e-05, "loss": 0.8153, "step": 2572 }, { "epoch": 0.21172598230816705, "grad_norm": 3.2117553898555675, "learning_rate": 1.8317748940272637e-05, "loss": 0.799, "step": 2573 }, { "epoch": 0.21180826990331209, "grad_norm": 3.427210044976752, "learning_rate": 1.8316269100303586e-05, "loss": 0.8151, "step": 2574 }, { "epoch": 0.21189055749845712, "grad_norm": 3.5133491900994476, "learning_rate": 1.8314788669560083e-05, "loss": 0.8532, "step": 2575 }, { "epoch": 0.21197284509360215, "grad_norm": 3.720043866224015, "learning_rate": 1.8313307648147286e-05, "loss": 0.8238, "step": 2576 }, { "epoch": 0.21205513268874718, "grad_norm": 3.635938030216419, "learning_rate": 1.831182603617041e-05, "loss": 0.8029, "step": 2577 }, { "epoch": 0.2121374202838922, "grad_norm": 3.6691540244521414, "learning_rate": 1.8310343833734704e-05, "loss": 0.8238, "step": 2578 }, { "epoch": 0.21221970787903724, "grad_norm": 2.9683873093103847, "learning_rate": 1.830886104094546e-05, "loss": 0.8452, "step": 2579 }, { "epoch": 0.21230199547418227, "grad_norm": 3.92284826064443, "learning_rate": 1.830737765790802e-05, "loss": 0.7959, "step": 2580 }, { "epoch": 0.2123842830693273, "grad_norm": 6.433500257947444, "learning_rate": 1.8305893684727758e-05, "loss": 0.8113, "step": 2581 }, { "epoch": 0.21246657066447233, "grad_norm": 0.4783127403947231, "learning_rate": 1.830440912151009e-05, "loss": 0.5652, "step": 2582 }, { "epoch": 0.21254885825961736, "grad_norm": 3.060683872511528, "learning_rate": 1.8302923968360476e-05, "loss": 0.8024, "step": 2583 }, { "epoch": 0.2126311458547624, "grad_norm": 3.0053452613095413, "learning_rate": 1.830143822538443e-05, "loss": 0.8557, "step": 2584 }, { "epoch": 0.21271343344990742, "grad_norm": 0.4673855917288871, "learning_rate": 1.829995189268748e-05, "loss": 0.5718, "step": 2585 }, { "epoch": 0.21279572104505246, "grad_norm": 6.640886036898429, "learning_rate": 1.8298464970375228e-05, "loss": 0.831, "step": 2586 }, { "epoch": 0.21287800864019749, "grad_norm": 0.42158033064070444, "learning_rate": 1.8296977458553292e-05, "loss": 0.5438, "step": 2587 }, { "epoch": 0.21296029623534252, "grad_norm": 2.555875718353797, "learning_rate": 1.8295489357327345e-05, "loss": 0.8275, "step": 2588 }, { "epoch": 0.21304258383048755, "grad_norm": 3.406091656124588, "learning_rate": 1.8294000666803104e-05, "loss": 0.8295, "step": 2589 }, { "epoch": 0.21312487142563258, "grad_norm": 2.5938644299533613, "learning_rate": 1.8292511387086317e-05, "loss": 0.8365, "step": 2590 }, { "epoch": 0.2132071590207776, "grad_norm": 2.8496068081536685, "learning_rate": 1.8291021518282786e-05, "loss": 0.8452, "step": 2591 }, { "epoch": 0.21328944661592264, "grad_norm": 3.3752857892659893, "learning_rate": 1.8289531060498345e-05, "loss": 0.8271, "step": 2592 }, { "epoch": 0.21337173421106767, "grad_norm": 3.1170922614104795, "learning_rate": 1.8288040013838873e-05, "loss": 0.7921, "step": 2593 }, { "epoch": 0.2134540218062127, "grad_norm": 2.9388409386634513, "learning_rate": 1.8286548378410295e-05, "loss": 0.7972, "step": 2594 }, { "epoch": 0.21353630940135773, "grad_norm": 3.4323486322698193, "learning_rate": 1.8285056154318573e-05, "loss": 0.8038, "step": 2595 }, { "epoch": 0.2136185969965028, "grad_norm": 3.2115666371727642, "learning_rate": 1.828356334166971e-05, "loss": 0.8441, "step": 2596 }, { "epoch": 0.21370088459164782, "grad_norm": 0.4663816033019578, "learning_rate": 1.8282069940569756e-05, "loss": 0.5721, "step": 2597 }, { "epoch": 0.21378317218679285, "grad_norm": 0.4355461155177228, "learning_rate": 1.8280575951124796e-05, "loss": 0.5343, "step": 2598 }, { "epoch": 0.21386545978193788, "grad_norm": 3.747322259261724, "learning_rate": 1.8279081373440967e-05, "loss": 0.8277, "step": 2599 }, { "epoch": 0.21394774737708291, "grad_norm": 2.6362418750782846, "learning_rate": 1.8277586207624436e-05, "loss": 0.8239, "step": 2600 }, { "epoch": 0.21403003497222794, "grad_norm": 2.8752720486831325, "learning_rate": 1.827609045378142e-05, "loss": 0.8247, "step": 2601 }, { "epoch": 0.21411232256737298, "grad_norm": 2.5111329831110525, "learning_rate": 1.8274594112018172e-05, "loss": 0.818, "step": 2602 }, { "epoch": 0.214194610162518, "grad_norm": 0.47382323197265425, "learning_rate": 1.827309718244099e-05, "loss": 0.552, "step": 2603 }, { "epoch": 0.21427689775766304, "grad_norm": 2.30810347940035, "learning_rate": 1.827159966515622e-05, "loss": 0.814, "step": 2604 }, { "epoch": 0.21435918535280807, "grad_norm": 2.3416958083524873, "learning_rate": 1.8270101560270234e-05, "loss": 0.8063, "step": 2605 }, { "epoch": 0.2144414729479531, "grad_norm": 2.880474026652732, "learning_rate": 1.8268602867889462e-05, "loss": 0.8221, "step": 2606 }, { "epoch": 0.21452376054309813, "grad_norm": 0.4761050768702208, "learning_rate": 1.8267103588120364e-05, "loss": 0.5639, "step": 2607 }, { "epoch": 0.21460604813824316, "grad_norm": 2.478967954728775, "learning_rate": 1.8265603721069453e-05, "loss": 0.8187, "step": 2608 }, { "epoch": 0.2146883357333882, "grad_norm": 4.3294368537334345, "learning_rate": 1.8264103266843264e-05, "loss": 0.8205, "step": 2609 }, { "epoch": 0.21477062332853322, "grad_norm": 0.4533011865343612, "learning_rate": 1.8262602225548403e-05, "loss": 0.5647, "step": 2610 }, { "epoch": 0.21485291092367825, "grad_norm": 3.5416383187210565, "learning_rate": 1.826110059729149e-05, "loss": 0.8418, "step": 2611 }, { "epoch": 0.21493519851882328, "grad_norm": 2.932491105752039, "learning_rate": 1.82595983821792e-05, "loss": 0.8319, "step": 2612 }, { "epoch": 0.21501748611396831, "grad_norm": 2.254319838401644, "learning_rate": 1.8258095580318258e-05, "loss": 0.7838, "step": 2613 }, { "epoch": 0.21509977370911335, "grad_norm": 2.4112108755813737, "learning_rate": 1.8256592191815407e-05, "loss": 0.8142, "step": 2614 }, { "epoch": 0.21518206130425838, "grad_norm": 2.0638729753662615, "learning_rate": 1.8255088216777454e-05, "loss": 0.8094, "step": 2615 }, { "epoch": 0.2152643488994034, "grad_norm": 0.4540631567309475, "learning_rate": 1.8253583655311232e-05, "loss": 0.5444, "step": 2616 }, { "epoch": 0.21534663649454844, "grad_norm": 2.868258138099938, "learning_rate": 1.8252078507523633e-05, "loss": 0.8408, "step": 2617 }, { "epoch": 0.21542892408969347, "grad_norm": 0.46467846212608765, "learning_rate": 1.8250572773521568e-05, "loss": 0.527, "step": 2618 }, { "epoch": 0.2155112116848385, "grad_norm": 0.4244812006406836, "learning_rate": 1.824906645341201e-05, "loss": 0.5297, "step": 2619 }, { "epoch": 0.21559349927998353, "grad_norm": 2.530046080436701, "learning_rate": 1.8247559547301966e-05, "loss": 0.8493, "step": 2620 }, { "epoch": 0.21567578687512856, "grad_norm": 2.273032185986812, "learning_rate": 1.8246052055298478e-05, "loss": 0.8309, "step": 2621 }, { "epoch": 0.21575807447027362, "grad_norm": 3.6569740933446746, "learning_rate": 1.824454397750864e-05, "loss": 0.8308, "step": 2622 }, { "epoch": 0.21584036206541865, "grad_norm": 3.0536589173423576, "learning_rate": 1.8243035314039587e-05, "loss": 0.8164, "step": 2623 }, { "epoch": 0.21592264966056368, "grad_norm": 3.0019301593091954, "learning_rate": 1.8241526064998485e-05, "loss": 0.8532, "step": 2624 }, { "epoch": 0.2160049372557087, "grad_norm": 3.427353544502481, "learning_rate": 1.8240016230492554e-05, "loss": 0.8472, "step": 2625 }, { "epoch": 0.21608722485085374, "grad_norm": 3.6884329481060103, "learning_rate": 1.8238505810629045e-05, "loss": 0.8313, "step": 2626 }, { "epoch": 0.21616951244599877, "grad_norm": 4.17700274528595, "learning_rate": 1.8236994805515263e-05, "loss": 0.858, "step": 2627 }, { "epoch": 0.2162518000411438, "grad_norm": 3.5142493762355222, "learning_rate": 1.8235483215258538e-05, "loss": 0.8013, "step": 2628 }, { "epoch": 0.21633408763628884, "grad_norm": 0.5701316952851451, "learning_rate": 1.823397103996626e-05, "loss": 0.5865, "step": 2629 }, { "epoch": 0.21641637523143387, "grad_norm": 6.342171267674201, "learning_rate": 1.8232458279745845e-05, "loss": 0.7946, "step": 2630 }, { "epoch": 0.2164986628265789, "grad_norm": 4.56657322487598, "learning_rate": 1.823094493470476e-05, "loss": 0.8507, "step": 2631 }, { "epoch": 0.21658095042172393, "grad_norm": 4.7063511682274, "learning_rate": 1.822943100495051e-05, "loss": 0.793, "step": 2632 }, { "epoch": 0.21666323801686896, "grad_norm": 5.375144420205079, "learning_rate": 1.8227916490590644e-05, "loss": 0.8057, "step": 2633 }, { "epoch": 0.216745525612014, "grad_norm": 3.8597372199138364, "learning_rate": 1.822640139173275e-05, "loss": 0.8367, "step": 2634 }, { "epoch": 0.21682781320715902, "grad_norm": 3.734335633179637, "learning_rate": 1.8224885708484454e-05, "loss": 0.8396, "step": 2635 }, { "epoch": 0.21691010080230405, "grad_norm": 4.897625227987675, "learning_rate": 1.8223369440953434e-05, "loss": 0.8158, "step": 2636 }, { "epoch": 0.21699238839744908, "grad_norm": 0.5368699806869238, "learning_rate": 1.82218525892474e-05, "loss": 0.5561, "step": 2637 }, { "epoch": 0.2170746759925941, "grad_norm": 3.8252691208651184, "learning_rate": 1.8220335153474104e-05, "loss": 0.7976, "step": 2638 }, { "epoch": 0.21715696358773914, "grad_norm": 3.7756639892156727, "learning_rate": 1.8218817133741348e-05, "loss": 0.8426, "step": 2639 }, { "epoch": 0.21723925118288417, "grad_norm": 4.658025704820251, "learning_rate": 1.8217298530156963e-05, "loss": 0.8187, "step": 2640 }, { "epoch": 0.2173215387780292, "grad_norm": 3.733061302198968, "learning_rate": 1.8215779342828835e-05, "loss": 0.8049, "step": 2641 }, { "epoch": 0.21740382637317424, "grad_norm": 3.6973346475021387, "learning_rate": 1.8214259571864886e-05, "loss": 0.8648, "step": 2642 }, { "epoch": 0.21748611396831927, "grad_norm": 3.184437840138913, "learning_rate": 1.821273921737307e-05, "loss": 0.8411, "step": 2643 }, { "epoch": 0.2175684015634643, "grad_norm": 3.8392363631998614, "learning_rate": 1.8211218279461393e-05, "loss": 0.8379, "step": 2644 }, { "epoch": 0.21765068915860933, "grad_norm": 3.88941746570052, "learning_rate": 1.82096967582379e-05, "loss": 0.8284, "step": 2645 }, { "epoch": 0.21773297675375436, "grad_norm": 4.027135426349342, "learning_rate": 1.8208174653810683e-05, "loss": 0.8311, "step": 2646 }, { "epoch": 0.2178152643488994, "grad_norm": 3.719374349183988, "learning_rate": 1.8206651966287863e-05, "loss": 0.8015, "step": 2647 }, { "epoch": 0.21789755194404445, "grad_norm": 3.94239636368207, "learning_rate": 1.8205128695777613e-05, "loss": 0.8184, "step": 2648 }, { "epoch": 0.21797983953918948, "grad_norm": 0.5068293052496876, "learning_rate": 1.820360484238814e-05, "loss": 0.5834, "step": 2649 }, { "epoch": 0.2180621271343345, "grad_norm": 3.0598014030618925, "learning_rate": 1.8202080406227703e-05, "loss": 0.8277, "step": 2650 }, { "epoch": 0.21814441472947954, "grad_norm": 2.8718911462563117, "learning_rate": 1.820055538740459e-05, "loss": 0.8531, "step": 2651 }, { "epoch": 0.21822670232462457, "grad_norm": 2.882907738566527, "learning_rate": 1.8199029786027133e-05, "loss": 0.818, "step": 2652 }, { "epoch": 0.2183089899197696, "grad_norm": 0.4460061330621163, "learning_rate": 1.8197503602203716e-05, "loss": 0.5489, "step": 2653 }, { "epoch": 0.21839127751491463, "grad_norm": 2.787502149270521, "learning_rate": 1.8195976836042753e-05, "loss": 0.8272, "step": 2654 }, { "epoch": 0.21847356511005966, "grad_norm": 3.6657569464084636, "learning_rate": 1.8194449487652704e-05, "loss": 0.8209, "step": 2655 }, { "epoch": 0.2185558527052047, "grad_norm": 4.190422309727537, "learning_rate": 1.8192921557142068e-05, "loss": 0.8266, "step": 2656 }, { "epoch": 0.21863814030034973, "grad_norm": 0.4720717771807136, "learning_rate": 1.8191393044619386e-05, "loss": 0.5499, "step": 2657 }, { "epoch": 0.21872042789549476, "grad_norm": 3.5968065813132255, "learning_rate": 1.818986395019324e-05, "loss": 0.8364, "step": 2658 }, { "epoch": 0.2188027154906398, "grad_norm": 2.6988580933542057, "learning_rate": 1.818833427397226e-05, "loss": 0.812, "step": 2659 }, { "epoch": 0.21888500308578482, "grad_norm": 3.1563976116639547, "learning_rate": 1.818680401606511e-05, "loss": 0.802, "step": 2660 }, { "epoch": 0.21896729068092985, "grad_norm": 2.9698731854020854, "learning_rate": 1.8185273176580494e-05, "loss": 0.8007, "step": 2661 }, { "epoch": 0.21904957827607488, "grad_norm": 3.209600169593195, "learning_rate": 1.818374175562716e-05, "loss": 0.8291, "step": 2662 }, { "epoch": 0.2191318658712199, "grad_norm": 3.2855530219150206, "learning_rate": 1.8182209753313903e-05, "loss": 0.8061, "step": 2663 }, { "epoch": 0.21921415346636494, "grad_norm": 2.9586512787993042, "learning_rate": 1.8180677169749547e-05, "loss": 0.848, "step": 2664 }, { "epoch": 0.21929644106150997, "grad_norm": 2.355790190628477, "learning_rate": 1.817914400504297e-05, "loss": 0.8436, "step": 2665 }, { "epoch": 0.219378728656655, "grad_norm": 3.9499318826017222, "learning_rate": 1.8177610259303085e-05, "loss": 0.8172, "step": 2666 }, { "epoch": 0.21946101625180003, "grad_norm": 3.587633446988715, "learning_rate": 1.8176075932638842e-05, "loss": 0.8162, "step": 2667 }, { "epoch": 0.21954330384694506, "grad_norm": 2.408074965663899, "learning_rate": 1.8174541025159242e-05, "loss": 0.8084, "step": 2668 }, { "epoch": 0.2196255914420901, "grad_norm": 2.3988549424557015, "learning_rate": 1.817300553697332e-05, "loss": 0.8377, "step": 2669 }, { "epoch": 0.21970787903723513, "grad_norm": 2.6276552918579172, "learning_rate": 1.8171469468190156e-05, "loss": 0.8095, "step": 2670 }, { "epoch": 0.21979016663238016, "grad_norm": 0.4931309790183725, "learning_rate": 1.816993281891887e-05, "loss": 0.5831, "step": 2671 }, { "epoch": 0.2198724542275252, "grad_norm": 2.862710610340563, "learning_rate": 1.8168395589268624e-05, "loss": 0.8434, "step": 2672 }, { "epoch": 0.21995474182267022, "grad_norm": 2.7040519252261705, "learning_rate": 1.8166857779348618e-05, "loss": 0.835, "step": 2673 }, { "epoch": 0.22003702941781528, "grad_norm": 2.2027133058893513, "learning_rate": 1.816531938926809e-05, "loss": 0.808, "step": 2674 }, { "epoch": 0.2201193170129603, "grad_norm": 2.1305283929039462, "learning_rate": 1.816378041913634e-05, "loss": 0.801, "step": 2675 }, { "epoch": 0.22020160460810534, "grad_norm": 2.1727604185232288, "learning_rate": 1.816224086906268e-05, "loss": 0.7963, "step": 2676 }, { "epoch": 0.22028389220325037, "grad_norm": 2.0940359561475055, "learning_rate": 1.816070073915648e-05, "loss": 0.8036, "step": 2677 }, { "epoch": 0.2203661797983954, "grad_norm": 2.6473855146481498, "learning_rate": 1.815916002952716e-05, "loss": 0.8284, "step": 2678 }, { "epoch": 0.22044846739354043, "grad_norm": 2.8347082933489376, "learning_rate": 1.8157618740284153e-05, "loss": 0.7905, "step": 2679 }, { "epoch": 0.22053075498868546, "grad_norm": 3.203459633707998, "learning_rate": 1.8156076871536958e-05, "loss": 0.8463, "step": 2680 }, { "epoch": 0.2206130425838305, "grad_norm": 2.5115557324427824, "learning_rate": 1.8154534423395107e-05, "loss": 0.8296, "step": 2681 }, { "epoch": 0.22069533017897552, "grad_norm": 2.6138770428847358, "learning_rate": 1.815299139596817e-05, "loss": 0.8051, "step": 2682 }, { "epoch": 0.22077761777412055, "grad_norm": 3.2232601828969694, "learning_rate": 1.8151447789365764e-05, "loss": 0.8251, "step": 2683 }, { "epoch": 0.22085990536926559, "grad_norm": 2.35388990966657, "learning_rate": 1.814990360369754e-05, "loss": 0.8204, "step": 2684 }, { "epoch": 0.22094219296441062, "grad_norm": 2.9872998389223513, "learning_rate": 1.8148358839073205e-05, "loss": 0.8165, "step": 2685 }, { "epoch": 0.22102448055955565, "grad_norm": 2.8245273602696557, "learning_rate": 1.8146813495602484e-05, "loss": 0.8324, "step": 2686 }, { "epoch": 0.22110676815470068, "grad_norm": 3.1009018581544483, "learning_rate": 1.8145267573395163e-05, "loss": 0.7939, "step": 2687 }, { "epoch": 0.2211890557498457, "grad_norm": 3.0898385828991026, "learning_rate": 1.8143721072561062e-05, "loss": 0.8672, "step": 2688 }, { "epoch": 0.22127134334499074, "grad_norm": 2.4901137109662494, "learning_rate": 1.8142173993210034e-05, "loss": 0.8209, "step": 2689 }, { "epoch": 0.22135363094013577, "grad_norm": 2.6992537201209488, "learning_rate": 1.814062633545199e-05, "loss": 0.8152, "step": 2690 }, { "epoch": 0.2214359185352808, "grad_norm": 2.47623420446661, "learning_rate": 1.813907809939687e-05, "loss": 0.8095, "step": 2691 }, { "epoch": 0.22151820613042583, "grad_norm": 2.483085492924412, "learning_rate": 1.813752928515466e-05, "loss": 0.8188, "step": 2692 }, { "epoch": 0.22160049372557086, "grad_norm": 2.7071635220071273, "learning_rate": 1.8135979892835383e-05, "loss": 0.8264, "step": 2693 }, { "epoch": 0.2216827813207159, "grad_norm": 2.2474688512352206, "learning_rate": 1.8134429922549106e-05, "loss": 0.8026, "step": 2694 }, { "epoch": 0.22176506891586092, "grad_norm": 2.4233908375852065, "learning_rate": 1.8132879374405937e-05, "loss": 0.8337, "step": 2695 }, { "epoch": 0.22184735651100596, "grad_norm": 3.165042749257207, "learning_rate": 1.813132824851602e-05, "loss": 0.8341, "step": 2696 }, { "epoch": 0.221929644106151, "grad_norm": 3.0657787022724947, "learning_rate": 1.812977654498955e-05, "loss": 0.8126, "step": 2697 }, { "epoch": 0.22201193170129602, "grad_norm": 2.7194251670803857, "learning_rate": 1.812822426393676e-05, "loss": 0.8572, "step": 2698 }, { "epoch": 0.22209421929644105, "grad_norm": 2.755261631156646, "learning_rate": 1.8126671405467914e-05, "loss": 0.809, "step": 2699 }, { "epoch": 0.2221765068915861, "grad_norm": 2.7498923006459854, "learning_rate": 1.812511796969333e-05, "loss": 0.8131, "step": 2700 }, { "epoch": 0.22225879448673114, "grad_norm": 2.435235398052274, "learning_rate": 1.8123563956723357e-05, "loss": 0.8362, "step": 2701 }, { "epoch": 0.22234108208187617, "grad_norm": 0.457018645766829, "learning_rate": 1.8122009366668394e-05, "loss": 0.5594, "step": 2702 }, { "epoch": 0.2224233696770212, "grad_norm": 4.292693915757756, "learning_rate": 1.8120454199638874e-05, "loss": 0.8575, "step": 2703 }, { "epoch": 0.22250565727216623, "grad_norm": 2.8863182053111234, "learning_rate": 1.8118898455745276e-05, "loss": 0.7982, "step": 2704 }, { "epoch": 0.22258794486731126, "grad_norm": 2.7521147693360715, "learning_rate": 1.811734213509811e-05, "loss": 0.839, "step": 2705 }, { "epoch": 0.2226702324624563, "grad_norm": 3.2656957555222323, "learning_rate": 1.8115785237807948e-05, "loss": 0.834, "step": 2706 }, { "epoch": 0.22275252005760132, "grad_norm": 3.789129894489883, "learning_rate": 1.811422776398538e-05, "loss": 0.8128, "step": 2707 }, { "epoch": 0.22283480765274635, "grad_norm": 2.3356779771566245, "learning_rate": 1.8112669713741046e-05, "loss": 0.8131, "step": 2708 }, { "epoch": 0.22291709524789138, "grad_norm": 3.555488258057913, "learning_rate": 1.8111111087185633e-05, "loss": 0.8236, "step": 2709 }, { "epoch": 0.22299938284303641, "grad_norm": 2.0143410520363356, "learning_rate": 1.8109551884429858e-05, "loss": 0.8185, "step": 2710 }, { "epoch": 0.22308167043818145, "grad_norm": 2.63263049454595, "learning_rate": 1.8107992105584488e-05, "loss": 0.8389, "step": 2711 }, { "epoch": 0.22316395803332648, "grad_norm": 2.279152679520782, "learning_rate": 1.8106431750760326e-05, "loss": 0.7975, "step": 2712 }, { "epoch": 0.2232462456284715, "grad_norm": 2.6298664235486275, "learning_rate": 1.8104870820068214e-05, "loss": 0.8051, "step": 2713 }, { "epoch": 0.22332853322361654, "grad_norm": 2.6437941616288865, "learning_rate": 1.8103309313619042e-05, "loss": 0.81, "step": 2714 }, { "epoch": 0.22341082081876157, "grad_norm": 2.77868949196984, "learning_rate": 1.8101747231523735e-05, "loss": 0.8422, "step": 2715 }, { "epoch": 0.2234931084139066, "grad_norm": 2.386387317962988, "learning_rate": 1.8100184573893265e-05, "loss": 0.8189, "step": 2716 }, { "epoch": 0.22357539600905163, "grad_norm": 0.47172623629404353, "learning_rate": 1.8098621340838635e-05, "loss": 0.5472, "step": 2717 }, { "epoch": 0.22365768360419666, "grad_norm": 2.2257199614717864, "learning_rate": 1.8097057532470893e-05, "loss": 0.8239, "step": 2718 }, { "epoch": 0.2237399711993417, "grad_norm": 2.2490034040330222, "learning_rate": 1.809549314890114e-05, "loss": 0.8069, "step": 2719 }, { "epoch": 0.22382225879448672, "grad_norm": 2.0704084361324018, "learning_rate": 1.8093928190240496e-05, "loss": 0.8194, "step": 2720 }, { "epoch": 0.22390454638963175, "grad_norm": 1.9673046749980998, "learning_rate": 1.809236265660014e-05, "loss": 0.8221, "step": 2721 }, { "epoch": 0.22398683398477678, "grad_norm": 0.4623247008641807, "learning_rate": 1.809079654809128e-05, "loss": 0.5534, "step": 2722 }, { "epoch": 0.22406912157992182, "grad_norm": 2.435738230686578, "learning_rate": 1.8089229864825175e-05, "loss": 0.8088, "step": 2723 }, { "epoch": 0.22415140917506685, "grad_norm": 2.4030263135753187, "learning_rate": 1.8087662606913116e-05, "loss": 0.7986, "step": 2724 }, { "epoch": 0.2242336967702119, "grad_norm": 2.3586380489701733, "learning_rate": 1.808609477446644e-05, "loss": 0.8449, "step": 2725 }, { "epoch": 0.22431598436535694, "grad_norm": 2.7952344002987983, "learning_rate": 1.808452636759652e-05, "loss": 0.8458, "step": 2726 }, { "epoch": 0.22439827196050197, "grad_norm": 2.7536534246615854, "learning_rate": 1.8082957386414782e-05, "loss": 0.829, "step": 2727 }, { "epoch": 0.224480559555647, "grad_norm": 0.4342296596543259, "learning_rate": 1.8081387831032675e-05, "loss": 0.5589, "step": 2728 }, { "epoch": 0.22456284715079203, "grad_norm": 2.3765572582680106, "learning_rate": 1.8079817701561702e-05, "loss": 0.8591, "step": 2729 }, { "epoch": 0.22464513474593706, "grad_norm": 2.807707088671858, "learning_rate": 1.80782469981134e-05, "loss": 0.8012, "step": 2730 }, { "epoch": 0.2247274223410821, "grad_norm": 2.87764660553564, "learning_rate": 1.807667572079935e-05, "loss": 0.839, "step": 2731 }, { "epoch": 0.22480970993622712, "grad_norm": 0.43016929955597694, "learning_rate": 1.8075103869731174e-05, "loss": 0.5304, "step": 2732 }, { "epoch": 0.22489199753137215, "grad_norm": 2.8712395483576296, "learning_rate": 1.8073531445020533e-05, "loss": 0.8004, "step": 2733 }, { "epoch": 0.22497428512651718, "grad_norm": 3.9125598864320894, "learning_rate": 1.8071958446779133e-05, "loss": 0.8353, "step": 2734 }, { "epoch": 0.2250565727216622, "grad_norm": 2.8766794137375276, "learning_rate": 1.807038487511871e-05, "loss": 0.7854, "step": 2735 }, { "epoch": 0.22513886031680724, "grad_norm": 3.76834116747564, "learning_rate": 1.8068810730151053e-05, "loss": 0.8137, "step": 2736 }, { "epoch": 0.22522114791195227, "grad_norm": 3.853892293284356, "learning_rate": 1.8067236011987987e-05, "loss": 0.8351, "step": 2737 }, { "epoch": 0.2253034355070973, "grad_norm": 3.415055201893161, "learning_rate": 1.8065660720741374e-05, "loss": 0.8034, "step": 2738 }, { "epoch": 0.22538572310224234, "grad_norm": 2.940132599899046, "learning_rate": 1.8064084856523126e-05, "loss": 0.8307, "step": 2739 }, { "epoch": 0.22546801069738737, "grad_norm": 3.581335962493952, "learning_rate": 1.8062508419445187e-05, "loss": 0.8331, "step": 2740 }, { "epoch": 0.2255502982925324, "grad_norm": 3.63356357971262, "learning_rate": 1.806093140961954e-05, "loss": 0.8409, "step": 2741 }, { "epoch": 0.22563258588767743, "grad_norm": 6.103630461426989, "learning_rate": 1.805935382715822e-05, "loss": 0.805, "step": 2742 }, { "epoch": 0.22571487348282246, "grad_norm": 4.939277422318664, "learning_rate": 1.8057775672173292e-05, "loss": 0.8172, "step": 2743 }, { "epoch": 0.2257971610779675, "grad_norm": 3.5615037872775708, "learning_rate": 1.8056196944776867e-05, "loss": 0.8284, "step": 2744 }, { "epoch": 0.22587944867311252, "grad_norm": 4.683351176812795, "learning_rate": 1.8054617645081094e-05, "loss": 0.797, "step": 2745 }, { "epoch": 0.22596173626825755, "grad_norm": 3.9070880231642082, "learning_rate": 1.8053037773198165e-05, "loss": 0.7933, "step": 2746 }, { "epoch": 0.22604402386340258, "grad_norm": 3.919312161038904, "learning_rate": 1.8051457329240315e-05, "loss": 0.8028, "step": 2747 }, { "epoch": 0.2261263114585476, "grad_norm": 3.340988457131581, "learning_rate": 1.8049876313319807e-05, "loss": 0.8357, "step": 2748 }, { "epoch": 0.22620859905369264, "grad_norm": 3.8575957682483053, "learning_rate": 1.8048294725548966e-05, "loss": 0.8367, "step": 2749 }, { "epoch": 0.22629088664883767, "grad_norm": 5.825965824067105, "learning_rate": 1.8046712566040135e-05, "loss": 0.8053, "step": 2750 }, { "epoch": 0.22637317424398273, "grad_norm": 3.497742944083964, "learning_rate": 1.8045129834905713e-05, "loss": 0.8078, "step": 2751 }, { "epoch": 0.22645546183912776, "grad_norm": 3.7369114255439047, "learning_rate": 1.8043546532258133e-05, "loss": 0.7694, "step": 2752 }, { "epoch": 0.2265377494342728, "grad_norm": 3.5319658700375576, "learning_rate": 1.8041962658209873e-05, "loss": 0.8042, "step": 2753 }, { "epoch": 0.22662003702941783, "grad_norm": 0.4604383030756832, "learning_rate": 1.8040378212873445e-05, "loss": 0.5335, "step": 2754 }, { "epoch": 0.22670232462456286, "grad_norm": 0.4470665522275032, "learning_rate": 1.803879319636141e-05, "loss": 0.5761, "step": 2755 }, { "epoch": 0.2267846122197079, "grad_norm": 0.4366989067725965, "learning_rate": 1.803720760878636e-05, "loss": 0.5476, "step": 2756 }, { "epoch": 0.22686689981485292, "grad_norm": 0.4340565178454343, "learning_rate": 1.8035621450260934e-05, "loss": 0.5576, "step": 2757 }, { "epoch": 0.22694918740999795, "grad_norm": 3.256507120264482, "learning_rate": 1.8034034720897815e-05, "loss": 0.8254, "step": 2758 }, { "epoch": 0.22703147500514298, "grad_norm": 3.547568746283773, "learning_rate": 1.8032447420809714e-05, "loss": 0.7996, "step": 2759 }, { "epoch": 0.227113762600288, "grad_norm": 3.162031884420745, "learning_rate": 1.8030859550109395e-05, "loss": 0.7906, "step": 2760 }, { "epoch": 0.22719605019543304, "grad_norm": 5.324282934442062, "learning_rate": 1.8029271108909658e-05, "loss": 0.8107, "step": 2761 }, { "epoch": 0.22727833779057807, "grad_norm": 0.5110136509069735, "learning_rate": 1.8027682097323345e-05, "loss": 0.5632, "step": 2762 }, { "epoch": 0.2273606253857231, "grad_norm": 3.1907523347530984, "learning_rate": 1.802609251546333e-05, "loss": 0.8165, "step": 2763 }, { "epoch": 0.22744291298086813, "grad_norm": 0.45997827251280465, "learning_rate": 1.802450236344254e-05, "loss": 0.5599, "step": 2764 }, { "epoch": 0.22752520057601316, "grad_norm": 4.260820446140328, "learning_rate": 1.802291164137394e-05, "loss": 0.8009, "step": 2765 }, { "epoch": 0.2276074881711582, "grad_norm": 3.0574181449459585, "learning_rate": 1.802132034937052e-05, "loss": 0.827, "step": 2766 }, { "epoch": 0.22768977576630323, "grad_norm": 19.53527055975888, "learning_rate": 1.8019728487545337e-05, "loss": 0.8205, "step": 2767 }, { "epoch": 0.22777206336144826, "grad_norm": 2.5986421856673134, "learning_rate": 1.8018136056011464e-05, "loss": 0.8297, "step": 2768 }, { "epoch": 0.2278543509565933, "grad_norm": 2.5704257965519504, "learning_rate": 1.801654305488203e-05, "loss": 0.8153, "step": 2769 }, { "epoch": 0.22793663855173832, "grad_norm": 3.16912014397892, "learning_rate": 1.8014949484270196e-05, "loss": 0.8531, "step": 2770 }, { "epoch": 0.22801892614688335, "grad_norm": 2.4673152456843064, "learning_rate": 1.8013355344289172e-05, "loss": 0.7883, "step": 2771 }, { "epoch": 0.22810121374202838, "grad_norm": 11.922974258036625, "learning_rate": 1.8011760635052198e-05, "loss": 0.8156, "step": 2772 }, { "epoch": 0.2281835013371734, "grad_norm": 3.306458350222661, "learning_rate": 1.801016535667256e-05, "loss": 0.7997, "step": 2773 }, { "epoch": 0.22826578893231844, "grad_norm": 3.5841902170576923, "learning_rate": 1.8008569509263588e-05, "loss": 0.8171, "step": 2774 }, { "epoch": 0.22834807652746347, "grad_norm": 2.7601808874883744, "learning_rate": 1.8006973092938645e-05, "loss": 0.8349, "step": 2775 }, { "epoch": 0.2284303641226085, "grad_norm": 2.4278025413104993, "learning_rate": 1.8005376107811136e-05, "loss": 0.8082, "step": 2776 }, { "epoch": 0.22851265171775356, "grad_norm": 3.01193194541238, "learning_rate": 1.8003778553994515e-05, "loss": 0.8296, "step": 2777 }, { "epoch": 0.2285949393128986, "grad_norm": 0.5176617522878121, "learning_rate": 1.8002180431602264e-05, "loss": 0.5477, "step": 2778 }, { "epoch": 0.22867722690804362, "grad_norm": 3.0329698356134895, "learning_rate": 1.8000581740747913e-05, "loss": 0.8466, "step": 2779 }, { "epoch": 0.22875951450318865, "grad_norm": 2.6835536612614037, "learning_rate": 1.799898248154503e-05, "loss": 0.8423, "step": 2780 }, { "epoch": 0.22884180209833369, "grad_norm": 3.424392110523395, "learning_rate": 1.7997382654107227e-05, "loss": 0.8356, "step": 2781 }, { "epoch": 0.22892408969347872, "grad_norm": 2.7777316016252547, "learning_rate": 1.7995782258548146e-05, "loss": 0.8572, "step": 2782 }, { "epoch": 0.22900637728862375, "grad_norm": 3.304412721528233, "learning_rate": 1.799418129498148e-05, "loss": 0.8053, "step": 2783 }, { "epoch": 0.22908866488376878, "grad_norm": 2.9525198650899975, "learning_rate": 1.7992579763520964e-05, "loss": 0.8395, "step": 2784 }, { "epoch": 0.2291709524789138, "grad_norm": 3.224856904335899, "learning_rate": 1.799097766428036e-05, "loss": 0.8374, "step": 2785 }, { "epoch": 0.22925324007405884, "grad_norm": 4.018838225746211, "learning_rate": 1.7989374997373486e-05, "loss": 0.811, "step": 2786 }, { "epoch": 0.22933552766920387, "grad_norm": 2.934213298461492, "learning_rate": 1.7987771762914185e-05, "loss": 0.8178, "step": 2787 }, { "epoch": 0.2294178152643489, "grad_norm": 3.174148191405845, "learning_rate": 1.7986167961016355e-05, "loss": 0.8313, "step": 2788 }, { "epoch": 0.22950010285949393, "grad_norm": 2.606152650937208, "learning_rate": 1.7984563591793923e-05, "loss": 0.8251, "step": 2789 }, { "epoch": 0.22958239045463896, "grad_norm": 3.111609899574041, "learning_rate": 1.7982958655360866e-05, "loss": 0.8243, "step": 2790 }, { "epoch": 0.229664678049784, "grad_norm": 2.986420250781335, "learning_rate": 1.7981353151831193e-05, "loss": 0.8266, "step": 2791 }, { "epoch": 0.22974696564492902, "grad_norm": 3.883321008722349, "learning_rate": 1.7979747081318956e-05, "loss": 0.8164, "step": 2792 }, { "epoch": 0.22982925324007406, "grad_norm": 3.106270550600261, "learning_rate": 1.7978140443938244e-05, "loss": 0.8306, "step": 2793 }, { "epoch": 0.22991154083521909, "grad_norm": 3.2300498098741723, "learning_rate": 1.79765332398032e-05, "loss": 0.8203, "step": 2794 }, { "epoch": 0.22999382843036412, "grad_norm": 4.038394248381576, "learning_rate": 1.7974925469027986e-05, "loss": 0.822, "step": 2795 }, { "epoch": 0.23007611602550915, "grad_norm": 2.7286891766184125, "learning_rate": 1.7973317131726823e-05, "loss": 0.8417, "step": 2796 }, { "epoch": 0.23015840362065418, "grad_norm": 0.535474203626563, "learning_rate": 1.7971708228013966e-05, "loss": 0.5823, "step": 2797 }, { "epoch": 0.2302406912157992, "grad_norm": 0.4653735052850396, "learning_rate": 1.7970098758003697e-05, "loss": 0.5227, "step": 2798 }, { "epoch": 0.23032297881094424, "grad_norm": 2.797506340326556, "learning_rate": 1.7968488721810364e-05, "loss": 0.8128, "step": 2799 }, { "epoch": 0.23040526640608927, "grad_norm": 4.896588128397363, "learning_rate": 1.7966878119548335e-05, "loss": 0.8044, "step": 2800 }, { "epoch": 0.2304875540012343, "grad_norm": 2.876903760083837, "learning_rate": 1.7965266951332027e-05, "loss": 0.7948, "step": 2801 }, { "epoch": 0.23056984159637933, "grad_norm": 2.5982431023174777, "learning_rate": 1.796365521727589e-05, "loss": 0.8107, "step": 2802 }, { "epoch": 0.2306521291915244, "grad_norm": 2.615159312652529, "learning_rate": 1.7962042917494427e-05, "loss": 0.8297, "step": 2803 }, { "epoch": 0.23073441678666942, "grad_norm": 2.9217124157634693, "learning_rate": 1.7960430052102166e-05, "loss": 0.844, "step": 2804 }, { "epoch": 0.23081670438181445, "grad_norm": 0.5660495591221812, "learning_rate": 1.7958816621213684e-05, "loss": 0.5424, "step": 2805 }, { "epoch": 0.23089899197695948, "grad_norm": 2.664884669703133, "learning_rate": 1.79572026249436e-05, "loss": 0.7837, "step": 2806 }, { "epoch": 0.23098127957210451, "grad_norm": 3.2720350383314964, "learning_rate": 1.7955588063406564e-05, "loss": 0.7938, "step": 2807 }, { "epoch": 0.23106356716724955, "grad_norm": 2.689662319312349, "learning_rate": 1.7953972936717278e-05, "loss": 0.8643, "step": 2808 }, { "epoch": 0.23114585476239458, "grad_norm": 2.6269361429987277, "learning_rate": 1.795235724499047e-05, "loss": 0.8291, "step": 2809 }, { "epoch": 0.2312281423575396, "grad_norm": 0.47857928016042517, "learning_rate": 1.7950740988340926e-05, "loss": 0.5723, "step": 2810 }, { "epoch": 0.23131042995268464, "grad_norm": 0.4619817927680795, "learning_rate": 1.7949124166883457e-05, "loss": 0.5529, "step": 2811 }, { "epoch": 0.23139271754782967, "grad_norm": 4.420187743344128, "learning_rate": 1.794750678073292e-05, "loss": 0.7903, "step": 2812 }, { "epoch": 0.2314750051429747, "grad_norm": 2.674173263406235, "learning_rate": 1.794588883000421e-05, "loss": 0.8273, "step": 2813 }, { "epoch": 0.23155729273811973, "grad_norm": 3.903932346463788, "learning_rate": 1.7944270314812265e-05, "loss": 0.8509, "step": 2814 }, { "epoch": 0.23163958033326476, "grad_norm": 2.374337227802502, "learning_rate": 1.7942651235272064e-05, "loss": 0.8112, "step": 2815 }, { "epoch": 0.2317218679284098, "grad_norm": 2.8597494534517365, "learning_rate": 1.7941031591498623e-05, "loss": 0.8384, "step": 2816 }, { "epoch": 0.23180415552355482, "grad_norm": 2.5462037393295383, "learning_rate": 1.793941138360699e-05, "loss": 0.846, "step": 2817 }, { "epoch": 0.23188644311869985, "grad_norm": 2.5528082911592986, "learning_rate": 1.7937790611712275e-05, "loss": 0.8105, "step": 2818 }, { "epoch": 0.23196873071384488, "grad_norm": 2.5290198154963535, "learning_rate": 1.793616927592961e-05, "loss": 0.8408, "step": 2819 }, { "epoch": 0.23205101830898991, "grad_norm": 2.401467555558881, "learning_rate": 1.793454737637417e-05, "loss": 0.817, "step": 2820 }, { "epoch": 0.23213330590413495, "grad_norm": 0.7866378048410645, "learning_rate": 1.7932924913161173e-05, "loss": 0.5505, "step": 2821 }, { "epoch": 0.23221559349927998, "grad_norm": 3.133879280551612, "learning_rate": 1.793130188640588e-05, "loss": 0.7989, "step": 2822 }, { "epoch": 0.232297881094425, "grad_norm": 2.6587889416921553, "learning_rate": 1.7929678296223585e-05, "loss": 0.8313, "step": 2823 }, { "epoch": 0.23238016868957004, "grad_norm": 0.458861730857705, "learning_rate": 1.7928054142729622e-05, "loss": 0.5455, "step": 2824 }, { "epoch": 0.23246245628471507, "grad_norm": 0.4621236247774008, "learning_rate": 1.7926429426039376e-05, "loss": 0.555, "step": 2825 }, { "epoch": 0.2325447438798601, "grad_norm": 3.52670740753614, "learning_rate": 1.7924804146268257e-05, "loss": 0.8106, "step": 2826 }, { "epoch": 0.23262703147500513, "grad_norm": 2.873579756099866, "learning_rate": 1.7923178303531727e-05, "loss": 0.8178, "step": 2827 }, { "epoch": 0.23270931907015016, "grad_norm": 3.9109130325921884, "learning_rate": 1.792155189794528e-05, "loss": 0.804, "step": 2828 }, { "epoch": 0.23279160666529522, "grad_norm": 3.486775955872337, "learning_rate": 1.7919924929624457e-05, "loss": 0.8197, "step": 2829 }, { "epoch": 0.23287389426044025, "grad_norm": 2.7880849629169937, "learning_rate": 1.7918297398684828e-05, "loss": 0.8276, "step": 2830 }, { "epoch": 0.23295618185558528, "grad_norm": 3.0345851221812734, "learning_rate": 1.791666930524202e-05, "loss": 0.8391, "step": 2831 }, { "epoch": 0.2330384694507303, "grad_norm": 2.7802856478901328, "learning_rate": 1.7915040649411687e-05, "loss": 0.8261, "step": 2832 }, { "epoch": 0.23312075704587534, "grad_norm": 3.01147576928371, "learning_rate": 1.7913411431309523e-05, "loss": 0.8339, "step": 2833 }, { "epoch": 0.23320304464102037, "grad_norm": 3.2421585360985836, "learning_rate": 1.7911781651051263e-05, "loss": 0.8345, "step": 2834 }, { "epoch": 0.2332853322361654, "grad_norm": 2.984325765197047, "learning_rate": 1.791015130875269e-05, "loss": 0.8164, "step": 2835 }, { "epoch": 0.23336761983131044, "grad_norm": 0.6096625903346664, "learning_rate": 1.7908520404529618e-05, "loss": 0.559, "step": 2836 }, { "epoch": 0.23344990742645547, "grad_norm": 2.873760656894134, "learning_rate": 1.7906888938497906e-05, "loss": 0.812, "step": 2837 }, { "epoch": 0.2335321950216005, "grad_norm": 0.5020987306387462, "learning_rate": 1.7905256910773446e-05, "loss": 0.5341, "step": 2838 }, { "epoch": 0.23361448261674553, "grad_norm": 4.234624919345039, "learning_rate": 1.7903624321472183e-05, "loss": 0.8344, "step": 2839 }, { "epoch": 0.23369677021189056, "grad_norm": 2.8206007894552187, "learning_rate": 1.790199117071008e-05, "loss": 0.7979, "step": 2840 }, { "epoch": 0.2337790578070356, "grad_norm": 2.739405453759977, "learning_rate": 1.7900357458603168e-05, "loss": 0.8385, "step": 2841 }, { "epoch": 0.23386134540218062, "grad_norm": 3.0773254877839733, "learning_rate": 1.7898723185267496e-05, "loss": 0.8249, "step": 2842 }, { "epoch": 0.23394363299732565, "grad_norm": 4.644462021471566, "learning_rate": 1.789708835081916e-05, "loss": 0.8543, "step": 2843 }, { "epoch": 0.23402592059247068, "grad_norm": 3.052268322199209, "learning_rate": 1.7895452955374296e-05, "loss": 0.8183, "step": 2844 }, { "epoch": 0.2341082081876157, "grad_norm": 3.2071646031808556, "learning_rate": 1.789381699904908e-05, "loss": 0.8029, "step": 2845 }, { "epoch": 0.23419049578276074, "grad_norm": 7.718251805801209, "learning_rate": 1.789218048195973e-05, "loss": 0.8158, "step": 2846 }, { "epoch": 0.23427278337790577, "grad_norm": 3.693545358611462, "learning_rate": 1.78905434042225e-05, "loss": 0.831, "step": 2847 }, { "epoch": 0.2343550709730508, "grad_norm": 3.871319336017675, "learning_rate": 1.788890576595369e-05, "loss": 0.7702, "step": 2848 }, { "epoch": 0.23443735856819584, "grad_norm": 3.3689317720232053, "learning_rate": 1.7887267567269627e-05, "loss": 0.8003, "step": 2849 }, { "epoch": 0.23451964616334087, "grad_norm": 0.6691224385810717, "learning_rate": 1.788562880828669e-05, "loss": 0.5571, "step": 2850 }, { "epoch": 0.2346019337584859, "grad_norm": 3.202024357123697, "learning_rate": 1.7883989489121293e-05, "loss": 0.8483, "step": 2851 }, { "epoch": 0.23468422135363093, "grad_norm": 2.3964130234978556, "learning_rate": 1.7882349609889896e-05, "loss": 0.807, "step": 2852 }, { "epoch": 0.23476650894877596, "grad_norm": 2.6580648611006334, "learning_rate": 1.788070917070898e-05, "loss": 0.799, "step": 2853 }, { "epoch": 0.234848796543921, "grad_norm": 2.538196363098514, "learning_rate": 1.7879068171695095e-05, "loss": 0.8416, "step": 2854 }, { "epoch": 0.23493108413906605, "grad_norm": 3.285871614256874, "learning_rate": 1.7877426612964805e-05, "loss": 0.8215, "step": 2855 }, { "epoch": 0.23501337173421108, "grad_norm": 0.476710228187294, "learning_rate": 1.7875784494634727e-05, "loss": 0.5773, "step": 2856 }, { "epoch": 0.2350956593293561, "grad_norm": 2.710230028079557, "learning_rate": 1.7874141816821516e-05, "loss": 0.7879, "step": 2857 }, { "epoch": 0.23517794692450114, "grad_norm": 0.4302482991466844, "learning_rate": 1.787249857964186e-05, "loss": 0.5361, "step": 2858 }, { "epoch": 0.23526023451964617, "grad_norm": 2.55335830830582, "learning_rate": 1.7870854783212497e-05, "loss": 0.8313, "step": 2859 }, { "epoch": 0.2353425221147912, "grad_norm": 2.9154561286287004, "learning_rate": 1.7869210427650197e-05, "loss": 0.8045, "step": 2860 }, { "epoch": 0.23542480970993623, "grad_norm": 2.967332392427343, "learning_rate": 1.7867565513071775e-05, "loss": 0.8084, "step": 2861 }, { "epoch": 0.23550709730508126, "grad_norm": 2.4313743351898625, "learning_rate": 1.7865920039594077e-05, "loss": 0.8063, "step": 2862 }, { "epoch": 0.2355893849002263, "grad_norm": 0.49104905859343323, "learning_rate": 1.7864274007334e-05, "loss": 0.5597, "step": 2863 }, { "epoch": 0.23567167249537133, "grad_norm": 0.44956185246382396, "learning_rate": 1.786262741640848e-05, "loss": 0.5427, "step": 2864 }, { "epoch": 0.23575396009051636, "grad_norm": 2.719530503696874, "learning_rate": 1.7860980266934477e-05, "loss": 0.8003, "step": 2865 }, { "epoch": 0.2358362476856614, "grad_norm": 3.9138205896080156, "learning_rate": 1.7859332559029007e-05, "loss": 0.8104, "step": 2866 }, { "epoch": 0.23591853528080642, "grad_norm": 2.3110630830595973, "learning_rate": 1.7857684292809125e-05, "loss": 0.8174, "step": 2867 }, { "epoch": 0.23600082287595145, "grad_norm": 4.057356326105194, "learning_rate": 1.7856035468391916e-05, "loss": 0.8488, "step": 2868 }, { "epoch": 0.23608311047109648, "grad_norm": 3.178059190227258, "learning_rate": 1.785438608589451e-05, "loss": 0.8503, "step": 2869 }, { "epoch": 0.2361653980662415, "grad_norm": 3.742005522748046, "learning_rate": 1.785273614543408e-05, "loss": 0.8381, "step": 2870 }, { "epoch": 0.23624768566138654, "grad_norm": 3.611675148428781, "learning_rate": 1.7851085647127834e-05, "loss": 0.8436, "step": 2871 }, { "epoch": 0.23632997325653157, "grad_norm": 0.52588085352646, "learning_rate": 1.7849434591093016e-05, "loss": 0.5563, "step": 2872 }, { "epoch": 0.2364122608516766, "grad_norm": 3.0529399713122354, "learning_rate": 1.784778297744692e-05, "loss": 0.8279, "step": 2873 }, { "epoch": 0.23649454844682163, "grad_norm": 3.8698554755964962, "learning_rate": 1.784613080630687e-05, "loss": 0.8416, "step": 2874 }, { "epoch": 0.23657683604196666, "grad_norm": 3.2625105635611513, "learning_rate": 1.7844478077790233e-05, "loss": 0.8031, "step": 2875 }, { "epoch": 0.2366591236371117, "grad_norm": 3.2943509488596137, "learning_rate": 1.7842824792014427e-05, "loss": 0.8336, "step": 2876 }, { "epoch": 0.23674141123225673, "grad_norm": 3.235947871537383, "learning_rate": 1.7841170949096883e-05, "loss": 0.8093, "step": 2877 }, { "epoch": 0.23682369882740176, "grad_norm": 3.77576612731386, "learning_rate": 1.78395165491551e-05, "loss": 0.8065, "step": 2878 }, { "epoch": 0.2369059864225468, "grad_norm": 5.456827759241825, "learning_rate": 1.7837861592306597e-05, "loss": 0.8474, "step": 2879 }, { "epoch": 0.23698827401769185, "grad_norm": 3.6312496615136545, "learning_rate": 1.783620607866894e-05, "loss": 0.8046, "step": 2880 }, { "epoch": 0.23707056161283688, "grad_norm": 3.862392789417281, "learning_rate": 1.7834550008359738e-05, "loss": 0.809, "step": 2881 }, { "epoch": 0.2371528492079819, "grad_norm": 3.1797948789980706, "learning_rate": 1.783289338149663e-05, "loss": 0.8026, "step": 2882 }, { "epoch": 0.23723513680312694, "grad_norm": 4.0496067473943835, "learning_rate": 1.7831236198197305e-05, "loss": 0.8172, "step": 2883 }, { "epoch": 0.23731742439827197, "grad_norm": 3.201889161643371, "learning_rate": 1.7829578458579483e-05, "loss": 0.8203, "step": 2884 }, { "epoch": 0.237399711993417, "grad_norm": 3.0994372587653944, "learning_rate": 1.782792016276093e-05, "loss": 0.8286, "step": 2885 }, { "epoch": 0.23748199958856203, "grad_norm": 6.101329439325836, "learning_rate": 1.7826261310859447e-05, "loss": 0.8153, "step": 2886 }, { "epoch": 0.23756428718370706, "grad_norm": 3.0580009754371407, "learning_rate": 1.782460190299288e-05, "loss": 0.8317, "step": 2887 }, { "epoch": 0.2376465747788521, "grad_norm": 3.1252717952475177, "learning_rate": 1.78229419392791e-05, "loss": 0.8358, "step": 2888 }, { "epoch": 0.23772886237399712, "grad_norm": 0.4793194081351795, "learning_rate": 1.7821281419836044e-05, "loss": 0.5412, "step": 2889 }, { "epoch": 0.23781114996914215, "grad_norm": 3.541850890247188, "learning_rate": 1.7819620344781657e-05, "loss": 0.8569, "step": 2890 }, { "epoch": 0.23789343756428719, "grad_norm": 3.882668007827672, "learning_rate": 1.7817958714233952e-05, "loss": 0.8249, "step": 2891 }, { "epoch": 0.23797572515943222, "grad_norm": 3.3992855029015074, "learning_rate": 1.781629652831096e-05, "loss": 0.8093, "step": 2892 }, { "epoch": 0.23805801275457725, "grad_norm": 3.5866512708918097, "learning_rate": 1.781463378713076e-05, "loss": 0.7686, "step": 2893 }, { "epoch": 0.23814030034972228, "grad_norm": 4.804221247572846, "learning_rate": 1.781297049081148e-05, "loss": 0.8252, "step": 2894 }, { "epoch": 0.2382225879448673, "grad_norm": 4.3070667034437085, "learning_rate": 1.7811306639471267e-05, "loss": 0.7975, "step": 2895 }, { "epoch": 0.23830487554001234, "grad_norm": 3.1785831184030444, "learning_rate": 1.7809642233228324e-05, "loss": 0.8118, "step": 2896 }, { "epoch": 0.23838716313515737, "grad_norm": 0.4418882975218361, "learning_rate": 1.780797727220089e-05, "loss": 0.5324, "step": 2897 }, { "epoch": 0.2384694507303024, "grad_norm": 2.862289776828012, "learning_rate": 1.7806311756507232e-05, "loss": 0.831, "step": 2898 }, { "epoch": 0.23855173832544743, "grad_norm": 3.4885230093797825, "learning_rate": 1.7804645686265672e-05, "loss": 0.8414, "step": 2899 }, { "epoch": 0.23863402592059246, "grad_norm": 2.584012821454574, "learning_rate": 1.7802979061594564e-05, "loss": 0.846, "step": 2900 }, { "epoch": 0.2387163135157375, "grad_norm": 2.6570487273962646, "learning_rate": 1.7801311882612307e-05, "loss": 0.8186, "step": 2901 }, { "epoch": 0.23879860111088252, "grad_norm": 0.44804613300031815, "learning_rate": 1.7799644149437325e-05, "loss": 0.5788, "step": 2902 }, { "epoch": 0.23888088870602756, "grad_norm": 2.253489610537284, "learning_rate": 1.77979758621881e-05, "loss": 0.7736, "step": 2903 }, { "epoch": 0.2389631763011726, "grad_norm": 2.943977446980764, "learning_rate": 1.7796307020983137e-05, "loss": 0.8284, "step": 2904 }, { "epoch": 0.23904546389631762, "grad_norm": 0.4239993593652877, "learning_rate": 1.7794637625940992e-05, "loss": 0.5514, "step": 2905 }, { "epoch": 0.23912775149146268, "grad_norm": 2.9737445711445463, "learning_rate": 1.7792967677180263e-05, "loss": 0.7948, "step": 2906 }, { "epoch": 0.2392100390866077, "grad_norm": 2.4626561112597805, "learning_rate": 1.7791297174819562e-05, "loss": 0.7985, "step": 2907 }, { "epoch": 0.23929232668175274, "grad_norm": 2.2496018970898675, "learning_rate": 1.778962611897758e-05, "loss": 0.7983, "step": 2908 }, { "epoch": 0.23937461427689777, "grad_norm": 3.8808962895028514, "learning_rate": 1.778795450977301e-05, "loss": 0.8173, "step": 2909 }, { "epoch": 0.2394569018720428, "grad_norm": 2.9670632893908646, "learning_rate": 1.7786282347324607e-05, "loss": 0.8209, "step": 2910 }, { "epoch": 0.23953918946718783, "grad_norm": 2.4468118274384874, "learning_rate": 1.7784609631751162e-05, "loss": 0.8414, "step": 2911 }, { "epoch": 0.23962147706233286, "grad_norm": 2.6242870827617444, "learning_rate": 1.7782936363171496e-05, "loss": 0.7946, "step": 2912 }, { "epoch": 0.2397037646574779, "grad_norm": 2.869216446545581, "learning_rate": 1.778126254170448e-05, "loss": 0.8297, "step": 2913 }, { "epoch": 0.23978605225262292, "grad_norm": 2.1225891935942536, "learning_rate": 1.7779588167469014e-05, "loss": 0.8305, "step": 2914 }, { "epoch": 0.23986833984776795, "grad_norm": 0.4726639378498239, "learning_rate": 1.7777913240584046e-05, "loss": 0.5738, "step": 2915 }, { "epoch": 0.23995062744291298, "grad_norm": 2.39252667296644, "learning_rate": 1.7776237761168562e-05, "loss": 0.7883, "step": 2916 }, { "epoch": 0.24003291503805801, "grad_norm": 2.0755652310001627, "learning_rate": 1.7774561729341583e-05, "loss": 0.8075, "step": 2917 }, { "epoch": 0.24011520263320305, "grad_norm": 2.157183836032903, "learning_rate": 1.7772885145222175e-05, "loss": 0.801, "step": 2918 }, { "epoch": 0.24019749022834808, "grad_norm": 2.886558156591231, "learning_rate": 1.7771208008929434e-05, "loss": 0.8196, "step": 2919 }, { "epoch": 0.2402797778234931, "grad_norm": 2.503925351985564, "learning_rate": 1.7769530320582504e-05, "loss": 0.8104, "step": 2920 }, { "epoch": 0.24036206541863814, "grad_norm": 2.3860694335338177, "learning_rate": 1.776785208030057e-05, "loss": 0.8078, "step": 2921 }, { "epoch": 0.24044435301378317, "grad_norm": 2.892237805255794, "learning_rate": 1.776617328820284e-05, "loss": 0.8211, "step": 2922 }, { "epoch": 0.2405266406089282, "grad_norm": 3.012439927965674, "learning_rate": 1.7764493944408583e-05, "loss": 0.8232, "step": 2923 }, { "epoch": 0.24060892820407323, "grad_norm": 2.996719130927815, "learning_rate": 1.7762814049037096e-05, "loss": 0.8153, "step": 2924 }, { "epoch": 0.24069121579921826, "grad_norm": 2.342317094738393, "learning_rate": 1.7761133602207712e-05, "loss": 0.8192, "step": 2925 }, { "epoch": 0.2407735033943633, "grad_norm": 2.887662051850029, "learning_rate": 1.775945260403981e-05, "loss": 0.7935, "step": 2926 }, { "epoch": 0.24085579098950832, "grad_norm": 2.4837332329260535, "learning_rate": 1.77577710546528e-05, "loss": 0.8325, "step": 2927 }, { "epoch": 0.24093807858465335, "grad_norm": 3.059684047216481, "learning_rate": 1.7756088954166147e-05, "loss": 0.8319, "step": 2928 }, { "epoch": 0.24102036617979838, "grad_norm": 3.4499819342486924, "learning_rate": 1.7754406302699333e-05, "loss": 0.8232, "step": 2929 }, { "epoch": 0.24110265377494342, "grad_norm": 0.4818175678770239, "learning_rate": 1.77527231003719e-05, "loss": 0.55, "step": 2930 }, { "epoch": 0.24118494137008845, "grad_norm": 3.0920558417930617, "learning_rate": 1.7751039347303417e-05, "loss": 0.7753, "step": 2931 }, { "epoch": 0.2412672289652335, "grad_norm": 2.998597893505298, "learning_rate": 1.7749355043613493e-05, "loss": 0.8651, "step": 2932 }, { "epoch": 0.24134951656037854, "grad_norm": 3.9038102066258777, "learning_rate": 1.7747670189421786e-05, "loss": 0.8337, "step": 2933 }, { "epoch": 0.24143180415552357, "grad_norm": 2.5723479252406913, "learning_rate": 1.774598478484797e-05, "loss": 0.7963, "step": 2934 }, { "epoch": 0.2415140917506686, "grad_norm": 0.4300721442737975, "learning_rate": 1.774429883001179e-05, "loss": 0.5481, "step": 2935 }, { "epoch": 0.24159637934581363, "grad_norm": 2.6283222051192503, "learning_rate": 1.7742612325033e-05, "loss": 0.8372, "step": 2936 }, { "epoch": 0.24167866694095866, "grad_norm": 2.415803374736619, "learning_rate": 1.7740925270031417e-05, "loss": 0.8244, "step": 2937 }, { "epoch": 0.2417609545361037, "grad_norm": 1.971274404106693, "learning_rate": 1.7739237665126885e-05, "loss": 0.8127, "step": 2938 }, { "epoch": 0.24184324213124872, "grad_norm": 7.078131438955339, "learning_rate": 1.773754951043928e-05, "loss": 0.799, "step": 2939 }, { "epoch": 0.24192552972639375, "grad_norm": 2.5314945703551155, "learning_rate": 1.7735860806088538e-05, "loss": 0.8475, "step": 2940 }, { "epoch": 0.24200781732153878, "grad_norm": 0.47201850617328384, "learning_rate": 1.7734171552194613e-05, "loss": 0.5476, "step": 2941 }, { "epoch": 0.2420901049166838, "grad_norm": 2.4755906245496977, "learning_rate": 1.773248174887751e-05, "loss": 0.8388, "step": 2942 }, { "epoch": 0.24217239251182884, "grad_norm": 3.512614041752255, "learning_rate": 1.773079139625727e-05, "loss": 0.8126, "step": 2943 }, { "epoch": 0.24225468010697387, "grad_norm": 2.515303401227105, "learning_rate": 1.772910049445397e-05, "loss": 0.8275, "step": 2944 }, { "epoch": 0.2423369677021189, "grad_norm": 3.250687085735003, "learning_rate": 1.7727409043587736e-05, "loss": 0.8373, "step": 2945 }, { "epoch": 0.24241925529726394, "grad_norm": 2.324671566478648, "learning_rate": 1.7725717043778724e-05, "loss": 0.7995, "step": 2946 }, { "epoch": 0.24250154289240897, "grad_norm": 0.468722771305979, "learning_rate": 1.7724024495147123e-05, "loss": 0.5426, "step": 2947 }, { "epoch": 0.242583830487554, "grad_norm": 3.4777394590970947, "learning_rate": 1.7722331397813177e-05, "loss": 0.8027, "step": 2948 }, { "epoch": 0.24266611808269903, "grad_norm": 3.0206885939563164, "learning_rate": 1.772063775189716e-05, "loss": 0.8526, "step": 2949 }, { "epoch": 0.24274840567784406, "grad_norm": 3.331833102443693, "learning_rate": 1.771894355751938e-05, "loss": 0.8295, "step": 2950 }, { "epoch": 0.2428306932729891, "grad_norm": 2.8204423273747867, "learning_rate": 1.7717248814800198e-05, "loss": 0.7961, "step": 2951 }, { "epoch": 0.24291298086813412, "grad_norm": 2.612675828083287, "learning_rate": 1.771555352386e-05, "loss": 0.8055, "step": 2952 }, { "epoch": 0.24299526846327915, "grad_norm": 2.4201788857878737, "learning_rate": 1.771385768481922e-05, "loss": 0.8306, "step": 2953 }, { "epoch": 0.24307755605842418, "grad_norm": 2.3617116892356536, "learning_rate": 1.771216129779833e-05, "loss": 0.8315, "step": 2954 }, { "epoch": 0.2431598436535692, "grad_norm": 2.2479992095226575, "learning_rate": 1.771046436291783e-05, "loss": 0.7626, "step": 2955 }, { "epoch": 0.24324213124871424, "grad_norm": 2.7526501879998015, "learning_rate": 1.7708766880298275e-05, "loss": 0.8079, "step": 2956 }, { "epoch": 0.24332441884385927, "grad_norm": 2.474424972586215, "learning_rate": 1.7707068850060247e-05, "loss": 0.8315, "step": 2957 }, { "epoch": 0.24340670643900433, "grad_norm": 2.7862231336500685, "learning_rate": 1.7705370272324375e-05, "loss": 0.8485, "step": 2958 }, { "epoch": 0.24348899403414936, "grad_norm": 2.5829056955847247, "learning_rate": 1.770367114721132e-05, "loss": 0.8221, "step": 2959 }, { "epoch": 0.2435712816292944, "grad_norm": 2.667211592504044, "learning_rate": 1.7701971474841793e-05, "loss": 0.8145, "step": 2960 }, { "epoch": 0.24365356922443943, "grad_norm": 0.48873734030396343, "learning_rate": 1.7700271255336525e-05, "loss": 0.5677, "step": 2961 }, { "epoch": 0.24373585681958446, "grad_norm": 2.888232728250638, "learning_rate": 1.76985704888163e-05, "loss": 0.8315, "step": 2962 }, { "epoch": 0.2438181444147295, "grad_norm": 20.925313878929853, "learning_rate": 1.769686917540194e-05, "loss": 0.8133, "step": 2963 }, { "epoch": 0.24390043200987452, "grad_norm": 2.762209834419681, "learning_rate": 1.769516731521431e-05, "loss": 0.8132, "step": 2964 }, { "epoch": 0.24398271960501955, "grad_norm": 2.8652691832779413, "learning_rate": 1.7693464908374295e-05, "loss": 0.8303, "step": 2965 }, { "epoch": 0.24406500720016458, "grad_norm": 2.267440807893723, "learning_rate": 1.7691761955002837e-05, "loss": 0.7961, "step": 2966 }, { "epoch": 0.2441472947953096, "grad_norm": 2.5922017142258764, "learning_rate": 1.769005845522091e-05, "loss": 0.8466, "step": 2967 }, { "epoch": 0.24422958239045464, "grad_norm": 2.435470874214321, "learning_rate": 1.768835440914953e-05, "loss": 0.8278, "step": 2968 }, { "epoch": 0.24431186998559967, "grad_norm": 2.734768088007503, "learning_rate": 1.768664981690975e-05, "loss": 0.8162, "step": 2969 }, { "epoch": 0.2443941575807447, "grad_norm": 2.2931159488349095, "learning_rate": 1.768494467862266e-05, "loss": 0.8015, "step": 2970 }, { "epoch": 0.24447644517588973, "grad_norm": 2.33505639795374, "learning_rate": 1.768323899440939e-05, "loss": 0.8067, "step": 2971 }, { "epoch": 0.24455873277103476, "grad_norm": 3.1191561268536265, "learning_rate": 1.7681532764391108e-05, "loss": 0.8209, "step": 2972 }, { "epoch": 0.2446410203661798, "grad_norm": 3.439317442966874, "learning_rate": 1.767982598868902e-05, "loss": 0.8156, "step": 2973 }, { "epoch": 0.24472330796132483, "grad_norm": 4.029913486068659, "learning_rate": 1.767811866742438e-05, "loss": 0.8809, "step": 2974 }, { "epoch": 0.24480559555646986, "grad_norm": 0.47992780240279903, "learning_rate": 1.767641080071847e-05, "loss": 0.527, "step": 2975 }, { "epoch": 0.2448878831516149, "grad_norm": 1.9906042687346728, "learning_rate": 1.7674702388692612e-05, "loss": 0.8284, "step": 2976 }, { "epoch": 0.24497017074675992, "grad_norm": 5.348166325662937, "learning_rate": 1.7672993431468167e-05, "loss": 0.8185, "step": 2977 }, { "epoch": 0.24505245834190495, "grad_norm": 3.0302113443408167, "learning_rate": 1.7671283929166545e-05, "loss": 0.8256, "step": 2978 }, { "epoch": 0.24513474593704998, "grad_norm": 2.765533594139685, "learning_rate": 1.766957388190918e-05, "loss": 0.8216, "step": 2979 }, { "epoch": 0.245217033532195, "grad_norm": 5.965302503829052, "learning_rate": 1.766786328981755e-05, "loss": 0.7883, "step": 2980 }, { "epoch": 0.24529932112734004, "grad_norm": 2.7406380130111803, "learning_rate": 1.7666152153013177e-05, "loss": 0.7926, "step": 2981 }, { "epoch": 0.24538160872248507, "grad_norm": 0.47825800329757423, "learning_rate": 1.766444047161761e-05, "loss": 0.5796, "step": 2982 }, { "epoch": 0.2454638963176301, "grad_norm": 2.310350076864101, "learning_rate": 1.7662728245752453e-05, "loss": 0.8045, "step": 2983 }, { "epoch": 0.24554618391277516, "grad_norm": 0.4363743762926369, "learning_rate": 1.7661015475539337e-05, "loss": 0.5486, "step": 2984 }, { "epoch": 0.2456284715079202, "grad_norm": 3.286691924601928, "learning_rate": 1.7659302161099935e-05, "loss": 0.8471, "step": 2985 }, { "epoch": 0.24571075910306522, "grad_norm": 2.3988825258194364, "learning_rate": 1.7657588302555956e-05, "loss": 0.8123, "step": 2986 }, { "epoch": 0.24579304669821025, "grad_norm": 2.1323590274633224, "learning_rate": 1.7655873900029147e-05, "loss": 0.8215, "step": 2987 }, { "epoch": 0.24587533429335529, "grad_norm": 2.7473213088400184, "learning_rate": 1.7654158953641303e-05, "loss": 0.8162, "step": 2988 }, { "epoch": 0.24595762188850032, "grad_norm": 3.185590661191936, "learning_rate": 1.7652443463514245e-05, "loss": 0.8183, "step": 2989 }, { "epoch": 0.24603990948364535, "grad_norm": 2.676247353052082, "learning_rate": 1.7650727429769844e-05, "loss": 0.8187, "step": 2990 }, { "epoch": 0.24612219707879038, "grad_norm": 0.4753685846545083, "learning_rate": 1.7649010852530005e-05, "loss": 0.5607, "step": 2991 }, { "epoch": 0.2462044846739354, "grad_norm": 2.261811425327102, "learning_rate": 1.7647293731916664e-05, "loss": 0.8185, "step": 2992 }, { "epoch": 0.24628677226908044, "grad_norm": 2.3374305262319464, "learning_rate": 1.7645576068051806e-05, "loss": 0.796, "step": 2993 }, { "epoch": 0.24636905986422547, "grad_norm": 0.45724340821086584, "learning_rate": 1.7643857861057453e-05, "loss": 0.5572, "step": 2994 }, { "epoch": 0.2464513474593705, "grad_norm": 2.8307572222686566, "learning_rate": 1.764213911105566e-05, "loss": 0.8397, "step": 2995 }, { "epoch": 0.24653363505451553, "grad_norm": 2.5129736766859203, "learning_rate": 1.764041981816853e-05, "loss": 0.7905, "step": 2996 }, { "epoch": 0.24661592264966056, "grad_norm": 2.9659850416830102, "learning_rate": 1.7638699982518193e-05, "loss": 0.8138, "step": 2997 }, { "epoch": 0.2466982102448056, "grad_norm": 2.6159577238005025, "learning_rate": 1.7636979604226826e-05, "loss": 0.8193, "step": 2998 }, { "epoch": 0.24678049783995062, "grad_norm": 0.4223222850046514, "learning_rate": 1.763525868341664e-05, "loss": 0.5336, "step": 2999 }, { "epoch": 0.24686278543509566, "grad_norm": 2.5919015592128196, "learning_rate": 1.763353722020989e-05, "loss": 0.8278, "step": 3000 }, { "epoch": 0.24694507303024069, "grad_norm": 2.9501868940145024, "learning_rate": 1.763181521472886e-05, "loss": 0.8371, "step": 3001 }, { "epoch": 0.24702736062538572, "grad_norm": 8.262148809997365, "learning_rate": 1.7630092667095886e-05, "loss": 0.8384, "step": 3002 }, { "epoch": 0.24710964822053075, "grad_norm": 6.492544055589544, "learning_rate": 1.7628369577433328e-05, "loss": 0.8187, "step": 3003 }, { "epoch": 0.24719193581567578, "grad_norm": 0.44702863278235955, "learning_rate": 1.7626645945863598e-05, "loss": 0.5541, "step": 3004 }, { "epoch": 0.2472742234108208, "grad_norm": 2.4050113395335355, "learning_rate": 1.7624921772509137e-05, "loss": 0.8374, "step": 3005 }, { "epoch": 0.24735651100596584, "grad_norm": 2.6401344327287224, "learning_rate": 1.762319705749243e-05, "loss": 0.825, "step": 3006 }, { "epoch": 0.24743879860111087, "grad_norm": 2.7101886526639904, "learning_rate": 1.762147180093599e-05, "loss": 0.8071, "step": 3007 }, { "epoch": 0.2475210861962559, "grad_norm": 0.4569123759115584, "learning_rate": 1.7619746002962385e-05, "loss": 0.5317, "step": 3008 }, { "epoch": 0.24760337379140093, "grad_norm": 2.1143175811102766, "learning_rate": 1.7618019663694213e-05, "loss": 0.8247, "step": 3009 }, { "epoch": 0.247685661386546, "grad_norm": 2.697483320930217, "learning_rate": 1.76162927832541e-05, "loss": 0.828, "step": 3010 }, { "epoch": 0.24776794898169102, "grad_norm": 2.319060517273934, "learning_rate": 1.7614565361764736e-05, "loss": 0.8083, "step": 3011 }, { "epoch": 0.24785023657683605, "grad_norm": 2.1388551387396966, "learning_rate": 1.761283739934882e-05, "loss": 0.815, "step": 3012 }, { "epoch": 0.24793252417198108, "grad_norm": 2.0823734950527504, "learning_rate": 1.761110889612911e-05, "loss": 0.8429, "step": 3013 }, { "epoch": 0.24801481176712611, "grad_norm": 2.7512328496729084, "learning_rate": 1.76093798522284e-05, "loss": 0.8381, "step": 3014 }, { "epoch": 0.24809709936227115, "grad_norm": 3.2382987863079076, "learning_rate": 1.7607650267769518e-05, "loss": 0.8127, "step": 3015 }, { "epoch": 0.24817938695741618, "grad_norm": 2.8477801097635145, "learning_rate": 1.760592014287532e-05, "loss": 0.8231, "step": 3016 }, { "epoch": 0.2482616745525612, "grad_norm": 2.403751343409887, "learning_rate": 1.7604189477668723e-05, "loss": 0.8021, "step": 3017 }, { "epoch": 0.24834396214770624, "grad_norm": 2.530935094848329, "learning_rate": 1.7602458272272664e-05, "loss": 0.8153, "step": 3018 }, { "epoch": 0.24842624974285127, "grad_norm": 2.63352136642727, "learning_rate": 1.760072652681013e-05, "loss": 0.809, "step": 3019 }, { "epoch": 0.2485085373379963, "grad_norm": 3.458558440607959, "learning_rate": 1.7598994241404138e-05, "loss": 0.8249, "step": 3020 }, { "epoch": 0.24859082493314133, "grad_norm": 5.264472625440188, "learning_rate": 1.7597261416177748e-05, "loss": 0.8381, "step": 3021 }, { "epoch": 0.24867311252828636, "grad_norm": 2.7924585028224773, "learning_rate": 1.759552805125406e-05, "loss": 0.8332, "step": 3022 }, { "epoch": 0.2487554001234314, "grad_norm": 2.811529056203523, "learning_rate": 1.75937941467562e-05, "loss": 0.8245, "step": 3023 }, { "epoch": 0.24883768771857642, "grad_norm": 2.3834870192262585, "learning_rate": 1.7592059702807355e-05, "loss": 0.8127, "step": 3024 }, { "epoch": 0.24891997531372145, "grad_norm": 3.080319879970508, "learning_rate": 1.7590324719530727e-05, "loss": 0.8208, "step": 3025 }, { "epoch": 0.24900226290886648, "grad_norm": 2.4573468356132198, "learning_rate": 1.7588589197049567e-05, "loss": 0.792, "step": 3026 }, { "epoch": 0.24908455050401151, "grad_norm": 3.0390976228111892, "learning_rate": 1.7586853135487173e-05, "loss": 0.8175, "step": 3027 }, { "epoch": 0.24916683809915655, "grad_norm": 3.010888603421016, "learning_rate": 1.7585116534966862e-05, "loss": 0.7831, "step": 3028 }, { "epoch": 0.24924912569430158, "grad_norm": 2.2894645992642015, "learning_rate": 1.7583379395612e-05, "loss": 0.8009, "step": 3029 }, { "epoch": 0.2493314132894466, "grad_norm": 2.3589866437890654, "learning_rate": 1.7581641717546e-05, "loss": 0.8162, "step": 3030 }, { "epoch": 0.24941370088459164, "grad_norm": 0.45847778884193263, "learning_rate": 1.7579903500892295e-05, "loss": 0.554, "step": 3031 }, { "epoch": 0.24949598847973667, "grad_norm": 2.667053134073255, "learning_rate": 1.7578164745774365e-05, "loss": 0.805, "step": 3032 }, { "epoch": 0.2495782760748817, "grad_norm": 2.5236571720177006, "learning_rate": 1.7576425452315734e-05, "loss": 0.8178, "step": 3033 }, { "epoch": 0.24966056367002673, "grad_norm": 2.258153479818695, "learning_rate": 1.7574685620639955e-05, "loss": 0.8225, "step": 3034 }, { "epoch": 0.2497428512651718, "grad_norm": 2.3110737984568654, "learning_rate": 1.7572945250870622e-05, "loss": 0.7955, "step": 3035 }, { "epoch": 0.24982513886031682, "grad_norm": 2.5234714054140923, "learning_rate": 1.7571204343131373e-05, "loss": 0.7953, "step": 3036 }, { "epoch": 0.24990742645546185, "grad_norm": 5.904592799601487, "learning_rate": 1.7569462897545873e-05, "loss": 0.8165, "step": 3037 }, { "epoch": 0.24998971405060688, "grad_norm": 2.0965767073497155, "learning_rate": 1.7567720914237835e-05, "loss": 0.7897, "step": 3038 }, { "epoch": 0.2500720016457519, "grad_norm": 4.594650483128967, "learning_rate": 1.7565978393331005e-05, "loss": 0.8458, "step": 3039 }, { "epoch": 0.25015428924089694, "grad_norm": 0.4553532023589861, "learning_rate": 1.756423533494917e-05, "loss": 0.5334, "step": 3040 }, { "epoch": 0.25023657683604195, "grad_norm": 2.6783122193891757, "learning_rate": 1.7562491739216155e-05, "loss": 0.8186, "step": 3041 }, { "epoch": 0.250318864431187, "grad_norm": 2.2723065495937087, "learning_rate": 1.756074760625582e-05, "loss": 0.8018, "step": 3042 }, { "epoch": 0.250401152026332, "grad_norm": 2.9650880609188435, "learning_rate": 1.755900293619207e-05, "loss": 0.8399, "step": 3043 }, { "epoch": 0.25048343962147707, "grad_norm": 3.9171173074396917, "learning_rate": 1.755725772914884e-05, "loss": 0.8322, "step": 3044 }, { "epoch": 0.25056572721662207, "grad_norm": 0.44238452905452086, "learning_rate": 1.75555119852501e-05, "loss": 0.5509, "step": 3045 }, { "epoch": 0.25064801481176713, "grad_norm": 2.532269176697092, "learning_rate": 1.7553765704619877e-05, "loss": 0.8376, "step": 3046 }, { "epoch": 0.25073030240691213, "grad_norm": 2.1601877532443083, "learning_rate": 1.755201888738222e-05, "loss": 0.8079, "step": 3047 }, { "epoch": 0.2508125900020572, "grad_norm": 0.4864105965006659, "learning_rate": 1.7550271533661217e-05, "loss": 0.5467, "step": 3048 }, { "epoch": 0.25089487759720225, "grad_norm": 2.379700540347124, "learning_rate": 1.7548523643581e-05, "loss": 0.7873, "step": 3049 }, { "epoch": 0.25097716519234725, "grad_norm": 4.0656425979348985, "learning_rate": 1.7546775217265734e-05, "loss": 0.7955, "step": 3050 }, { "epoch": 0.2510594527874923, "grad_norm": 2.7719299480072084, "learning_rate": 1.7545026254839627e-05, "loss": 0.8208, "step": 3051 }, { "epoch": 0.2511417403826373, "grad_norm": 2.112697347780536, "learning_rate": 1.754327675642692e-05, "loss": 0.8283, "step": 3052 }, { "epoch": 0.25122402797778237, "grad_norm": 2.7457155480951747, "learning_rate": 1.7541526722151897e-05, "loss": 0.7984, "step": 3053 }, { "epoch": 0.2513063155729274, "grad_norm": 2.964443751057906, "learning_rate": 1.753977615213888e-05, "loss": 0.8019, "step": 3054 }, { "epoch": 0.25138860316807243, "grad_norm": 0.46730221582743114, "learning_rate": 1.7538025046512218e-05, "loss": 0.5251, "step": 3055 }, { "epoch": 0.25147089076321744, "grad_norm": 0.4461078153487976, "learning_rate": 1.7536273405396314e-05, "loss": 0.5648, "step": 3056 }, { "epoch": 0.2515531783583625, "grad_norm": 2.298544228777589, "learning_rate": 1.75345212289156e-05, "loss": 0.8048, "step": 3057 }, { "epoch": 0.2516354659535075, "grad_norm": 2.8616057906402537, "learning_rate": 1.753276851719455e-05, "loss": 0.8246, "step": 3058 }, { "epoch": 0.25171775354865256, "grad_norm": 2.659983701913832, "learning_rate": 1.7531015270357667e-05, "loss": 0.7937, "step": 3059 }, { "epoch": 0.25180004114379756, "grad_norm": 3.5499141636761755, "learning_rate": 1.7529261488529503e-05, "loss": 0.8159, "step": 3060 }, { "epoch": 0.2518823287389426, "grad_norm": 2.3718170006117827, "learning_rate": 1.7527507171834647e-05, "loss": 0.8379, "step": 3061 }, { "epoch": 0.2519646163340876, "grad_norm": 6.495089047174273, "learning_rate": 1.7525752320397717e-05, "loss": 0.8113, "step": 3062 }, { "epoch": 0.2520469039292327, "grad_norm": 0.4670703002111636, "learning_rate": 1.752399693434338e-05, "loss": 0.5719, "step": 3063 }, { "epoch": 0.2521291915243777, "grad_norm": 3.035483833386936, "learning_rate": 1.7522241013796336e-05, "loss": 0.8339, "step": 3064 }, { "epoch": 0.25221147911952274, "grad_norm": 3.1199229312583663, "learning_rate": 1.7520484558881316e-05, "loss": 0.8418, "step": 3065 }, { "epoch": 0.25229376671466774, "grad_norm": 3.4612403146143547, "learning_rate": 1.7518727569723104e-05, "loss": 0.8355, "step": 3066 }, { "epoch": 0.2523760543098128, "grad_norm": 3.3961735804307867, "learning_rate": 1.7516970046446506e-05, "loss": 0.7901, "step": 3067 }, { "epoch": 0.2524583419049578, "grad_norm": 3.058194691007951, "learning_rate": 1.751521198917638e-05, "loss": 0.8025, "step": 3068 }, { "epoch": 0.25254062950010286, "grad_norm": 2.906124566549867, "learning_rate": 1.7513453398037613e-05, "loss": 0.8066, "step": 3069 }, { "epoch": 0.25262291709524787, "grad_norm": 2.4134898879968363, "learning_rate": 1.7511694273155133e-05, "loss": 0.7797, "step": 3070 }, { "epoch": 0.2527052046903929, "grad_norm": 2.3780332334605876, "learning_rate": 1.7509934614653903e-05, "loss": 0.8551, "step": 3071 }, { "epoch": 0.25278749228553793, "grad_norm": 0.4675698799777684, "learning_rate": 1.750817442265893e-05, "loss": 0.5587, "step": 3072 }, { "epoch": 0.252869779880683, "grad_norm": 2.507602846575027, "learning_rate": 1.7506413697295253e-05, "loss": 0.8095, "step": 3073 }, { "epoch": 0.25295206747582805, "grad_norm": 2.423926741802012, "learning_rate": 1.7504652438687952e-05, "loss": 0.7885, "step": 3074 }, { "epoch": 0.25303435507097305, "grad_norm": 2.427193422320289, "learning_rate": 1.7502890646962143e-05, "loss": 0.81, "step": 3075 }, { "epoch": 0.2531166426661181, "grad_norm": 3.0277514236068934, "learning_rate": 1.7501128322242982e-05, "loss": 0.818, "step": 3076 }, { "epoch": 0.2531989302612631, "grad_norm": 2.9834204488960965, "learning_rate": 1.7499365464655663e-05, "loss": 0.8075, "step": 3077 }, { "epoch": 0.25328121785640817, "grad_norm": 2.950887337851664, "learning_rate": 1.7497602074325412e-05, "loss": 0.8039, "step": 3078 }, { "epoch": 0.2533635054515532, "grad_norm": 2.67647405206412, "learning_rate": 1.74958381513775e-05, "loss": 0.7867, "step": 3079 }, { "epoch": 0.25344579304669823, "grad_norm": 2.394261796844176, "learning_rate": 1.7494073695937233e-05, "loss": 0.8258, "step": 3080 }, { "epoch": 0.25352808064184323, "grad_norm": 3.6425048453575566, "learning_rate": 1.749230870812996e-05, "loss": 0.7776, "step": 3081 }, { "epoch": 0.2536103682369883, "grad_norm": 2.6208238884816084, "learning_rate": 1.7490543188081056e-05, "loss": 0.7842, "step": 3082 }, { "epoch": 0.2536926558321333, "grad_norm": 3.017990497572143, "learning_rate": 1.748877713591594e-05, "loss": 0.7864, "step": 3083 }, { "epoch": 0.25377494342727835, "grad_norm": 3.0416081537456017, "learning_rate": 1.748701055176008e-05, "loss": 0.7868, "step": 3084 }, { "epoch": 0.25385723102242336, "grad_norm": 4.6039910995048, "learning_rate": 1.748524343573896e-05, "loss": 0.8093, "step": 3085 }, { "epoch": 0.2539395186175684, "grad_norm": 3.7897892956488795, "learning_rate": 1.7483475787978116e-05, "loss": 0.8165, "step": 3086 }, { "epoch": 0.2540218062127134, "grad_norm": 2.4618042382111094, "learning_rate": 1.748170760860312e-05, "loss": 0.814, "step": 3087 }, { "epoch": 0.2541040938078585, "grad_norm": 3.5491587012727734, "learning_rate": 1.7479938897739584e-05, "loss": 0.8284, "step": 3088 }, { "epoch": 0.2541863814030035, "grad_norm": 3.2529004658417864, "learning_rate": 1.747816965551315e-05, "loss": 0.8226, "step": 3089 }, { "epoch": 0.25426866899814854, "grad_norm": 2.945095187811677, "learning_rate": 1.7476399882049504e-05, "loss": 0.8261, "step": 3090 }, { "epoch": 0.25435095659329354, "grad_norm": 0.4431246499664612, "learning_rate": 1.7474629577474364e-05, "loss": 0.5286, "step": 3091 }, { "epoch": 0.2544332441884386, "grad_norm": 2.8315496862525857, "learning_rate": 1.7472858741913494e-05, "loss": 0.7767, "step": 3092 }, { "epoch": 0.2545155317835836, "grad_norm": 2.7369266356349566, "learning_rate": 1.747108737549269e-05, "loss": 0.7892, "step": 3093 }, { "epoch": 0.25459781937872866, "grad_norm": 2.7472613775880665, "learning_rate": 1.746931547833779e-05, "loss": 0.822, "step": 3094 }, { "epoch": 0.25468010697387367, "grad_norm": 0.43128596916781586, "learning_rate": 1.7467543050574663e-05, "loss": 0.5538, "step": 3095 }, { "epoch": 0.2547623945690187, "grad_norm": 0.4429071764601574, "learning_rate": 1.7465770092329216e-05, "loss": 0.5472, "step": 3096 }, { "epoch": 0.2548446821641637, "grad_norm": 2.5446895062718333, "learning_rate": 1.7463996603727405e-05, "loss": 0.7802, "step": 3097 }, { "epoch": 0.2549269697593088, "grad_norm": 3.206538126288419, "learning_rate": 1.746222258489521e-05, "loss": 0.8024, "step": 3098 }, { "epoch": 0.2550092573544538, "grad_norm": 2.3407871094989745, "learning_rate": 1.746044803595866e-05, "loss": 0.8284, "step": 3099 }, { "epoch": 0.25509154494959885, "grad_norm": 2.683572965991484, "learning_rate": 1.7458672957043807e-05, "loss": 0.8101, "step": 3100 }, { "epoch": 0.2551738325447439, "grad_norm": 0.4420254669332104, "learning_rate": 1.7456897348276764e-05, "loss": 0.5457, "step": 3101 }, { "epoch": 0.2552561201398889, "grad_norm": 2.363029115605435, "learning_rate": 1.745512120978365e-05, "loss": 0.8116, "step": 3102 }, { "epoch": 0.25533840773503397, "grad_norm": 2.1223518351061506, "learning_rate": 1.7453344541690653e-05, "loss": 0.816, "step": 3103 }, { "epoch": 0.25542069533017897, "grad_norm": 3.655723669767227, "learning_rate": 1.7451567344123978e-05, "loss": 0.8411, "step": 3104 }, { "epoch": 0.25550298292532403, "grad_norm": 2.356116906927714, "learning_rate": 1.7449789617209876e-05, "loss": 0.7996, "step": 3105 }, { "epoch": 0.25558527052046903, "grad_norm": 2.309729737380879, "learning_rate": 1.7448011361074634e-05, "loss": 0.8313, "step": 3106 }, { "epoch": 0.2556675581156141, "grad_norm": 2.316446164810882, "learning_rate": 1.7446232575844578e-05, "loss": 0.8265, "step": 3107 }, { "epoch": 0.2557498457107591, "grad_norm": 2.2446147804060566, "learning_rate": 1.744445326164607e-05, "loss": 0.8034, "step": 3108 }, { "epoch": 0.25583213330590415, "grad_norm": 2.101950996193958, "learning_rate": 1.74426734186055e-05, "loss": 0.7628, "step": 3109 }, { "epoch": 0.25591442090104916, "grad_norm": 2.161627019623046, "learning_rate": 1.744089304684932e-05, "loss": 0.7809, "step": 3110 }, { "epoch": 0.2559967084961942, "grad_norm": 2.2963699180473482, "learning_rate": 1.7439112146503994e-05, "loss": 0.7928, "step": 3111 }, { "epoch": 0.2560789960913392, "grad_norm": 2.633370868874133, "learning_rate": 1.743733071769604e-05, "loss": 0.7789, "step": 3112 }, { "epoch": 0.2561612836864843, "grad_norm": 2.329791359543784, "learning_rate": 1.7435548760552005e-05, "loss": 0.8215, "step": 3113 }, { "epoch": 0.2562435712816293, "grad_norm": 2.4293776147390234, "learning_rate": 1.743376627519848e-05, "loss": 0.7877, "step": 3114 }, { "epoch": 0.25632585887677434, "grad_norm": 2.4472772592053995, "learning_rate": 1.7431983261762087e-05, "loss": 0.7643, "step": 3115 }, { "epoch": 0.25640814647191934, "grad_norm": 2.4275086488371405, "learning_rate": 1.743019972036949e-05, "loss": 0.8256, "step": 3116 }, { "epoch": 0.2564904340670644, "grad_norm": 2.6011920159469684, "learning_rate": 1.742841565114738e-05, "loss": 0.7854, "step": 3117 }, { "epoch": 0.2565727216622094, "grad_norm": 2.606686750827679, "learning_rate": 1.742663105422251e-05, "loss": 0.8066, "step": 3118 }, { "epoch": 0.25665500925735446, "grad_norm": 0.46905392054729245, "learning_rate": 1.7424845929721645e-05, "loss": 0.5637, "step": 3119 }, { "epoch": 0.25673729685249946, "grad_norm": 0.43242755372410013, "learning_rate": 1.74230602777716e-05, "loss": 0.5467, "step": 3120 }, { "epoch": 0.2568195844476445, "grad_norm": 2.0432006955888564, "learning_rate": 1.7421274098499223e-05, "loss": 0.7777, "step": 3121 }, { "epoch": 0.2569018720427895, "grad_norm": 2.703151990446935, "learning_rate": 1.74194873920314e-05, "loss": 0.8186, "step": 3122 }, { "epoch": 0.2569841596379346, "grad_norm": 2.3934733083927613, "learning_rate": 1.741770015849506e-05, "loss": 0.8089, "step": 3123 }, { "epoch": 0.2570664472330796, "grad_norm": 2.012062497140323, "learning_rate": 1.7415912398017167e-05, "loss": 0.8043, "step": 3124 }, { "epoch": 0.25714873482822465, "grad_norm": 2.0661448979537003, "learning_rate": 1.7414124110724718e-05, "loss": 0.83, "step": 3125 }, { "epoch": 0.2572310224233697, "grad_norm": 2.501706805105995, "learning_rate": 1.7412335296744744e-05, "loss": 0.8424, "step": 3126 }, { "epoch": 0.2573133100185147, "grad_norm": 2.3208443689743388, "learning_rate": 1.741054595620433e-05, "loss": 0.8136, "step": 3127 }, { "epoch": 0.25739559761365977, "grad_norm": 4.837653791257567, "learning_rate": 1.740875608923058e-05, "loss": 0.8082, "step": 3128 }, { "epoch": 0.25747788520880477, "grad_norm": 2.8197341867948444, "learning_rate": 1.7406965695950644e-05, "loss": 0.8033, "step": 3129 }, { "epoch": 0.2575601728039498, "grad_norm": 1.9629309753444786, "learning_rate": 1.740517477649171e-05, "loss": 0.8161, "step": 3130 }, { "epoch": 0.25764246039909483, "grad_norm": 2.270871953975815, "learning_rate": 1.7403383330981008e-05, "loss": 0.8429, "step": 3131 }, { "epoch": 0.2577247479942399, "grad_norm": 0.4591756755142735, "learning_rate": 1.740159135954579e-05, "loss": 0.5406, "step": 3132 }, { "epoch": 0.2578070355893849, "grad_norm": 2.251577055692386, "learning_rate": 1.739979886231336e-05, "loss": 0.7972, "step": 3133 }, { "epoch": 0.25788932318452995, "grad_norm": 1.859966438912752, "learning_rate": 1.7398005839411056e-05, "loss": 0.8233, "step": 3134 }, { "epoch": 0.25797161077967495, "grad_norm": 2.8639139708150196, "learning_rate": 1.7396212290966247e-05, "loss": 0.8191, "step": 3135 }, { "epoch": 0.25805389837482, "grad_norm": 2.4944107163482157, "learning_rate": 1.7394418217106342e-05, "loss": 0.7591, "step": 3136 }, { "epoch": 0.258136185969965, "grad_norm": 2.366722872021388, "learning_rate": 1.7392623617958795e-05, "loss": 0.8167, "step": 3137 }, { "epoch": 0.2582184735651101, "grad_norm": 2.603370055066394, "learning_rate": 1.739082849365109e-05, "loss": 0.8104, "step": 3138 }, { "epoch": 0.2583007611602551, "grad_norm": 2.754780707706453, "learning_rate": 1.7389032844310746e-05, "loss": 0.8262, "step": 3139 }, { "epoch": 0.25838304875540014, "grad_norm": 7.339005960377376, "learning_rate": 1.7387236670065325e-05, "loss": 0.7942, "step": 3140 }, { "epoch": 0.25846533635054514, "grad_norm": 2.5408794751755517, "learning_rate": 1.7385439971042428e-05, "loss": 0.8389, "step": 3141 }, { "epoch": 0.2585476239456902, "grad_norm": 3.301905373284494, "learning_rate": 1.7383642747369688e-05, "loss": 0.8037, "step": 3142 }, { "epoch": 0.2586299115408352, "grad_norm": 0.46772970353286447, "learning_rate": 1.7381844999174773e-05, "loss": 0.5683, "step": 3143 }, { "epoch": 0.25871219913598026, "grad_norm": 3.331965492546159, "learning_rate": 1.7380046726585396e-05, "loss": 0.8083, "step": 3144 }, { "epoch": 0.25879448673112526, "grad_norm": 2.1550660455142987, "learning_rate": 1.73782479297293e-05, "loss": 0.8134, "step": 3145 }, { "epoch": 0.2588767743262703, "grad_norm": 2.9972605054435735, "learning_rate": 1.7376448608734275e-05, "loss": 0.7925, "step": 3146 }, { "epoch": 0.2589590619214153, "grad_norm": 2.164026283547475, "learning_rate": 1.7374648763728134e-05, "loss": 0.8315, "step": 3147 }, { "epoch": 0.2590413495165604, "grad_norm": 3.129881200517848, "learning_rate": 1.737284839483874e-05, "loss": 0.8187, "step": 3148 }, { "epoch": 0.2591236371117054, "grad_norm": 2.249471320311643, "learning_rate": 1.7371047502193988e-05, "loss": 0.7994, "step": 3149 }, { "epoch": 0.25920592470685044, "grad_norm": 0.43681294157060224, "learning_rate": 1.7369246085921808e-05, "loss": 0.5399, "step": 3150 }, { "epoch": 0.25928821230199545, "grad_norm": 4.811660631207214, "learning_rate": 1.736744414615017e-05, "loss": 0.8236, "step": 3151 }, { "epoch": 0.2593704998971405, "grad_norm": 2.5431996434505657, "learning_rate": 1.7365641683007085e-05, "loss": 0.803, "step": 3152 }, { "epoch": 0.25945278749228556, "grad_norm": 0.4447693002784123, "learning_rate": 1.7363838696620593e-05, "loss": 0.5437, "step": 3153 }, { "epoch": 0.25953507508743057, "grad_norm": 2.817772431461036, "learning_rate": 1.7362035187118777e-05, "loss": 0.8013, "step": 3154 }, { "epoch": 0.2596173626825756, "grad_norm": 3.3812300131117334, "learning_rate": 1.7360231154629756e-05, "loss": 0.8215, "step": 3155 }, { "epoch": 0.25969965027772063, "grad_norm": 2.6892117455209004, "learning_rate": 1.7358426599281686e-05, "loss": 0.8064, "step": 3156 }, { "epoch": 0.2597819378728657, "grad_norm": 3.053879603907526, "learning_rate": 1.7356621521202757e-05, "loss": 0.7923, "step": 3157 }, { "epoch": 0.2598642254680107, "grad_norm": 0.4570535125627652, "learning_rate": 1.73548159205212e-05, "loss": 0.5413, "step": 3158 }, { "epoch": 0.25994651306315575, "grad_norm": 4.174512131282455, "learning_rate": 1.7353009797365283e-05, "loss": 0.7724, "step": 3159 }, { "epoch": 0.26002880065830075, "grad_norm": 2.467743389444022, "learning_rate": 1.735120315186331e-05, "loss": 0.8162, "step": 3160 }, { "epoch": 0.2601110882534458, "grad_norm": 2.4087878649109564, "learning_rate": 1.734939598414362e-05, "loss": 0.8459, "step": 3161 }, { "epoch": 0.2601933758485908, "grad_norm": 2.882879818085009, "learning_rate": 1.7347588294334595e-05, "loss": 0.7865, "step": 3162 }, { "epoch": 0.26027566344373587, "grad_norm": 3.042682180964853, "learning_rate": 1.7345780082564646e-05, "loss": 0.7795, "step": 3163 }, { "epoch": 0.2603579510388809, "grad_norm": 2.684497062312509, "learning_rate": 1.734397134896223e-05, "loss": 0.8, "step": 3164 }, { "epoch": 0.26044023863402593, "grad_norm": 2.168903824528707, "learning_rate": 1.734216209365583e-05, "loss": 0.8216, "step": 3165 }, { "epoch": 0.26052252622917094, "grad_norm": 3.3438572773747843, "learning_rate": 1.734035231677398e-05, "loss": 0.8096, "step": 3166 }, { "epoch": 0.260604813824316, "grad_norm": 2.790640753291628, "learning_rate": 1.7338542018445242e-05, "loss": 0.8268, "step": 3167 }, { "epoch": 0.260687101419461, "grad_norm": 2.122562413274879, "learning_rate": 1.7336731198798214e-05, "loss": 0.8282, "step": 3168 }, { "epoch": 0.26076938901460606, "grad_norm": 2.0906485747970835, "learning_rate": 1.7334919857961533e-05, "loss": 0.7994, "step": 3169 }, { "epoch": 0.26085167660975106, "grad_norm": 2.5413232611159553, "learning_rate": 1.733310799606388e-05, "loss": 0.8384, "step": 3170 }, { "epoch": 0.2609339642048961, "grad_norm": 2.113615118093388, "learning_rate": 1.733129561323396e-05, "loss": 0.807, "step": 3171 }, { "epoch": 0.2610162518000411, "grad_norm": 3.8110046625338994, "learning_rate": 1.732948270960052e-05, "loss": 0.8028, "step": 3172 }, { "epoch": 0.2610985393951862, "grad_norm": 2.274089846845691, "learning_rate": 1.7327669285292357e-05, "loss": 0.8159, "step": 3173 }, { "epoch": 0.2611808269903312, "grad_norm": 2.155339007298501, "learning_rate": 1.7325855340438286e-05, "loss": 0.8151, "step": 3174 }, { "epoch": 0.26126311458547624, "grad_norm": 2.3913425727713538, "learning_rate": 1.7324040875167165e-05, "loss": 0.8157, "step": 3175 }, { "epoch": 0.26134540218062124, "grad_norm": 2.3386302794190748, "learning_rate": 1.7322225889607893e-05, "loss": 0.846, "step": 3176 }, { "epoch": 0.2614276897757663, "grad_norm": 2.678919405039354, "learning_rate": 1.7320410383889404e-05, "loss": 0.8309, "step": 3177 }, { "epoch": 0.26150997737091136, "grad_norm": 2.6864776396729035, "learning_rate": 1.7318594358140672e-05, "loss": 0.8027, "step": 3178 }, { "epoch": 0.26159226496605636, "grad_norm": 2.190560347922986, "learning_rate": 1.73167778124907e-05, "loss": 0.7834, "step": 3179 }, { "epoch": 0.2616745525612014, "grad_norm": 2.0674677033535866, "learning_rate": 1.731496074706853e-05, "loss": 0.8237, "step": 3180 }, { "epoch": 0.2617568401563464, "grad_norm": 2.2650779122141484, "learning_rate": 1.731314316200325e-05, "loss": 0.8189, "step": 3181 }, { "epoch": 0.2618391277514915, "grad_norm": 0.4501782139277584, "learning_rate": 1.7311325057423975e-05, "loss": 0.5598, "step": 3182 }, { "epoch": 0.2619214153466365, "grad_norm": 2.537932066869163, "learning_rate": 1.730950643345986e-05, "loss": 0.8132, "step": 3183 }, { "epoch": 0.26200370294178155, "grad_norm": 1.8526185042324084, "learning_rate": 1.73076872902401e-05, "loss": 0.8154, "step": 3184 }, { "epoch": 0.26208599053692655, "grad_norm": 1.902302784050244, "learning_rate": 1.730586762789392e-05, "loss": 0.7661, "step": 3185 }, { "epoch": 0.2621682781320716, "grad_norm": 2.053819447779948, "learning_rate": 1.7304047446550587e-05, "loss": 0.843, "step": 3186 }, { "epoch": 0.2622505657272166, "grad_norm": 1.8359116217629687, "learning_rate": 1.7302226746339405e-05, "loss": 0.8086, "step": 3187 }, { "epoch": 0.26233285332236167, "grad_norm": 1.6847594337426774, "learning_rate": 1.7300405527389715e-05, "loss": 0.8187, "step": 3188 }, { "epoch": 0.2624151409175067, "grad_norm": 0.41702344315935097, "learning_rate": 1.729858378983089e-05, "loss": 0.5229, "step": 3189 }, { "epoch": 0.26249742851265173, "grad_norm": 1.9228251205119076, "learning_rate": 1.7296761533792344e-05, "loss": 0.7767, "step": 3190 }, { "epoch": 0.26257971610779673, "grad_norm": 0.4265786138023259, "learning_rate": 1.729493875940353e-05, "loss": 0.5629, "step": 3191 }, { "epoch": 0.2626620037029418, "grad_norm": 2.1452185533518406, "learning_rate": 1.729311546679393e-05, "loss": 0.8223, "step": 3192 }, { "epoch": 0.2627442912980868, "grad_norm": 1.7012131918386595, "learning_rate": 1.7291291656093076e-05, "loss": 0.8546, "step": 3193 }, { "epoch": 0.26282657889323185, "grad_norm": 2.648389059941007, "learning_rate": 1.728946732743052e-05, "loss": 0.8148, "step": 3194 }, { "epoch": 0.26290886648837686, "grad_norm": 9.042676560045523, "learning_rate": 1.7287642480935863e-05, "loss": 0.8144, "step": 3195 }, { "epoch": 0.2629911540835219, "grad_norm": 2.7412001255716434, "learning_rate": 1.7285817116738738e-05, "loss": 0.7991, "step": 3196 }, { "epoch": 0.2630734416786669, "grad_norm": 2.3624966281421322, "learning_rate": 1.728399123496882e-05, "loss": 0.7915, "step": 3197 }, { "epoch": 0.263155729273812, "grad_norm": 2.1492800104720264, "learning_rate": 1.728216483575581e-05, "loss": 0.781, "step": 3198 }, { "epoch": 0.263238016868957, "grad_norm": 0.4780175499628937, "learning_rate": 1.728033791922946e-05, "loss": 0.5888, "step": 3199 }, { "epoch": 0.26332030446410204, "grad_norm": 2.5847737094687724, "learning_rate": 1.7278510485519548e-05, "loss": 0.8161, "step": 3200 }, { "epoch": 0.26340259205924704, "grad_norm": 6.83724462118161, "learning_rate": 1.727668253475589e-05, "loss": 0.7952, "step": 3201 }, { "epoch": 0.2634848796543921, "grad_norm": 3.3337817483900976, "learning_rate": 1.7274854067068337e-05, "loss": 0.8391, "step": 3202 }, { "epoch": 0.26356716724953716, "grad_norm": 3.6413631005329785, "learning_rate": 1.727302508258679e-05, "loss": 0.8345, "step": 3203 }, { "epoch": 0.26364945484468216, "grad_norm": 0.4708623517734391, "learning_rate": 1.7271195581441174e-05, "loss": 0.5692, "step": 3204 }, { "epoch": 0.2637317424398272, "grad_norm": 0.4665677729352944, "learning_rate": 1.7269365563761452e-05, "loss": 0.5749, "step": 3205 }, { "epoch": 0.2638140300349722, "grad_norm": 3.2795161486720605, "learning_rate": 1.726753502967762e-05, "loss": 0.8285, "step": 3206 }, { "epoch": 0.2638963176301173, "grad_norm": 20.250846110052386, "learning_rate": 1.726570397931973e-05, "loss": 0.8299, "step": 3207 }, { "epoch": 0.2639786052252623, "grad_norm": 3.9499671072623643, "learning_rate": 1.7263872412817847e-05, "loss": 0.8233, "step": 3208 }, { "epoch": 0.26406089282040734, "grad_norm": 2.2860994668827193, "learning_rate": 1.7262040330302085e-05, "loss": 0.8133, "step": 3209 }, { "epoch": 0.26414318041555235, "grad_norm": 3.473394250186446, "learning_rate": 1.7260207731902586e-05, "loss": 0.8001, "step": 3210 }, { "epoch": 0.2642254680106974, "grad_norm": 3.1593135449863845, "learning_rate": 1.7258374617749547e-05, "loss": 0.804, "step": 3211 }, { "epoch": 0.2643077556058424, "grad_norm": 2.044762678096964, "learning_rate": 1.725654098797318e-05, "loss": 0.8373, "step": 3212 }, { "epoch": 0.26439004320098747, "grad_norm": 2.2672027919069437, "learning_rate": 1.725470684270375e-05, "loss": 0.8107, "step": 3213 }, { "epoch": 0.26447233079613247, "grad_norm": 2.746222310105515, "learning_rate": 1.7252872182071543e-05, "loss": 0.7898, "step": 3214 }, { "epoch": 0.26455461839127753, "grad_norm": 2.34381887200085, "learning_rate": 1.72510370062069e-05, "loss": 0.8069, "step": 3215 }, { "epoch": 0.26463690598642253, "grad_norm": 2.1027650450293893, "learning_rate": 1.724920131524018e-05, "loss": 0.8081, "step": 3216 }, { "epoch": 0.2647191935815676, "grad_norm": 2.681313461674057, "learning_rate": 1.7247365109301797e-05, "loss": 0.8357, "step": 3217 }, { "epoch": 0.2648014811767126, "grad_norm": 0.4873380246831437, "learning_rate": 1.7245528388522184e-05, "loss": 0.5659, "step": 3218 }, { "epoch": 0.26488376877185765, "grad_norm": 2.1376457729444898, "learning_rate": 1.7243691153031824e-05, "loss": 0.785, "step": 3219 }, { "epoch": 0.26496605636700266, "grad_norm": 2.4174703642569852, "learning_rate": 1.7241853402961227e-05, "loss": 0.8179, "step": 3220 }, { "epoch": 0.2650483439621477, "grad_norm": 1.7698760368671045, "learning_rate": 1.7240015138440947e-05, "loss": 0.822, "step": 3221 }, { "epoch": 0.2651306315572927, "grad_norm": 0.45256651843325324, "learning_rate": 1.723817635960157e-05, "loss": 0.5495, "step": 3222 }, { "epoch": 0.2652129191524378, "grad_norm": 1.8859690396403943, "learning_rate": 1.7236337066573717e-05, "loss": 0.8127, "step": 3223 }, { "epoch": 0.2652952067475828, "grad_norm": 1.778989937861356, "learning_rate": 1.7234497259488056e-05, "loss": 0.8183, "step": 3224 }, { "epoch": 0.26537749434272784, "grad_norm": 2.164404304323049, "learning_rate": 1.7232656938475278e-05, "loss": 0.8276, "step": 3225 }, { "epoch": 0.26545978193787284, "grad_norm": 0.4274796517906534, "learning_rate": 1.7230816103666118e-05, "loss": 0.5293, "step": 3226 }, { "epoch": 0.2655420695330179, "grad_norm": 2.515911549884607, "learning_rate": 1.7228974755191346e-05, "loss": 0.8335, "step": 3227 }, { "epoch": 0.2656243571281629, "grad_norm": 2.665495710145411, "learning_rate": 1.722713289318177e-05, "loss": 0.8183, "step": 3228 }, { "epoch": 0.26570664472330796, "grad_norm": 2.1032879064622914, "learning_rate": 1.7225290517768227e-05, "loss": 0.7815, "step": 3229 }, { "epoch": 0.265788932318453, "grad_norm": 0.45918301181841464, "learning_rate": 1.7223447629081606e-05, "loss": 0.5528, "step": 3230 }, { "epoch": 0.265871219913598, "grad_norm": 2.0918680503795812, "learning_rate": 1.7221604227252813e-05, "loss": 0.8358, "step": 3231 }, { "epoch": 0.2659535075087431, "grad_norm": 2.024008414579574, "learning_rate": 1.721976031241281e-05, "loss": 0.8327, "step": 3232 }, { "epoch": 0.2660357951038881, "grad_norm": 1.8557295954758255, "learning_rate": 1.7217915884692575e-05, "loss": 0.8278, "step": 3233 }, { "epoch": 0.26611808269903314, "grad_norm": 2.1019216116121737, "learning_rate": 1.721607094422314e-05, "loss": 0.785, "step": 3234 }, { "epoch": 0.26620037029417815, "grad_norm": 1.8352365534118282, "learning_rate": 1.721422549113557e-05, "loss": 0.8203, "step": 3235 }, { "epoch": 0.2662826578893232, "grad_norm": 2.1039521614565118, "learning_rate": 1.7212379525560956e-05, "loss": 0.8513, "step": 3236 }, { "epoch": 0.2663649454844682, "grad_norm": 2.520211759922124, "learning_rate": 1.7210533047630436e-05, "loss": 0.7667, "step": 3237 }, { "epoch": 0.26644723307961327, "grad_norm": 2.0121725529690613, "learning_rate": 1.720868605747518e-05, "loss": 0.8022, "step": 3238 }, { "epoch": 0.26652952067475827, "grad_norm": 3.0568058022460805, "learning_rate": 1.7206838555226394e-05, "loss": 0.7937, "step": 3239 }, { "epoch": 0.2666118082699033, "grad_norm": 2.7439724996226382, "learning_rate": 1.720499054101532e-05, "loss": 0.8571, "step": 3240 }, { "epoch": 0.26669409586504833, "grad_norm": 2.3999917272443185, "learning_rate": 1.7203142014973245e-05, "loss": 0.8092, "step": 3241 }, { "epoch": 0.2667763834601934, "grad_norm": 2.548770342859101, "learning_rate": 1.7201292977231475e-05, "loss": 0.8116, "step": 3242 }, { "epoch": 0.2668586710553384, "grad_norm": 2.5150954313517304, "learning_rate": 1.7199443427921375e-05, "loss": 0.8253, "step": 3243 }, { "epoch": 0.26694095865048345, "grad_norm": 0.45147598726438465, "learning_rate": 1.7197593367174326e-05, "loss": 0.5544, "step": 3244 }, { "epoch": 0.26702324624562845, "grad_norm": 3.1494051772033442, "learning_rate": 1.7195742795121754e-05, "loss": 0.83, "step": 3245 }, { "epoch": 0.2671055338407735, "grad_norm": 2.4260353862913413, "learning_rate": 1.7193891711895122e-05, "loss": 0.8145, "step": 3246 }, { "epoch": 0.2671878214359185, "grad_norm": 2.366121050333026, "learning_rate": 1.7192040117625927e-05, "loss": 0.8437, "step": 3247 }, { "epoch": 0.2672701090310636, "grad_norm": 0.4277435140922396, "learning_rate": 1.7190188012445707e-05, "loss": 0.533, "step": 3248 }, { "epoch": 0.2673523966262086, "grad_norm": 2.5357390411312393, "learning_rate": 1.7188335396486024e-05, "loss": 0.8084, "step": 3249 }, { "epoch": 0.26743468422135364, "grad_norm": 2.8034007544837753, "learning_rate": 1.7186482269878496e-05, "loss": 0.8069, "step": 3250 }, { "epoch": 0.26751697181649864, "grad_norm": 2.7529826625470846, "learning_rate": 1.718462863275476e-05, "loss": 0.799, "step": 3251 }, { "epoch": 0.2675992594116437, "grad_norm": 2.181450039061867, "learning_rate": 1.7182774485246493e-05, "loss": 0.8287, "step": 3252 }, { "epoch": 0.2676815470067887, "grad_norm": 0.5190998331313099, "learning_rate": 1.7180919827485414e-05, "loss": 0.5556, "step": 3253 }, { "epoch": 0.26776383460193376, "grad_norm": 2.6524747277559366, "learning_rate": 1.7179064659603277e-05, "loss": 0.8174, "step": 3254 }, { "epoch": 0.2678461221970788, "grad_norm": 2.857466915807229, "learning_rate": 1.7177208981731864e-05, "loss": 0.7897, "step": 3255 }, { "epoch": 0.2679284097922238, "grad_norm": 2.1330095635523376, "learning_rate": 1.717535279400301e-05, "loss": 0.7827, "step": 3256 }, { "epoch": 0.2680106973873689, "grad_norm": 1.6286409905128667, "learning_rate": 1.7173496096548562e-05, "loss": 0.8053, "step": 3257 }, { "epoch": 0.2680929849825139, "grad_norm": 1.9075208132843064, "learning_rate": 1.717163888950043e-05, "loss": 0.8175, "step": 3258 }, { "epoch": 0.26817527257765894, "grad_norm": 2.1464092784064395, "learning_rate": 1.7169781172990532e-05, "loss": 0.805, "step": 3259 }, { "epoch": 0.26825756017280394, "grad_norm": 2.354882095513986, "learning_rate": 1.716792294715085e-05, "loss": 0.8168, "step": 3260 }, { "epoch": 0.268339847767949, "grad_norm": 2.4235854013908784, "learning_rate": 1.716606421211339e-05, "loss": 0.8108, "step": 3261 }, { "epoch": 0.268422135363094, "grad_norm": 0.441932485843513, "learning_rate": 1.7164204968010186e-05, "loss": 0.5369, "step": 3262 }, { "epoch": 0.26850442295823906, "grad_norm": 3.8674534779204675, "learning_rate": 1.7162345214973316e-05, "loss": 0.8112, "step": 3263 }, { "epoch": 0.26858671055338407, "grad_norm": 2.0192449661176077, "learning_rate": 1.71604849531349e-05, "loss": 0.8362, "step": 3264 }, { "epoch": 0.2686689981485291, "grad_norm": 2.1024227141859564, "learning_rate": 1.715862418262708e-05, "loss": 0.8234, "step": 3265 }, { "epoch": 0.26875128574367413, "grad_norm": 1.7913169528107178, "learning_rate": 1.715676290358205e-05, "loss": 0.8087, "step": 3266 }, { "epoch": 0.2688335733388192, "grad_norm": 2.4916457168510275, "learning_rate": 1.715490111613203e-05, "loss": 0.8152, "step": 3267 }, { "epoch": 0.2689158609339642, "grad_norm": 0.44607243016910886, "learning_rate": 1.7153038820409272e-05, "loss": 0.5177, "step": 3268 }, { "epoch": 0.26899814852910925, "grad_norm": 1.9388380159999783, "learning_rate": 1.7151176016546078e-05, "loss": 0.8064, "step": 3269 }, { "epoch": 0.26908043612425425, "grad_norm": 1.8908928592795229, "learning_rate": 1.7149312704674778e-05, "loss": 0.8106, "step": 3270 }, { "epoch": 0.2691627237193993, "grad_norm": 2.153614448672576, "learning_rate": 1.7147448884927737e-05, "loss": 0.8105, "step": 3271 }, { "epoch": 0.2692450113145443, "grad_norm": 1.6705307534234544, "learning_rate": 1.7145584557437357e-05, "loss": 0.7938, "step": 3272 }, { "epoch": 0.26932729890968937, "grad_norm": 1.8896210907500344, "learning_rate": 1.714371972233608e-05, "loss": 0.8239, "step": 3273 }, { "epoch": 0.2694095865048344, "grad_norm": 2.8554806232336962, "learning_rate": 1.7141854379756373e-05, "loss": 0.7964, "step": 3274 }, { "epoch": 0.26949187409997943, "grad_norm": 6.209599869394988, "learning_rate": 1.713998852983076e-05, "loss": 0.802, "step": 3275 }, { "epoch": 0.26957416169512444, "grad_norm": 2.07612603899728, "learning_rate": 1.7138122172691774e-05, "loss": 0.8126, "step": 3276 }, { "epoch": 0.2696564492902695, "grad_norm": 0.44897503609566247, "learning_rate": 1.713625530847201e-05, "loss": 0.5385, "step": 3277 }, { "epoch": 0.2697387368854145, "grad_norm": 1.9728522028250042, "learning_rate": 1.7134387937304075e-05, "loss": 0.8193, "step": 3278 }, { "epoch": 0.26982102448055956, "grad_norm": 1.978362275479669, "learning_rate": 1.7132520059320635e-05, "loss": 0.8097, "step": 3279 }, { "epoch": 0.26990331207570456, "grad_norm": 2.757889220842567, "learning_rate": 1.7130651674654374e-05, "loss": 0.8308, "step": 3280 }, { "epoch": 0.2699855996708496, "grad_norm": 2.398643541912809, "learning_rate": 1.7128782783438027e-05, "loss": 0.7921, "step": 3281 }, { "epoch": 0.2700678872659947, "grad_norm": 1.7756416210361876, "learning_rate": 1.712691338580435e-05, "loss": 0.8216, "step": 3282 }, { "epoch": 0.2701501748611397, "grad_norm": 1.8358732316988162, "learning_rate": 1.712504348188614e-05, "loss": 0.7986, "step": 3283 }, { "epoch": 0.27023246245628474, "grad_norm": 1.9998406902584105, "learning_rate": 1.712317307181624e-05, "loss": 0.8137, "step": 3284 }, { "epoch": 0.27031475005142974, "grad_norm": 2.323256865689543, "learning_rate": 1.7121302155727516e-05, "loss": 0.803, "step": 3285 }, { "epoch": 0.2703970376465748, "grad_norm": 1.8768893607837283, "learning_rate": 1.7119430733752875e-05, "loss": 0.8083, "step": 3286 }, { "epoch": 0.2704793252417198, "grad_norm": 2.6528067614762163, "learning_rate": 1.7117558806025262e-05, "loss": 0.7954, "step": 3287 }, { "epoch": 0.27056161283686486, "grad_norm": 1.973609765171818, "learning_rate": 1.7115686372677652e-05, "loss": 0.7853, "step": 3288 }, { "epoch": 0.27064390043200987, "grad_norm": 2.3217110177707623, "learning_rate": 1.7113813433843063e-05, "loss": 0.8039, "step": 3289 }, { "epoch": 0.2707261880271549, "grad_norm": 2.838651310164189, "learning_rate": 1.7111939989654544e-05, "loss": 0.8316, "step": 3290 }, { "epoch": 0.2708084756222999, "grad_norm": 2.382217470261549, "learning_rate": 1.7110066040245183e-05, "loss": 0.8242, "step": 3291 }, { "epoch": 0.270890763217445, "grad_norm": 2.887414017571467, "learning_rate": 1.7108191585748103e-05, "loss": 0.8337, "step": 3292 }, { "epoch": 0.27097305081259, "grad_norm": 0.47604219888877264, "learning_rate": 1.710631662629646e-05, "loss": 0.5184, "step": 3293 }, { "epoch": 0.27105533840773505, "grad_norm": 2.0852978156132593, "learning_rate": 1.7104441162023444e-05, "loss": 0.8153, "step": 3294 }, { "epoch": 0.27113762600288005, "grad_norm": 2.1675255629575583, "learning_rate": 1.7102565193062294e-05, "loss": 0.8, "step": 3295 }, { "epoch": 0.2712199135980251, "grad_norm": 4.09311237887044, "learning_rate": 1.710068871954627e-05, "loss": 0.7962, "step": 3296 }, { "epoch": 0.2713022011931701, "grad_norm": 0.4518855461550148, "learning_rate": 1.7098811741608675e-05, "loss": 0.5521, "step": 3297 }, { "epoch": 0.27138448878831517, "grad_norm": 1.9238316344585502, "learning_rate": 1.709693425938285e-05, "loss": 0.8074, "step": 3298 }, { "epoch": 0.2714667763834602, "grad_norm": 2.1643329311004273, "learning_rate": 1.709505627300216e-05, "loss": 0.8162, "step": 3299 }, { "epoch": 0.27154906397860523, "grad_norm": 2.177575648049798, "learning_rate": 1.7093177782600023e-05, "loss": 0.8079, "step": 3300 }, { "epoch": 0.27163135157375023, "grad_norm": 2.6025431418214824, "learning_rate": 1.709129878830988e-05, "loss": 0.8499, "step": 3301 }, { "epoch": 0.2717136391688953, "grad_norm": 3.4271863171360755, "learning_rate": 1.708941929026521e-05, "loss": 0.8361, "step": 3302 }, { "epoch": 0.2717959267640403, "grad_norm": 2.7718425253153596, "learning_rate": 1.7087539288599533e-05, "loss": 0.8321, "step": 3303 }, { "epoch": 0.27187821435918536, "grad_norm": 2.03764504045628, "learning_rate": 1.70856587834464e-05, "loss": 0.8324, "step": 3304 }, { "epoch": 0.27196050195433036, "grad_norm": 0.4650979904841794, "learning_rate": 1.7083777774939396e-05, "loss": 0.5372, "step": 3305 }, { "epoch": 0.2720427895494754, "grad_norm": 2.0512935842462423, "learning_rate": 1.708189626321215e-05, "loss": 0.8092, "step": 3306 }, { "epoch": 0.2721250771446205, "grad_norm": 2.0198795891242054, "learning_rate": 1.708001424839832e-05, "loss": 0.8012, "step": 3307 }, { "epoch": 0.2722073647397655, "grad_norm": 2.5750179825763673, "learning_rate": 1.70781317306316e-05, "loss": 0.7717, "step": 3308 }, { "epoch": 0.27228965233491054, "grad_norm": 1.6929684242043748, "learning_rate": 1.7076248710045723e-05, "loss": 0.8148, "step": 3309 }, { "epoch": 0.27237193993005554, "grad_norm": 2.553340454097266, "learning_rate": 1.7074365186774452e-05, "loss": 0.8086, "step": 3310 }, { "epoch": 0.2724542275252006, "grad_norm": 0.4513239731225968, "learning_rate": 1.7072481160951592e-05, "loss": 0.5447, "step": 3311 }, { "epoch": 0.2725365151203456, "grad_norm": 2.147515676061674, "learning_rate": 1.707059663271098e-05, "loss": 0.7959, "step": 3312 }, { "epoch": 0.27261880271549066, "grad_norm": 1.781932232000643, "learning_rate": 1.7068711602186495e-05, "loss": 0.7834, "step": 3313 }, { "epoch": 0.27270109031063566, "grad_norm": 2.2065108390079247, "learning_rate": 1.706682606951204e-05, "loss": 0.8294, "step": 3314 }, { "epoch": 0.2727833779057807, "grad_norm": 0.44205612058721017, "learning_rate": 1.706494003482156e-05, "loss": 0.5366, "step": 3315 }, { "epoch": 0.2728656655009257, "grad_norm": 2.305688184989724, "learning_rate": 1.7063053498249043e-05, "loss": 0.8396, "step": 3316 }, { "epoch": 0.2729479530960708, "grad_norm": 2.3601766963878523, "learning_rate": 1.70611664599285e-05, "loss": 0.8406, "step": 3317 }, { "epoch": 0.2730302406912158, "grad_norm": 1.8335893965816523, "learning_rate": 1.7059278919993984e-05, "loss": 0.8155, "step": 3318 }, { "epoch": 0.27311252828636084, "grad_norm": 0.41818738369309927, "learning_rate": 1.705739087857958e-05, "loss": 0.5361, "step": 3319 }, { "epoch": 0.27319481588150585, "grad_norm": 1.9509967112825835, "learning_rate": 1.7055502335819424e-05, "loss": 0.7959, "step": 3320 }, { "epoch": 0.2732771034766509, "grad_norm": 1.8398501099325344, "learning_rate": 1.7053613291847656e-05, "loss": 0.7916, "step": 3321 }, { "epoch": 0.2733593910717959, "grad_norm": 1.6060444808115466, "learning_rate": 1.7051723746798485e-05, "loss": 0.7943, "step": 3322 }, { "epoch": 0.27344167866694097, "grad_norm": 1.8905149545844033, "learning_rate": 1.7049833700806137e-05, "loss": 0.8041, "step": 3323 }, { "epoch": 0.27352396626208597, "grad_norm": 1.8663100124717953, "learning_rate": 1.7047943154004875e-05, "loss": 0.8031, "step": 3324 }, { "epoch": 0.27360625385723103, "grad_norm": 2.0161913834988017, "learning_rate": 1.7046052106529004e-05, "loss": 0.8337, "step": 3325 }, { "epoch": 0.27368854145237603, "grad_norm": 2.0274683750441658, "learning_rate": 1.704416055851286e-05, "loss": 0.7846, "step": 3326 }, { "epoch": 0.2737708290475211, "grad_norm": 0.44113211885882087, "learning_rate": 1.7042268510090814e-05, "loss": 0.5408, "step": 3327 }, { "epoch": 0.2738531166426661, "grad_norm": 2.2224775662705114, "learning_rate": 1.7040375961397278e-05, "loss": 0.7821, "step": 3328 }, { "epoch": 0.27393540423781115, "grad_norm": 1.9250995186429698, "learning_rate": 1.703848291256669e-05, "loss": 0.8445, "step": 3329 }, { "epoch": 0.27401769183295616, "grad_norm": 1.864181688494493, "learning_rate": 1.7036589363733534e-05, "loss": 0.797, "step": 3330 }, { "epoch": 0.2740999794281012, "grad_norm": 1.7449914040884869, "learning_rate": 1.7034695315032323e-05, "loss": 0.8396, "step": 3331 }, { "epoch": 0.2741822670232463, "grad_norm": 2.0412298979855183, "learning_rate": 1.7032800766597608e-05, "loss": 0.8351, "step": 3332 }, { "epoch": 0.2742645546183913, "grad_norm": 2.1068397051206755, "learning_rate": 1.7030905718563972e-05, "loss": 0.7893, "step": 3333 }, { "epoch": 0.27434684221353633, "grad_norm": 1.8306211345628756, "learning_rate": 1.7029010171066042e-05, "loss": 0.8022, "step": 3334 }, { "epoch": 0.27442912980868134, "grad_norm": 1.5634997113375737, "learning_rate": 1.7027114124238466e-05, "loss": 0.8086, "step": 3335 }, { "epoch": 0.2745114174038264, "grad_norm": 1.7569546506128608, "learning_rate": 1.7025217578215943e-05, "loss": 0.8014, "step": 3336 }, { "epoch": 0.2745937049989714, "grad_norm": 2.0232013413047096, "learning_rate": 1.7023320533133198e-05, "loss": 0.8504, "step": 3337 }, { "epoch": 0.27467599259411646, "grad_norm": 1.696664859128998, "learning_rate": 1.702142298912499e-05, "loss": 0.8252, "step": 3338 }, { "epoch": 0.27475828018926146, "grad_norm": 1.9048938072450126, "learning_rate": 1.7019524946326128e-05, "loss": 0.8314, "step": 3339 }, { "epoch": 0.2748405677844065, "grad_norm": 1.6242850934087922, "learning_rate": 1.7017626404871438e-05, "loss": 0.7981, "step": 3340 }, { "epoch": 0.2749228553795515, "grad_norm": 1.5322679268428667, "learning_rate": 1.7015727364895794e-05, "loss": 0.8063, "step": 3341 }, { "epoch": 0.2750051429746966, "grad_norm": 1.8137498841382922, "learning_rate": 1.7013827826534096e-05, "loss": 0.7779, "step": 3342 }, { "epoch": 0.2750874305698416, "grad_norm": 1.7887008817254988, "learning_rate": 1.7011927789921283e-05, "loss": 0.8315, "step": 3343 }, { "epoch": 0.27516971816498664, "grad_norm": 1.940416298679269, "learning_rate": 1.7010027255192337e-05, "loss": 0.8019, "step": 3344 }, { "epoch": 0.27525200576013165, "grad_norm": 1.5770078546653166, "learning_rate": 1.7008126222482265e-05, "loss": 0.7964, "step": 3345 }, { "epoch": 0.2753342933552767, "grad_norm": 2.067596160022381, "learning_rate": 1.7006224691926113e-05, "loss": 0.7864, "step": 3346 }, { "epoch": 0.2754165809504217, "grad_norm": 2.756085711775809, "learning_rate": 1.7004322663658967e-05, "loss": 0.8236, "step": 3347 }, { "epoch": 0.27549886854556677, "grad_norm": 1.9318322636331693, "learning_rate": 1.7002420137815936e-05, "loss": 0.8145, "step": 3348 }, { "epoch": 0.27558115614071177, "grad_norm": 1.7523196389978979, "learning_rate": 1.700051711453218e-05, "loss": 0.7977, "step": 3349 }, { "epoch": 0.27566344373585683, "grad_norm": 1.7864940799926223, "learning_rate": 1.6998613593942886e-05, "loss": 0.8005, "step": 3350 }, { "epoch": 0.27574573133100183, "grad_norm": 1.8804326018457047, "learning_rate": 1.699670957618327e-05, "loss": 0.8339, "step": 3351 }, { "epoch": 0.2758280189261469, "grad_norm": 1.6044837569260737, "learning_rate": 1.6994805061388597e-05, "loss": 0.8433, "step": 3352 }, { "epoch": 0.2759103065212919, "grad_norm": 1.9980538892293163, "learning_rate": 1.699290004969416e-05, "loss": 0.8173, "step": 3353 }, { "epoch": 0.27599259411643695, "grad_norm": 1.4377196318751049, "learning_rate": 1.6990994541235287e-05, "loss": 0.8204, "step": 3354 }, { "epoch": 0.27607488171158195, "grad_norm": 1.945350080976809, "learning_rate": 1.6989088536147343e-05, "loss": 0.8322, "step": 3355 }, { "epoch": 0.276157169306727, "grad_norm": 2.294389988296374, "learning_rate": 1.6987182034565727e-05, "loss": 0.7996, "step": 3356 }, { "epoch": 0.276239456901872, "grad_norm": 1.41771829095886, "learning_rate": 1.698527503662587e-05, "loss": 0.8343, "step": 3357 }, { "epoch": 0.2763217444970171, "grad_norm": 1.5926276389564853, "learning_rate": 1.698336754246325e-05, "loss": 0.7884, "step": 3358 }, { "epoch": 0.27640403209216213, "grad_norm": 1.586524028319989, "learning_rate": 1.6981459552213363e-05, "loss": 0.8287, "step": 3359 }, { "epoch": 0.27648631968730714, "grad_norm": 1.740900423555991, "learning_rate": 1.697955106601176e-05, "loss": 0.8004, "step": 3360 }, { "epoch": 0.2765686072824522, "grad_norm": 1.758580264555415, "learning_rate": 1.6977642083994006e-05, "loss": 0.8201, "step": 3361 }, { "epoch": 0.2766508948775972, "grad_norm": 0.4741258364442609, "learning_rate": 1.697573260629572e-05, "loss": 0.5618, "step": 3362 }, { "epoch": 0.27673318247274226, "grad_norm": 2.1009382288448015, "learning_rate": 1.6973822633052547e-05, "loss": 0.809, "step": 3363 }, { "epoch": 0.27681547006788726, "grad_norm": 1.4906072475343315, "learning_rate": 1.6971912164400163e-05, "loss": 0.7993, "step": 3364 }, { "epoch": 0.2768977576630323, "grad_norm": 0.4185664842935336, "learning_rate": 1.6970001200474296e-05, "loss": 0.5429, "step": 3365 }, { "epoch": 0.2769800452581773, "grad_norm": 2.0662172481020034, "learning_rate": 1.6968089741410684e-05, "loss": 0.8321, "step": 3366 }, { "epoch": 0.2770623328533224, "grad_norm": 0.42729417981410084, "learning_rate": 1.6966177787345125e-05, "loss": 0.5264, "step": 3367 }, { "epoch": 0.2771446204484674, "grad_norm": 1.4645916126136944, "learning_rate": 1.6964265338413434e-05, "loss": 0.8361, "step": 3368 }, { "epoch": 0.27722690804361244, "grad_norm": 2.0284938459903494, "learning_rate": 1.6962352394751473e-05, "loss": 0.7948, "step": 3369 }, { "epoch": 0.27730919563875744, "grad_norm": 1.8160325925978076, "learning_rate": 1.696043895649513e-05, "loss": 0.8187, "step": 3370 }, { "epoch": 0.2773914832339025, "grad_norm": 1.6372608945764426, "learning_rate": 1.6958525023780337e-05, "loss": 0.8418, "step": 3371 }, { "epoch": 0.2774737708290475, "grad_norm": 1.4144229825435075, "learning_rate": 1.6956610596743057e-05, "loss": 0.8092, "step": 3372 }, { "epoch": 0.27755605842419256, "grad_norm": 1.488036616750426, "learning_rate": 1.695469567551928e-05, "loss": 0.8124, "step": 3373 }, { "epoch": 0.27763834601933757, "grad_norm": 1.4610354658407356, "learning_rate": 1.695278026024505e-05, "loss": 0.7949, "step": 3374 }, { "epoch": 0.2777206336144826, "grad_norm": 1.5881282267404018, "learning_rate": 1.6950864351056426e-05, "loss": 0.8117, "step": 3375 }, { "epoch": 0.27780292120962763, "grad_norm": 2.0521286867664714, "learning_rate": 1.6948947948089512e-05, "loss": 0.7922, "step": 3376 }, { "epoch": 0.2778852088047727, "grad_norm": 0.4982262468961942, "learning_rate": 1.6947031051480457e-05, "loss": 0.5381, "step": 3377 }, { "epoch": 0.2779674963999177, "grad_norm": 1.932526443724488, "learning_rate": 1.694511366136542e-05, "loss": 0.8203, "step": 3378 }, { "epoch": 0.27804978399506275, "grad_norm": 2.0835624700650395, "learning_rate": 1.6943195777880615e-05, "loss": 0.8289, "step": 3379 }, { "epoch": 0.27813207159020775, "grad_norm": 0.41924968430352944, "learning_rate": 1.6941277401162292e-05, "loss": 0.5271, "step": 3380 }, { "epoch": 0.2782143591853528, "grad_norm": 0.42900497961077644, "learning_rate": 1.693935853134672e-05, "loss": 0.527, "step": 3381 }, { "epoch": 0.2782966467804978, "grad_norm": 1.70942845007818, "learning_rate": 1.6937439168570217e-05, "loss": 0.8289, "step": 3382 }, { "epoch": 0.2783789343756429, "grad_norm": 0.4169202457645185, "learning_rate": 1.693551931296913e-05, "loss": 0.5405, "step": 3383 }, { "epoch": 0.27846122197078793, "grad_norm": 1.5249997090807386, "learning_rate": 1.693359896467984e-05, "loss": 0.7746, "step": 3384 }, { "epoch": 0.27854350956593293, "grad_norm": 1.844099104485872, "learning_rate": 1.693167812383877e-05, "loss": 0.8101, "step": 3385 }, { "epoch": 0.278625797161078, "grad_norm": 0.4620191998761672, "learning_rate": 1.6929756790582374e-05, "loss": 0.5587, "step": 3386 }, { "epoch": 0.278708084756223, "grad_norm": 1.8276177937378877, "learning_rate": 1.6927834965047134e-05, "loss": 0.8152, "step": 3387 }, { "epoch": 0.27879037235136805, "grad_norm": 1.521403662930178, "learning_rate": 1.692591264736958e-05, "loss": 0.8188, "step": 3388 }, { "epoch": 0.27887265994651306, "grad_norm": 4.367006002729017, "learning_rate": 1.6923989837686266e-05, "loss": 0.7981, "step": 3389 }, { "epoch": 0.2789549475416581, "grad_norm": 1.5441132673864073, "learning_rate": 1.692206653613379e-05, "loss": 0.8155, "step": 3390 }, { "epoch": 0.2790372351368031, "grad_norm": 0.43893254704922163, "learning_rate": 1.6920142742848775e-05, "loss": 0.5416, "step": 3391 }, { "epoch": 0.2791195227319482, "grad_norm": 2.6249098901055126, "learning_rate": 1.6918218457967888e-05, "loss": 0.8534, "step": 3392 }, { "epoch": 0.2792018103270932, "grad_norm": 1.627178716854893, "learning_rate": 1.6916293681627823e-05, "loss": 0.8187, "step": 3393 }, { "epoch": 0.27928409792223824, "grad_norm": 1.9555270765008443, "learning_rate": 1.691436841396532e-05, "loss": 0.8209, "step": 3394 }, { "epoch": 0.27936638551738324, "grad_norm": 0.4305921737725394, "learning_rate": 1.6912442655117144e-05, "loss": 0.5327, "step": 3395 }, { "epoch": 0.2794486731125283, "grad_norm": 1.490785336532508, "learning_rate": 1.691051640522009e-05, "loss": 0.7933, "step": 3396 }, { "epoch": 0.2795309607076733, "grad_norm": 2.677448047961783, "learning_rate": 1.6908589664411007e-05, "loss": 0.7862, "step": 3397 }, { "epoch": 0.27961324830281836, "grad_norm": 2.3547165596097472, "learning_rate": 1.6906662432826763e-05, "loss": 0.8082, "step": 3398 }, { "epoch": 0.27969553589796337, "grad_norm": 1.6494931737575393, "learning_rate": 1.690473471060426e-05, "loss": 0.8217, "step": 3399 }, { "epoch": 0.2797778234931084, "grad_norm": 1.6765976469942176, "learning_rate": 1.6902806497880454e-05, "loss": 0.8225, "step": 3400 }, { "epoch": 0.2798601110882534, "grad_norm": 0.46128265450139233, "learning_rate": 1.690087779479231e-05, "loss": 0.5631, "step": 3401 }, { "epoch": 0.2799423986833985, "grad_norm": 5.213573703540176, "learning_rate": 1.6898948601476842e-05, "loss": 0.8275, "step": 3402 }, { "epoch": 0.2800246862785435, "grad_norm": 1.9002942814116162, "learning_rate": 1.68970189180711e-05, "loss": 0.8315, "step": 3403 }, { "epoch": 0.28010697387368855, "grad_norm": 1.710672469450862, "learning_rate": 1.6895088744712164e-05, "loss": 0.814, "step": 3404 }, { "epoch": 0.28018926146883355, "grad_norm": 2.4750463606181876, "learning_rate": 1.689315808153715e-05, "loss": 0.7951, "step": 3405 }, { "epoch": 0.2802715490639786, "grad_norm": 0.4357926929191528, "learning_rate": 1.6891226928683213e-05, "loss": 0.5428, "step": 3406 }, { "epoch": 0.2803538366591236, "grad_norm": 1.6990175521384374, "learning_rate": 1.688929528628753e-05, "loss": 0.8015, "step": 3407 }, { "epoch": 0.28043612425426867, "grad_norm": 1.8882436532486544, "learning_rate": 1.6887363154487336e-05, "loss": 0.828, "step": 3408 }, { "epoch": 0.2805184118494137, "grad_norm": 2.155423063392042, "learning_rate": 1.688543053341987e-05, "loss": 0.8267, "step": 3409 }, { "epoch": 0.28060069944455873, "grad_norm": 0.4279141131980349, "learning_rate": 1.6883497423222435e-05, "loss": 0.5409, "step": 3410 }, { "epoch": 0.2806829870397038, "grad_norm": 2.2826003792873366, "learning_rate": 1.6881563824032354e-05, "loss": 0.7995, "step": 3411 }, { "epoch": 0.2807652746348488, "grad_norm": 1.9585010119599398, "learning_rate": 1.6879629735986978e-05, "loss": 0.8228, "step": 3412 }, { "epoch": 0.28084756222999385, "grad_norm": 2.6976405683684583, "learning_rate": 1.687769515922371e-05, "loss": 0.8281, "step": 3413 }, { "epoch": 0.28092984982513886, "grad_norm": 0.4670841176023679, "learning_rate": 1.6875760093879978e-05, "loss": 0.5713, "step": 3414 }, { "epoch": 0.2810121374202839, "grad_norm": 0.46234681406345274, "learning_rate": 1.6873824540093245e-05, "loss": 0.5615, "step": 3415 }, { "epoch": 0.2810944250154289, "grad_norm": 1.7765530034273658, "learning_rate": 1.687188849800101e-05, "loss": 0.8235, "step": 3416 }, { "epoch": 0.281176712610574, "grad_norm": 0.42952479338740623, "learning_rate": 1.6869951967740806e-05, "loss": 0.5411, "step": 3417 }, { "epoch": 0.281259000205719, "grad_norm": 2.0545519167604427, "learning_rate": 1.68680149494502e-05, "loss": 0.8392, "step": 3418 }, { "epoch": 0.28134128780086404, "grad_norm": 2.3305775779080875, "learning_rate": 1.68660774432668e-05, "loss": 0.8384, "step": 3419 }, { "epoch": 0.28142357539600904, "grad_norm": 2.2918573491440593, "learning_rate": 1.6864139449328237e-05, "loss": 0.8096, "step": 3420 }, { "epoch": 0.2815058629911541, "grad_norm": 2.065669036400418, "learning_rate": 1.686220096777218e-05, "loss": 0.8052, "step": 3421 }, { "epoch": 0.2815881505862991, "grad_norm": 2.6104333684954164, "learning_rate": 1.6860261998736347e-05, "loss": 0.831, "step": 3422 }, { "epoch": 0.28167043818144416, "grad_norm": 3.4434340957132856, "learning_rate": 1.685832254235847e-05, "loss": 0.7722, "step": 3423 }, { "epoch": 0.28175272577658916, "grad_norm": 1.8938224377515387, "learning_rate": 1.685638259877633e-05, "loss": 0.7882, "step": 3424 }, { "epoch": 0.2818350133717342, "grad_norm": 2.552662398864058, "learning_rate": 1.6854442168127733e-05, "loss": 0.843, "step": 3425 }, { "epoch": 0.2819173009668792, "grad_norm": 3.748955949469818, "learning_rate": 1.6852501250550527e-05, "loss": 0.8169, "step": 3426 }, { "epoch": 0.2819995885620243, "grad_norm": 0.46337970709394594, "learning_rate": 1.685055984618259e-05, "loss": 0.5366, "step": 3427 }, { "epoch": 0.2820818761571693, "grad_norm": 2.703284585055913, "learning_rate": 1.684861795516184e-05, "loss": 0.8095, "step": 3428 }, { "epoch": 0.28216416375231435, "grad_norm": 4.795160195831921, "learning_rate": 1.684667557762622e-05, "loss": 0.8175, "step": 3429 }, { "epoch": 0.28224645134745935, "grad_norm": 1.9984490983691139, "learning_rate": 1.6844732713713718e-05, "loss": 0.8195, "step": 3430 }, { "epoch": 0.2823287389426044, "grad_norm": 2.2528306971361842, "learning_rate": 1.6842789363562354e-05, "loss": 0.802, "step": 3431 }, { "epoch": 0.2824110265377494, "grad_norm": 2.624866901729511, "learning_rate": 1.6840845527310176e-05, "loss": 0.7985, "step": 3432 }, { "epoch": 0.28249331413289447, "grad_norm": 4.036532510382469, "learning_rate": 1.6838901205095267e-05, "loss": 0.8637, "step": 3433 }, { "epoch": 0.28257560172803947, "grad_norm": 2.4014428806472012, "learning_rate": 1.683695639705576e-05, "loss": 0.8383, "step": 3434 }, { "epoch": 0.28265788932318453, "grad_norm": 2.339737367909026, "learning_rate": 1.68350111033298e-05, "loss": 0.821, "step": 3435 }, { "epoch": 0.2827401769183296, "grad_norm": 0.4517504767826104, "learning_rate": 1.6833065324055582e-05, "loss": 0.5493, "step": 3436 }, { "epoch": 0.2828224645134746, "grad_norm": 2.23861206286531, "learning_rate": 1.6831119059371332e-05, "loss": 0.792, "step": 3437 }, { "epoch": 0.28290475210861965, "grad_norm": 3.041436048220035, "learning_rate": 1.6829172309415313e-05, "loss": 0.8169, "step": 3438 }, { "epoch": 0.28298703970376465, "grad_norm": 2.847666493958736, "learning_rate": 1.6827225074325812e-05, "loss": 0.767, "step": 3439 }, { "epoch": 0.2830693272989097, "grad_norm": 0.4506691367955305, "learning_rate": 1.6825277354241156e-05, "loss": 0.5467, "step": 3440 }, { "epoch": 0.2831516148940547, "grad_norm": 2.657056454748742, "learning_rate": 1.6823329149299716e-05, "loss": 0.8191, "step": 3441 }, { "epoch": 0.2832339024891998, "grad_norm": 0.4452525425742602, "learning_rate": 1.6821380459639888e-05, "loss": 0.5525, "step": 3442 }, { "epoch": 0.2833161900843448, "grad_norm": 2.421257029111015, "learning_rate": 1.6819431285400096e-05, "loss": 0.7846, "step": 3443 }, { "epoch": 0.28339847767948984, "grad_norm": 2.16142347243168, "learning_rate": 1.6817481626718818e-05, "loss": 0.8067, "step": 3444 }, { "epoch": 0.28348076527463484, "grad_norm": 2.064303219148629, "learning_rate": 1.6815531483734543e-05, "loss": 0.7633, "step": 3445 }, { "epoch": 0.2835630528697799, "grad_norm": 2.0973469858510887, "learning_rate": 1.681358085658581e-05, "loss": 0.8081, "step": 3446 }, { "epoch": 0.2836453404649249, "grad_norm": 2.3990758947427264, "learning_rate": 1.6811629745411195e-05, "loss": 0.8388, "step": 3447 }, { "epoch": 0.28372762806006996, "grad_norm": 1.821405179949142, "learning_rate": 1.6809678150349293e-05, "loss": 0.8008, "step": 3448 }, { "epoch": 0.28380991565521496, "grad_norm": 0.4627300071898438, "learning_rate": 1.6807726071538745e-05, "loss": 0.5309, "step": 3449 }, { "epoch": 0.28389220325036, "grad_norm": 2.4438573828258385, "learning_rate": 1.6805773509118227e-05, "loss": 0.8391, "step": 3450 }, { "epoch": 0.283974490845505, "grad_norm": 0.4373096974424113, "learning_rate": 1.6803820463226443e-05, "loss": 0.5076, "step": 3451 }, { "epoch": 0.2840567784406501, "grad_norm": 2.5300726616808538, "learning_rate": 1.6801866934002133e-05, "loss": 0.8217, "step": 3452 }, { "epoch": 0.2841390660357951, "grad_norm": 0.4513361038680293, "learning_rate": 1.6799912921584075e-05, "loss": 0.5711, "step": 3453 }, { "epoch": 0.28422135363094014, "grad_norm": 0.44598526131806787, "learning_rate": 1.6797958426111077e-05, "loss": 0.5681, "step": 3454 }, { "epoch": 0.28430364122608515, "grad_norm": 2.1791061463851435, "learning_rate": 1.6796003447721983e-05, "loss": 0.8099, "step": 3455 }, { "epoch": 0.2843859288212302, "grad_norm": 2.1521791907156484, "learning_rate": 1.6794047986555676e-05, "loss": 0.8312, "step": 3456 }, { "epoch": 0.2844682164163752, "grad_norm": 1.7549749460192268, "learning_rate": 1.679209204275106e-05, "loss": 0.7879, "step": 3457 }, { "epoch": 0.28455050401152027, "grad_norm": 1.9520709768223774, "learning_rate": 1.6790135616447095e-05, "loss": 0.7895, "step": 3458 }, { "epoch": 0.28463279160666527, "grad_norm": 1.9994730042248807, "learning_rate": 1.678817870778275e-05, "loss": 0.7944, "step": 3459 }, { "epoch": 0.28471507920181033, "grad_norm": 2.093491173269509, "learning_rate": 1.6786221316897044e-05, "loss": 0.8137, "step": 3460 }, { "epoch": 0.28479736679695533, "grad_norm": 1.5889757859406402, "learning_rate": 1.6784263443929033e-05, "loss": 0.8225, "step": 3461 }, { "epoch": 0.2848796543921004, "grad_norm": 0.4634819706610228, "learning_rate": 1.6782305089017797e-05, "loss": 0.5537, "step": 3462 }, { "epoch": 0.28496194198724545, "grad_norm": 1.8487655355484818, "learning_rate": 1.678034625230245e-05, "loss": 0.8266, "step": 3463 }, { "epoch": 0.28504422958239045, "grad_norm": 2.038986069756992, "learning_rate": 1.6778386933922153e-05, "loss": 0.8367, "step": 3464 }, { "epoch": 0.2851265171775355, "grad_norm": 2.17004702794202, "learning_rate": 1.6776427134016087e-05, "loss": 0.817, "step": 3465 }, { "epoch": 0.2852088047726805, "grad_norm": 1.9199445673909137, "learning_rate": 1.6774466852723474e-05, "loss": 0.8215, "step": 3466 }, { "epoch": 0.28529109236782557, "grad_norm": 1.7262092987215405, "learning_rate": 1.677250609018357e-05, "loss": 0.8053, "step": 3467 }, { "epoch": 0.2853733799629706, "grad_norm": 1.5914902114461569, "learning_rate": 1.6770544846535666e-05, "loss": 0.7638, "step": 3468 }, { "epoch": 0.28545566755811563, "grad_norm": 1.6011141890970073, "learning_rate": 1.676858312191908e-05, "loss": 0.8046, "step": 3469 }, { "epoch": 0.28553795515326064, "grad_norm": 0.4546462728884523, "learning_rate": 1.6766620916473177e-05, "loss": 0.5333, "step": 3470 }, { "epoch": 0.2856202427484057, "grad_norm": 4.032771336860496, "learning_rate": 1.6764658230337346e-05, "loss": 0.785, "step": 3471 }, { "epoch": 0.2857025303435507, "grad_norm": 1.7267029252870196, "learning_rate": 1.6762695063651013e-05, "loss": 0.828, "step": 3472 }, { "epoch": 0.28578481793869576, "grad_norm": 1.5879765058811814, "learning_rate": 1.6760731416553638e-05, "loss": 0.8442, "step": 3473 }, { "epoch": 0.28586710553384076, "grad_norm": 3.4180243470837173, "learning_rate": 1.6758767289184715e-05, "loss": 0.8218, "step": 3474 }, { "epoch": 0.2859493931289858, "grad_norm": 1.7781900009410883, "learning_rate": 1.675680268168377e-05, "loss": 0.8227, "step": 3475 }, { "epoch": 0.2860316807241308, "grad_norm": 1.4123618115578194, "learning_rate": 1.6754837594190372e-05, "loss": 0.7955, "step": 3476 }, { "epoch": 0.2861139683192759, "grad_norm": 1.6657900768321887, "learning_rate": 1.6752872026844114e-05, "loss": 0.7854, "step": 3477 }, { "epoch": 0.2861962559144209, "grad_norm": 1.6508324682525461, "learning_rate": 1.6750905979784622e-05, "loss": 0.8093, "step": 3478 }, { "epoch": 0.28627854350956594, "grad_norm": 2.0792386003784196, "learning_rate": 1.6748939453151573e-05, "loss": 0.8028, "step": 3479 }, { "epoch": 0.28636083110471094, "grad_norm": 1.5017518287802412, "learning_rate": 1.674697244708465e-05, "loss": 0.8267, "step": 3480 }, { "epoch": 0.286443118699856, "grad_norm": 1.581528239807794, "learning_rate": 1.6745004961723604e-05, "loss": 0.8114, "step": 3481 }, { "epoch": 0.286525406295001, "grad_norm": 1.4857891678884565, "learning_rate": 1.674303699720819e-05, "loss": 0.7895, "step": 3482 }, { "epoch": 0.28660769389014606, "grad_norm": 1.5787661760144176, "learning_rate": 1.6741068553678208e-05, "loss": 0.8402, "step": 3483 }, { "epoch": 0.28668998148529107, "grad_norm": 1.900995643556635, "learning_rate": 1.6739099631273497e-05, "loss": 0.8176, "step": 3484 }, { "epoch": 0.2867722690804361, "grad_norm": 1.9360343091947392, "learning_rate": 1.6737130230133927e-05, "loss": 0.799, "step": 3485 }, { "epoch": 0.28685455667558113, "grad_norm": 1.7475632880592815, "learning_rate": 1.67351603503994e-05, "loss": 0.809, "step": 3486 }, { "epoch": 0.2869368442707262, "grad_norm": 2.2072607505968183, "learning_rate": 1.6733189992209852e-05, "loss": 0.8084, "step": 3487 }, { "epoch": 0.28701913186587125, "grad_norm": 2.3932208696484776, "learning_rate": 1.6731219155705258e-05, "loss": 0.8207, "step": 3488 }, { "epoch": 0.28710141946101625, "grad_norm": 1.8002529054592749, "learning_rate": 1.6729247841025618e-05, "loss": 0.7709, "step": 3489 }, { "epoch": 0.2871837070561613, "grad_norm": 2.0561098850829205, "learning_rate": 1.6727276048310974e-05, "loss": 0.8071, "step": 3490 }, { "epoch": 0.2872659946513063, "grad_norm": 1.8680311857539131, "learning_rate": 1.67253037777014e-05, "loss": 0.7797, "step": 3491 }, { "epoch": 0.28734828224645137, "grad_norm": 3.5448812326687045, "learning_rate": 1.6723331029336994e-05, "loss": 0.8386, "step": 3492 }, { "epoch": 0.2874305698415964, "grad_norm": 2.056876501784213, "learning_rate": 1.672135780335791e-05, "loss": 0.8361, "step": 3493 }, { "epoch": 0.28751285743674143, "grad_norm": 2.136317029629569, "learning_rate": 1.6719384099904318e-05, "loss": 0.8258, "step": 3494 }, { "epoch": 0.28759514503188643, "grad_norm": 2.5939735686338232, "learning_rate": 1.671740991911642e-05, "loss": 0.825, "step": 3495 }, { "epoch": 0.2876774326270315, "grad_norm": 2.0944690429347004, "learning_rate": 1.671543526113447e-05, "loss": 0.8222, "step": 3496 }, { "epoch": 0.2877597202221765, "grad_norm": 3.411959560559101, "learning_rate": 1.6713460126098736e-05, "loss": 0.8005, "step": 3497 }, { "epoch": 0.28784200781732155, "grad_norm": 2.110041202716306, "learning_rate": 1.671148451414953e-05, "loss": 0.8173, "step": 3498 }, { "epoch": 0.28792429541246656, "grad_norm": 0.45389707370787435, "learning_rate": 1.6709508425427202e-05, "loss": 0.5691, "step": 3499 }, { "epoch": 0.2880065830076116, "grad_norm": 2.2752430550938874, "learning_rate": 1.6707531860072122e-05, "loss": 0.779, "step": 3500 }, { "epoch": 0.2880888706027566, "grad_norm": 2.9098953632036104, "learning_rate": 1.670555481822471e-05, "loss": 0.7919, "step": 3501 }, { "epoch": 0.2881711581979017, "grad_norm": 1.8995725284770844, "learning_rate": 1.67035773000254e-05, "loss": 0.8184, "step": 3502 }, { "epoch": 0.2882534457930467, "grad_norm": 2.003185274187615, "learning_rate": 1.6701599305614685e-05, "loss": 0.8289, "step": 3503 }, { "epoch": 0.28833573338819174, "grad_norm": 2.084723279405393, "learning_rate": 1.669962083513307e-05, "loss": 0.8283, "step": 3504 }, { "epoch": 0.28841802098333674, "grad_norm": 4.067890464085165, "learning_rate": 1.6697641888721107e-05, "loss": 0.8025, "step": 3505 }, { "epoch": 0.2885003085784818, "grad_norm": 0.4481290034369956, "learning_rate": 1.6695662466519377e-05, "loss": 0.5435, "step": 3506 }, { "epoch": 0.2885825961736268, "grad_norm": 2.278029783924429, "learning_rate": 1.669368256866849e-05, "loss": 0.8009, "step": 3507 }, { "epoch": 0.28866488376877186, "grad_norm": 0.4230219089422917, "learning_rate": 1.6691702195309105e-05, "loss": 0.5255, "step": 3508 }, { "epoch": 0.28874717136391687, "grad_norm": 0.45033485534258255, "learning_rate": 1.6689721346581892e-05, "loss": 0.5458, "step": 3509 }, { "epoch": 0.2888294589590619, "grad_norm": 1.960789229166538, "learning_rate": 1.6687740022627573e-05, "loss": 0.8029, "step": 3510 }, { "epoch": 0.2889117465542069, "grad_norm": 1.9707966632273453, "learning_rate": 1.66857582235869e-05, "loss": 0.8049, "step": 3511 }, { "epoch": 0.288994034149352, "grad_norm": 3.348967297731327, "learning_rate": 1.6683775949600654e-05, "loss": 0.7836, "step": 3512 }, { "epoch": 0.28907632174449704, "grad_norm": 0.4351955326287272, "learning_rate": 1.6681793200809656e-05, "loss": 0.5538, "step": 3513 }, { "epoch": 0.28915860933964205, "grad_norm": 2.0498748058278453, "learning_rate": 1.6679809977354754e-05, "loss": 0.7876, "step": 3514 }, { "epoch": 0.2892408969347871, "grad_norm": 1.8919493558704104, "learning_rate": 1.6677826279376832e-05, "loss": 0.8057, "step": 3515 }, { "epoch": 0.2893231845299321, "grad_norm": 2.334579373937873, "learning_rate": 1.6675842107016814e-05, "loss": 0.7895, "step": 3516 }, { "epoch": 0.28940547212507717, "grad_norm": 5.3766265211063375, "learning_rate": 1.6673857460415647e-05, "loss": 0.7938, "step": 3517 }, { "epoch": 0.28948775972022217, "grad_norm": 1.971049825711844, "learning_rate": 1.667187233971432e-05, "loss": 0.8235, "step": 3518 }, { "epoch": 0.28957004731536723, "grad_norm": 2.2536217470647375, "learning_rate": 1.666988674505385e-05, "loss": 0.7829, "step": 3519 }, { "epoch": 0.28965233491051223, "grad_norm": 6.287185634952787, "learning_rate": 1.666790067657529e-05, "loss": 0.8167, "step": 3520 }, { "epoch": 0.2897346225056573, "grad_norm": 2.0640992367691884, "learning_rate": 1.666591413441974e-05, "loss": 0.786, "step": 3521 }, { "epoch": 0.2898169101008023, "grad_norm": 1.8931435716380844, "learning_rate": 1.6663927118728302e-05, "loss": 0.8118, "step": 3522 }, { "epoch": 0.28989919769594735, "grad_norm": 2.5014981392717015, "learning_rate": 1.6661939629642142e-05, "loss": 0.8086, "step": 3523 }, { "epoch": 0.28998148529109236, "grad_norm": 2.2167087484870898, "learning_rate": 1.665995166730244e-05, "loss": 0.783, "step": 3524 }, { "epoch": 0.2900637728862374, "grad_norm": 2.0101485502511878, "learning_rate": 1.6657963231850432e-05, "loss": 0.7882, "step": 3525 }, { "epoch": 0.2901460604813824, "grad_norm": 2.291672665287918, "learning_rate": 1.6655974323427354e-05, "loss": 0.8176, "step": 3526 }, { "epoch": 0.2902283480765275, "grad_norm": 2.195417285730131, "learning_rate": 1.6653984942174513e-05, "loss": 0.8262, "step": 3527 }, { "epoch": 0.2903106356716725, "grad_norm": 0.45560175155715454, "learning_rate": 1.665199508823322e-05, "loss": 0.5565, "step": 3528 }, { "epoch": 0.29039292326681754, "grad_norm": 0.4457673720306088, "learning_rate": 1.665000476174483e-05, "loss": 0.5617, "step": 3529 }, { "epoch": 0.29047521086196254, "grad_norm": 1.850716696724229, "learning_rate": 1.6648013962850743e-05, "loss": 0.7899, "step": 3530 }, { "epoch": 0.2905574984571076, "grad_norm": 2.246953512173027, "learning_rate": 1.6646022691692373e-05, "loss": 0.7851, "step": 3531 }, { "epoch": 0.2906397860522526, "grad_norm": 2.8944961863444636, "learning_rate": 1.6644030948411177e-05, "loss": 0.8117, "step": 3532 }, { "epoch": 0.29072207364739766, "grad_norm": 2.073029290927673, "learning_rate": 1.6642038733148654e-05, "loss": 0.8105, "step": 3533 }, { "epoch": 0.29080436124254266, "grad_norm": 1.9774566624610654, "learning_rate": 1.664004604604632e-05, "loss": 0.8142, "step": 3534 }, { "epoch": 0.2908866488376877, "grad_norm": 1.84749961783582, "learning_rate": 1.6638052887245733e-05, "loss": 0.8005, "step": 3535 }, { "epoch": 0.2909689364328327, "grad_norm": 0.48264343720441555, "learning_rate": 1.6636059256888484e-05, "loss": 0.5329, "step": 3536 }, { "epoch": 0.2910512240279778, "grad_norm": 2.3891852329936207, "learning_rate": 1.66340651551162e-05, "loss": 0.7946, "step": 3537 }, { "epoch": 0.2911335116231228, "grad_norm": 1.9987906573804815, "learning_rate": 1.6632070582070536e-05, "loss": 0.7835, "step": 3538 }, { "epoch": 0.29121579921826785, "grad_norm": 2.7784467982667107, "learning_rate": 1.6630075537893183e-05, "loss": 0.8092, "step": 3539 }, { "epoch": 0.2912980868134129, "grad_norm": 0.4415210978944295, "learning_rate": 1.6628080022725866e-05, "loss": 0.5425, "step": 3540 }, { "epoch": 0.2913803744085579, "grad_norm": 2.380535919426329, "learning_rate": 1.662608403671035e-05, "loss": 0.8228, "step": 3541 }, { "epoch": 0.29146266200370297, "grad_norm": 5.061045389787459, "learning_rate": 1.6624087579988416e-05, "loss": 0.8045, "step": 3542 }, { "epoch": 0.29154494959884797, "grad_norm": 1.8563695073025637, "learning_rate": 1.6622090652701896e-05, "loss": 0.8018, "step": 3543 }, { "epoch": 0.291627237193993, "grad_norm": 1.999734316894825, "learning_rate": 1.6620093254992646e-05, "loss": 0.7917, "step": 3544 }, { "epoch": 0.29170952478913803, "grad_norm": 1.9433176007559034, "learning_rate": 1.6618095387002556e-05, "loss": 0.7849, "step": 3545 }, { "epoch": 0.2917918123842831, "grad_norm": 2.2309708240767576, "learning_rate": 1.6616097048873557e-05, "loss": 0.8153, "step": 3546 }, { "epoch": 0.2918740999794281, "grad_norm": 2.341814818226442, "learning_rate": 1.6614098240747606e-05, "loss": 0.8647, "step": 3547 }, { "epoch": 0.29195638757457315, "grad_norm": 2.9791401434190417, "learning_rate": 1.661209896276669e-05, "loss": 0.7999, "step": 3548 }, { "epoch": 0.29203867516971815, "grad_norm": 0.430303260464014, "learning_rate": 1.661009921507284e-05, "loss": 0.5291, "step": 3549 }, { "epoch": 0.2921209627648632, "grad_norm": 1.8123400429439562, "learning_rate": 1.6608098997808114e-05, "loss": 0.8052, "step": 3550 }, { "epoch": 0.2922032503600082, "grad_norm": 2.347163456518436, "learning_rate": 1.66060983111146e-05, "loss": 0.8203, "step": 3551 }, { "epoch": 0.2922855379551533, "grad_norm": 1.721626934446873, "learning_rate": 1.6604097155134427e-05, "loss": 0.797, "step": 3552 }, { "epoch": 0.2923678255502983, "grad_norm": 2.201616811051578, "learning_rate": 1.660209553000976e-05, "loss": 0.7654, "step": 3553 }, { "epoch": 0.29245011314544334, "grad_norm": 1.816300835480641, "learning_rate": 1.6600093435882777e-05, "loss": 0.8073, "step": 3554 }, { "epoch": 0.29253240074058834, "grad_norm": 0.4658253741741589, "learning_rate": 1.6598090872895715e-05, "loss": 0.5435, "step": 3555 }, { "epoch": 0.2926146883357334, "grad_norm": 0.4081132818015702, "learning_rate": 1.6596087841190832e-05, "loss": 0.5015, "step": 3556 }, { "epoch": 0.2926969759308784, "grad_norm": 1.9217498282118224, "learning_rate": 1.6594084340910416e-05, "loss": 0.8015, "step": 3557 }, { "epoch": 0.29277926352602346, "grad_norm": 2.459476842735801, "learning_rate": 1.659208037219679e-05, "loss": 0.822, "step": 3558 }, { "epoch": 0.29286155112116846, "grad_norm": 0.4187755012731122, "learning_rate": 1.659007593519232e-05, "loss": 0.539, "step": 3559 }, { "epoch": 0.2929438387163135, "grad_norm": 0.4439044605797784, "learning_rate": 1.6588071030039395e-05, "loss": 0.5437, "step": 3560 }, { "epoch": 0.2930261263114585, "grad_norm": 1.9430080081113947, "learning_rate": 1.6586065656880442e-05, "loss": 0.8191, "step": 3561 }, { "epoch": 0.2931084139066036, "grad_norm": 0.4261739322393414, "learning_rate": 1.6584059815857917e-05, "loss": 0.5266, "step": 3562 }, { "epoch": 0.2931907015017486, "grad_norm": 2.0059153612762817, "learning_rate": 1.658205350711431e-05, "loss": 0.7954, "step": 3563 }, { "epoch": 0.29327298909689364, "grad_norm": 1.992803682953668, "learning_rate": 1.658004673079215e-05, "loss": 0.8117, "step": 3564 }, { "epoch": 0.2933552766920387, "grad_norm": 1.9218001671195901, "learning_rate": 1.657803948703399e-05, "loss": 0.8139, "step": 3565 }, { "epoch": 0.2934375642871837, "grad_norm": 2.202882243209258, "learning_rate": 1.6576031775982428e-05, "loss": 0.7937, "step": 3566 }, { "epoch": 0.29351985188232876, "grad_norm": 1.6770391115358514, "learning_rate": 1.6574023597780086e-05, "loss": 0.8279, "step": 3567 }, { "epoch": 0.29360213947747377, "grad_norm": 1.8078657382158536, "learning_rate": 1.6572014952569622e-05, "loss": 0.8006, "step": 3568 }, { "epoch": 0.2936844270726188, "grad_norm": 2.1096069549377243, "learning_rate": 1.6570005840493723e-05, "loss": 0.7904, "step": 3569 }, { "epoch": 0.29376671466776383, "grad_norm": 0.42460506300564094, "learning_rate": 1.656799626169512e-05, "loss": 0.5501, "step": 3570 }, { "epoch": 0.2938490022629089, "grad_norm": 2.0963250119352423, "learning_rate": 1.6565986216316564e-05, "loss": 0.8106, "step": 3571 }, { "epoch": 0.2939312898580539, "grad_norm": 2.0656139084436598, "learning_rate": 1.6563975704500847e-05, "loss": 0.8339, "step": 3572 }, { "epoch": 0.29401357745319895, "grad_norm": 2.500541500884672, "learning_rate": 1.6561964726390797e-05, "loss": 0.8418, "step": 3573 }, { "epoch": 0.29409586504834395, "grad_norm": 1.8127364690862025, "learning_rate": 1.6559953282129262e-05, "loss": 0.8124, "step": 3574 }, { "epoch": 0.294178152643489, "grad_norm": 2.0207547834774857, "learning_rate": 1.655794137185914e-05, "loss": 0.807, "step": 3575 }, { "epoch": 0.294260440238634, "grad_norm": 0.45181317862494913, "learning_rate": 1.655592899572335e-05, "loss": 0.5462, "step": 3576 }, { "epoch": 0.29434272783377907, "grad_norm": 1.9261816815698125, "learning_rate": 1.655391615386485e-05, "loss": 0.8061, "step": 3577 }, { "epoch": 0.2944250154289241, "grad_norm": 1.787594653223662, "learning_rate": 1.6551902846426626e-05, "loss": 0.8171, "step": 3578 }, { "epoch": 0.29450730302406913, "grad_norm": 2.2310815549232825, "learning_rate": 1.6549889073551705e-05, "loss": 0.8099, "step": 3579 }, { "epoch": 0.29458959061921414, "grad_norm": 0.4485931026862, "learning_rate": 1.6547874835383137e-05, "loss": 0.5532, "step": 3580 }, { "epoch": 0.2946718782143592, "grad_norm": 0.4301956132003114, "learning_rate": 1.6545860132064015e-05, "loss": 0.5388, "step": 3581 }, { "epoch": 0.2947541658095042, "grad_norm": 4.830932316927706, "learning_rate": 1.6543844963737454e-05, "loss": 0.7782, "step": 3582 }, { "epoch": 0.29483645340464926, "grad_norm": 1.8174347345887978, "learning_rate": 1.6541829330546616e-05, "loss": 0.801, "step": 3583 }, { "epoch": 0.29491874099979426, "grad_norm": 1.6929817178799809, "learning_rate": 1.653981323263468e-05, "loss": 0.7876, "step": 3584 }, { "epoch": 0.2950010285949393, "grad_norm": 1.958737548661962, "learning_rate": 1.6537796670144873e-05, "loss": 0.7925, "step": 3585 }, { "epoch": 0.2950833161900843, "grad_norm": 1.7981141842359325, "learning_rate": 1.653577964322045e-05, "loss": 0.817, "step": 3586 }, { "epoch": 0.2951656037852294, "grad_norm": 1.8185287740526173, "learning_rate": 1.6533762152004687e-05, "loss": 0.7807, "step": 3587 }, { "epoch": 0.2952478913803744, "grad_norm": 0.4537506028450974, "learning_rate": 1.6531744196640915e-05, "loss": 0.5344, "step": 3588 }, { "epoch": 0.29533017897551944, "grad_norm": 0.44577322939836633, "learning_rate": 1.6529725777272476e-05, "loss": 0.5653, "step": 3589 }, { "epoch": 0.29541246657066444, "grad_norm": 2.0387141776373214, "learning_rate": 1.6527706894042765e-05, "loss": 0.8222, "step": 3590 }, { "epoch": 0.2954947541658095, "grad_norm": 0.438508063126075, "learning_rate": 1.6525687547095194e-05, "loss": 0.5543, "step": 3591 }, { "epoch": 0.29557704176095456, "grad_norm": 1.3420787245609647, "learning_rate": 1.6523667736573216e-05, "loss": 0.7896, "step": 3592 }, { "epoch": 0.29565932935609957, "grad_norm": 1.4944961177832303, "learning_rate": 1.652164746262032e-05, "loss": 0.7956, "step": 3593 }, { "epoch": 0.2957416169512446, "grad_norm": 2.067155769754942, "learning_rate": 1.651962672538001e-05, "loss": 0.8027, "step": 3594 }, { "epoch": 0.2958239045463896, "grad_norm": 0.4880661651769194, "learning_rate": 1.651760552499585e-05, "loss": 0.575, "step": 3595 }, { "epoch": 0.2959061921415347, "grad_norm": 1.549996096043789, "learning_rate": 1.6515583861611413e-05, "loss": 0.824, "step": 3596 }, { "epoch": 0.2959884797366797, "grad_norm": 1.4569372189154544, "learning_rate": 1.651356173537032e-05, "loss": 0.8063, "step": 3597 }, { "epoch": 0.29607076733182475, "grad_norm": 2.1045506375221876, "learning_rate": 1.6511539146416217e-05, "loss": 0.8321, "step": 3598 }, { "epoch": 0.29615305492696975, "grad_norm": 1.603101180312977, "learning_rate": 1.6509516094892788e-05, "loss": 0.7873, "step": 3599 }, { "epoch": 0.2962353425221148, "grad_norm": 1.5464961268220638, "learning_rate": 1.6507492580943746e-05, "loss": 0.7793, "step": 3600 }, { "epoch": 0.2963176301172598, "grad_norm": 1.4547001761588105, "learning_rate": 1.650546860471284e-05, "loss": 0.8206, "step": 3601 }, { "epoch": 0.29639991771240487, "grad_norm": 1.5156790874536277, "learning_rate": 1.6503444166343846e-05, "loss": 0.807, "step": 3602 }, { "epoch": 0.2964822053075499, "grad_norm": 1.6607693253933526, "learning_rate": 1.650141926598058e-05, "loss": 0.818, "step": 3603 }, { "epoch": 0.29656449290269493, "grad_norm": 0.45200961268789075, "learning_rate": 1.6499393903766886e-05, "loss": 0.5318, "step": 3604 }, { "epoch": 0.29664678049783993, "grad_norm": 0.4281520155308469, "learning_rate": 1.6497368079846646e-05, "loss": 0.51, "step": 3605 }, { "epoch": 0.296729068092985, "grad_norm": 1.4844329190620529, "learning_rate": 1.6495341794363768e-05, "loss": 0.8147, "step": 3606 }, { "epoch": 0.29681135568813, "grad_norm": 1.4468093740635921, "learning_rate": 1.649331504746219e-05, "loss": 0.8401, "step": 3607 }, { "epoch": 0.29689364328327505, "grad_norm": 7.349607156958685, "learning_rate": 1.6491287839285903e-05, "loss": 0.8087, "step": 3608 }, { "epoch": 0.29697593087842006, "grad_norm": 0.48797003096312785, "learning_rate": 1.6489260169978908e-05, "loss": 0.5222, "step": 3609 }, { "epoch": 0.2970582184735651, "grad_norm": 1.4909203688886028, "learning_rate": 1.6487232039685246e-05, "loss": 0.8239, "step": 3610 }, { "epoch": 0.2971405060687101, "grad_norm": 1.45832375397737, "learning_rate": 1.6485203448548995e-05, "loss": 0.8116, "step": 3611 }, { "epoch": 0.2972227936638552, "grad_norm": 1.451419169359552, "learning_rate": 1.6483174396714265e-05, "loss": 0.8069, "step": 3612 }, { "epoch": 0.2973050812590002, "grad_norm": 0.5008600469907893, "learning_rate": 1.6481144884325193e-05, "loss": 0.5822, "step": 3613 }, { "epoch": 0.29738736885414524, "grad_norm": 0.45991870974867094, "learning_rate": 1.6479114911525952e-05, "loss": 0.5504, "step": 3614 }, { "epoch": 0.29746965644929024, "grad_norm": 1.9837374203990883, "learning_rate": 1.647708447846075e-05, "loss": 0.8216, "step": 3615 }, { "epoch": 0.2975519440444353, "grad_norm": 1.4163826272645652, "learning_rate": 1.647505358527383e-05, "loss": 0.812, "step": 3616 }, { "epoch": 0.29763423163958036, "grad_norm": 2.0305024301193457, "learning_rate": 1.6473022232109453e-05, "loss": 0.8104, "step": 3617 }, { "epoch": 0.29771651923472536, "grad_norm": 1.5169338642840535, "learning_rate": 1.647099041911193e-05, "loss": 0.805, "step": 3618 }, { "epoch": 0.2977988068298704, "grad_norm": 1.7748947218775537, "learning_rate": 1.64689581464256e-05, "loss": 0.8281, "step": 3619 }, { "epoch": 0.2978810944250154, "grad_norm": 0.5102719187792992, "learning_rate": 1.6466925414194827e-05, "loss": 0.5624, "step": 3620 }, { "epoch": 0.2979633820201605, "grad_norm": 1.5103048053312222, "learning_rate": 1.646489222256401e-05, "loss": 0.8261, "step": 3621 }, { "epoch": 0.2980456696153055, "grad_norm": 2.285924143020317, "learning_rate": 1.6462858571677593e-05, "loss": 0.7961, "step": 3622 }, { "epoch": 0.29812795721045054, "grad_norm": 1.5727073825492892, "learning_rate": 1.6460824461680037e-05, "loss": 0.8123, "step": 3623 }, { "epoch": 0.29821024480559555, "grad_norm": 1.5823667385972964, "learning_rate": 1.6458789892715845e-05, "loss": 0.8246, "step": 3624 }, { "epoch": 0.2982925324007406, "grad_norm": 0.44216108395055426, "learning_rate": 1.645675486492955e-05, "loss": 0.5517, "step": 3625 }, { "epoch": 0.2983748199958856, "grad_norm": 0.4443349389522302, "learning_rate": 1.6454719378465714e-05, "loss": 0.5221, "step": 3626 }, { "epoch": 0.29845710759103067, "grad_norm": 1.7290493167311836, "learning_rate": 1.6452683433468934e-05, "loss": 0.7989, "step": 3627 }, { "epoch": 0.29853939518617567, "grad_norm": 0.43576759443993884, "learning_rate": 1.6450647030083845e-05, "loss": 0.5295, "step": 3628 }, { "epoch": 0.29862168278132073, "grad_norm": 1.4800601153777706, "learning_rate": 1.6448610168455105e-05, "loss": 0.8092, "step": 3629 }, { "epoch": 0.29870397037646573, "grad_norm": 0.4482543666374695, "learning_rate": 1.6446572848727416e-05, "loss": 0.5222, "step": 3630 }, { "epoch": 0.2987862579716108, "grad_norm": 1.9043113521649464, "learning_rate": 1.64445350710455e-05, "loss": 0.7889, "step": 3631 }, { "epoch": 0.2988685455667558, "grad_norm": 0.47399637243483855, "learning_rate": 1.6442496835554112e-05, "loss": 0.5404, "step": 3632 }, { "epoch": 0.29895083316190085, "grad_norm": 1.694965329913979, "learning_rate": 1.644045814239806e-05, "loss": 0.7979, "step": 3633 }, { "epoch": 0.29903312075704586, "grad_norm": 1.7030898842474096, "learning_rate": 1.643841899172216e-05, "loss": 0.7616, "step": 3634 }, { "epoch": 0.2991154083521909, "grad_norm": 1.711673123185248, "learning_rate": 1.643637938367127e-05, "loss": 0.7985, "step": 3635 }, { "epoch": 0.2991976959473359, "grad_norm": 1.4704819630463015, "learning_rate": 1.6434339318390286e-05, "loss": 0.8161, "step": 3636 }, { "epoch": 0.299279983542481, "grad_norm": 1.5446355311555044, "learning_rate": 1.643229879602412e-05, "loss": 0.8116, "step": 3637 }, { "epoch": 0.299362271137626, "grad_norm": 1.571432630304453, "learning_rate": 1.6430257816717743e-05, "loss": 0.8019, "step": 3638 }, { "epoch": 0.29944455873277104, "grad_norm": 0.4422729175523287, "learning_rate": 1.642821638061613e-05, "loss": 0.5291, "step": 3639 }, { "epoch": 0.29952684632791604, "grad_norm": 1.8673988008895486, "learning_rate": 1.6426174487864304e-05, "loss": 0.8319, "step": 3640 }, { "epoch": 0.2996091339230611, "grad_norm": 1.6221967539246775, "learning_rate": 1.642413213860732e-05, "loss": 0.8057, "step": 3641 }, { "epoch": 0.29969142151820616, "grad_norm": 1.5028216429220547, "learning_rate": 1.6422089332990264e-05, "loss": 0.8044, "step": 3642 }, { "epoch": 0.29977370911335116, "grad_norm": 1.6762577400321679, "learning_rate": 1.6420046071158253e-05, "loss": 0.8024, "step": 3643 }, { "epoch": 0.2998559967084962, "grad_norm": 1.699525059905768, "learning_rate": 1.6418002353256436e-05, "loss": 0.8158, "step": 3644 }, { "epoch": 0.2999382843036412, "grad_norm": 1.7940824039037504, "learning_rate": 1.6415958179429996e-05, "loss": 0.8126, "step": 3645 }, { "epoch": 0.3000205718987863, "grad_norm": 1.7449849959524295, "learning_rate": 1.6413913549824147e-05, "loss": 0.8142, "step": 3646 }, { "epoch": 0.3001028594939313, "grad_norm": 0.43320109979126875, "learning_rate": 1.641186846458414e-05, "loss": 0.5415, "step": 3647 }, { "epoch": 0.30018514708907634, "grad_norm": 2.2913328597836684, "learning_rate": 1.6409822923855248e-05, "loss": 0.7865, "step": 3648 }, { "epoch": 0.30026743468422135, "grad_norm": 1.8618971692247046, "learning_rate": 1.6407776927782787e-05, "loss": 0.8344, "step": 3649 }, { "epoch": 0.3003497222793664, "grad_norm": 1.8089576166987351, "learning_rate": 1.64057304765121e-05, "loss": 0.8059, "step": 3650 }, { "epoch": 0.3004320098745114, "grad_norm": 0.4394608719087332, "learning_rate": 1.6403683570188567e-05, "loss": 0.5263, "step": 3651 }, { "epoch": 0.30051429746965647, "grad_norm": 2.4289340317821773, "learning_rate": 1.640163620895759e-05, "loss": 0.796, "step": 3652 }, { "epoch": 0.30059658506480147, "grad_norm": 1.625348899423735, "learning_rate": 1.639958839296462e-05, "loss": 0.7909, "step": 3653 }, { "epoch": 0.30067887265994653, "grad_norm": 1.5166507240089266, "learning_rate": 1.6397540122355122e-05, "loss": 0.8002, "step": 3654 }, { "epoch": 0.30076116025509153, "grad_norm": 1.4184455757549488, "learning_rate": 1.6395491397274608e-05, "loss": 0.8222, "step": 3655 }, { "epoch": 0.3008434478502366, "grad_norm": 0.4327869707190793, "learning_rate": 1.639344221786861e-05, "loss": 0.5416, "step": 3656 }, { "epoch": 0.3009257354453816, "grad_norm": 1.4562376716327414, "learning_rate": 1.6391392584282705e-05, "loss": 0.8151, "step": 3657 }, { "epoch": 0.30100802304052665, "grad_norm": 0.438335227303146, "learning_rate": 1.638934249666249e-05, "loss": 0.5366, "step": 3658 }, { "epoch": 0.30109031063567165, "grad_norm": 1.5033019033013582, "learning_rate": 1.6387291955153603e-05, "loss": 0.7965, "step": 3659 }, { "epoch": 0.3011725982308167, "grad_norm": 1.5225162562640404, "learning_rate": 1.638524095990171e-05, "loss": 0.8058, "step": 3660 }, { "epoch": 0.3012548858259617, "grad_norm": 1.5016565914911202, "learning_rate": 1.6383189511052507e-05, "loss": 0.8118, "step": 3661 }, { "epoch": 0.3013371734211068, "grad_norm": 1.4769933744123498, "learning_rate": 1.6381137608751733e-05, "loss": 0.8256, "step": 3662 }, { "epoch": 0.3014194610162518, "grad_norm": 1.3113247016849063, "learning_rate": 1.637908525314515e-05, "loss": 0.7982, "step": 3663 }, { "epoch": 0.30150174861139684, "grad_norm": 0.45340394266460904, "learning_rate": 1.637703244437855e-05, "loss": 0.5557, "step": 3664 }, { "epoch": 0.30158403620654184, "grad_norm": 1.522511390412077, "learning_rate": 1.6374979182597766e-05, "loss": 0.8411, "step": 3665 }, { "epoch": 0.3016663238016869, "grad_norm": 0.4256116592130053, "learning_rate": 1.6372925467948656e-05, "loss": 0.5497, "step": 3666 }, { "epoch": 0.3017486113968319, "grad_norm": 1.3973244380801353, "learning_rate": 1.6370871300577112e-05, "loss": 0.8016, "step": 3667 }, { "epoch": 0.30183089899197696, "grad_norm": 0.42105939597568615, "learning_rate": 1.6368816680629058e-05, "loss": 0.5377, "step": 3668 }, { "epoch": 0.301913186587122, "grad_norm": 1.622145350120707, "learning_rate": 1.6366761608250453e-05, "loss": 0.8198, "step": 3669 }, { "epoch": 0.301995474182267, "grad_norm": 1.4773642812742485, "learning_rate": 1.6364706083587287e-05, "loss": 0.7939, "step": 3670 }, { "epoch": 0.3020777617774121, "grad_norm": 1.4255914313834837, "learning_rate": 1.6362650106785577e-05, "loss": 0.8244, "step": 3671 }, { "epoch": 0.3021600493725571, "grad_norm": 1.8726203492671103, "learning_rate": 1.6360593677991383e-05, "loss": 0.8347, "step": 3672 }, { "epoch": 0.30224233696770214, "grad_norm": 1.4678091772662945, "learning_rate": 1.6358536797350783e-05, "loss": 0.7806, "step": 3673 }, { "epoch": 0.30232462456284714, "grad_norm": 0.43833813379823816, "learning_rate": 1.6356479465009898e-05, "loss": 0.5562, "step": 3674 }, { "epoch": 0.3024069121579922, "grad_norm": 1.292202911636895, "learning_rate": 1.635442168111488e-05, "loss": 0.8095, "step": 3675 }, { "epoch": 0.3024891997531372, "grad_norm": 1.3720240213000705, "learning_rate": 1.6352363445811907e-05, "loss": 0.8064, "step": 3676 }, { "epoch": 0.30257148734828226, "grad_norm": 1.828225512415719, "learning_rate": 1.6350304759247194e-05, "loss": 0.8045, "step": 3677 }, { "epoch": 0.30265377494342727, "grad_norm": 1.6394823344470917, "learning_rate": 1.6348245621566987e-05, "loss": 0.8236, "step": 3678 }, { "epoch": 0.3027360625385723, "grad_norm": 0.4391593547423728, "learning_rate": 1.634618603291756e-05, "loss": 0.5518, "step": 3679 }, { "epoch": 0.30281835013371733, "grad_norm": 1.4520181940736636, "learning_rate": 1.634412599344523e-05, "loss": 0.809, "step": 3680 }, { "epoch": 0.3029006377288624, "grad_norm": 1.4174240110221976, "learning_rate": 1.6342065503296333e-05, "loss": 0.7964, "step": 3681 }, { "epoch": 0.3029829253240074, "grad_norm": 1.4677308594644047, "learning_rate": 1.6340004562617248e-05, "loss": 0.791, "step": 3682 }, { "epoch": 0.30306521291915245, "grad_norm": 1.4786293669077117, "learning_rate": 1.633794317155438e-05, "loss": 0.8026, "step": 3683 }, { "epoch": 0.30314750051429745, "grad_norm": 1.8994862403985697, "learning_rate": 1.633588133025416e-05, "loss": 0.8178, "step": 3684 }, { "epoch": 0.3032297881094425, "grad_norm": 1.5848395704359917, "learning_rate": 1.633381903886307e-05, "loss": 0.8295, "step": 3685 }, { "epoch": 0.3033120757045875, "grad_norm": 1.7457959954236808, "learning_rate": 1.6331756297527595e-05, "loss": 0.8141, "step": 3686 }, { "epoch": 0.3033943632997326, "grad_norm": 1.953308606724481, "learning_rate": 1.6329693106394285e-05, "loss": 0.7854, "step": 3687 }, { "epoch": 0.3034766508948776, "grad_norm": 3.1688517850914857, "learning_rate": 1.6327629465609697e-05, "loss": 0.819, "step": 3688 }, { "epoch": 0.30355893849002263, "grad_norm": 0.4226930045061616, "learning_rate": 1.6325565375320437e-05, "loss": 0.5135, "step": 3689 }, { "epoch": 0.30364122608516764, "grad_norm": 1.9921789316023617, "learning_rate": 1.632350083567312e-05, "loss": 0.841, "step": 3690 }, { "epoch": 0.3037235136803127, "grad_norm": 2.144021981352117, "learning_rate": 1.6321435846814425e-05, "loss": 0.7732, "step": 3691 }, { "epoch": 0.3038058012754577, "grad_norm": 2.1150059213115098, "learning_rate": 1.6319370408891033e-05, "loss": 0.7836, "step": 3692 }, { "epoch": 0.30388808887060276, "grad_norm": 0.4225605763671702, "learning_rate": 1.6317304522049676e-05, "loss": 0.5257, "step": 3693 }, { "epoch": 0.3039703764657478, "grad_norm": 1.8297556296284003, "learning_rate": 1.6315238186437105e-05, "loss": 0.7853, "step": 3694 }, { "epoch": 0.3040526640608928, "grad_norm": 1.834168255019643, "learning_rate": 1.6313171402200113e-05, "loss": 0.7548, "step": 3695 }, { "epoch": 0.3041349516560379, "grad_norm": 3.037930138675158, "learning_rate": 1.6311104169485524e-05, "loss": 0.8082, "step": 3696 }, { "epoch": 0.3042172392511829, "grad_norm": 2.0655840771497047, "learning_rate": 1.6309036488440188e-05, "loss": 0.7983, "step": 3697 }, { "epoch": 0.30429952684632794, "grad_norm": 1.633093211202821, "learning_rate": 1.630696835921099e-05, "loss": 0.8161, "step": 3698 }, { "epoch": 0.30438181444147294, "grad_norm": 1.691470651596845, "learning_rate": 1.6304899781944843e-05, "loss": 0.8128, "step": 3699 }, { "epoch": 0.304464102036618, "grad_norm": 2.0532750008623837, "learning_rate": 1.63028307567887e-05, "loss": 0.7754, "step": 3700 }, { "epoch": 0.304546389631763, "grad_norm": 1.9264864628222687, "learning_rate": 1.630076128388954e-05, "loss": 0.8237, "step": 3701 }, { "epoch": 0.30462867722690806, "grad_norm": 1.7260987270858321, "learning_rate": 1.6298691363394376e-05, "loss": 0.8195, "step": 3702 }, { "epoch": 0.30471096482205307, "grad_norm": 1.5578352879677724, "learning_rate": 1.629662099545025e-05, "loss": 0.7841, "step": 3703 }, { "epoch": 0.3047932524171981, "grad_norm": 2.2106924467540554, "learning_rate": 1.6294550180204238e-05, "loss": 0.8069, "step": 3704 }, { "epoch": 0.3048755400123431, "grad_norm": 1.8921781975692649, "learning_rate": 1.6292478917803448e-05, "loss": 0.7939, "step": 3705 }, { "epoch": 0.3049578276074882, "grad_norm": 1.673642078076112, "learning_rate": 1.629040720839502e-05, "loss": 0.8063, "step": 3706 }, { "epoch": 0.3050401152026332, "grad_norm": 1.8564314006699936, "learning_rate": 1.6288335052126127e-05, "loss": 0.8101, "step": 3707 }, { "epoch": 0.30512240279777825, "grad_norm": 0.4393600634993367, "learning_rate": 1.628626244914396e-05, "loss": 0.5327, "step": 3708 }, { "epoch": 0.30520469039292325, "grad_norm": 1.5957945769842101, "learning_rate": 1.6284189399595767e-05, "loss": 0.7861, "step": 3709 }, { "epoch": 0.3052869779880683, "grad_norm": 1.7141665761524336, "learning_rate": 1.628211590362881e-05, "loss": 0.7668, "step": 3710 }, { "epoch": 0.3053692655832133, "grad_norm": 1.7253634739463022, "learning_rate": 1.6280041961390387e-05, "loss": 0.8165, "step": 3711 }, { "epoch": 0.30545155317835837, "grad_norm": 2.218877930057157, "learning_rate": 1.6277967573027823e-05, "loss": 0.7945, "step": 3712 }, { "epoch": 0.3055338407735034, "grad_norm": 0.4341320111159625, "learning_rate": 1.6275892738688484e-05, "loss": 0.5605, "step": 3713 }, { "epoch": 0.30561612836864843, "grad_norm": 0.43930733101248265, "learning_rate": 1.6273817458519764e-05, "loss": 0.5232, "step": 3714 }, { "epoch": 0.30569841596379344, "grad_norm": 2.0033466027439313, "learning_rate": 1.627174173266908e-05, "loss": 0.8007, "step": 3715 }, { "epoch": 0.3057807035589385, "grad_norm": 1.758130594597491, "learning_rate": 1.6269665561283898e-05, "loss": 0.808, "step": 3716 }, { "epoch": 0.3058629911540835, "grad_norm": 2.083476995210029, "learning_rate": 1.62675889445117e-05, "loss": 0.7918, "step": 3717 }, { "epoch": 0.30594527874922856, "grad_norm": 0.4606851012547155, "learning_rate": 1.6265511882500008e-05, "loss": 0.5494, "step": 3718 }, { "epoch": 0.30602756634437356, "grad_norm": 1.9313786148077983, "learning_rate": 1.626343437539637e-05, "loss": 0.8238, "step": 3719 }, { "epoch": 0.3061098539395186, "grad_norm": 1.8681515209882509, "learning_rate": 1.626135642334837e-05, "loss": 0.7808, "step": 3720 }, { "epoch": 0.3061921415346637, "grad_norm": 2.03958255232041, "learning_rate": 1.6259278026503625e-05, "loss": 0.8132, "step": 3721 }, { "epoch": 0.3062744291298087, "grad_norm": 2.180998620356756, "learning_rate": 1.625719918500978e-05, "loss": 0.7985, "step": 3722 }, { "epoch": 0.30635671672495374, "grad_norm": 2.1147018237252615, "learning_rate": 1.6255119899014514e-05, "loss": 0.8145, "step": 3723 }, { "epoch": 0.30643900432009874, "grad_norm": 2.2660656384183224, "learning_rate": 1.625304016866553e-05, "loss": 0.8245, "step": 3724 }, { "epoch": 0.3065212919152438, "grad_norm": 2.0843798415517343, "learning_rate": 1.6250959994110575e-05, "loss": 0.8288, "step": 3725 }, { "epoch": 0.3066035795103888, "grad_norm": 2.433152350777801, "learning_rate": 1.6248879375497416e-05, "loss": 0.8016, "step": 3726 }, { "epoch": 0.30668586710553386, "grad_norm": 1.8635624104923694, "learning_rate": 1.6246798312973862e-05, "loss": 0.8302, "step": 3727 }, { "epoch": 0.30676815470067886, "grad_norm": 1.9733434384603887, "learning_rate": 1.6244716806687746e-05, "loss": 0.8362, "step": 3728 }, { "epoch": 0.3068504422958239, "grad_norm": 0.4827561540472388, "learning_rate": 1.6242634856786934e-05, "loss": 0.5584, "step": 3729 }, { "epoch": 0.3069327298909689, "grad_norm": 2.0026515293908065, "learning_rate": 1.6240552463419325e-05, "loss": 0.8013, "step": 3730 }, { "epoch": 0.307015017486114, "grad_norm": 2.421089896619913, "learning_rate": 1.623846962673285e-05, "loss": 0.7854, "step": 3731 }, { "epoch": 0.307097305081259, "grad_norm": 2.004747768730618, "learning_rate": 1.6236386346875473e-05, "loss": 0.796, "step": 3732 }, { "epoch": 0.30717959267640405, "grad_norm": 1.8191195652060745, "learning_rate": 1.623430262399518e-05, "loss": 0.7972, "step": 3733 }, { "epoch": 0.30726188027154905, "grad_norm": 0.43757093610523434, "learning_rate": 1.623221845824e-05, "loss": 0.5311, "step": 3734 }, { "epoch": 0.3073441678666941, "grad_norm": 2.240566483832002, "learning_rate": 1.6230133849757984e-05, "loss": 0.7828, "step": 3735 }, { "epoch": 0.3074264554618391, "grad_norm": 2.1385825153781566, "learning_rate": 1.6228048798697228e-05, "loss": 0.7757, "step": 3736 }, { "epoch": 0.30750874305698417, "grad_norm": 2.127148299389216, "learning_rate": 1.6225963305205845e-05, "loss": 0.8251, "step": 3737 }, { "epoch": 0.30759103065212917, "grad_norm": 2.120414731679044, "learning_rate": 1.6223877369431983e-05, "loss": 0.8012, "step": 3738 }, { "epoch": 0.30767331824727423, "grad_norm": 1.9228746790037947, "learning_rate": 1.622179099152383e-05, "loss": 0.7766, "step": 3739 }, { "epoch": 0.30775560584241923, "grad_norm": 5.699396120222066, "learning_rate": 1.621970417162959e-05, "loss": 0.8022, "step": 3740 }, { "epoch": 0.3078378934375643, "grad_norm": 0.4309867200604675, "learning_rate": 1.6217616909897516e-05, "loss": 0.5309, "step": 3741 }, { "epoch": 0.3079201810327093, "grad_norm": 1.9015914971003123, "learning_rate": 1.621552920647588e-05, "loss": 0.8028, "step": 3742 }, { "epoch": 0.30800246862785435, "grad_norm": 1.9000230700954626, "learning_rate": 1.621344106151299e-05, "loss": 0.7936, "step": 3743 }, { "epoch": 0.30808475622299936, "grad_norm": 0.4426055699337771, "learning_rate": 1.6211352475157183e-05, "loss": 0.5109, "step": 3744 }, { "epoch": 0.3081670438181444, "grad_norm": 2.2152525796595812, "learning_rate": 1.620926344755683e-05, "loss": 0.8065, "step": 3745 }, { "epoch": 0.3082493314132895, "grad_norm": 0.4481125883668462, "learning_rate": 1.620717397886033e-05, "loss": 0.5508, "step": 3746 }, { "epoch": 0.3083316190084345, "grad_norm": 2.3399247026612757, "learning_rate": 1.6205084069216122e-05, "loss": 0.796, "step": 3747 }, { "epoch": 0.30841390660357954, "grad_norm": 1.8345421801615338, "learning_rate": 1.6202993718772662e-05, "loss": 0.801, "step": 3748 }, { "epoch": 0.30849619419872454, "grad_norm": 1.7932846729941831, "learning_rate": 1.6200902927678447e-05, "loss": 0.7722, "step": 3749 }, { "epoch": 0.3085784817938696, "grad_norm": 1.687207271881378, "learning_rate": 1.6198811696082008e-05, "loss": 0.7969, "step": 3750 }, { "epoch": 0.3086607693890146, "grad_norm": 1.8714625068204986, "learning_rate": 1.61967200241319e-05, "loss": 0.7993, "step": 3751 }, { "epoch": 0.30874305698415966, "grad_norm": 1.7409895731594691, "learning_rate": 1.619462791197671e-05, "loss": 0.7805, "step": 3752 }, { "epoch": 0.30882534457930466, "grad_norm": 1.739045393950046, "learning_rate": 1.619253535976506e-05, "loss": 0.7832, "step": 3753 }, { "epoch": 0.3089076321744497, "grad_norm": 0.44313475687242815, "learning_rate": 1.6190442367645603e-05, "loss": 0.5453, "step": 3754 }, { "epoch": 0.3089899197695947, "grad_norm": 1.6893521002681389, "learning_rate": 1.6188348935767018e-05, "loss": 0.7769, "step": 3755 }, { "epoch": 0.3090722073647398, "grad_norm": 2.308498242005326, "learning_rate": 1.618625506427802e-05, "loss": 0.7868, "step": 3756 }, { "epoch": 0.3091544949598848, "grad_norm": 1.8145631519769951, "learning_rate": 1.618416075332736e-05, "loss": 0.7977, "step": 3757 }, { "epoch": 0.30923678255502984, "grad_norm": 1.7295348876425896, "learning_rate": 1.618206600306381e-05, "loss": 0.798, "step": 3758 }, { "epoch": 0.30931907015017485, "grad_norm": 2.0147213977428864, "learning_rate": 1.6179970813636175e-05, "loss": 0.7525, "step": 3759 }, { "epoch": 0.3094013577453199, "grad_norm": 0.4449268535613779, "learning_rate": 1.61778751851933e-05, "loss": 0.5563, "step": 3760 }, { "epoch": 0.3094836453404649, "grad_norm": 0.40231439760617077, "learning_rate": 1.6175779117884046e-05, "loss": 0.5245, "step": 3761 }, { "epoch": 0.30956593293560997, "grad_norm": 1.7766369949909324, "learning_rate": 1.6173682611857327e-05, "loss": 0.8161, "step": 3762 }, { "epoch": 0.30964822053075497, "grad_norm": 1.5194083385934598, "learning_rate": 1.6171585667262068e-05, "loss": 0.7975, "step": 3763 }, { "epoch": 0.30973050812590003, "grad_norm": 0.4391010126701544, "learning_rate": 1.6169488284247227e-05, "loss": 0.5278, "step": 3764 }, { "epoch": 0.30981279572104503, "grad_norm": 1.9057323951157967, "learning_rate": 1.6167390462961812e-05, "loss": 0.7963, "step": 3765 }, { "epoch": 0.3098950833161901, "grad_norm": 1.496900484594616, "learning_rate": 1.6165292203554835e-05, "loss": 0.8146, "step": 3766 }, { "epoch": 0.3099773709113351, "grad_norm": 1.3203974600978459, "learning_rate": 1.6163193506175365e-05, "loss": 0.8, "step": 3767 }, { "epoch": 0.31005965850648015, "grad_norm": 1.4659118769716732, "learning_rate": 1.6161094370972486e-05, "loss": 0.8395, "step": 3768 }, { "epoch": 0.31014194610162515, "grad_norm": 1.6799103341614017, "learning_rate": 1.615899479809531e-05, "loss": 0.7951, "step": 3769 }, { "epoch": 0.3102242336967702, "grad_norm": 0.42356524852903504, "learning_rate": 1.6156894787693002e-05, "loss": 0.5293, "step": 3770 }, { "epoch": 0.3103065212919152, "grad_norm": 1.5727058387890596, "learning_rate": 1.615479433991473e-05, "loss": 0.8121, "step": 3771 }, { "epoch": 0.3103888088870603, "grad_norm": 1.2670970557759353, "learning_rate": 1.6152693454909706e-05, "loss": 0.7827, "step": 3772 }, { "epoch": 0.31047109648220533, "grad_norm": 1.3323156008193104, "learning_rate": 1.6150592132827186e-05, "loss": 0.7858, "step": 3773 }, { "epoch": 0.31055338407735034, "grad_norm": 1.2028385372754486, "learning_rate": 1.6148490373816435e-05, "loss": 0.7607, "step": 3774 }, { "epoch": 0.3106356716724954, "grad_norm": 1.4909932774781338, "learning_rate": 1.614638817802676e-05, "loss": 0.8046, "step": 3775 }, { "epoch": 0.3107179592676404, "grad_norm": 1.4754248739906664, "learning_rate": 1.61442855456075e-05, "loss": 0.7874, "step": 3776 }, { "epoch": 0.31080024686278546, "grad_norm": 1.5094229241942867, "learning_rate": 1.614218247670802e-05, "loss": 0.8253, "step": 3777 }, { "epoch": 0.31088253445793046, "grad_norm": 0.4295052349695629, "learning_rate": 1.614007897147772e-05, "loss": 0.543, "step": 3778 }, { "epoch": 0.3109648220530755, "grad_norm": 1.6799174553659821, "learning_rate": 1.613797503006603e-05, "loss": 0.7956, "step": 3779 }, { "epoch": 0.3110471096482205, "grad_norm": 1.5824840220914298, "learning_rate": 1.613587065262241e-05, "loss": 0.7853, "step": 3780 }, { "epoch": 0.3111293972433656, "grad_norm": 1.4575534258065173, "learning_rate": 1.613376583929635e-05, "loss": 0.7989, "step": 3781 }, { "epoch": 0.3112116848385106, "grad_norm": 1.6635486970673155, "learning_rate": 1.613166059023738e-05, "loss": 0.8021, "step": 3782 }, { "epoch": 0.31129397243365564, "grad_norm": 1.3912852168757746, "learning_rate": 1.6129554905595043e-05, "loss": 0.7991, "step": 3783 }, { "epoch": 0.31137626002880064, "grad_norm": 1.8348284522634324, "learning_rate": 1.612744878551893e-05, "loss": 0.7884, "step": 3784 }, { "epoch": 0.3114585476239457, "grad_norm": 0.45404800627571795, "learning_rate": 1.6125342230158653e-05, "loss": 0.5297, "step": 3785 }, { "epoch": 0.3115408352190907, "grad_norm": 1.568224830463089, "learning_rate": 1.612323523966386e-05, "loss": 0.7889, "step": 3786 }, { "epoch": 0.31162312281423576, "grad_norm": 1.4615826506999914, "learning_rate": 1.612112781418423e-05, "loss": 0.8027, "step": 3787 }, { "epoch": 0.31170541040938077, "grad_norm": 1.3600983387015941, "learning_rate": 1.611901995386947e-05, "loss": 0.7957, "step": 3788 }, { "epoch": 0.3117876980045258, "grad_norm": 2.4823284104691257, "learning_rate": 1.6116911658869313e-05, "loss": 0.8086, "step": 3789 }, { "epoch": 0.31186998559967083, "grad_norm": 1.4425764579983513, "learning_rate": 1.611480292933354e-05, "loss": 0.86, "step": 3790 }, { "epoch": 0.3119522731948159, "grad_norm": 3.5898890527626572, "learning_rate": 1.6112693765411944e-05, "loss": 0.7922, "step": 3791 }, { "epoch": 0.3120345607899609, "grad_norm": 0.4405848046693677, "learning_rate": 1.611058416725436e-05, "loss": 0.5164, "step": 3792 }, { "epoch": 0.31211684838510595, "grad_norm": 1.930749253968764, "learning_rate": 1.6108474135010647e-05, "loss": 0.8474, "step": 3793 }, { "epoch": 0.31219913598025095, "grad_norm": 1.6005631874722352, "learning_rate": 1.61063636688307e-05, "loss": 0.7985, "step": 3794 }, { "epoch": 0.312281423575396, "grad_norm": 2.1228915952675322, "learning_rate": 1.6104252768864447e-05, "loss": 0.8236, "step": 3795 }, { "epoch": 0.312363711170541, "grad_norm": 1.5504485182917005, "learning_rate": 1.6102141435261837e-05, "loss": 0.8064, "step": 3796 }, { "epoch": 0.3124459987656861, "grad_norm": 2.174381516183099, "learning_rate": 1.610002966817286e-05, "loss": 0.7895, "step": 3797 }, { "epoch": 0.31252828636083113, "grad_norm": 2.1588757115892836, "learning_rate": 1.609791746774753e-05, "loss": 0.8281, "step": 3798 }, { "epoch": 0.31261057395597613, "grad_norm": 1.8016195608269203, "learning_rate": 1.6095804834135895e-05, "loss": 0.8017, "step": 3799 }, { "epoch": 0.3126928615511212, "grad_norm": 1.9215240891550083, "learning_rate": 1.6093691767488032e-05, "loss": 0.8288, "step": 3800 }, { "epoch": 0.3127751491462662, "grad_norm": 1.7081490290132, "learning_rate": 1.609157826795406e-05, "loss": 0.8244, "step": 3801 }, { "epoch": 0.31285743674141125, "grad_norm": 1.4362733132847159, "learning_rate": 1.6089464335684097e-05, "loss": 0.8203, "step": 3802 }, { "epoch": 0.31293972433655626, "grad_norm": 1.5092490177822409, "learning_rate": 1.6087349970828335e-05, "loss": 0.8063, "step": 3803 }, { "epoch": 0.3130220119317013, "grad_norm": 1.4989585796869414, "learning_rate": 1.6085235173536965e-05, "loss": 0.7916, "step": 3804 }, { "epoch": 0.3131042995268463, "grad_norm": 1.4282121061703668, "learning_rate": 1.6083119943960215e-05, "loss": 0.7771, "step": 3805 }, { "epoch": 0.3131865871219914, "grad_norm": 1.361727204221599, "learning_rate": 1.6081004282248358e-05, "loss": 0.7847, "step": 3806 }, { "epoch": 0.3132688747171364, "grad_norm": 1.8247981932325896, "learning_rate": 1.607888818855168e-05, "loss": 0.7565, "step": 3807 }, { "epoch": 0.31335116231228144, "grad_norm": 1.4669885684346229, "learning_rate": 1.6076771663020507e-05, "loss": 0.8057, "step": 3808 }, { "epoch": 0.31343344990742644, "grad_norm": 0.4719992291304323, "learning_rate": 1.6074654705805194e-05, "loss": 0.5486, "step": 3809 }, { "epoch": 0.3135157375025715, "grad_norm": 1.6647995724657256, "learning_rate": 1.6072537317056128e-05, "loss": 0.7938, "step": 3810 }, { "epoch": 0.3135980250977165, "grad_norm": 2.476341315156464, "learning_rate": 1.6070419496923716e-05, "loss": 0.808, "step": 3811 }, { "epoch": 0.31368031269286156, "grad_norm": 1.6305978435794288, "learning_rate": 1.606830124555842e-05, "loss": 0.815, "step": 3812 }, { "epoch": 0.31376260028800657, "grad_norm": 1.7651591384088836, "learning_rate": 1.6066182563110698e-05, "loss": 0.8375, "step": 3813 }, { "epoch": 0.3138448878831516, "grad_norm": 1.6865103523908131, "learning_rate": 1.6064063449731076e-05, "loss": 0.8222, "step": 3814 }, { "epoch": 0.3139271754782966, "grad_norm": 0.4241649811874777, "learning_rate": 1.606194390557008e-05, "loss": 0.5088, "step": 3815 }, { "epoch": 0.3140094630734417, "grad_norm": 1.451751859628819, "learning_rate": 1.6059823930778286e-05, "loss": 0.7924, "step": 3816 }, { "epoch": 0.3140917506685867, "grad_norm": 1.5746528155678103, "learning_rate": 1.6057703525506293e-05, "loss": 0.7977, "step": 3817 }, { "epoch": 0.31417403826373175, "grad_norm": 1.633038667302806, "learning_rate": 1.6055582689904724e-05, "loss": 0.8224, "step": 3818 }, { "epoch": 0.31425632585887675, "grad_norm": 1.3997285893805254, "learning_rate": 1.605346142412425e-05, "loss": 0.8236, "step": 3819 }, { "epoch": 0.3143386134540218, "grad_norm": 1.3776423446334434, "learning_rate": 1.6051339728315557e-05, "loss": 0.8028, "step": 3820 }, { "epoch": 0.3144209010491668, "grad_norm": 0.4296997716679929, "learning_rate": 1.6049217602629368e-05, "loss": 0.5383, "step": 3821 }, { "epoch": 0.31450318864431187, "grad_norm": 1.5112326971489343, "learning_rate": 1.604709504721643e-05, "loss": 0.8011, "step": 3822 }, { "epoch": 0.31458547623945693, "grad_norm": 1.5000778581455252, "learning_rate": 1.6044972062227536e-05, "loss": 0.788, "step": 3823 }, { "epoch": 0.31466776383460193, "grad_norm": 1.8304966534923168, "learning_rate": 1.604284864781349e-05, "loss": 0.8036, "step": 3824 }, { "epoch": 0.314750051429747, "grad_norm": 1.447229013784352, "learning_rate": 1.6040724804125144e-05, "loss": 0.788, "step": 3825 }, { "epoch": 0.314832339024892, "grad_norm": 1.400055065113336, "learning_rate": 1.6038600531313365e-05, "loss": 0.8338, "step": 3826 }, { "epoch": 0.31491462662003705, "grad_norm": 1.6462850322118918, "learning_rate": 1.6036475829529065e-05, "loss": 0.8159, "step": 3827 }, { "epoch": 0.31499691421518206, "grad_norm": 1.4405452087209327, "learning_rate": 1.6034350698923175e-05, "loss": 0.8021, "step": 3828 }, { "epoch": 0.3150792018103271, "grad_norm": 1.238962748264279, "learning_rate": 1.6032225139646663e-05, "loss": 0.7933, "step": 3829 }, { "epoch": 0.3151614894054721, "grad_norm": 1.3760217055651613, "learning_rate": 1.603009915185052e-05, "loss": 0.8448, "step": 3830 }, { "epoch": 0.3152437770006172, "grad_norm": 0.4592060820215616, "learning_rate": 1.602797273568578e-05, "loss": 0.5474, "step": 3831 }, { "epoch": 0.3153260645957622, "grad_norm": 1.46346971437999, "learning_rate": 1.60258458913035e-05, "loss": 0.8075, "step": 3832 }, { "epoch": 0.31540835219090724, "grad_norm": 0.43688001977430013, "learning_rate": 1.6023718618854756e-05, "loss": 0.5195, "step": 3833 }, { "epoch": 0.31549063978605224, "grad_norm": 1.387966012985661, "learning_rate": 1.6021590918490685e-05, "loss": 0.7928, "step": 3834 }, { "epoch": 0.3155729273811973, "grad_norm": 1.2845135247879658, "learning_rate": 1.6019462790362415e-05, "loss": 0.7991, "step": 3835 }, { "epoch": 0.3156552149763423, "grad_norm": 1.3529603536381467, "learning_rate": 1.6017334234621143e-05, "loss": 0.7881, "step": 3836 }, { "epoch": 0.31573750257148736, "grad_norm": 0.4413166562719957, "learning_rate": 1.601520525141807e-05, "loss": 0.5475, "step": 3837 }, { "epoch": 0.31581979016663236, "grad_norm": 1.5798220607086506, "learning_rate": 1.6013075840904433e-05, "loss": 0.8072, "step": 3838 }, { "epoch": 0.3159020777617774, "grad_norm": 1.3374478512636028, "learning_rate": 1.6010946003231507e-05, "loss": 0.774, "step": 3839 }, { "epoch": 0.3159843653569224, "grad_norm": 1.3317452201789195, "learning_rate": 1.6008815738550588e-05, "loss": 0.7837, "step": 3840 }, { "epoch": 0.3160666529520675, "grad_norm": 1.524499254315705, "learning_rate": 1.6006685047013008e-05, "loss": 0.8071, "step": 3841 }, { "epoch": 0.3161489405472125, "grad_norm": 1.3306539588843487, "learning_rate": 1.600455392877013e-05, "loss": 0.8184, "step": 3842 }, { "epoch": 0.31623122814235755, "grad_norm": 0.42763710421613876, "learning_rate": 1.6002422383973345e-05, "loss": 0.5117, "step": 3843 }, { "epoch": 0.31631351573750255, "grad_norm": 1.5219746227372761, "learning_rate": 1.6000290412774072e-05, "loss": 0.8044, "step": 3844 }, { "epoch": 0.3163958033326476, "grad_norm": 1.41718191129118, "learning_rate": 1.599815801532376e-05, "loss": 0.7955, "step": 3845 }, { "epoch": 0.3164780909277926, "grad_norm": 1.2798703247927727, "learning_rate": 1.59960251917739e-05, "loss": 0.7954, "step": 3846 }, { "epoch": 0.31656037852293767, "grad_norm": 4.202688452888293, "learning_rate": 1.5993891942276e-05, "loss": 0.8213, "step": 3847 }, { "epoch": 0.31664266611808267, "grad_norm": 3.028615193665598, "learning_rate": 1.59917582669816e-05, "loss": 0.7849, "step": 3848 }, { "epoch": 0.31672495371322773, "grad_norm": 1.4614417133635351, "learning_rate": 1.5989624166042275e-05, "loss": 0.7868, "step": 3849 }, { "epoch": 0.3168072413083728, "grad_norm": 1.4622588657748734, "learning_rate": 1.598748963960963e-05, "loss": 0.7858, "step": 3850 }, { "epoch": 0.3168895289035178, "grad_norm": 1.411182313839077, "learning_rate": 1.5985354687835296e-05, "loss": 0.8097, "step": 3851 }, { "epoch": 0.31697181649866285, "grad_norm": 1.4862535038642217, "learning_rate": 1.598321931087094e-05, "loss": 0.8158, "step": 3852 }, { "epoch": 0.31705410409380785, "grad_norm": 1.7517411737646655, "learning_rate": 1.598108350886825e-05, "loss": 0.7836, "step": 3853 }, { "epoch": 0.3171363916889529, "grad_norm": 1.50628024287692, "learning_rate": 1.597894728197895e-05, "loss": 0.825, "step": 3854 }, { "epoch": 0.3172186792840979, "grad_norm": 1.5551702890796064, "learning_rate": 1.59768106303548e-05, "loss": 0.7912, "step": 3855 }, { "epoch": 0.317300966879243, "grad_norm": 1.565139044705377, "learning_rate": 1.5974673554147583e-05, "loss": 0.8298, "step": 3856 }, { "epoch": 0.317383254474388, "grad_norm": 1.6719645431305064, "learning_rate": 1.597253605350911e-05, "loss": 0.8159, "step": 3857 }, { "epoch": 0.31746554206953304, "grad_norm": 1.738755656953427, "learning_rate": 1.5970398128591226e-05, "loss": 0.8111, "step": 3858 }, { "epoch": 0.31754782966467804, "grad_norm": 1.3016455355427752, "learning_rate": 1.596825977954581e-05, "loss": 0.796, "step": 3859 }, { "epoch": 0.3176301172598231, "grad_norm": 1.6339301852364194, "learning_rate": 1.5966121006524763e-05, "loss": 0.8089, "step": 3860 }, { "epoch": 0.3177124048549681, "grad_norm": 1.6554656500069764, "learning_rate": 1.5963981809680017e-05, "loss": 0.857, "step": 3861 }, { "epoch": 0.31779469245011316, "grad_norm": 1.7785982317572142, "learning_rate": 1.5961842189163547e-05, "loss": 0.8098, "step": 3862 }, { "epoch": 0.31787698004525816, "grad_norm": 2.1597556737395562, "learning_rate": 1.595970214512734e-05, "loss": 0.798, "step": 3863 }, { "epoch": 0.3179592676404032, "grad_norm": 1.9370568707163913, "learning_rate": 1.5957561677723426e-05, "loss": 0.8079, "step": 3864 }, { "epoch": 0.3180415552355482, "grad_norm": 1.9444571258095107, "learning_rate": 1.5955420787103856e-05, "loss": 0.8109, "step": 3865 }, { "epoch": 0.3181238428306933, "grad_norm": 1.760393730100222, "learning_rate": 1.5953279473420715e-05, "loss": 0.7758, "step": 3866 }, { "epoch": 0.3182061304258383, "grad_norm": 1.6420492720718982, "learning_rate": 1.5951137736826122e-05, "loss": 0.8169, "step": 3867 }, { "epoch": 0.31828841802098334, "grad_norm": 0.44768553469151645, "learning_rate": 1.5948995577472226e-05, "loss": 0.5237, "step": 3868 }, { "epoch": 0.31837070561612835, "grad_norm": 0.4704272888843954, "learning_rate": 1.5946852995511196e-05, "loss": 0.5489, "step": 3869 }, { "epoch": 0.3184529932112734, "grad_norm": 1.5467122402037703, "learning_rate": 1.5944709991095238e-05, "loss": 0.8043, "step": 3870 }, { "epoch": 0.3185352808064184, "grad_norm": 2.390616971685316, "learning_rate": 1.594256656437659e-05, "loss": 0.7991, "step": 3871 }, { "epoch": 0.31861756840156347, "grad_norm": 1.7214781780864123, "learning_rate": 1.5940422715507522e-05, "loss": 0.7695, "step": 3872 }, { "epoch": 0.31869985599670847, "grad_norm": 1.6526754937961543, "learning_rate": 1.593827844464032e-05, "loss": 0.786, "step": 3873 }, { "epoch": 0.31878214359185353, "grad_norm": 1.7516686291235084, "learning_rate": 1.593613375192731e-05, "loss": 0.8132, "step": 3874 }, { "epoch": 0.3188644311869986, "grad_norm": 1.9016648443724014, "learning_rate": 1.593398863752086e-05, "loss": 0.8149, "step": 3875 }, { "epoch": 0.3189467187821436, "grad_norm": 0.4667168130769043, "learning_rate": 1.5931843101573345e-05, "loss": 0.5291, "step": 3876 }, { "epoch": 0.31902900637728865, "grad_norm": 2.1732316364322637, "learning_rate": 1.592969714423718e-05, "loss": 0.8116, "step": 3877 }, { "epoch": 0.31911129397243365, "grad_norm": 2.0042075733499396, "learning_rate": 1.5927550765664814e-05, "loss": 0.7947, "step": 3878 }, { "epoch": 0.3191935815675787, "grad_norm": 2.00997410027192, "learning_rate": 1.592540396600872e-05, "loss": 0.8145, "step": 3879 }, { "epoch": 0.3192758691627237, "grad_norm": 1.5637086608174724, "learning_rate": 1.5923256745421408e-05, "loss": 0.827, "step": 3880 }, { "epoch": 0.31935815675786877, "grad_norm": 1.6181242256485258, "learning_rate": 1.592110910405541e-05, "loss": 0.8151, "step": 3881 }, { "epoch": 0.3194404443530138, "grad_norm": 3.370578012508591, "learning_rate": 1.5918961042063285e-05, "loss": 0.7889, "step": 3882 }, { "epoch": 0.31952273194815883, "grad_norm": 1.882982469706157, "learning_rate": 1.5916812559597635e-05, "loss": 0.7844, "step": 3883 }, { "epoch": 0.31960501954330384, "grad_norm": 2.002582131058503, "learning_rate": 1.5914663656811086e-05, "loss": 0.8074, "step": 3884 }, { "epoch": 0.3196873071384489, "grad_norm": 1.600771949322908, "learning_rate": 1.591251433385629e-05, "loss": 0.805, "step": 3885 }, { "epoch": 0.3197695947335939, "grad_norm": 1.5408535050061394, "learning_rate": 1.591036459088593e-05, "loss": 0.8019, "step": 3886 }, { "epoch": 0.31985188232873896, "grad_norm": 1.47785259187151, "learning_rate": 1.590821442805272e-05, "loss": 0.7725, "step": 3887 }, { "epoch": 0.31993416992388396, "grad_norm": 1.649957375486492, "learning_rate": 1.590606384550941e-05, "loss": 0.8423, "step": 3888 }, { "epoch": 0.320016457519029, "grad_norm": 1.722398959070732, "learning_rate": 1.590391284340877e-05, "loss": 0.7958, "step": 3889 }, { "epoch": 0.320098745114174, "grad_norm": 0.4439275700511717, "learning_rate": 1.5901761421903602e-05, "loss": 0.5155, "step": 3890 }, { "epoch": 0.3201810327093191, "grad_norm": 1.6385779929802615, "learning_rate": 1.589960958114674e-05, "loss": 0.8037, "step": 3891 }, { "epoch": 0.3202633203044641, "grad_norm": 2.2677890806726233, "learning_rate": 1.589745732129105e-05, "loss": 0.8014, "step": 3892 }, { "epoch": 0.32034560789960914, "grad_norm": 1.7440331245033582, "learning_rate": 1.589530464248942e-05, "loss": 0.7926, "step": 3893 }, { "epoch": 0.32042789549475414, "grad_norm": 1.7572278727107002, "learning_rate": 1.589315154489478e-05, "loss": 0.799, "step": 3894 }, { "epoch": 0.3205101830898992, "grad_norm": 1.8522770471221919, "learning_rate": 1.5890998028660077e-05, "loss": 0.8021, "step": 3895 }, { "epoch": 0.3205924706850442, "grad_norm": 1.6887691382789587, "learning_rate": 1.5888844093938295e-05, "loss": 0.7935, "step": 3896 }, { "epoch": 0.32067475828018926, "grad_norm": 1.6838500779017664, "learning_rate": 1.5886689740882448e-05, "loss": 0.7934, "step": 3897 }, { "epoch": 0.32075704587533427, "grad_norm": 1.7224600932545493, "learning_rate": 1.5884534969645574e-05, "loss": 0.7891, "step": 3898 }, { "epoch": 0.3208393334704793, "grad_norm": 1.7210572191001172, "learning_rate": 1.588237978038075e-05, "loss": 0.7813, "step": 3899 }, { "epoch": 0.32092162106562433, "grad_norm": 2.8344387409880674, "learning_rate": 1.588022417324107e-05, "loss": 0.8094, "step": 3900 }, { "epoch": 0.3210039086607694, "grad_norm": 2.1217231122168143, "learning_rate": 1.587806814837967e-05, "loss": 0.7753, "step": 3901 }, { "epoch": 0.32108619625591445, "grad_norm": 2.214733800606323, "learning_rate": 1.587591170594971e-05, "loss": 0.7817, "step": 3902 }, { "epoch": 0.32116848385105945, "grad_norm": 0.43017597210839653, "learning_rate": 1.587375484610438e-05, "loss": 0.5431, "step": 3903 }, { "epoch": 0.3212507714462045, "grad_norm": 2.259495257156286, "learning_rate": 1.58715975689969e-05, "loss": 0.814, "step": 3904 }, { "epoch": 0.3213330590413495, "grad_norm": 0.4247657565225192, "learning_rate": 1.5869439874780518e-05, "loss": 0.5151, "step": 3905 }, { "epoch": 0.32141534663649457, "grad_norm": 1.612160032316507, "learning_rate": 1.5867281763608514e-05, "loss": 0.7756, "step": 3906 }, { "epoch": 0.3214976342316396, "grad_norm": 2.9126245659122993, "learning_rate": 1.5865123235634196e-05, "loss": 0.8098, "step": 3907 }, { "epoch": 0.32157992182678463, "grad_norm": 1.8880678211217063, "learning_rate": 1.5862964291010904e-05, "loss": 0.8025, "step": 3908 }, { "epoch": 0.32166220942192963, "grad_norm": 2.470962406465848, "learning_rate": 1.5860804929892007e-05, "loss": 0.7807, "step": 3909 }, { "epoch": 0.3217444970170747, "grad_norm": 1.8096274546446176, "learning_rate": 1.58586451524309e-05, "loss": 0.7938, "step": 3910 }, { "epoch": 0.3218267846122197, "grad_norm": 1.7357976383540734, "learning_rate": 1.5856484958781007e-05, "loss": 0.7698, "step": 3911 }, { "epoch": 0.32190907220736475, "grad_norm": 1.6751104297921913, "learning_rate": 1.5854324349095794e-05, "loss": 0.7985, "step": 3912 }, { "epoch": 0.32199135980250976, "grad_norm": 0.45078868165814406, "learning_rate": 1.5852163323528736e-05, "loss": 0.5611, "step": 3913 }, { "epoch": 0.3220736473976548, "grad_norm": 1.8160018709189711, "learning_rate": 1.585000188223336e-05, "loss": 0.7767, "step": 3914 }, { "epoch": 0.3221559349927998, "grad_norm": 1.8270349432081778, "learning_rate": 1.5847840025363206e-05, "loss": 0.7747, "step": 3915 }, { "epoch": 0.3222382225879449, "grad_norm": 1.8128097632259197, "learning_rate": 1.5845677753071847e-05, "loss": 0.785, "step": 3916 }, { "epoch": 0.3223205101830899, "grad_norm": 1.8944803424632484, "learning_rate": 1.5843515065512885e-05, "loss": 0.804, "step": 3917 }, { "epoch": 0.32240279777823494, "grad_norm": 2.2596012157276135, "learning_rate": 1.5841351962839966e-05, "loss": 0.7594, "step": 3918 }, { "epoch": 0.32248508537337994, "grad_norm": 0.4063307625629307, "learning_rate": 1.583918844520674e-05, "loss": 0.5257, "step": 3919 }, { "epoch": 0.322567372968525, "grad_norm": 1.7705394326449508, "learning_rate": 1.5837024512766905e-05, "loss": 0.7968, "step": 3920 }, { "epoch": 0.32264966056367, "grad_norm": 2.2793356779989806, "learning_rate": 1.583486016567419e-05, "loss": 0.7715, "step": 3921 }, { "epoch": 0.32273194815881506, "grad_norm": 2.3770074512733346, "learning_rate": 1.5832695404082334e-05, "loss": 0.7833, "step": 3922 }, { "epoch": 0.32281423575396007, "grad_norm": 1.9767747470853723, "learning_rate": 1.5830530228145125e-05, "loss": 0.799, "step": 3923 }, { "epoch": 0.3228965233491051, "grad_norm": 2.1188118908486704, "learning_rate": 1.5828364638016377e-05, "loss": 0.8134, "step": 3924 }, { "epoch": 0.3229788109442501, "grad_norm": 0.431863066119262, "learning_rate": 1.5826198633849922e-05, "loss": 0.5202, "step": 3925 }, { "epoch": 0.3230610985393952, "grad_norm": 2.0264993947923973, "learning_rate": 1.5824032215799635e-05, "loss": 0.7724, "step": 3926 }, { "epoch": 0.32314338613454024, "grad_norm": 2.080019534214767, "learning_rate": 1.582186538401941e-05, "loss": 0.7923, "step": 3927 }, { "epoch": 0.32322567372968525, "grad_norm": 2.3466362434119823, "learning_rate": 1.5819698138663185e-05, "loss": 0.789, "step": 3928 }, { "epoch": 0.3233079613248303, "grad_norm": 2.676529299854149, "learning_rate": 1.581753047988491e-05, "loss": 0.7975, "step": 3929 }, { "epoch": 0.3233902489199753, "grad_norm": 2.084847629393202, "learning_rate": 1.5815362407838572e-05, "loss": 0.8186, "step": 3930 }, { "epoch": 0.32347253651512037, "grad_norm": 1.8539793305616092, "learning_rate": 1.581319392267819e-05, "loss": 0.7899, "step": 3931 }, { "epoch": 0.32355482411026537, "grad_norm": 1.946873396330972, "learning_rate": 1.5811025024557806e-05, "loss": 0.8188, "step": 3932 }, { "epoch": 0.32363711170541043, "grad_norm": 2.159098166320808, "learning_rate": 1.58088557136315e-05, "loss": 0.7848, "step": 3933 }, { "epoch": 0.32371939930055543, "grad_norm": 2.047580727362364, "learning_rate": 1.5806685990053374e-05, "loss": 0.7795, "step": 3934 }, { "epoch": 0.3238016868957005, "grad_norm": 2.671302249322026, "learning_rate": 1.5804515853977562e-05, "loss": 0.7878, "step": 3935 }, { "epoch": 0.3238839744908455, "grad_norm": 1.8591097758701614, "learning_rate": 1.5802345305558224e-05, "loss": 0.7879, "step": 3936 }, { "epoch": 0.32396626208599055, "grad_norm": 1.8516339214566484, "learning_rate": 1.580017434494956e-05, "loss": 0.7827, "step": 3937 }, { "epoch": 0.32404854968113556, "grad_norm": 2.366887865572097, "learning_rate": 1.5798002972305782e-05, "loss": 0.7994, "step": 3938 }, { "epoch": 0.3241308372762806, "grad_norm": 2.14665278036224, "learning_rate": 1.5795831187781147e-05, "loss": 0.7899, "step": 3939 }, { "epoch": 0.3242131248714256, "grad_norm": 1.9282270888047135, "learning_rate": 1.5793658991529934e-05, "loss": 0.7822, "step": 3940 }, { "epoch": 0.3242954124665707, "grad_norm": 1.7470566661407605, "learning_rate": 1.5791486383706448e-05, "loss": 0.7985, "step": 3941 }, { "epoch": 0.3243777000617157, "grad_norm": 2.2568501368655274, "learning_rate": 1.5789313364465037e-05, "loss": 0.7811, "step": 3942 }, { "epoch": 0.32445998765686074, "grad_norm": 1.8202413829686064, "learning_rate": 1.578713993396006e-05, "loss": 0.7751, "step": 3943 }, { "epoch": 0.32454227525200574, "grad_norm": 1.6164720356317872, "learning_rate": 1.5784966092345916e-05, "loss": 0.7684, "step": 3944 }, { "epoch": 0.3246245628471508, "grad_norm": 1.7664920640395763, "learning_rate": 1.5782791839777035e-05, "loss": 0.8171, "step": 3945 }, { "epoch": 0.3247068504422958, "grad_norm": 1.7140385425234788, "learning_rate": 1.578061717640787e-05, "loss": 0.7983, "step": 3946 }, { "epoch": 0.32478913803744086, "grad_norm": 1.9915284832242997, "learning_rate": 1.5778442102392903e-05, "loss": 0.7752, "step": 3947 }, { "epoch": 0.32487142563258586, "grad_norm": 1.8606268367017726, "learning_rate": 1.5776266617886652e-05, "loss": 0.7885, "step": 3948 }, { "epoch": 0.3249537132277309, "grad_norm": 1.8214192299435588, "learning_rate": 1.577409072304366e-05, "loss": 0.8146, "step": 3949 }, { "epoch": 0.3250360008228759, "grad_norm": 2.027591412924204, "learning_rate": 1.5771914418018493e-05, "loss": 0.7706, "step": 3950 }, { "epoch": 0.325118288418021, "grad_norm": 2.1354790579381695, "learning_rate": 1.5769737702965762e-05, "loss": 0.8089, "step": 3951 }, { "epoch": 0.32520057601316604, "grad_norm": 2.074699176889147, "learning_rate": 1.576756057804009e-05, "loss": 0.773, "step": 3952 }, { "epoch": 0.32528286360831105, "grad_norm": 2.100870397111902, "learning_rate": 1.5765383043396137e-05, "loss": 0.7644, "step": 3953 }, { "epoch": 0.3253651512034561, "grad_norm": 0.45137516125086635, "learning_rate": 1.5763205099188594e-05, "loss": 0.5643, "step": 3954 }, { "epoch": 0.3254474387986011, "grad_norm": 2.217278953007144, "learning_rate": 1.5761026745572178e-05, "loss": 0.7821, "step": 3955 }, { "epoch": 0.32552972639374617, "grad_norm": 1.9618940028761858, "learning_rate": 1.5758847982701636e-05, "loss": 0.8177, "step": 3956 }, { "epoch": 0.32561201398889117, "grad_norm": 2.1199923845726, "learning_rate": 1.575666881073174e-05, "loss": 0.7761, "step": 3957 }, { "epoch": 0.3256943015840362, "grad_norm": 0.43239596333275854, "learning_rate": 1.57544892298173e-05, "loss": 0.5101, "step": 3958 }, { "epoch": 0.32577658917918123, "grad_norm": 2.850191356164409, "learning_rate": 1.575230924011315e-05, "loss": 0.7955, "step": 3959 }, { "epoch": 0.3258588767743263, "grad_norm": 1.8692796068426931, "learning_rate": 1.5750128841774147e-05, "loss": 0.7897, "step": 3960 }, { "epoch": 0.3259411643694713, "grad_norm": 2.1827524276448274, "learning_rate": 1.574794803495519e-05, "loss": 0.8264, "step": 3961 }, { "epoch": 0.32602345196461635, "grad_norm": 2.263658558707186, "learning_rate": 1.5745766819811197e-05, "loss": 0.8098, "step": 3962 }, { "epoch": 0.32610573955976135, "grad_norm": 0.4244020766463776, "learning_rate": 1.5743585196497114e-05, "loss": 0.5129, "step": 3963 }, { "epoch": 0.3261880271549064, "grad_norm": 2.595927481531137, "learning_rate": 1.574140316516793e-05, "loss": 0.8184, "step": 3964 }, { "epoch": 0.3262703147500514, "grad_norm": 0.419897869617388, "learning_rate": 1.5739220725978642e-05, "loss": 0.5457, "step": 3965 }, { "epoch": 0.3263526023451965, "grad_norm": 2.258978016931398, "learning_rate": 1.5737037879084298e-05, "loss": 0.7558, "step": 3966 }, { "epoch": 0.3264348899403415, "grad_norm": 2.542891238910011, "learning_rate": 1.5734854624639956e-05, "loss": 0.807, "step": 3967 }, { "epoch": 0.32651717753548654, "grad_norm": 0.4265054536091749, "learning_rate": 1.5732670962800712e-05, "loss": 0.5115, "step": 3968 }, { "epoch": 0.32659946513063154, "grad_norm": 1.8955838199608996, "learning_rate": 1.5730486893721688e-05, "loss": 0.7797, "step": 3969 }, { "epoch": 0.3266817527257766, "grad_norm": 2.0601699426832663, "learning_rate": 1.5728302417558043e-05, "loss": 0.7884, "step": 3970 }, { "epoch": 0.3267640403209216, "grad_norm": 0.40840965019656833, "learning_rate": 1.5726117534464954e-05, "loss": 0.53, "step": 3971 }, { "epoch": 0.32684632791606666, "grad_norm": 1.8843960611451949, "learning_rate": 1.5723932244597634e-05, "loss": 0.767, "step": 3972 }, { "epoch": 0.32692861551121166, "grad_norm": 1.904639008422634, "learning_rate": 1.5721746548111322e-05, "loss": 0.8159, "step": 3973 }, { "epoch": 0.3270109031063567, "grad_norm": 1.9290444263661974, "learning_rate": 1.5719560445161284e-05, "loss": 0.829, "step": 3974 }, { "epoch": 0.3270931907015017, "grad_norm": 2.297219569544417, "learning_rate": 1.571737393590282e-05, "loss": 0.7835, "step": 3975 }, { "epoch": 0.3271754782966468, "grad_norm": 1.8435362368311108, "learning_rate": 1.5715187020491254e-05, "loss": 0.7753, "step": 3976 }, { "epoch": 0.3272577658917918, "grad_norm": 2.067228837195023, "learning_rate": 1.5712999699081947e-05, "loss": 0.8031, "step": 3977 }, { "epoch": 0.32734005348693684, "grad_norm": 2.074862181219466, "learning_rate": 1.5710811971830274e-05, "loss": 0.8123, "step": 3978 }, { "epoch": 0.3274223410820819, "grad_norm": 0.4119220619691624, "learning_rate": 1.570862383889165e-05, "loss": 0.526, "step": 3979 }, { "epoch": 0.3275046286772269, "grad_norm": 2.2681013473374505, "learning_rate": 1.570643530042152e-05, "loss": 0.7865, "step": 3980 }, { "epoch": 0.32758691627237196, "grad_norm": 2.346967460159985, "learning_rate": 1.5704246356575352e-05, "loss": 0.7951, "step": 3981 }, { "epoch": 0.32766920386751697, "grad_norm": 2.0280379580514323, "learning_rate": 1.5702057007508648e-05, "loss": 0.7967, "step": 3982 }, { "epoch": 0.327751491462662, "grad_norm": 2.645494755217657, "learning_rate": 1.5699867253376928e-05, "loss": 0.7813, "step": 3983 }, { "epoch": 0.32783377905780703, "grad_norm": 2.0400139913657713, "learning_rate": 1.5697677094335758e-05, "loss": 0.787, "step": 3984 }, { "epoch": 0.3279160666529521, "grad_norm": 2.205406469269344, "learning_rate": 1.5695486530540717e-05, "loss": 0.7531, "step": 3985 }, { "epoch": 0.3279983542480971, "grad_norm": 0.43167951522059045, "learning_rate": 1.5693295562147423e-05, "loss": 0.5336, "step": 3986 }, { "epoch": 0.32808064184324215, "grad_norm": 2.107204959767559, "learning_rate": 1.569110418931152e-05, "loss": 0.7802, "step": 3987 }, { "epoch": 0.32816292943838715, "grad_norm": 2.4392908759184007, "learning_rate": 1.5688912412188673e-05, "loss": 0.8002, "step": 3988 }, { "epoch": 0.3282452170335322, "grad_norm": 2.14764265908603, "learning_rate": 1.5686720230934587e-05, "loss": 0.8399, "step": 3989 }, { "epoch": 0.3283275046286772, "grad_norm": 2.3680824327659655, "learning_rate": 1.568452764570499e-05, "loss": 0.8197, "step": 3990 }, { "epoch": 0.32840979222382227, "grad_norm": 2.3280330211238662, "learning_rate": 1.5682334656655642e-05, "loss": 0.7957, "step": 3991 }, { "epoch": 0.3284920798189673, "grad_norm": 1.8107832679614735, "learning_rate": 1.5680141263942325e-05, "loss": 0.8044, "step": 3992 }, { "epoch": 0.32857436741411233, "grad_norm": 2.998434288538815, "learning_rate": 1.5677947467720856e-05, "loss": 0.7835, "step": 3993 }, { "epoch": 0.32865665500925734, "grad_norm": 0.4392336814515646, "learning_rate": 1.5675753268147085e-05, "loss": 0.5206, "step": 3994 }, { "epoch": 0.3287389426044024, "grad_norm": 2.36967808615286, "learning_rate": 1.5673558665376873e-05, "loss": 0.8124, "step": 3995 }, { "epoch": 0.3288212301995474, "grad_norm": 2.0335764345348855, "learning_rate": 1.567136365956613e-05, "loss": 0.7935, "step": 3996 }, { "epoch": 0.32890351779469246, "grad_norm": 2.13170838537876, "learning_rate": 1.5669168250870784e-05, "loss": 0.7875, "step": 3997 }, { "epoch": 0.32898580538983746, "grad_norm": 1.9456780606139135, "learning_rate": 1.566697243944679e-05, "loss": 0.7759, "step": 3998 }, { "epoch": 0.3290680929849825, "grad_norm": 2.773312723528782, "learning_rate": 1.5664776225450132e-05, "loss": 0.7478, "step": 3999 }, { "epoch": 0.3291503805801275, "grad_norm": 2.4541175140476814, "learning_rate": 1.5662579609036836e-05, "loss": 0.8043, "step": 4000 }, { "epoch": 0.3292326681752726, "grad_norm": 3.6905082234410176, "learning_rate": 1.566038259036294e-05, "loss": 0.7849, "step": 4001 }, { "epoch": 0.3293149557704176, "grad_norm": 2.130518471319136, "learning_rate": 1.5658185169584518e-05, "loss": 0.794, "step": 4002 }, { "epoch": 0.32939724336556264, "grad_norm": 2.8597433510479378, "learning_rate": 1.565598734685767e-05, "loss": 0.8093, "step": 4003 }, { "epoch": 0.3294795309607077, "grad_norm": 0.46263416600642066, "learning_rate": 1.5653789122338526e-05, "loss": 0.5274, "step": 4004 }, { "epoch": 0.3295618185558527, "grad_norm": 3.503175456230659, "learning_rate": 1.565159049618324e-05, "loss": 0.8216, "step": 4005 }, { "epoch": 0.32964410615099776, "grad_norm": 4.067051927797743, "learning_rate": 1.5649391468548013e-05, "loss": 0.7893, "step": 4006 }, { "epoch": 0.32972639374614277, "grad_norm": 2.6026548196115122, "learning_rate": 1.5647192039589042e-05, "loss": 0.8011, "step": 4007 }, { "epoch": 0.3298086813412878, "grad_norm": 2.1311953566519763, "learning_rate": 1.5644992209462583e-05, "loss": 0.7826, "step": 4008 }, { "epoch": 0.3298909689364328, "grad_norm": 2.4932806360122854, "learning_rate": 1.5642791978324908e-05, "loss": 0.805, "step": 4009 }, { "epoch": 0.3299732565315779, "grad_norm": 2.728699047628324, "learning_rate": 1.5640591346332313e-05, "loss": 0.7736, "step": 4010 }, { "epoch": 0.3300555441267229, "grad_norm": 1.8896891934256532, "learning_rate": 1.563839031364113e-05, "loss": 0.7609, "step": 4011 }, { "epoch": 0.33013783172186795, "grad_norm": 2.4997541576257656, "learning_rate": 1.5636188880407717e-05, "loss": 0.7786, "step": 4012 }, { "epoch": 0.33022011931701295, "grad_norm": 0.4302448310310332, "learning_rate": 1.5633987046788458e-05, "loss": 0.5157, "step": 4013 }, { "epoch": 0.330302406912158, "grad_norm": 2.5275586558015863, "learning_rate": 1.563178481293977e-05, "loss": 0.8039, "step": 4014 }, { "epoch": 0.330384694507303, "grad_norm": 2.2740611908051758, "learning_rate": 1.5629582179018097e-05, "loss": 0.7724, "step": 4015 }, { "epoch": 0.33046698210244807, "grad_norm": 2.356752663446273, "learning_rate": 1.5627379145179907e-05, "loss": 0.7805, "step": 4016 }, { "epoch": 0.3305492696975931, "grad_norm": 2.0804581335562267, "learning_rate": 1.5625175711581702e-05, "loss": 0.7573, "step": 4017 }, { "epoch": 0.33063155729273813, "grad_norm": 1.9908887619880642, "learning_rate": 1.5622971878380014e-05, "loss": 0.791, "step": 4018 }, { "epoch": 0.33071384488788313, "grad_norm": 2.1469774019491155, "learning_rate": 1.5620767645731394e-05, "loss": 0.7831, "step": 4019 }, { "epoch": 0.3307961324830282, "grad_norm": 1.8937606278549608, "learning_rate": 1.5618563013792426e-05, "loss": 0.7737, "step": 4020 }, { "epoch": 0.3308784200781732, "grad_norm": 2.5642012322205394, "learning_rate": 1.5616357982719732e-05, "loss": 0.7675, "step": 4021 }, { "epoch": 0.33096070767331826, "grad_norm": 0.4356450645178253, "learning_rate": 1.561415255266995e-05, "loss": 0.5228, "step": 4022 }, { "epoch": 0.33104299526846326, "grad_norm": 2.574916998095043, "learning_rate": 1.5611946723799745e-05, "loss": 0.778, "step": 4023 }, { "epoch": 0.3311252828636083, "grad_norm": 2.7688836825179655, "learning_rate": 1.560974049626582e-05, "loss": 0.8046, "step": 4024 }, { "epoch": 0.3312075704587533, "grad_norm": 2.171637815428403, "learning_rate": 1.5607533870224905e-05, "loss": 0.7726, "step": 4025 }, { "epoch": 0.3312898580538984, "grad_norm": 2.6121128273222545, "learning_rate": 1.5605326845833747e-05, "loss": 0.7622, "step": 4026 }, { "epoch": 0.3313721456490434, "grad_norm": 3.8572921705382037, "learning_rate": 1.5603119423249138e-05, "loss": 0.7663, "step": 4027 }, { "epoch": 0.33145443324418844, "grad_norm": 2.4190187644468533, "learning_rate": 1.5600911602627887e-05, "loss": 0.7833, "step": 4028 }, { "epoch": 0.33153672083933344, "grad_norm": 2.2438722540135903, "learning_rate": 1.559870338412683e-05, "loss": 0.8005, "step": 4029 }, { "epoch": 0.3316190084344785, "grad_norm": 2.0470890034029168, "learning_rate": 1.559649476790284e-05, "loss": 0.7796, "step": 4030 }, { "epoch": 0.33170129602962356, "grad_norm": 2.306122681003319, "learning_rate": 1.5594285754112813e-05, "loss": 0.7822, "step": 4031 }, { "epoch": 0.33178358362476856, "grad_norm": 2.258462595107799, "learning_rate": 1.559207634291367e-05, "loss": 0.7889, "step": 4032 }, { "epoch": 0.3318658712199136, "grad_norm": 2.4594697996504116, "learning_rate": 1.558986653446237e-05, "loss": 0.7824, "step": 4033 }, { "epoch": 0.3319481588150586, "grad_norm": 2.5711750789918475, "learning_rate": 1.5587656328915886e-05, "loss": 0.7918, "step": 4034 }, { "epoch": 0.3320304464102037, "grad_norm": 0.4433676817708056, "learning_rate": 1.5585445726431235e-05, "loss": 0.509, "step": 4035 }, { "epoch": 0.3321127340053487, "grad_norm": 2.283865584285344, "learning_rate": 1.5583234727165456e-05, "loss": 0.7443, "step": 4036 }, { "epoch": 0.33219502160049375, "grad_norm": 0.4101973792108834, "learning_rate": 1.5581023331275607e-05, "loss": 0.4913, "step": 4037 }, { "epoch": 0.33227730919563875, "grad_norm": 3.581975059566234, "learning_rate": 1.5578811538918788e-05, "loss": 0.764, "step": 4038 }, { "epoch": 0.3323595967907838, "grad_norm": 2.061623283699238, "learning_rate": 1.5576599350252118e-05, "loss": 0.8047, "step": 4039 }, { "epoch": 0.3324418843859288, "grad_norm": 2.5185115205311805, "learning_rate": 1.5574386765432747e-05, "loss": 0.7757, "step": 4040 }, { "epoch": 0.33252417198107387, "grad_norm": 0.4373356793724688, "learning_rate": 1.557217378461786e-05, "loss": 0.5516, "step": 4041 }, { "epoch": 0.33260645957621887, "grad_norm": 0.4175419290847868, "learning_rate": 1.5569960407964656e-05, "loss": 0.5028, "step": 4042 }, { "epoch": 0.33268874717136393, "grad_norm": 0.4193817285666343, "learning_rate": 1.556774663563037e-05, "loss": 0.5275, "step": 4043 }, { "epoch": 0.33277103476650893, "grad_norm": 2.7178153363796858, "learning_rate": 1.556553246777227e-05, "loss": 0.8058, "step": 4044 }, { "epoch": 0.332853322361654, "grad_norm": 2.3231797903294433, "learning_rate": 1.5563317904547647e-05, "loss": 0.8015, "step": 4045 }, { "epoch": 0.332935609956799, "grad_norm": 2.3013934131802776, "learning_rate": 1.556110294611381e-05, "loss": 0.7931, "step": 4046 }, { "epoch": 0.33301789755194405, "grad_norm": 0.41864141680826006, "learning_rate": 1.5558887592628118e-05, "loss": 0.5046, "step": 4047 }, { "epoch": 0.33310018514708906, "grad_norm": 2.063054176010626, "learning_rate": 1.555667184424794e-05, "loss": 0.8141, "step": 4048 }, { "epoch": 0.3331824727422341, "grad_norm": 0.43526008271738104, "learning_rate": 1.555445570113068e-05, "loss": 0.5364, "step": 4049 }, { "epoch": 0.3332647603373791, "grad_norm": 2.5772664816408586, "learning_rate": 1.5552239163433774e-05, "loss": 0.7868, "step": 4050 }, { "epoch": 0.3333470479325242, "grad_norm": 0.46894680139071393, "learning_rate": 1.5550022231314678e-05, "loss": 0.5729, "step": 4051 }, { "epoch": 0.3334293355276692, "grad_norm": 1.9956817792375223, "learning_rate": 1.5547804904930873e-05, "loss": 0.7839, "step": 4052 }, { "epoch": 0.33351162312281424, "grad_norm": 2.21909870705443, "learning_rate": 1.5545587184439883e-05, "loss": 0.769, "step": 4053 }, { "epoch": 0.33359391071795924, "grad_norm": 2.037384083154615, "learning_rate": 1.554336906999925e-05, "loss": 0.7825, "step": 4054 }, { "epoch": 0.3336761983131043, "grad_norm": 0.436862139649715, "learning_rate": 1.554115056176654e-05, "loss": 0.537, "step": 4055 }, { "epoch": 0.33375848590824936, "grad_norm": 2.414719241435678, "learning_rate": 1.5538931659899357e-05, "loss": 0.7639, "step": 4056 }, { "epoch": 0.33384077350339436, "grad_norm": 3.080897428013041, "learning_rate": 1.553671236455533e-05, "loss": 0.8175, "step": 4057 }, { "epoch": 0.3339230610985394, "grad_norm": 3.6067928304673678, "learning_rate": 1.553449267589211e-05, "loss": 0.7767, "step": 4058 }, { "epoch": 0.3340053486936844, "grad_norm": 2.2683890374702798, "learning_rate": 1.5532272594067378e-05, "loss": 0.788, "step": 4059 }, { "epoch": 0.3340876362888295, "grad_norm": 4.657508972032548, "learning_rate": 1.5530052119238848e-05, "loss": 0.7629, "step": 4060 }, { "epoch": 0.3341699238839745, "grad_norm": 2.6649940020873686, "learning_rate": 1.5527831251564264e-05, "loss": 0.8005, "step": 4061 }, { "epoch": 0.33425221147911954, "grad_norm": 2.5930973491609897, "learning_rate": 1.5525609991201384e-05, "loss": 0.7978, "step": 4062 }, { "epoch": 0.33433449907426455, "grad_norm": 0.4321773872749387, "learning_rate": 1.5523388338308014e-05, "loss": 0.5579, "step": 4063 }, { "epoch": 0.3344167866694096, "grad_norm": 2.4268824148494375, "learning_rate": 1.552116629304196e-05, "loss": 0.8035, "step": 4064 }, { "epoch": 0.3344990742645546, "grad_norm": 2.2973387188444834, "learning_rate": 1.551894385556109e-05, "loss": 0.7847, "step": 4065 }, { "epoch": 0.33458136185969967, "grad_norm": 2.729428390786781, "learning_rate": 1.5516721026023272e-05, "loss": 0.82, "step": 4066 }, { "epoch": 0.33466364945484467, "grad_norm": 2.3680876747481876, "learning_rate": 1.5514497804586416e-05, "loss": 0.7786, "step": 4067 }, { "epoch": 0.33474593704998973, "grad_norm": 2.1300210233336654, "learning_rate": 1.5512274191408456e-05, "loss": 0.7309, "step": 4068 }, { "epoch": 0.33482822464513473, "grad_norm": 3.2729300200283262, "learning_rate": 1.551005018664735e-05, "loss": 0.7738, "step": 4069 }, { "epoch": 0.3349105122402798, "grad_norm": 1.9704989368246766, "learning_rate": 1.5507825790461093e-05, "loss": 0.7596, "step": 4070 }, { "epoch": 0.3349927998354248, "grad_norm": 0.4365947099302801, "learning_rate": 1.55056010030077e-05, "loss": 0.5131, "step": 4071 }, { "epoch": 0.33507508743056985, "grad_norm": 2.0074999190203124, "learning_rate": 1.5503375824445218e-05, "loss": 0.786, "step": 4072 }, { "epoch": 0.33515737502571485, "grad_norm": 1.748780315236836, "learning_rate": 1.5501150254931716e-05, "loss": 0.7874, "step": 4073 }, { "epoch": 0.3352396626208599, "grad_norm": 2.4331820741114027, "learning_rate": 1.54989242946253e-05, "loss": 0.7733, "step": 4074 }, { "epoch": 0.3353219502160049, "grad_norm": 1.907826778457189, "learning_rate": 1.5496697943684094e-05, "loss": 0.7798, "step": 4075 }, { "epoch": 0.33540423781115, "grad_norm": 1.8091432034316774, "learning_rate": 1.549447120226626e-05, "loss": 0.779, "step": 4076 }, { "epoch": 0.335486525406295, "grad_norm": 2.1216937869082813, "learning_rate": 1.5492244070529975e-05, "loss": 0.8008, "step": 4077 }, { "epoch": 0.33556881300144004, "grad_norm": 0.4665356572253128, "learning_rate": 1.5490016548633455e-05, "loss": 0.5399, "step": 4078 }, { "epoch": 0.33565110059658504, "grad_norm": 2.0588554112755584, "learning_rate": 1.5487788636734943e-05, "loss": 0.7858, "step": 4079 }, { "epoch": 0.3357333881917301, "grad_norm": 3.1048131482979073, "learning_rate": 1.54855603349927e-05, "loss": 0.7484, "step": 4080 }, { "epoch": 0.33581567578687516, "grad_norm": 1.9977468749376908, "learning_rate": 1.548333164356502e-05, "loss": 0.7747, "step": 4081 }, { "epoch": 0.33589796338202016, "grad_norm": 1.9507568482794677, "learning_rate": 1.5481102562610236e-05, "loss": 0.7856, "step": 4082 }, { "epoch": 0.3359802509771652, "grad_norm": 2.4700140913823962, "learning_rate": 1.5478873092286694e-05, "loss": 0.8011, "step": 4083 }, { "epoch": 0.3360625385723102, "grad_norm": 2.2523640725668215, "learning_rate": 1.5476643232752763e-05, "loss": 0.7944, "step": 4084 }, { "epoch": 0.3361448261674553, "grad_norm": 2.17172088234952, "learning_rate": 1.5474412984166858e-05, "loss": 0.7984, "step": 4085 }, { "epoch": 0.3362271137626003, "grad_norm": 1.8796567105026312, "learning_rate": 1.547218234668741e-05, "loss": 0.7782, "step": 4086 }, { "epoch": 0.33630940135774534, "grad_norm": 1.7950177915791392, "learning_rate": 1.5469951320472874e-05, "loss": 0.7964, "step": 4087 }, { "epoch": 0.33639168895289034, "grad_norm": 1.7626809722482157, "learning_rate": 1.5467719905681752e-05, "loss": 0.7889, "step": 4088 }, { "epoch": 0.3364739765480354, "grad_norm": 1.991539753326125, "learning_rate": 1.546548810247255e-05, "loss": 0.79, "step": 4089 }, { "epoch": 0.3365562641431804, "grad_norm": 2.115231252527963, "learning_rate": 1.5463255911003808e-05, "loss": 0.7908, "step": 4090 }, { "epoch": 0.33663855173832546, "grad_norm": 1.6321369721550663, "learning_rate": 1.5461023331434112e-05, "loss": 0.7894, "step": 4091 }, { "epoch": 0.33672083933347047, "grad_norm": 2.1896541961345846, "learning_rate": 1.545879036392205e-05, "loss": 0.7944, "step": 4092 }, { "epoch": 0.3368031269286155, "grad_norm": 0.5113893318754602, "learning_rate": 1.5456557008626244e-05, "loss": 0.5442, "step": 4093 }, { "epoch": 0.33688541452376053, "grad_norm": 1.7351416694095272, "learning_rate": 1.545432326570536e-05, "loss": 0.777, "step": 4094 }, { "epoch": 0.3369677021189056, "grad_norm": 2.230496217825373, "learning_rate": 1.5452089135318074e-05, "loss": 0.7605, "step": 4095 }, { "epoch": 0.3370499897140506, "grad_norm": 1.7918268706931566, "learning_rate": 1.5449854617623096e-05, "loss": 0.8035, "step": 4096 }, { "epoch": 0.33713227730919565, "grad_norm": 2.2531298457396263, "learning_rate": 1.544761971277916e-05, "loss": 0.7871, "step": 4097 }, { "epoch": 0.33721456490434065, "grad_norm": 2.386839052979061, "learning_rate": 1.544538442094503e-05, "loss": 0.8035, "step": 4098 }, { "epoch": 0.3372968524994857, "grad_norm": 2.2183148555618057, "learning_rate": 1.5443148742279504e-05, "loss": 0.7676, "step": 4099 }, { "epoch": 0.3373791400946307, "grad_norm": 1.6436385462158203, "learning_rate": 1.5440912676941392e-05, "loss": 0.7738, "step": 4100 }, { "epoch": 0.3374614276897758, "grad_norm": 0.44570601705566626, "learning_rate": 1.543867622508955e-05, "loss": 0.543, "step": 4101 }, { "epoch": 0.3375437152849208, "grad_norm": 2.2551194306581515, "learning_rate": 1.543643938688284e-05, "loss": 0.815, "step": 4102 }, { "epoch": 0.33762600288006583, "grad_norm": 1.9418876796336597, "learning_rate": 1.5434202162480175e-05, "loss": 0.7817, "step": 4103 }, { "epoch": 0.33770829047521084, "grad_norm": 2.0876894686853817, "learning_rate": 1.5431964552040478e-05, "loss": 0.7784, "step": 4104 }, { "epoch": 0.3377905780703559, "grad_norm": 1.987353387707695, "learning_rate": 1.5429726555722708e-05, "loss": 0.8066, "step": 4105 }, { "epoch": 0.3378728656655009, "grad_norm": 3.259218369318484, "learning_rate": 1.5427488173685842e-05, "loss": 0.7775, "step": 4106 }, { "epoch": 0.33795515326064596, "grad_norm": 1.8533066136922023, "learning_rate": 1.54252494060889e-05, "loss": 0.7956, "step": 4107 }, { "epoch": 0.338037440855791, "grad_norm": 1.9536184886449355, "learning_rate": 1.542301025309092e-05, "loss": 0.8026, "step": 4108 }, { "epoch": 0.338119728450936, "grad_norm": 1.8613583097264637, "learning_rate": 1.5420770714850956e-05, "loss": 0.7763, "step": 4109 }, { "epoch": 0.3382020160460811, "grad_norm": 0.42751450227861165, "learning_rate": 1.5418530791528115e-05, "loss": 0.5328, "step": 4110 }, { "epoch": 0.3382843036412261, "grad_norm": 1.7004766564081137, "learning_rate": 1.5416290483281512e-05, "loss": 0.7659, "step": 4111 }, { "epoch": 0.33836659123637114, "grad_norm": 2.8240946495342465, "learning_rate": 1.5414049790270294e-05, "loss": 0.7836, "step": 4112 }, { "epoch": 0.33844887883151614, "grad_norm": 1.8021331146733643, "learning_rate": 1.541180871265364e-05, "loss": 0.8015, "step": 4113 }, { "epoch": 0.3385311664266612, "grad_norm": 3.449214192179231, "learning_rate": 1.5409567250590746e-05, "loss": 0.781, "step": 4114 }, { "epoch": 0.3386134540218062, "grad_norm": 0.4383606675842084, "learning_rate": 1.540732540424085e-05, "loss": 0.5093, "step": 4115 }, { "epoch": 0.33869574161695126, "grad_norm": 1.9850473052707085, "learning_rate": 1.54050831737632e-05, "loss": 0.7694, "step": 4116 }, { "epoch": 0.33877802921209627, "grad_norm": 1.9317148764512087, "learning_rate": 1.540284055931709e-05, "loss": 0.7903, "step": 4117 }, { "epoch": 0.3388603168072413, "grad_norm": 1.6480666719876866, "learning_rate": 1.5400597561061825e-05, "loss": 0.7929, "step": 4118 }, { "epoch": 0.3389426044023863, "grad_norm": 0.4286688284073161, "learning_rate": 1.5398354179156747e-05, "loss": 0.5475, "step": 4119 }, { "epoch": 0.3390248919975314, "grad_norm": 0.4232030691579948, "learning_rate": 1.539611041376122e-05, "loss": 0.5282, "step": 4120 }, { "epoch": 0.3391071795926764, "grad_norm": 2.188095931719702, "learning_rate": 1.539386626503464e-05, "loss": 0.8267, "step": 4121 }, { "epoch": 0.33918946718782145, "grad_norm": 2.2336043337964013, "learning_rate": 1.539162173313643e-05, "loss": 0.7968, "step": 4122 }, { "epoch": 0.33927175478296645, "grad_norm": 1.7327144878573497, "learning_rate": 1.538937681822603e-05, "loss": 0.7926, "step": 4123 }, { "epoch": 0.3393540423781115, "grad_norm": 0.4099620413261847, "learning_rate": 1.538713152046292e-05, "loss": 0.518, "step": 4124 }, { "epoch": 0.3394363299732565, "grad_norm": 1.7009331434739732, "learning_rate": 1.5384885840006604e-05, "loss": 0.8089, "step": 4125 }, { "epoch": 0.33951861756840157, "grad_norm": 1.5319943743654452, "learning_rate": 1.538263977701661e-05, "loss": 0.7932, "step": 4126 }, { "epoch": 0.3396009051635466, "grad_norm": 0.42235284493781433, "learning_rate": 1.5380393331652495e-05, "loss": 0.5097, "step": 4127 }, { "epoch": 0.33968319275869163, "grad_norm": 11.1615298912984, "learning_rate": 1.537814650407384e-05, "loss": 0.7968, "step": 4128 }, { "epoch": 0.33976548035383664, "grad_norm": 1.7586320233591177, "learning_rate": 1.537589929444026e-05, "loss": 0.7995, "step": 4129 }, { "epoch": 0.3398477679489817, "grad_norm": 1.5528213595126805, "learning_rate": 1.5373651702911393e-05, "loss": 0.7535, "step": 4130 }, { "epoch": 0.3399300555441267, "grad_norm": 0.4485178263335829, "learning_rate": 1.5371403729646905e-05, "loss": 0.5046, "step": 4131 }, { "epoch": 0.34001234313927176, "grad_norm": 1.7964144593662237, "learning_rate": 1.536915537480648e-05, "loss": 0.7851, "step": 4132 }, { "epoch": 0.3400946307344168, "grad_norm": 1.8147768900972738, "learning_rate": 1.5366906638549845e-05, "loss": 0.7978, "step": 4133 }, { "epoch": 0.3401769183295618, "grad_norm": 3.348717434702876, "learning_rate": 1.5364657521036747e-05, "loss": 0.7832, "step": 4134 }, { "epoch": 0.3402592059247069, "grad_norm": 1.7615874202683148, "learning_rate": 1.5362408022426958e-05, "loss": 0.8017, "step": 4135 }, { "epoch": 0.3403414935198519, "grad_norm": 2.0426562833962247, "learning_rate": 1.536015814288028e-05, "loss": 0.7941, "step": 4136 }, { "epoch": 0.34042378111499694, "grad_norm": 1.5361881219172584, "learning_rate": 1.5357907882556537e-05, "loss": 0.8022, "step": 4137 }, { "epoch": 0.34050606871014194, "grad_norm": 3.0995959121715253, "learning_rate": 1.5355657241615588e-05, "loss": 0.7737, "step": 4138 }, { "epoch": 0.340588356305287, "grad_norm": 2.588606371301522, "learning_rate": 1.535340622021732e-05, "loss": 0.793, "step": 4139 }, { "epoch": 0.340670643900432, "grad_norm": 1.8902237364663586, "learning_rate": 1.5351154818521626e-05, "loss": 0.7944, "step": 4140 }, { "epoch": 0.34075293149557706, "grad_norm": 1.7204007865686732, "learning_rate": 1.5348903036688456e-05, "loss": 0.7956, "step": 4141 }, { "epoch": 0.34083521909072206, "grad_norm": 1.9999028390813671, "learning_rate": 1.534665087487777e-05, "loss": 0.7936, "step": 4142 }, { "epoch": 0.3409175066858671, "grad_norm": 1.8644536096936406, "learning_rate": 1.5344398333249554e-05, "loss": 0.7921, "step": 4143 }, { "epoch": 0.3409997942810121, "grad_norm": 1.7722552179841173, "learning_rate": 1.534214541196383e-05, "loss": 0.794, "step": 4144 }, { "epoch": 0.3410820818761572, "grad_norm": 1.6203041917519163, "learning_rate": 1.5339892111180637e-05, "loss": 0.7749, "step": 4145 }, { "epoch": 0.3411643694713022, "grad_norm": 1.7129423869701084, "learning_rate": 1.533763843106005e-05, "loss": 0.796, "step": 4146 }, { "epoch": 0.34124665706644725, "grad_norm": 1.751627667300706, "learning_rate": 1.5335384371762163e-05, "loss": 0.7904, "step": 4147 }, { "epoch": 0.34132894466159225, "grad_norm": 1.693787450935078, "learning_rate": 1.5333129933447103e-05, "loss": 0.7835, "step": 4148 }, { "epoch": 0.3414112322567373, "grad_norm": 1.849766722664072, "learning_rate": 1.5330875116275022e-05, "loss": 0.7825, "step": 4149 }, { "epoch": 0.3414935198518823, "grad_norm": 2.335996008891753, "learning_rate": 1.5328619920406102e-05, "loss": 0.784, "step": 4150 }, { "epoch": 0.34157580744702737, "grad_norm": 0.44901144941744836, "learning_rate": 1.532636434600054e-05, "loss": 0.5513, "step": 4151 }, { "epoch": 0.34165809504217237, "grad_norm": 1.7605434715758692, "learning_rate": 1.5324108393218576e-05, "loss": 0.7851, "step": 4152 }, { "epoch": 0.34174038263731743, "grad_norm": 1.9583447241776644, "learning_rate": 1.5321852062220467e-05, "loss": 0.817, "step": 4153 }, { "epoch": 0.34182267023246243, "grad_norm": 1.9555340733009015, "learning_rate": 1.5319595353166496e-05, "loss": 0.7928, "step": 4154 }, { "epoch": 0.3419049578276075, "grad_norm": 1.6234717616083876, "learning_rate": 1.531733826621698e-05, "loss": 0.77, "step": 4155 }, { "epoch": 0.3419872454227525, "grad_norm": 0.4456752890716857, "learning_rate": 1.5315080801532255e-05, "loss": 0.5504, "step": 4156 }, { "epoch": 0.34206953301789755, "grad_norm": 0.44365575104674465, "learning_rate": 1.531282295927269e-05, "loss": 0.5112, "step": 4157 }, { "epoch": 0.34215182061304256, "grad_norm": 1.947857220556244, "learning_rate": 1.531056473959868e-05, "loss": 0.7754, "step": 4158 }, { "epoch": 0.3422341082081876, "grad_norm": 2.12391634258539, "learning_rate": 1.530830614267065e-05, "loss": 0.7968, "step": 4159 }, { "epoch": 0.3423163958033327, "grad_norm": 1.750950386734388, "learning_rate": 1.530604716864903e-05, "loss": 0.7881, "step": 4160 }, { "epoch": 0.3423986833984777, "grad_norm": 1.9947279914057716, "learning_rate": 1.530378781769431e-05, "loss": 0.7887, "step": 4161 }, { "epoch": 0.34248097099362274, "grad_norm": 0.45005457810175087, "learning_rate": 1.5301528089966987e-05, "loss": 0.5424, "step": 4162 }, { "epoch": 0.34256325858876774, "grad_norm": 2.2805189849580034, "learning_rate": 1.529926798562759e-05, "loss": 0.7734, "step": 4163 }, { "epoch": 0.3426455461839128, "grad_norm": 1.5993081613216364, "learning_rate": 1.529700750483666e-05, "loss": 0.7817, "step": 4164 }, { "epoch": 0.3427278337790578, "grad_norm": 1.880393324380173, "learning_rate": 1.5294746647754796e-05, "loss": 0.791, "step": 4165 }, { "epoch": 0.34281012137420286, "grad_norm": 2.2495773107601904, "learning_rate": 1.52924854145426e-05, "loss": 0.7674, "step": 4166 }, { "epoch": 0.34289240896934786, "grad_norm": 1.6974789508594508, "learning_rate": 1.52902238053607e-05, "loss": 0.8326, "step": 4167 }, { "epoch": 0.3429746965644929, "grad_norm": 0.4563080678296591, "learning_rate": 1.528796182036976e-05, "loss": 0.5343, "step": 4168 }, { "epoch": 0.3430569841596379, "grad_norm": 1.556776041063212, "learning_rate": 1.528569945973047e-05, "loss": 0.8141, "step": 4169 }, { "epoch": 0.343139271754783, "grad_norm": 1.5065087672450441, "learning_rate": 1.5283436723603545e-05, "loss": 0.7871, "step": 4170 }, { "epoch": 0.343221559349928, "grad_norm": 1.4917252771642058, "learning_rate": 1.5281173612149723e-05, "loss": 0.7942, "step": 4171 }, { "epoch": 0.34330384694507304, "grad_norm": 1.7600396605299424, "learning_rate": 1.5278910125529776e-05, "loss": 0.7875, "step": 4172 }, { "epoch": 0.34338613454021805, "grad_norm": 0.4718144474080084, "learning_rate": 1.5276646263904493e-05, "loss": 0.528, "step": 4173 }, { "epoch": 0.3434684221353631, "grad_norm": 1.7921169307664222, "learning_rate": 1.52743820274347e-05, "loss": 0.7934, "step": 4174 }, { "epoch": 0.3435507097305081, "grad_norm": 1.906441376187714, "learning_rate": 1.5272117416281242e-05, "loss": 0.8022, "step": 4175 }, { "epoch": 0.34363299732565317, "grad_norm": 0.4163744964764687, "learning_rate": 1.5269852430604997e-05, "loss": 0.4966, "step": 4176 }, { "epoch": 0.34371528492079817, "grad_norm": 1.8296072306072015, "learning_rate": 1.5267587070566864e-05, "loss": 0.8, "step": 4177 }, { "epoch": 0.34379757251594323, "grad_norm": 1.674607986023617, "learning_rate": 1.5265321336327766e-05, "loss": 0.7773, "step": 4178 }, { "epoch": 0.34387986011108823, "grad_norm": 1.6711256431021402, "learning_rate": 1.526305522804866e-05, "loss": 0.7988, "step": 4179 }, { "epoch": 0.3439621477062333, "grad_norm": 1.6731906814384143, "learning_rate": 1.526078874589053e-05, "loss": 0.7912, "step": 4180 }, { "epoch": 0.3440444353013783, "grad_norm": 1.5370806580936933, "learning_rate": 1.5258521890014381e-05, "loss": 0.7959, "step": 4181 }, { "epoch": 0.34412672289652335, "grad_norm": 1.7509253626808505, "learning_rate": 1.5256254660581247e-05, "loss": 0.789, "step": 4182 }, { "epoch": 0.34420901049166835, "grad_norm": 1.6598945176515831, "learning_rate": 1.5253987057752186e-05, "loss": 0.806, "step": 4183 }, { "epoch": 0.3442912980868134, "grad_norm": 1.9159792462949143, "learning_rate": 1.5251719081688288e-05, "loss": 0.8085, "step": 4184 }, { "epoch": 0.34437358568195847, "grad_norm": 1.7556028330987612, "learning_rate": 1.5249450732550668e-05, "loss": 0.787, "step": 4185 }, { "epoch": 0.3444558732771035, "grad_norm": 2.318757101763348, "learning_rate": 1.5247182010500458e-05, "loss": 0.7872, "step": 4186 }, { "epoch": 0.34453816087224853, "grad_norm": 3.031324644798477, "learning_rate": 1.5244912915698833e-05, "loss": 0.8013, "step": 4187 }, { "epoch": 0.34462044846739354, "grad_norm": 2.0434488021905586, "learning_rate": 1.5242643448306981e-05, "loss": 0.8163, "step": 4188 }, { "epoch": 0.3447027360625386, "grad_norm": 1.6799979144229933, "learning_rate": 1.5240373608486123e-05, "loss": 0.7715, "step": 4189 }, { "epoch": 0.3447850236576836, "grad_norm": 0.4467407100216385, "learning_rate": 1.5238103396397505e-05, "loss": 0.5467, "step": 4190 }, { "epoch": 0.34486731125282866, "grad_norm": 1.9693649433882872, "learning_rate": 1.52358328122024e-05, "loss": 0.7841, "step": 4191 }, { "epoch": 0.34494959884797366, "grad_norm": 1.2915651169899338, "learning_rate": 1.5233561856062104e-05, "loss": 0.8025, "step": 4192 }, { "epoch": 0.3450318864431187, "grad_norm": 1.7854079753024872, "learning_rate": 1.5231290528137943e-05, "loss": 0.7722, "step": 4193 }, { "epoch": 0.3451141740382637, "grad_norm": 1.6367298314154521, "learning_rate": 1.5229018828591273e-05, "loss": 0.8148, "step": 4194 }, { "epoch": 0.3451964616334088, "grad_norm": 1.7294806681563275, "learning_rate": 1.5226746757583465e-05, "loss": 0.7904, "step": 4195 }, { "epoch": 0.3452787492285538, "grad_norm": 0.4373012276375105, "learning_rate": 1.5224474315275926e-05, "loss": 0.54, "step": 4196 }, { "epoch": 0.34536103682369884, "grad_norm": 1.5022173094188904, "learning_rate": 1.5222201501830088e-05, "loss": 0.7818, "step": 4197 }, { "epoch": 0.34544332441884384, "grad_norm": 1.7240451693707013, "learning_rate": 1.5219928317407404e-05, "loss": 0.7736, "step": 4198 }, { "epoch": 0.3455256120139889, "grad_norm": 2.205567965000888, "learning_rate": 1.5217654762169364e-05, "loss": 0.7661, "step": 4199 }, { "epoch": 0.3456078996091339, "grad_norm": 1.586922527517016, "learning_rate": 1.5215380836277474e-05, "loss": 0.7783, "step": 4200 }, { "epoch": 0.34569018720427896, "grad_norm": 1.7853572284210082, "learning_rate": 1.521310653989327e-05, "loss": 0.8258, "step": 4201 }, { "epoch": 0.34577247479942397, "grad_norm": 1.4672545450198033, "learning_rate": 1.5210831873178311e-05, "loss": 0.7834, "step": 4202 }, { "epoch": 0.345854762394569, "grad_norm": 1.5690386844013842, "learning_rate": 1.5208556836294192e-05, "loss": 0.7796, "step": 4203 }, { "epoch": 0.34593704998971403, "grad_norm": 0.4192692613470579, "learning_rate": 1.5206281429402524e-05, "loss": 0.5253, "step": 4204 }, { "epoch": 0.3460193375848591, "grad_norm": 1.5149887373516677, "learning_rate": 1.520400565266495e-05, "loss": 0.8028, "step": 4205 }, { "epoch": 0.3461016251800041, "grad_norm": 1.453553109481577, "learning_rate": 1.520172950624314e-05, "loss": 0.7909, "step": 4206 }, { "epoch": 0.34618391277514915, "grad_norm": 0.4297334154612609, "learning_rate": 1.5199452990298781e-05, "loss": 0.5441, "step": 4207 }, { "epoch": 0.34626620037029415, "grad_norm": 1.6152493810058053, "learning_rate": 1.5197176104993598e-05, "loss": 0.7648, "step": 4208 }, { "epoch": 0.3463484879654392, "grad_norm": 1.4384567755187307, "learning_rate": 1.5194898850489338e-05, "loss": 0.8243, "step": 4209 }, { "epoch": 0.3464307755605842, "grad_norm": 1.9412054957510942, "learning_rate": 1.519262122694777e-05, "loss": 0.7981, "step": 4210 }, { "epoch": 0.3465130631557293, "grad_norm": 1.6101767213118356, "learning_rate": 1.5190343234530694e-05, "loss": 0.766, "step": 4211 }, { "epoch": 0.34659535075087433, "grad_norm": 3.3062058052946153, "learning_rate": 1.5188064873399935e-05, "loss": 0.769, "step": 4212 }, { "epoch": 0.34667763834601933, "grad_norm": 0.42009599275022463, "learning_rate": 1.5185786143717347e-05, "loss": 0.5446, "step": 4213 }, { "epoch": 0.3467599259411644, "grad_norm": 1.4677687844466423, "learning_rate": 1.51835070456448e-05, "loss": 0.7933, "step": 4214 }, { "epoch": 0.3468422135363094, "grad_norm": 1.493217503475276, "learning_rate": 1.5181227579344207e-05, "loss": 0.7651, "step": 4215 }, { "epoch": 0.34692450113145445, "grad_norm": 1.74764198103528, "learning_rate": 1.5178947744977493e-05, "loss": 0.7622, "step": 4216 }, { "epoch": 0.34700678872659946, "grad_norm": 1.862412917914253, "learning_rate": 1.5176667542706611e-05, "loss": 0.8079, "step": 4217 }, { "epoch": 0.3470890763217445, "grad_norm": 1.758942491124323, "learning_rate": 1.5174386972693546e-05, "loss": 0.7467, "step": 4218 }, { "epoch": 0.3471713639168895, "grad_norm": 1.5665368455310102, "learning_rate": 1.5172106035100305e-05, "loss": 0.8025, "step": 4219 }, { "epoch": 0.3472536515120346, "grad_norm": 1.711947014681679, "learning_rate": 1.5169824730088926e-05, "loss": 0.7928, "step": 4220 }, { "epoch": 0.3473359391071796, "grad_norm": 2.0085808038513737, "learning_rate": 1.5167543057821463e-05, "loss": 0.7835, "step": 4221 }, { "epoch": 0.34741822670232464, "grad_norm": 1.7430833856993977, "learning_rate": 1.5165261018460004e-05, "loss": 0.7675, "step": 4222 }, { "epoch": 0.34750051429746964, "grad_norm": 1.845518885363521, "learning_rate": 1.5162978612166668e-05, "loss": 0.7779, "step": 4223 }, { "epoch": 0.3475828018926147, "grad_norm": 2.7130830798869385, "learning_rate": 1.5160695839103587e-05, "loss": 0.7705, "step": 4224 }, { "epoch": 0.3476650894877597, "grad_norm": 1.9547052151210038, "learning_rate": 1.5158412699432923e-05, "loss": 0.783, "step": 4225 }, { "epoch": 0.34774737708290476, "grad_norm": 2.5145943348093507, "learning_rate": 1.5156129193316876e-05, "loss": 0.782, "step": 4226 }, { "epoch": 0.34782966467804977, "grad_norm": 1.65257979402394, "learning_rate": 1.5153845320917653e-05, "loss": 0.8015, "step": 4227 }, { "epoch": 0.3479119522731948, "grad_norm": 1.92403730012545, "learning_rate": 1.51515610823975e-05, "loss": 0.8318, "step": 4228 }, { "epoch": 0.3479942398683398, "grad_norm": 2.394929975442086, "learning_rate": 1.5149276477918691e-05, "loss": 0.8012, "step": 4229 }, { "epoch": 0.3480765274634849, "grad_norm": 2.7461350080669233, "learning_rate": 1.5146991507643514e-05, "loss": 0.8125, "step": 4230 }, { "epoch": 0.3481588150586299, "grad_norm": 1.8120751425621948, "learning_rate": 1.5144706171734289e-05, "loss": 0.7726, "step": 4231 }, { "epoch": 0.34824110265377495, "grad_norm": 1.880205900342263, "learning_rate": 1.514242047035337e-05, "loss": 0.7893, "step": 4232 }, { "epoch": 0.34832339024891995, "grad_norm": 0.4647853530055138, "learning_rate": 1.5140134403663123e-05, "loss": 0.5387, "step": 4233 }, { "epoch": 0.348405677844065, "grad_norm": 1.856891163678968, "learning_rate": 1.5137847971825945e-05, "loss": 0.7911, "step": 4234 }, { "epoch": 0.34848796543921, "grad_norm": 2.5751394290321663, "learning_rate": 1.5135561175004267e-05, "loss": 0.802, "step": 4235 }, { "epoch": 0.34857025303435507, "grad_norm": 2.10679008451569, "learning_rate": 1.5133274013360537e-05, "loss": 0.7561, "step": 4236 }, { "epoch": 0.34865254062950013, "grad_norm": 6.051687839873176, "learning_rate": 1.513098648705723e-05, "loss": 0.7989, "step": 4237 }, { "epoch": 0.34873482822464513, "grad_norm": 1.8938964041180257, "learning_rate": 1.5128698596256848e-05, "loss": 0.7753, "step": 4238 }, { "epoch": 0.3488171158197902, "grad_norm": 0.43663972439972254, "learning_rate": 1.5126410341121918e-05, "loss": 0.5499, "step": 4239 }, { "epoch": 0.3488994034149352, "grad_norm": 1.7524512839146058, "learning_rate": 1.5124121721814997e-05, "loss": 0.7988, "step": 4240 }, { "epoch": 0.34898169101008025, "grad_norm": 2.147678489537462, "learning_rate": 1.5121832738498668e-05, "loss": 0.8009, "step": 4241 }, { "epoch": 0.34906397860522526, "grad_norm": 1.9916009072361731, "learning_rate": 1.5119543391335528e-05, "loss": 0.8214, "step": 4242 }, { "epoch": 0.3491462662003703, "grad_norm": 2.085216813138033, "learning_rate": 1.5117253680488213e-05, "loss": 0.7498, "step": 4243 }, { "epoch": 0.3492285537955153, "grad_norm": 1.714008577700925, "learning_rate": 1.5114963606119385e-05, "loss": 0.8042, "step": 4244 }, { "epoch": 0.3493108413906604, "grad_norm": 1.7263799829530475, "learning_rate": 1.5112673168391717e-05, "loss": 0.7906, "step": 4245 }, { "epoch": 0.3493931289858054, "grad_norm": 2.130672830015722, "learning_rate": 1.5110382367467923e-05, "loss": 0.7749, "step": 4246 }, { "epoch": 0.34947541658095044, "grad_norm": 1.7780716625039947, "learning_rate": 1.5108091203510742e-05, "loss": 0.7774, "step": 4247 }, { "epoch": 0.34955770417609544, "grad_norm": 2.1644882894071458, "learning_rate": 1.510579967668293e-05, "loss": 0.7883, "step": 4248 }, { "epoch": 0.3496399917712405, "grad_norm": 1.760334884478578, "learning_rate": 1.5103507787147273e-05, "loss": 0.7546, "step": 4249 }, { "epoch": 0.3497222793663855, "grad_norm": 1.9435333039412346, "learning_rate": 1.5101215535066589e-05, "loss": 0.8172, "step": 4250 }, { "epoch": 0.34980456696153056, "grad_norm": 1.631519680132551, "learning_rate": 1.5098922920603709e-05, "loss": 0.7752, "step": 4251 }, { "epoch": 0.34988685455667556, "grad_norm": 1.6116949765501842, "learning_rate": 1.5096629943921502e-05, "loss": 0.7568, "step": 4252 }, { "epoch": 0.3499691421518206, "grad_norm": 2.2422131361565327, "learning_rate": 1.509433660518285e-05, "loss": 0.8293, "step": 4253 }, { "epoch": 0.3500514297469656, "grad_norm": 2.596112128877894, "learning_rate": 1.5092042904550673e-05, "loss": 0.7793, "step": 4254 }, { "epoch": 0.3501337173421107, "grad_norm": 0.4581743573064218, "learning_rate": 1.5089748842187914e-05, "loss": 0.548, "step": 4255 }, { "epoch": 0.3502160049372557, "grad_norm": 1.776109036903087, "learning_rate": 1.5087454418257537e-05, "loss": 0.8004, "step": 4256 }, { "epoch": 0.35029829253240075, "grad_norm": 2.1083981886382475, "learning_rate": 1.5085159632922532e-05, "loss": 0.805, "step": 4257 }, { "epoch": 0.35038058012754575, "grad_norm": 0.4169899132477313, "learning_rate": 1.5082864486345923e-05, "loss": 0.5054, "step": 4258 }, { "epoch": 0.3504628677226908, "grad_norm": 0.40960518547708463, "learning_rate": 1.5080568978690746e-05, "loss": 0.5182, "step": 4259 }, { "epoch": 0.3505451553178358, "grad_norm": 2.1915036808679096, "learning_rate": 1.5078273110120074e-05, "loss": 0.7803, "step": 4260 }, { "epoch": 0.35062744291298087, "grad_norm": 1.8696022940477977, "learning_rate": 1.5075976880797006e-05, "loss": 0.7757, "step": 4261 }, { "epoch": 0.3507097305081259, "grad_norm": 2.363058279324034, "learning_rate": 1.5073680290884654e-05, "loss": 0.7653, "step": 4262 }, { "epoch": 0.35079201810327093, "grad_norm": 0.46062839853449933, "learning_rate": 1.5071383340546169e-05, "loss": 0.5365, "step": 4263 }, { "epoch": 0.350874305698416, "grad_norm": 0.43011203608729137, "learning_rate": 1.5069086029944723e-05, "loss": 0.5271, "step": 4264 }, { "epoch": 0.350956593293561, "grad_norm": 2.037853153702179, "learning_rate": 1.5066788359243512e-05, "loss": 0.7989, "step": 4265 }, { "epoch": 0.35103888088870605, "grad_norm": 1.639424871080346, "learning_rate": 1.5064490328605756e-05, "loss": 0.7834, "step": 4266 }, { "epoch": 0.35112116848385105, "grad_norm": 1.6125841712238647, "learning_rate": 1.5062191938194712e-05, "loss": 0.802, "step": 4267 }, { "epoch": 0.3512034560789961, "grad_norm": 1.8753737259352214, "learning_rate": 1.5059893188173647e-05, "loss": 0.7807, "step": 4268 }, { "epoch": 0.3512857436741411, "grad_norm": 1.6341123032672547, "learning_rate": 1.5057594078705857e-05, "loss": 0.7787, "step": 4269 }, { "epoch": 0.3513680312692862, "grad_norm": 2.237770860720103, "learning_rate": 1.5055294609954678e-05, "loss": 0.7762, "step": 4270 }, { "epoch": 0.3514503188644312, "grad_norm": 1.5519616780677832, "learning_rate": 1.5052994782083454e-05, "loss": 0.8126, "step": 4271 }, { "epoch": 0.35153260645957624, "grad_norm": 1.509703879953318, "learning_rate": 1.5050694595255558e-05, "loss": 0.7957, "step": 4272 }, { "epoch": 0.35161489405472124, "grad_norm": 1.844428715887101, "learning_rate": 1.5048394049634398e-05, "loss": 0.7769, "step": 4273 }, { "epoch": 0.3516971816498663, "grad_norm": 0.4780655911411019, "learning_rate": 1.5046093145383397e-05, "loss": 0.5073, "step": 4274 }, { "epoch": 0.3517794692450113, "grad_norm": 1.8122459480963558, "learning_rate": 1.5043791882666013e-05, "loss": 0.7866, "step": 4275 }, { "epoch": 0.35186175684015636, "grad_norm": 2.7196919535131583, "learning_rate": 1.5041490261645717e-05, "loss": 0.803, "step": 4276 }, { "epoch": 0.35194404443530136, "grad_norm": 1.947362916758738, "learning_rate": 1.5039188282486015e-05, "loss": 0.7593, "step": 4277 }, { "epoch": 0.3520263320304464, "grad_norm": 1.6755592332557474, "learning_rate": 1.5036885945350437e-05, "loss": 0.7979, "step": 4278 }, { "epoch": 0.3521086196255914, "grad_norm": 0.44549387741601987, "learning_rate": 1.5034583250402536e-05, "loss": 0.5504, "step": 4279 }, { "epoch": 0.3521909072207365, "grad_norm": 1.6959891194348287, "learning_rate": 1.5032280197805894e-05, "loss": 0.7979, "step": 4280 }, { "epoch": 0.3522731948158815, "grad_norm": 0.4334185076699465, "learning_rate": 1.5029976787724115e-05, "loss": 0.5261, "step": 4281 }, { "epoch": 0.35235548241102654, "grad_norm": 1.5551353847283433, "learning_rate": 1.5027673020320828e-05, "loss": 0.7556, "step": 4282 }, { "epoch": 0.35243777000617155, "grad_norm": 1.897905580799512, "learning_rate": 1.502536889575969e-05, "loss": 0.7847, "step": 4283 }, { "epoch": 0.3525200576013166, "grad_norm": 1.4099051942622551, "learning_rate": 1.5023064414204383e-05, "loss": 0.8067, "step": 4284 }, { "epoch": 0.3526023451964616, "grad_norm": 1.3115240490795925, "learning_rate": 1.5020759575818615e-05, "loss": 0.769, "step": 4285 }, { "epoch": 0.35268463279160667, "grad_norm": 1.4639194067957033, "learning_rate": 1.5018454380766114e-05, "loss": 0.7802, "step": 4286 }, { "epoch": 0.35276692038675167, "grad_norm": 2.7851546916573082, "learning_rate": 1.501614882921064e-05, "loss": 0.7753, "step": 4287 }, { "epoch": 0.35284920798189673, "grad_norm": 1.5293399433307024, "learning_rate": 1.5013842921315975e-05, "loss": 0.7762, "step": 4288 }, { "epoch": 0.3529314955770418, "grad_norm": 0.4384726124476116, "learning_rate": 1.5011536657245929e-05, "loss": 0.506, "step": 4289 }, { "epoch": 0.3530137831721868, "grad_norm": 0.4596793540848056, "learning_rate": 1.5009230037164334e-05, "loss": 0.5755, "step": 4290 }, { "epoch": 0.35309607076733185, "grad_norm": 2.2532046831265706, "learning_rate": 1.5006923061235044e-05, "loss": 0.7833, "step": 4291 }, { "epoch": 0.35317835836247685, "grad_norm": 1.4800954913943396, "learning_rate": 1.5004615729621948e-05, "loss": 0.8004, "step": 4292 }, { "epoch": 0.3532606459576219, "grad_norm": 4.369531230294371, "learning_rate": 1.5002308042488957e-05, "loss": 0.7948, "step": 4293 }, { "epoch": 0.3533429335527669, "grad_norm": 1.4581301798184974, "learning_rate": 1.5000000000000002e-05, "loss": 0.8079, "step": 4294 }, { "epoch": 0.35342522114791197, "grad_norm": 0.42557480757840555, "learning_rate": 1.4997691602319043e-05, "loss": 0.5219, "step": 4295 }, { "epoch": 0.353507508743057, "grad_norm": 1.7232340708525398, "learning_rate": 1.4995382849610067e-05, "loss": 0.7887, "step": 4296 }, { "epoch": 0.35358979633820203, "grad_norm": 1.4294977274858944, "learning_rate": 1.499307374203708e-05, "loss": 0.8211, "step": 4297 }, { "epoch": 0.35367208393334704, "grad_norm": 1.6951785203767, "learning_rate": 1.4990764279764119e-05, "loss": 0.7464, "step": 4298 }, { "epoch": 0.3537543715284921, "grad_norm": 1.7342801853234957, "learning_rate": 1.4988454462955247e-05, "loss": 0.8096, "step": 4299 }, { "epoch": 0.3538366591236371, "grad_norm": 1.8333889306402888, "learning_rate": 1.4986144291774547e-05, "loss": 0.8065, "step": 4300 }, { "epoch": 0.35391894671878216, "grad_norm": 1.8109674757270033, "learning_rate": 1.498383376638613e-05, "loss": 0.8099, "step": 4301 }, { "epoch": 0.35400123431392716, "grad_norm": 1.7127351891082367, "learning_rate": 1.4981522886954134e-05, "loss": 0.776, "step": 4302 }, { "epoch": 0.3540835219090722, "grad_norm": 2.534337989440837, "learning_rate": 1.4979211653642717e-05, "loss": 0.7926, "step": 4303 }, { "epoch": 0.3541658095042172, "grad_norm": 3.2956799725671764, "learning_rate": 1.4976900066616069e-05, "loss": 0.7775, "step": 4304 }, { "epoch": 0.3542480970993623, "grad_norm": 1.3450479980394232, "learning_rate": 1.49745881260384e-05, "loss": 0.7548, "step": 4305 }, { "epoch": 0.3543303846945073, "grad_norm": 1.6505077821163292, "learning_rate": 1.4972275832073946e-05, "loss": 0.7936, "step": 4306 }, { "epoch": 0.35441267228965234, "grad_norm": 2.0578215330096588, "learning_rate": 1.4969963184886966e-05, "loss": 0.7781, "step": 4307 }, { "epoch": 0.35449495988479734, "grad_norm": 1.585819640723395, "learning_rate": 1.4967650184641753e-05, "loss": 0.7997, "step": 4308 }, { "epoch": 0.3545772474799424, "grad_norm": 1.7645604991703887, "learning_rate": 1.4965336831502614e-05, "loss": 0.8015, "step": 4309 }, { "epoch": 0.3546595350750874, "grad_norm": 0.4372974520778442, "learning_rate": 1.4963023125633887e-05, "loss": 0.5125, "step": 4310 }, { "epoch": 0.35474182267023247, "grad_norm": 1.4258633483589316, "learning_rate": 1.4960709067199937e-05, "loss": 0.7803, "step": 4311 }, { "epoch": 0.35482411026537747, "grad_norm": 1.5188146591151062, "learning_rate": 1.4958394656365146e-05, "loss": 0.7933, "step": 4312 }, { "epoch": 0.3549063978605225, "grad_norm": 2.6880047243594043, "learning_rate": 1.4956079893293926e-05, "loss": 0.7793, "step": 4313 }, { "epoch": 0.3549886854556676, "grad_norm": 0.43100430278635776, "learning_rate": 1.495376477815072e-05, "loss": 0.5538, "step": 4314 }, { "epoch": 0.3550709730508126, "grad_norm": 0.43658011623034215, "learning_rate": 1.4951449311099988e-05, "loss": 0.5187, "step": 4315 }, { "epoch": 0.35515326064595765, "grad_norm": 1.4978063264835995, "learning_rate": 1.4949133492306212e-05, "loss": 0.7808, "step": 4316 }, { "epoch": 0.35523554824110265, "grad_norm": 2.017109062281373, "learning_rate": 1.4946817321933908e-05, "loss": 0.7691, "step": 4317 }, { "epoch": 0.3553178358362477, "grad_norm": 0.43278384502390105, "learning_rate": 1.4944500800147614e-05, "loss": 0.5224, "step": 4318 }, { "epoch": 0.3554001234313927, "grad_norm": 2.547518864388293, "learning_rate": 1.4942183927111894e-05, "loss": 0.7816, "step": 4319 }, { "epoch": 0.35548241102653777, "grad_norm": 1.5296634536888702, "learning_rate": 1.4939866702991326e-05, "loss": 0.7757, "step": 4320 }, { "epoch": 0.3555646986216828, "grad_norm": 2.5812116396659763, "learning_rate": 1.493754912795053e-05, "loss": 0.8021, "step": 4321 }, { "epoch": 0.35564698621682783, "grad_norm": 0.4565619768179845, "learning_rate": 1.493523120215414e-05, "loss": 0.5301, "step": 4322 }, { "epoch": 0.35572927381197283, "grad_norm": 0.4322188785723711, "learning_rate": 1.4932912925766818e-05, "loss": 0.5188, "step": 4323 }, { "epoch": 0.3558115614071179, "grad_norm": 1.6949003738215151, "learning_rate": 1.493059429895325e-05, "loss": 0.8034, "step": 4324 }, { "epoch": 0.3558938490022629, "grad_norm": 1.8602893281816135, "learning_rate": 1.4928275321878152e-05, "loss": 0.8028, "step": 4325 }, { "epoch": 0.35597613659740795, "grad_norm": 1.4907644504514543, "learning_rate": 1.4925955994706255e-05, "loss": 0.7807, "step": 4326 }, { "epoch": 0.35605842419255296, "grad_norm": 1.4239535398006118, "learning_rate": 1.4923636317602318e-05, "loss": 0.7921, "step": 4327 }, { "epoch": 0.356140711787698, "grad_norm": 1.4902110723412587, "learning_rate": 1.4921316290731134e-05, "loss": 0.7928, "step": 4328 }, { "epoch": 0.356222999382843, "grad_norm": 1.8527416755681474, "learning_rate": 1.491899591425751e-05, "loss": 0.7832, "step": 4329 }, { "epoch": 0.3563052869779881, "grad_norm": 1.8637668135802596, "learning_rate": 1.4916675188346284e-05, "loss": 0.7864, "step": 4330 }, { "epoch": 0.3563875745731331, "grad_norm": 3.0640721383251632, "learning_rate": 1.491435411316232e-05, "loss": 0.7782, "step": 4331 }, { "epoch": 0.35646986216827814, "grad_norm": 2.388721813885545, "learning_rate": 1.4912032688870493e-05, "loss": 0.7903, "step": 4332 }, { "epoch": 0.35655214976342314, "grad_norm": 0.44245453915069405, "learning_rate": 1.4909710915635722e-05, "loss": 0.529, "step": 4333 }, { "epoch": 0.3566344373585682, "grad_norm": 1.6032963266044926, "learning_rate": 1.4907388793622939e-05, "loss": 0.7919, "step": 4334 }, { "epoch": 0.3567167249537132, "grad_norm": 1.6393045541929858, "learning_rate": 1.4905066322997105e-05, "loss": 0.7842, "step": 4335 }, { "epoch": 0.35679901254885826, "grad_norm": 1.507884493635332, "learning_rate": 1.4902743503923205e-05, "loss": 0.8016, "step": 4336 }, { "epoch": 0.35688130014400327, "grad_norm": 1.3560049215640406, "learning_rate": 1.4900420336566243e-05, "loss": 0.807, "step": 4337 }, { "epoch": 0.3569635877391483, "grad_norm": 1.4349679556693968, "learning_rate": 1.4898096821091262e-05, "loss": 0.7375, "step": 4338 }, { "epoch": 0.35704587533429333, "grad_norm": 1.5183757647071592, "learning_rate": 1.4895772957663315e-05, "loss": 0.7875, "step": 4339 }, { "epoch": 0.3571281629294384, "grad_norm": 2.2178816644971873, "learning_rate": 1.4893448746447485e-05, "loss": 0.8037, "step": 4340 }, { "epoch": 0.35721045052458344, "grad_norm": 1.6676595172038777, "learning_rate": 1.4891124187608883e-05, "loss": 0.8064, "step": 4341 }, { "epoch": 0.35729273811972845, "grad_norm": 1.6292009573164543, "learning_rate": 1.488879928131264e-05, "loss": 0.784, "step": 4342 }, { "epoch": 0.3573750257148735, "grad_norm": 2.3299089741042924, "learning_rate": 1.4886474027723916e-05, "loss": 0.7779, "step": 4343 }, { "epoch": 0.3574573133100185, "grad_norm": 2.5843007019758164, "learning_rate": 1.488414842700789e-05, "loss": 0.7615, "step": 4344 }, { "epoch": 0.35753960090516357, "grad_norm": 2.01821967326205, "learning_rate": 1.4881822479329776e-05, "loss": 0.7861, "step": 4345 }, { "epoch": 0.35762188850030857, "grad_norm": 1.5113228304964665, "learning_rate": 1.4879496184854794e-05, "loss": 0.7825, "step": 4346 }, { "epoch": 0.35770417609545363, "grad_norm": 1.364429323660587, "learning_rate": 1.4877169543748209e-05, "loss": 0.7601, "step": 4347 }, { "epoch": 0.35778646369059863, "grad_norm": 1.9001598124952053, "learning_rate": 1.48748425561753e-05, "loss": 0.7942, "step": 4348 }, { "epoch": 0.3578687512857437, "grad_norm": 1.6844932331039923, "learning_rate": 1.487251522230137e-05, "loss": 0.8034, "step": 4349 }, { "epoch": 0.3579510388808887, "grad_norm": 1.7566380889578541, "learning_rate": 1.4870187542291751e-05, "loss": 0.8053, "step": 4350 }, { "epoch": 0.35803332647603375, "grad_norm": 2.078558288380098, "learning_rate": 1.4867859516311803e-05, "loss": 0.7937, "step": 4351 }, { "epoch": 0.35811561407117876, "grad_norm": 1.6595737343227377, "learning_rate": 1.4865531144526894e-05, "loss": 0.8424, "step": 4352 }, { "epoch": 0.3581979016663238, "grad_norm": 1.9147501230715447, "learning_rate": 1.4863202427102437e-05, "loss": 0.7919, "step": 4353 }, { "epoch": 0.3582801892614688, "grad_norm": 1.84351545251438, "learning_rate": 1.4860873364203855e-05, "loss": 0.8086, "step": 4354 }, { "epoch": 0.3583624768566139, "grad_norm": 2.037047198191343, "learning_rate": 1.4858543955996605e-05, "loss": 0.7899, "step": 4355 }, { "epoch": 0.3584447644517589, "grad_norm": 1.9936858446786507, "learning_rate": 1.4856214202646161e-05, "loss": 0.8084, "step": 4356 }, { "epoch": 0.35852705204690394, "grad_norm": 1.9081258550978737, "learning_rate": 1.4853884104318028e-05, "loss": 0.7796, "step": 4357 }, { "epoch": 0.35860933964204894, "grad_norm": 2.090683290882742, "learning_rate": 1.4851553661177728e-05, "loss": 0.7852, "step": 4358 }, { "epoch": 0.358691627237194, "grad_norm": 1.8781498017728904, "learning_rate": 1.4849222873390815e-05, "loss": 0.7805, "step": 4359 }, { "epoch": 0.358773914832339, "grad_norm": 1.7386884741520234, "learning_rate": 1.4846891741122869e-05, "loss": 0.7679, "step": 4360 }, { "epoch": 0.35885620242748406, "grad_norm": 1.7603079590151627, "learning_rate": 1.4844560264539483e-05, "loss": 0.7481, "step": 4361 }, { "epoch": 0.35893849002262906, "grad_norm": 2.4484258611817618, "learning_rate": 1.4842228443806282e-05, "loss": 0.8061, "step": 4362 }, { "epoch": 0.3590207776177741, "grad_norm": 2.0869440215336836, "learning_rate": 1.4839896279088917e-05, "loss": 0.8011, "step": 4363 }, { "epoch": 0.3591030652129191, "grad_norm": 0.43753689917699273, "learning_rate": 1.483756377055306e-05, "loss": 0.5496, "step": 4364 }, { "epoch": 0.3591853528080642, "grad_norm": 2.031649607822119, "learning_rate": 1.483523091836441e-05, "loss": 0.8227, "step": 4365 }, { "epoch": 0.35926764040320924, "grad_norm": 1.8553219815666742, "learning_rate": 1.4832897722688688e-05, "loss": 0.7724, "step": 4366 }, { "epoch": 0.35934992799835425, "grad_norm": 3.9583170664767717, "learning_rate": 1.4830564183691642e-05, "loss": 0.7842, "step": 4367 }, { "epoch": 0.3594322155934993, "grad_norm": 1.7655995488989444, "learning_rate": 1.4828230301539042e-05, "loss": 0.7717, "step": 4368 }, { "epoch": 0.3595145031886443, "grad_norm": 2.4827230111740852, "learning_rate": 1.482589607639668e-05, "loss": 0.7668, "step": 4369 }, { "epoch": 0.35959679078378937, "grad_norm": 2.1463383584658353, "learning_rate": 1.482356150843038e-05, "loss": 0.7585, "step": 4370 }, { "epoch": 0.35967907837893437, "grad_norm": 1.869372283109839, "learning_rate": 1.4821226597805987e-05, "loss": 0.7641, "step": 4371 }, { "epoch": 0.35976136597407943, "grad_norm": 0.4499207892352961, "learning_rate": 1.4818891344689363e-05, "loss": 0.5478, "step": 4372 }, { "epoch": 0.35984365356922443, "grad_norm": 2.0440067055615048, "learning_rate": 1.4816555749246407e-05, "loss": 0.7762, "step": 4373 }, { "epoch": 0.3599259411643695, "grad_norm": 3.147207113013275, "learning_rate": 1.4814219811643033e-05, "loss": 0.8085, "step": 4374 }, { "epoch": 0.3600082287595145, "grad_norm": 2.216160077603176, "learning_rate": 1.4811883532045184e-05, "loss": 0.8197, "step": 4375 }, { "epoch": 0.36009051635465955, "grad_norm": 1.9417645543266515, "learning_rate": 1.4809546910618821e-05, "loss": 0.7747, "step": 4376 }, { "epoch": 0.36017280394980455, "grad_norm": 2.58679860017351, "learning_rate": 1.4807209947529941e-05, "loss": 0.7903, "step": 4377 }, { "epoch": 0.3602550915449496, "grad_norm": 0.4442965451755858, "learning_rate": 1.4804872642944553e-05, "loss": 0.5569, "step": 4378 }, { "epoch": 0.3603373791400946, "grad_norm": 2.085092584435455, "learning_rate": 1.4802534997028695e-05, "loss": 0.7963, "step": 4379 }, { "epoch": 0.3604196667352397, "grad_norm": 3.3901320322156514, "learning_rate": 1.4800197009948434e-05, "loss": 0.7941, "step": 4380 }, { "epoch": 0.3605019543303847, "grad_norm": 2.064303540863521, "learning_rate": 1.4797858681869852e-05, "loss": 0.7987, "step": 4381 }, { "epoch": 0.36058424192552974, "grad_norm": 2.485014757376644, "learning_rate": 1.4795520012959064e-05, "loss": 0.7748, "step": 4382 }, { "epoch": 0.36066652952067474, "grad_norm": 1.914798579701594, "learning_rate": 1.4793181003382201e-05, "loss": 0.7639, "step": 4383 }, { "epoch": 0.3607488171158198, "grad_norm": 2.838719737626025, "learning_rate": 1.4790841653305428e-05, "loss": 0.7505, "step": 4384 }, { "epoch": 0.3608311047109648, "grad_norm": 6.260918360797801, "learning_rate": 1.4788501962894923e-05, "loss": 0.7675, "step": 4385 }, { "epoch": 0.36091339230610986, "grad_norm": 2.5501043836126875, "learning_rate": 1.47861619323169e-05, "loss": 0.7504, "step": 4386 }, { "epoch": 0.36099567990125486, "grad_norm": 2.1505803851911667, "learning_rate": 1.4783821561737587e-05, "loss": 0.8255, "step": 4387 }, { "epoch": 0.3610779674963999, "grad_norm": 1.9949662428415138, "learning_rate": 1.4781480851323238e-05, "loss": 0.7895, "step": 4388 }, { "epoch": 0.3611602550915449, "grad_norm": 2.3932676443907117, "learning_rate": 1.477913980124014e-05, "loss": 0.7882, "step": 4389 }, { "epoch": 0.36124254268669, "grad_norm": 3.4156281292965858, "learning_rate": 1.4776798411654589e-05, "loss": 0.7787, "step": 4390 }, { "epoch": 0.36132483028183504, "grad_norm": 7.616137893545026, "learning_rate": 1.4774456682732923e-05, "loss": 0.7754, "step": 4391 }, { "epoch": 0.36140711787698004, "grad_norm": 2.1515663233240767, "learning_rate": 1.4772114614641488e-05, "loss": 0.8253, "step": 4392 }, { "epoch": 0.3614894054721251, "grad_norm": 2.2023060458768993, "learning_rate": 1.4769772207546659e-05, "loss": 0.7728, "step": 4393 }, { "epoch": 0.3615716930672701, "grad_norm": 2.066421862032711, "learning_rate": 1.4767429461614846e-05, "loss": 0.8279, "step": 4394 }, { "epoch": 0.36165398066241516, "grad_norm": 1.9163198370520074, "learning_rate": 1.4765086377012466e-05, "loss": 0.7756, "step": 4395 }, { "epoch": 0.36173626825756017, "grad_norm": 2.615582751637595, "learning_rate": 1.476274295390597e-05, "loss": 0.7961, "step": 4396 }, { "epoch": 0.3618185558527052, "grad_norm": 4.41644033257218, "learning_rate": 1.4760399192461831e-05, "loss": 0.7601, "step": 4397 }, { "epoch": 0.36190084344785023, "grad_norm": 2.2109711420310236, "learning_rate": 1.475805509284655e-05, "loss": 0.7722, "step": 4398 }, { "epoch": 0.3619831310429953, "grad_norm": 2.275007390439845, "learning_rate": 1.475571065522664e-05, "loss": 0.8105, "step": 4399 }, { "epoch": 0.3620654186381403, "grad_norm": 2.6731814569907346, "learning_rate": 1.4753365879768656e-05, "loss": 0.761, "step": 4400 }, { "epoch": 0.36214770623328535, "grad_norm": 2.2806188231209954, "learning_rate": 1.4751020766639158e-05, "loss": 0.803, "step": 4401 }, { "epoch": 0.36222999382843035, "grad_norm": 2.6625844562523073, "learning_rate": 1.4748675316004741e-05, "loss": 0.7826, "step": 4402 }, { "epoch": 0.3623122814235754, "grad_norm": 2.9353270418499036, "learning_rate": 1.4746329528032029e-05, "loss": 0.807, "step": 4403 }, { "epoch": 0.3623945690187204, "grad_norm": 2.1595877463416455, "learning_rate": 1.4743983402887654e-05, "loss": 0.8089, "step": 4404 }, { "epoch": 0.3624768566138655, "grad_norm": 0.4436264102071975, "learning_rate": 1.4741636940738286e-05, "loss": 0.5838, "step": 4405 }, { "epoch": 0.3625591442090105, "grad_norm": 2.1020565999096643, "learning_rate": 1.4739290141750615e-05, "loss": 0.7764, "step": 4406 }, { "epoch": 0.36264143180415553, "grad_norm": 2.2256976473440324, "learning_rate": 1.4736943006091348e-05, "loss": 0.7884, "step": 4407 }, { "epoch": 0.36272371939930054, "grad_norm": 1.9858803972230128, "learning_rate": 1.4734595533927228e-05, "loss": 0.8152, "step": 4408 }, { "epoch": 0.3628060069944456, "grad_norm": 3.1016590160338855, "learning_rate": 1.4732247725425013e-05, "loss": 0.7771, "step": 4409 }, { "epoch": 0.3628882945895906, "grad_norm": 2.341868715880278, "learning_rate": 1.4729899580751488e-05, "loss": 0.7542, "step": 4410 }, { "epoch": 0.36297058218473566, "grad_norm": 3.4637181984694623, "learning_rate": 1.4727551100073458e-05, "loss": 0.7601, "step": 4411 }, { "epoch": 0.36305286977988066, "grad_norm": 2.014929654564701, "learning_rate": 1.4725202283557762e-05, "loss": 0.781, "step": 4412 }, { "epoch": 0.3631351573750257, "grad_norm": 0.436093194203851, "learning_rate": 1.4722853131371252e-05, "loss": 0.5101, "step": 4413 }, { "epoch": 0.3632174449701707, "grad_norm": 2.956083725742205, "learning_rate": 1.4720503643680805e-05, "loss": 0.767, "step": 4414 }, { "epoch": 0.3632997325653158, "grad_norm": 1.9646441399269536, "learning_rate": 1.4718153820653337e-05, "loss": 0.7914, "step": 4415 }, { "epoch": 0.3633820201604608, "grad_norm": 0.39786861770836607, "learning_rate": 1.471580366245576e-05, "loss": 0.5067, "step": 4416 }, { "epoch": 0.36346430775560584, "grad_norm": 6.216926529114422, "learning_rate": 1.4713453169255032e-05, "loss": 0.7755, "step": 4417 }, { "epoch": 0.3635465953507509, "grad_norm": 1.8278689159508343, "learning_rate": 1.4711102341218133e-05, "loss": 0.7711, "step": 4418 }, { "epoch": 0.3636288829458959, "grad_norm": 2.1161362522194045, "learning_rate": 1.4708751178512055e-05, "loss": 0.793, "step": 4419 }, { "epoch": 0.36371117054104096, "grad_norm": 2.213773823337977, "learning_rate": 1.4706399681303825e-05, "loss": 0.7536, "step": 4420 }, { "epoch": 0.36379345813618597, "grad_norm": 2.653453023061755, "learning_rate": 1.470404784976049e-05, "loss": 0.765, "step": 4421 }, { "epoch": 0.363875745731331, "grad_norm": 2.4982987604135567, "learning_rate": 1.4701695684049115e-05, "loss": 0.7616, "step": 4422 }, { "epoch": 0.363958033326476, "grad_norm": 2.5408162463108006, "learning_rate": 1.4699343184336801e-05, "loss": 0.7701, "step": 4423 }, { "epoch": 0.3640403209216211, "grad_norm": 2.3664164761512345, "learning_rate": 1.4696990350790663e-05, "loss": 0.7694, "step": 4424 }, { "epoch": 0.3641226085167661, "grad_norm": 2.1764983944899647, "learning_rate": 1.469463718357784e-05, "loss": 0.7738, "step": 4425 }, { "epoch": 0.36420489611191115, "grad_norm": 3.528121792473789, "learning_rate": 1.46922836828655e-05, "loss": 0.8011, "step": 4426 }, { "epoch": 0.36428718370705615, "grad_norm": 2.715815677172619, "learning_rate": 1.4689929848820831e-05, "loss": 0.7594, "step": 4427 }, { "epoch": 0.3643694713022012, "grad_norm": 1.9804015653569753, "learning_rate": 1.4687575681611048e-05, "loss": 0.7785, "step": 4428 }, { "epoch": 0.3644517588973462, "grad_norm": 1.9600468292254916, "learning_rate": 1.4685221181403382e-05, "loss": 0.7706, "step": 4429 }, { "epoch": 0.36453404649249127, "grad_norm": 2.8777421364458555, "learning_rate": 1.4682866348365102e-05, "loss": 0.7803, "step": 4430 }, { "epoch": 0.3646163340876363, "grad_norm": 2.176178063953276, "learning_rate": 1.468051118266348e-05, "loss": 0.7708, "step": 4431 }, { "epoch": 0.36469862168278133, "grad_norm": 1.8795084006154403, "learning_rate": 1.4678155684465828e-05, "loss": 0.7682, "step": 4432 }, { "epoch": 0.36478090927792634, "grad_norm": 1.7924640517850554, "learning_rate": 1.4675799853939483e-05, "loss": 0.7844, "step": 4433 }, { "epoch": 0.3648631968730714, "grad_norm": 2.49611047395207, "learning_rate": 1.4673443691251793e-05, "loss": 0.7956, "step": 4434 }, { "epoch": 0.3649454844682164, "grad_norm": 2.7584007784580247, "learning_rate": 1.4671087196570137e-05, "loss": 0.7933, "step": 4435 }, { "epoch": 0.36502777206336146, "grad_norm": 2.841840168495128, "learning_rate": 1.4668730370061914e-05, "loss": 0.7921, "step": 4436 }, { "epoch": 0.36511005965850646, "grad_norm": 1.954551730501201, "learning_rate": 1.4666373211894553e-05, "loss": 0.7576, "step": 4437 }, { "epoch": 0.3651923472536515, "grad_norm": 0.4457840230578507, "learning_rate": 1.4664015722235505e-05, "loss": 0.5342, "step": 4438 }, { "epoch": 0.3652746348487965, "grad_norm": 2.272082365086708, "learning_rate": 1.4661657901252236e-05, "loss": 0.7938, "step": 4439 }, { "epoch": 0.3653569224439416, "grad_norm": 1.8497286525207204, "learning_rate": 1.4659299749112243e-05, "loss": 0.7751, "step": 4440 }, { "epoch": 0.3654392100390866, "grad_norm": 0.4713257525773104, "learning_rate": 1.4656941265983054e-05, "loss": 0.5644, "step": 4441 }, { "epoch": 0.36552149763423164, "grad_norm": 1.9767974354616729, "learning_rate": 1.46545824520322e-05, "loss": 0.7883, "step": 4442 }, { "epoch": 0.3656037852293767, "grad_norm": 2.1481195870810703, "learning_rate": 1.4652223307427254e-05, "loss": 0.771, "step": 4443 }, { "epoch": 0.3656860728245217, "grad_norm": 1.8051795359148761, "learning_rate": 1.4649863832335805e-05, "loss": 0.7909, "step": 4444 }, { "epoch": 0.36576836041966676, "grad_norm": 2.3149191020200597, "learning_rate": 1.4647504026925464e-05, "loss": 0.7772, "step": 4445 }, { "epoch": 0.36585064801481176, "grad_norm": 2.255789293611326, "learning_rate": 1.4645143891363869e-05, "loss": 0.7948, "step": 4446 }, { "epoch": 0.3659329356099568, "grad_norm": 2.169344677139653, "learning_rate": 1.4642783425818684e-05, "loss": 0.8179, "step": 4447 }, { "epoch": 0.3660152232051018, "grad_norm": 1.8152605627814558, "learning_rate": 1.4640422630457586e-05, "loss": 0.7903, "step": 4448 }, { "epoch": 0.3660975108002469, "grad_norm": 2.206827823284778, "learning_rate": 1.4638061505448286e-05, "loss": 0.7686, "step": 4449 }, { "epoch": 0.3661797983953919, "grad_norm": 2.101686778623947, "learning_rate": 1.4635700050958516e-05, "loss": 0.7737, "step": 4450 }, { "epoch": 0.36626208599053695, "grad_norm": 1.9607020489875775, "learning_rate": 1.4633338267156028e-05, "loss": 0.7568, "step": 4451 }, { "epoch": 0.36634437358568195, "grad_norm": 2.0592409137920873, "learning_rate": 1.4630976154208598e-05, "loss": 0.7869, "step": 4452 }, { "epoch": 0.366426661180827, "grad_norm": 2.6252630769120087, "learning_rate": 1.462861371228403e-05, "loss": 0.7687, "step": 4453 }, { "epoch": 0.366508948775972, "grad_norm": 2.1773211757346003, "learning_rate": 1.4626250941550144e-05, "loss": 0.7811, "step": 4454 }, { "epoch": 0.36659123637111707, "grad_norm": 2.180626335955112, "learning_rate": 1.4623887842174792e-05, "loss": 0.7553, "step": 4455 }, { "epoch": 0.36667352396626207, "grad_norm": 2.320887199378988, "learning_rate": 1.462152441432584e-05, "loss": 0.765, "step": 4456 }, { "epoch": 0.36675581156140713, "grad_norm": 1.8838337805008327, "learning_rate": 1.4619160658171186e-05, "loss": 0.767, "step": 4457 }, { "epoch": 0.36683809915655213, "grad_norm": 1.8138251759224935, "learning_rate": 1.4616796573878746e-05, "loss": 0.7722, "step": 4458 }, { "epoch": 0.3669203867516972, "grad_norm": 0.4544553362135176, "learning_rate": 1.4614432161616462e-05, "loss": 0.5241, "step": 4459 }, { "epoch": 0.3670026743468422, "grad_norm": 2.4727369174406193, "learning_rate": 1.4612067421552296e-05, "loss": 0.7727, "step": 4460 }, { "epoch": 0.36708496194198725, "grad_norm": 2.0979961178815763, "learning_rate": 1.4609702353854237e-05, "loss": 0.7751, "step": 4461 }, { "epoch": 0.36716724953713226, "grad_norm": 0.4150156175350144, "learning_rate": 1.4607336958690294e-05, "loss": 0.5245, "step": 4462 }, { "epoch": 0.3672495371322773, "grad_norm": 0.42416759569725526, "learning_rate": 1.4604971236228501e-05, "loss": 0.5297, "step": 4463 }, { "epoch": 0.3673318247274223, "grad_norm": 1.8600715550814397, "learning_rate": 1.4602605186636915e-05, "loss": 0.782, "step": 4464 }, { "epoch": 0.3674141123225674, "grad_norm": 2.264156513094946, "learning_rate": 1.4600238810083622e-05, "loss": 0.7976, "step": 4465 }, { "epoch": 0.3674963999177124, "grad_norm": 2.466816640608851, "learning_rate": 1.4597872106736717e-05, "loss": 0.769, "step": 4466 }, { "epoch": 0.36757868751285744, "grad_norm": 2.4140449926533583, "learning_rate": 1.459550507676433e-05, "loss": 0.7374, "step": 4467 }, { "epoch": 0.36766097510800244, "grad_norm": 1.9744437590118025, "learning_rate": 1.4593137720334617e-05, "loss": 0.8271, "step": 4468 }, { "epoch": 0.3677432627031475, "grad_norm": 2.0861707911384118, "learning_rate": 1.459077003761574e-05, "loss": 0.7556, "step": 4469 }, { "epoch": 0.36782555029829256, "grad_norm": 2.225586480349321, "learning_rate": 1.4588402028775908e-05, "loss": 0.8122, "step": 4470 }, { "epoch": 0.36790783789343756, "grad_norm": 2.3460756177803863, "learning_rate": 1.4586033693983327e-05, "loss": 0.7529, "step": 4471 }, { "epoch": 0.3679901254885826, "grad_norm": 2.02700075708636, "learning_rate": 1.458366503340625e-05, "loss": 0.759, "step": 4472 }, { "epoch": 0.3680724130837276, "grad_norm": 2.1164599563780713, "learning_rate": 1.458129604721294e-05, "loss": 0.7755, "step": 4473 }, { "epoch": 0.3681547006788727, "grad_norm": 2.1320393238304827, "learning_rate": 1.4578926735571683e-05, "loss": 0.8045, "step": 4474 }, { "epoch": 0.3682369882740177, "grad_norm": 0.48695037172098554, "learning_rate": 1.4576557098650796e-05, "loss": 0.5391, "step": 4475 }, { "epoch": 0.36831927586916274, "grad_norm": 1.5776679410827645, "learning_rate": 1.4574187136618611e-05, "loss": 0.7711, "step": 4476 }, { "epoch": 0.36840156346430775, "grad_norm": 1.983467140644944, "learning_rate": 1.4571816849643488e-05, "loss": 0.7422, "step": 4477 }, { "epoch": 0.3684838510594528, "grad_norm": 2.7952470061780335, "learning_rate": 1.4569446237893805e-05, "loss": 0.7629, "step": 4478 }, { "epoch": 0.3685661386545978, "grad_norm": 1.8951929397468077, "learning_rate": 1.4567075301537973e-05, "loss": 0.7608, "step": 4479 }, { "epoch": 0.36864842624974287, "grad_norm": 2.2644032786330666, "learning_rate": 1.4564704040744413e-05, "loss": 0.7731, "step": 4480 }, { "epoch": 0.36873071384488787, "grad_norm": 1.6308147304056042, "learning_rate": 1.4562332455681576e-05, "loss": 0.7718, "step": 4481 }, { "epoch": 0.36881300144003293, "grad_norm": 1.9187417357013663, "learning_rate": 1.4559960546517941e-05, "loss": 0.7499, "step": 4482 }, { "epoch": 0.36889528903517793, "grad_norm": 2.4970892054774243, "learning_rate": 1.4557588313422002e-05, "loss": 0.7722, "step": 4483 }, { "epoch": 0.368977576630323, "grad_norm": 2.035220067729166, "learning_rate": 1.4555215756562275e-05, "loss": 0.783, "step": 4484 }, { "epoch": 0.369059864225468, "grad_norm": 1.5326810812478324, "learning_rate": 1.455284287610731e-05, "loss": 0.7957, "step": 4485 }, { "epoch": 0.36914215182061305, "grad_norm": 9.804159766806016, "learning_rate": 1.4550469672225665e-05, "loss": 0.7803, "step": 4486 }, { "epoch": 0.36922443941575805, "grad_norm": 1.7776938498064865, "learning_rate": 1.454809614508593e-05, "loss": 0.7409, "step": 4487 }, { "epoch": 0.3693067270109031, "grad_norm": 1.4513063578619894, "learning_rate": 1.4545722294856721e-05, "loss": 0.7513, "step": 4488 }, { "epoch": 0.3693890146060481, "grad_norm": 3.5972151914030492, "learning_rate": 1.454334812170667e-05, "loss": 0.7692, "step": 4489 }, { "epoch": 0.3694713022011932, "grad_norm": 1.6586163446187947, "learning_rate": 1.4540973625804433e-05, "loss": 0.7604, "step": 4490 }, { "epoch": 0.3695535897963382, "grad_norm": 0.4313549997543267, "learning_rate": 1.4538598807318696e-05, "loss": 0.5467, "step": 4491 }, { "epoch": 0.36963587739148324, "grad_norm": 4.145955650814357, "learning_rate": 1.4536223666418155e-05, "loss": 0.7478, "step": 4492 }, { "epoch": 0.36971816498662824, "grad_norm": 1.7442126531231734, "learning_rate": 1.4533848203271537e-05, "loss": 0.8121, "step": 4493 }, { "epoch": 0.3698004525817733, "grad_norm": 1.6286417796899968, "learning_rate": 1.4531472418047598e-05, "loss": 0.7719, "step": 4494 }, { "epoch": 0.36988274017691836, "grad_norm": 1.6615117096187888, "learning_rate": 1.4529096310915102e-05, "loss": 0.7699, "step": 4495 }, { "epoch": 0.36996502777206336, "grad_norm": 1.4117787685747312, "learning_rate": 1.4526719882042848e-05, "loss": 0.7878, "step": 4496 }, { "epoch": 0.3700473153672084, "grad_norm": 1.638714013161808, "learning_rate": 1.4524343131599653e-05, "loss": 0.7376, "step": 4497 }, { "epoch": 0.3701296029623534, "grad_norm": 1.5965406796796069, "learning_rate": 1.452196605975436e-05, "loss": 0.7777, "step": 4498 }, { "epoch": 0.3702118905574985, "grad_norm": 1.5482838017669318, "learning_rate": 1.4519588666675827e-05, "loss": 0.7855, "step": 4499 }, { "epoch": 0.3702941781526435, "grad_norm": 1.8858369520853138, "learning_rate": 1.4517210952532947e-05, "loss": 0.7805, "step": 4500 }, { "epoch": 0.37037646574778854, "grad_norm": 1.4375744990660175, "learning_rate": 1.4514832917494621e-05, "loss": 0.7696, "step": 4501 }, { "epoch": 0.37045875334293354, "grad_norm": 2.347801549265926, "learning_rate": 1.4512454561729785e-05, "loss": 0.7445, "step": 4502 }, { "epoch": 0.3705410409380786, "grad_norm": 1.5609272987977516, "learning_rate": 1.4510075885407397e-05, "loss": 0.7722, "step": 4503 }, { "epoch": 0.3706233285332236, "grad_norm": 0.4317774207142754, "learning_rate": 1.4507696888696427e-05, "loss": 0.5321, "step": 4504 }, { "epoch": 0.37070561612836866, "grad_norm": 0.44896736781993724, "learning_rate": 1.4505317571765884e-05, "loss": 0.5395, "step": 4505 }, { "epoch": 0.37078790372351367, "grad_norm": 1.8858777948103396, "learning_rate": 1.4502937934784782e-05, "loss": 0.7858, "step": 4506 }, { "epoch": 0.3708701913186587, "grad_norm": 2.0328093636829405, "learning_rate": 1.4500557977922169e-05, "loss": 0.7787, "step": 4507 }, { "epoch": 0.37095247891380373, "grad_norm": 1.4327084093712712, "learning_rate": 1.449817770134712e-05, "loss": 0.7672, "step": 4508 }, { "epoch": 0.3710347665089488, "grad_norm": 0.43904804316605966, "learning_rate": 1.4495797105228717e-05, "loss": 0.5272, "step": 4509 }, { "epoch": 0.3711170541040938, "grad_norm": 1.8640721897885113, "learning_rate": 1.4493416189736078e-05, "loss": 0.7821, "step": 4510 }, { "epoch": 0.37119934169923885, "grad_norm": 2.1193011716107697, "learning_rate": 1.449103495503834e-05, "loss": 0.7917, "step": 4511 }, { "epoch": 0.37128162929438385, "grad_norm": 1.7976609236363605, "learning_rate": 1.4488653401304661e-05, "loss": 0.767, "step": 4512 }, { "epoch": 0.3713639168895289, "grad_norm": 2.129476666394086, "learning_rate": 1.4486271528704221e-05, "loss": 0.7577, "step": 4513 }, { "epoch": 0.3714462044846739, "grad_norm": 1.750353382913871, "learning_rate": 1.4483889337406229e-05, "loss": 0.7854, "step": 4514 }, { "epoch": 0.371528492079819, "grad_norm": 1.7298980126758945, "learning_rate": 1.4481506827579907e-05, "loss": 0.7681, "step": 4515 }, { "epoch": 0.371610779674964, "grad_norm": 1.8153783425434746, "learning_rate": 1.4479123999394511e-05, "loss": 0.7803, "step": 4516 }, { "epoch": 0.37169306727010903, "grad_norm": 1.7403068790709029, "learning_rate": 1.4476740853019306e-05, "loss": 0.7721, "step": 4517 }, { "epoch": 0.37177535486525404, "grad_norm": 0.4399208383339162, "learning_rate": 1.447435738862359e-05, "loss": 0.5243, "step": 4518 }, { "epoch": 0.3718576424603991, "grad_norm": 1.6715068973685927, "learning_rate": 1.4471973606376683e-05, "loss": 0.7726, "step": 4519 }, { "epoch": 0.3719399300555441, "grad_norm": 2.024714093194338, "learning_rate": 1.446958950644792e-05, "loss": 0.754, "step": 4520 }, { "epoch": 0.37202221765068916, "grad_norm": 1.4834201143071126, "learning_rate": 1.4467205089006669e-05, "loss": 0.7723, "step": 4521 }, { "epoch": 0.3721045052458342, "grad_norm": 1.8700387404842578, "learning_rate": 1.4464820354222313e-05, "loss": 0.7761, "step": 4522 }, { "epoch": 0.3721867928409792, "grad_norm": 1.6717258091113558, "learning_rate": 1.4462435302264258e-05, "loss": 0.7549, "step": 4523 }, { "epoch": 0.3722690804361243, "grad_norm": 1.4988403446389595, "learning_rate": 1.4460049933301936e-05, "loss": 0.7658, "step": 4524 }, { "epoch": 0.3723513680312693, "grad_norm": 1.9011931159728093, "learning_rate": 1.4457664247504801e-05, "loss": 0.7386, "step": 4525 }, { "epoch": 0.37243365562641434, "grad_norm": 1.5673821015214395, "learning_rate": 1.4455278245042324e-05, "loss": 0.7549, "step": 4526 }, { "epoch": 0.37251594322155934, "grad_norm": 7.012481079996853, "learning_rate": 1.4452891926084007e-05, "loss": 0.7986, "step": 4527 }, { "epoch": 0.3725982308167044, "grad_norm": 2.2057183131549345, "learning_rate": 1.445050529079937e-05, "loss": 0.782, "step": 4528 }, { "epoch": 0.3726805184118494, "grad_norm": 1.6008460838370742, "learning_rate": 1.4448118339357952e-05, "loss": 0.791, "step": 4529 }, { "epoch": 0.37276280600699446, "grad_norm": 1.505605241363267, "learning_rate": 1.4445731071929322e-05, "loss": 0.7968, "step": 4530 }, { "epoch": 0.37284509360213947, "grad_norm": 1.6138313139826725, "learning_rate": 1.444334348868307e-05, "loss": 0.7937, "step": 4531 }, { "epoch": 0.3729273811972845, "grad_norm": 0.4458667608434429, "learning_rate": 1.4440955589788799e-05, "loss": 0.5406, "step": 4532 }, { "epoch": 0.3730096687924295, "grad_norm": 0.432796137745458, "learning_rate": 1.4438567375416146e-05, "loss": 0.5296, "step": 4533 }, { "epoch": 0.3730919563875746, "grad_norm": 0.40766863777022705, "learning_rate": 1.4436178845734765e-05, "loss": 0.5185, "step": 4534 }, { "epoch": 0.3731742439827196, "grad_norm": 1.6234333459586188, "learning_rate": 1.4433790000914335e-05, "loss": 0.766, "step": 4535 }, { "epoch": 0.37325653157786465, "grad_norm": 1.589246175659433, "learning_rate": 1.443140084112455e-05, "loss": 0.8056, "step": 4536 }, { "epoch": 0.37333881917300965, "grad_norm": 0.4631045822690777, "learning_rate": 1.4429011366535141e-05, "loss": 0.5251, "step": 4537 }, { "epoch": 0.3734211067681547, "grad_norm": 2.483003333811014, "learning_rate": 1.4426621577315845e-05, "loss": 0.768, "step": 4538 }, { "epoch": 0.3735033943632997, "grad_norm": 1.5338530280671414, "learning_rate": 1.4424231473636433e-05, "loss": 0.7825, "step": 4539 }, { "epoch": 0.37358568195844477, "grad_norm": 0.45025629813446305, "learning_rate": 1.4421841055666692e-05, "loss": 0.5369, "step": 4540 }, { "epoch": 0.3736679695535898, "grad_norm": 1.4208958817575046, "learning_rate": 1.4419450323576433e-05, "loss": 0.7747, "step": 4541 }, { "epoch": 0.37375025714873483, "grad_norm": 1.5654795359049074, "learning_rate": 1.441705927753549e-05, "loss": 0.8072, "step": 4542 }, { "epoch": 0.37383254474387984, "grad_norm": 1.4134230661246265, "learning_rate": 1.4414667917713722e-05, "loss": 0.7878, "step": 4543 }, { "epoch": 0.3739148323390249, "grad_norm": 1.6379887262014758, "learning_rate": 1.4412276244281007e-05, "loss": 0.7817, "step": 4544 }, { "epoch": 0.3739971199341699, "grad_norm": 1.4450553066957816, "learning_rate": 1.4409884257407241e-05, "loss": 0.7891, "step": 4545 }, { "epoch": 0.37407940752931496, "grad_norm": 1.7623252183283458, "learning_rate": 1.4407491957262352e-05, "loss": 0.7584, "step": 4546 }, { "epoch": 0.37416169512446, "grad_norm": 2.3308199124710796, "learning_rate": 1.4405099344016283e-05, "loss": 0.7739, "step": 4547 }, { "epoch": 0.374243982719605, "grad_norm": 1.4816028141881152, "learning_rate": 1.4402706417838998e-05, "loss": 0.8105, "step": 4548 }, { "epoch": 0.3743262703147501, "grad_norm": 1.9583308216181488, "learning_rate": 1.4400313178900493e-05, "loss": 0.7764, "step": 4549 }, { "epoch": 0.3744085579098951, "grad_norm": 5.005498485553217, "learning_rate": 1.4397919627370778e-05, "loss": 0.7797, "step": 4550 }, { "epoch": 0.37449084550504014, "grad_norm": 1.8625426911477554, "learning_rate": 1.4395525763419887e-05, "loss": 0.7413, "step": 4551 }, { "epoch": 0.37457313310018514, "grad_norm": 0.43797451334491677, "learning_rate": 1.4393131587217872e-05, "loss": 0.5325, "step": 4552 }, { "epoch": 0.3746554206953302, "grad_norm": 1.7261389962629246, "learning_rate": 1.4390737098934814e-05, "loss": 0.7817, "step": 4553 }, { "epoch": 0.3747377082904752, "grad_norm": 0.432483155996718, "learning_rate": 1.4388342298740818e-05, "loss": 0.5266, "step": 4554 }, { "epoch": 0.37481999588562026, "grad_norm": 2.0048308136300412, "learning_rate": 1.4385947186806002e-05, "loss": 0.8069, "step": 4555 }, { "epoch": 0.37490228348076526, "grad_norm": 0.4266240039230243, "learning_rate": 1.4383551763300511e-05, "loss": 0.5414, "step": 4556 }, { "epoch": 0.3749845710759103, "grad_norm": 0.40765210161391374, "learning_rate": 1.4381156028394516e-05, "loss": 0.5189, "step": 4557 }, { "epoch": 0.3750668586710553, "grad_norm": 1.5829942825994772, "learning_rate": 1.43787599822582e-05, "loss": 0.7935, "step": 4558 }, { "epoch": 0.3751491462662004, "grad_norm": 1.8774020709371115, "learning_rate": 1.4376363625061777e-05, "loss": 0.7774, "step": 4559 }, { "epoch": 0.3752314338613454, "grad_norm": 2.0550537940410543, "learning_rate": 1.4373966956975485e-05, "loss": 0.7709, "step": 4560 }, { "epoch": 0.37531372145649045, "grad_norm": 1.8249255325108416, "learning_rate": 1.4371569978169573e-05, "loss": 0.7987, "step": 4561 }, { "epoch": 0.37539600905163545, "grad_norm": 1.541752537939895, "learning_rate": 1.4369172688814321e-05, "loss": 0.7597, "step": 4562 }, { "epoch": 0.3754782966467805, "grad_norm": 1.3961789137334686, "learning_rate": 1.4366775089080032e-05, "loss": 0.7543, "step": 4563 }, { "epoch": 0.3755605842419255, "grad_norm": 2.1105305033114408, "learning_rate": 1.4364377179137019e-05, "loss": 0.7893, "step": 4564 }, { "epoch": 0.37564287183707057, "grad_norm": 1.495692999681622, "learning_rate": 1.4361978959155634e-05, "loss": 0.7719, "step": 4565 }, { "epoch": 0.37572515943221557, "grad_norm": 1.4691891017741983, "learning_rate": 1.435958042930624e-05, "loss": 0.7936, "step": 4566 }, { "epoch": 0.37580744702736063, "grad_norm": 1.6181827872012398, "learning_rate": 1.4357181589759224e-05, "loss": 0.7743, "step": 4567 }, { "epoch": 0.37588973462250563, "grad_norm": 2.3181204703282168, "learning_rate": 1.4354782440684996e-05, "loss": 0.7959, "step": 4568 }, { "epoch": 0.3759720222176507, "grad_norm": 1.7137548161273988, "learning_rate": 1.4352382982253987e-05, "loss": 0.7952, "step": 4569 }, { "epoch": 0.3760543098127957, "grad_norm": 1.4865403220549387, "learning_rate": 1.4349983214636651e-05, "loss": 0.7595, "step": 4570 }, { "epoch": 0.37613659740794075, "grad_norm": 1.6103471768909214, "learning_rate": 1.4347583138003466e-05, "loss": 0.7806, "step": 4571 }, { "epoch": 0.3762188850030858, "grad_norm": 1.5131674019559864, "learning_rate": 1.4345182752524928e-05, "loss": 0.7636, "step": 4572 }, { "epoch": 0.3763011725982308, "grad_norm": 1.63268871631576, "learning_rate": 1.4342782058371556e-05, "loss": 0.7492, "step": 4573 }, { "epoch": 0.3763834601933759, "grad_norm": 2.061844552068988, "learning_rate": 1.434038105571389e-05, "loss": 0.7729, "step": 4574 }, { "epoch": 0.3764657477885209, "grad_norm": 0.5066430113274296, "learning_rate": 1.4337979744722499e-05, "loss": 0.5516, "step": 4575 }, { "epoch": 0.37654803538366594, "grad_norm": 1.6736737523868574, "learning_rate": 1.433557812556796e-05, "loss": 0.7661, "step": 4576 }, { "epoch": 0.37663032297881094, "grad_norm": 2.105534033243132, "learning_rate": 1.4333176198420886e-05, "loss": 0.7938, "step": 4577 }, { "epoch": 0.376712610573956, "grad_norm": 2.1355003778351414, "learning_rate": 1.4330773963451908e-05, "loss": 0.7819, "step": 4578 }, { "epoch": 0.376794898169101, "grad_norm": 1.4741029506595988, "learning_rate": 1.4328371420831671e-05, "loss": 0.7853, "step": 4579 }, { "epoch": 0.37687718576424606, "grad_norm": 1.5835573949313149, "learning_rate": 1.4325968570730848e-05, "loss": 0.7749, "step": 4580 }, { "epoch": 0.37695947335939106, "grad_norm": 0.44083094397088896, "learning_rate": 1.4323565413320142e-05, "loss": 0.5391, "step": 4581 }, { "epoch": 0.3770417609545361, "grad_norm": 1.6347567812423156, "learning_rate": 1.4321161948770259e-05, "loss": 0.7737, "step": 4582 }, { "epoch": 0.3771240485496811, "grad_norm": 1.8441033262743296, "learning_rate": 1.4318758177251942e-05, "loss": 0.7192, "step": 4583 }, { "epoch": 0.3772063361448262, "grad_norm": 0.4289571956899822, "learning_rate": 1.4316354098935954e-05, "loss": 0.5118, "step": 4584 }, { "epoch": 0.3772886237399712, "grad_norm": 1.7899605781408279, "learning_rate": 1.4313949713993071e-05, "loss": 0.7726, "step": 4585 }, { "epoch": 0.37737091133511624, "grad_norm": 1.56666651510149, "learning_rate": 1.4311545022594102e-05, "loss": 0.7793, "step": 4586 }, { "epoch": 0.37745319893026125, "grad_norm": 1.737668997898733, "learning_rate": 1.4309140024909866e-05, "loss": 0.8099, "step": 4587 }, { "epoch": 0.3775354865254063, "grad_norm": 1.589919808762366, "learning_rate": 1.4306734721111218e-05, "loss": 0.7804, "step": 4588 }, { "epoch": 0.3776177741205513, "grad_norm": 0.4244706060066968, "learning_rate": 1.4304329111369022e-05, "loss": 0.4932, "step": 4589 }, { "epoch": 0.37770006171569637, "grad_norm": 2.4329554443936816, "learning_rate": 1.4301923195854169e-05, "loss": 0.7891, "step": 4590 }, { "epoch": 0.37778234931084137, "grad_norm": 0.43773898887519175, "learning_rate": 1.429951697473757e-05, "loss": 0.5401, "step": 4591 }, { "epoch": 0.37786463690598643, "grad_norm": 1.7246753161721682, "learning_rate": 1.4297110448190165e-05, "loss": 0.8075, "step": 4592 }, { "epoch": 0.37794692450113143, "grad_norm": 2.01452624675174, "learning_rate": 1.4294703616382903e-05, "loss": 0.7609, "step": 4593 }, { "epoch": 0.3780292120962765, "grad_norm": 1.6966931260351583, "learning_rate": 1.4292296479486767e-05, "loss": 0.7696, "step": 4594 }, { "epoch": 0.3781114996914215, "grad_norm": 1.6082916245483474, "learning_rate": 1.4289889037672753e-05, "loss": 0.782, "step": 4595 }, { "epoch": 0.37819378728656655, "grad_norm": 0.43571892849028787, "learning_rate": 1.4287481291111883e-05, "loss": 0.5085, "step": 4596 }, { "epoch": 0.37827607488171155, "grad_norm": 1.6687021531709416, "learning_rate": 1.4285073239975196e-05, "loss": 0.791, "step": 4597 }, { "epoch": 0.3783583624768566, "grad_norm": 1.4674920894090417, "learning_rate": 1.4282664884433761e-05, "loss": 0.7724, "step": 4598 }, { "epoch": 0.37844065007200167, "grad_norm": 1.8231309511495206, "learning_rate": 1.4280256224658661e-05, "loss": 0.7819, "step": 4599 }, { "epoch": 0.3785229376671467, "grad_norm": 1.5851674784462597, "learning_rate": 1.4277847260821005e-05, "loss": 0.7643, "step": 4600 }, { "epoch": 0.37860522526229173, "grad_norm": 1.8373627682166949, "learning_rate": 1.427543799309192e-05, "loss": 0.8036, "step": 4601 }, { "epoch": 0.37868751285743674, "grad_norm": 1.8604288802938558, "learning_rate": 1.427302842164256e-05, "loss": 0.7754, "step": 4602 }, { "epoch": 0.3787698004525818, "grad_norm": 1.656448028035198, "learning_rate": 1.4270618546644091e-05, "loss": 0.8092, "step": 4603 }, { "epoch": 0.3788520880477268, "grad_norm": 0.43009165612102307, "learning_rate": 1.4268208368267713e-05, "loss": 0.5195, "step": 4604 }, { "epoch": 0.37893437564287186, "grad_norm": 1.584713294524119, "learning_rate": 1.4265797886684636e-05, "loss": 0.7956, "step": 4605 }, { "epoch": 0.37901666323801686, "grad_norm": 1.4611823928271321, "learning_rate": 1.42633871020661e-05, "loss": 0.7752, "step": 4606 }, { "epoch": 0.3790989508331619, "grad_norm": 1.5306387548194744, "learning_rate": 1.4260976014583365e-05, "loss": 0.7766, "step": 4607 }, { "epoch": 0.3791812384283069, "grad_norm": 1.6347143847078773, "learning_rate": 1.4258564624407707e-05, "loss": 0.7735, "step": 4608 }, { "epoch": 0.379263526023452, "grad_norm": 2.1087785704595596, "learning_rate": 1.4256152931710427e-05, "loss": 0.7548, "step": 4609 }, { "epoch": 0.379345813618597, "grad_norm": 0.4586510286645557, "learning_rate": 1.4253740936662851e-05, "loss": 0.542, "step": 4610 }, { "epoch": 0.37942810121374204, "grad_norm": 1.5829988483837198, "learning_rate": 1.425132863943632e-05, "loss": 0.7983, "step": 4611 }, { "epoch": 0.37951038880888704, "grad_norm": 0.42329751953786027, "learning_rate": 1.4248916040202204e-05, "loss": 0.527, "step": 4612 }, { "epoch": 0.3795926764040321, "grad_norm": 1.7726799396306343, "learning_rate": 1.4246503139131887e-05, "loss": 0.7798, "step": 4613 }, { "epoch": 0.3796749639991771, "grad_norm": 1.8816783912453918, "learning_rate": 1.4244089936396776e-05, "loss": 0.807, "step": 4614 }, { "epoch": 0.37975725159432216, "grad_norm": 1.548165423221472, "learning_rate": 1.4241676432168306e-05, "loss": 0.7617, "step": 4615 }, { "epoch": 0.37983953918946717, "grad_norm": 1.7317113706167404, "learning_rate": 1.4239262626617927e-05, "loss": 0.7868, "step": 4616 }, { "epoch": 0.3799218267846122, "grad_norm": 1.4057493354349178, "learning_rate": 1.4236848519917107e-05, "loss": 0.7612, "step": 4617 }, { "epoch": 0.38000411437975723, "grad_norm": 1.7705982163378193, "learning_rate": 1.4234434112237346e-05, "loss": 0.8089, "step": 4618 }, { "epoch": 0.3800864019749023, "grad_norm": 2.853516435647564, "learning_rate": 1.4232019403750157e-05, "loss": 0.7749, "step": 4619 }, { "epoch": 0.3801686895700473, "grad_norm": 1.559043015888748, "learning_rate": 1.422960439462708e-05, "loss": 0.772, "step": 4620 }, { "epoch": 0.38025097716519235, "grad_norm": 1.8879800923469299, "learning_rate": 1.4227189085039668e-05, "loss": 0.7691, "step": 4621 }, { "epoch": 0.38033326476033735, "grad_norm": 0.4256614959361152, "learning_rate": 1.4224773475159504e-05, "loss": 0.5531, "step": 4622 }, { "epoch": 0.3804155523554824, "grad_norm": 0.4200057556792914, "learning_rate": 1.4222357565158189e-05, "loss": 0.5464, "step": 4623 }, { "epoch": 0.38049783995062747, "grad_norm": 1.917363164103099, "learning_rate": 1.4219941355207347e-05, "loss": 0.7407, "step": 4624 }, { "epoch": 0.3805801275457725, "grad_norm": 1.4810251000145112, "learning_rate": 1.4217524845478618e-05, "loss": 0.8127, "step": 4625 }, { "epoch": 0.38066241514091753, "grad_norm": 1.4435143298392117, "learning_rate": 1.421510803614367e-05, "loss": 0.7905, "step": 4626 }, { "epoch": 0.38074470273606253, "grad_norm": 2.0985491595215184, "learning_rate": 1.4212690927374188e-05, "loss": 0.7816, "step": 4627 }, { "epoch": 0.3808269903312076, "grad_norm": 1.36587917935779, "learning_rate": 1.421027351934188e-05, "loss": 0.7809, "step": 4628 }, { "epoch": 0.3809092779263526, "grad_norm": 1.7137264620781636, "learning_rate": 1.4207855812218472e-05, "loss": 0.7994, "step": 4629 }, { "epoch": 0.38099156552149765, "grad_norm": 0.441510461339159, "learning_rate": 1.4205437806175721e-05, "loss": 0.4994, "step": 4630 }, { "epoch": 0.38107385311664266, "grad_norm": 1.3562536795580937, "learning_rate": 1.4203019501385391e-05, "loss": 0.7499, "step": 4631 }, { "epoch": 0.3811561407117877, "grad_norm": 0.42632945270690537, "learning_rate": 1.4200600898019276e-05, "loss": 0.5156, "step": 4632 }, { "epoch": 0.3812384283069327, "grad_norm": 1.6573555699915479, "learning_rate": 1.4198181996249196e-05, "loss": 0.817, "step": 4633 }, { "epoch": 0.3813207159020778, "grad_norm": 1.90762606509077, "learning_rate": 1.4195762796246976e-05, "loss": 0.7941, "step": 4634 }, { "epoch": 0.3814030034972228, "grad_norm": 1.459358643776078, "learning_rate": 1.4193343298184479e-05, "loss": 0.8, "step": 4635 }, { "epoch": 0.38148529109236784, "grad_norm": 1.5081872207107532, "learning_rate": 1.4190923502233583e-05, "loss": 0.7832, "step": 4636 }, { "epoch": 0.38156757868751284, "grad_norm": 1.430695030854491, "learning_rate": 1.4188503408566179e-05, "loss": 0.7656, "step": 4637 }, { "epoch": 0.3816498662826579, "grad_norm": 1.49802076909413, "learning_rate": 1.4186083017354194e-05, "loss": 0.7822, "step": 4638 }, { "epoch": 0.3817321538778029, "grad_norm": 1.4754487085953332, "learning_rate": 1.4183662328769568e-05, "loss": 0.7757, "step": 4639 }, { "epoch": 0.38181444147294796, "grad_norm": 1.5712279624439285, "learning_rate": 1.4181241342984255e-05, "loss": 0.7508, "step": 4640 }, { "epoch": 0.38189672906809297, "grad_norm": 2.4526469485793667, "learning_rate": 1.417882006017025e-05, "loss": 0.7915, "step": 4641 }, { "epoch": 0.381979016663238, "grad_norm": 2.1003872646327926, "learning_rate": 1.4176398480499548e-05, "loss": 0.7621, "step": 4642 }, { "epoch": 0.382061304258383, "grad_norm": 1.2032367702521385, "learning_rate": 1.4173976604144177e-05, "loss": 0.7571, "step": 4643 }, { "epoch": 0.3821435918535281, "grad_norm": 1.4285144802391894, "learning_rate": 1.4171554431276184e-05, "loss": 0.8014, "step": 4644 }, { "epoch": 0.3822258794486731, "grad_norm": 1.4220750148782317, "learning_rate": 1.4169131962067636e-05, "loss": 0.7578, "step": 4645 }, { "epoch": 0.38230816704381815, "grad_norm": 0.4455145286244249, "learning_rate": 1.416670919669062e-05, "loss": 0.5599, "step": 4646 }, { "epoch": 0.38239045463896315, "grad_norm": 1.3999354856030055, "learning_rate": 1.4164286135317246e-05, "loss": 0.8006, "step": 4647 }, { "epoch": 0.3824727422341082, "grad_norm": 1.616631495560293, "learning_rate": 1.4161862778119648e-05, "loss": 0.7697, "step": 4648 }, { "epoch": 0.3825550298292532, "grad_norm": 1.5018858547569893, "learning_rate": 1.4159439125269971e-05, "loss": 0.7915, "step": 4649 }, { "epoch": 0.38263731742439827, "grad_norm": 0.45941714058641875, "learning_rate": 1.415701517694039e-05, "loss": 0.5196, "step": 4650 }, { "epoch": 0.38271960501954333, "grad_norm": 1.6684908069043642, "learning_rate": 1.4154590933303101e-05, "loss": 0.7726, "step": 4651 }, { "epoch": 0.38280189261468833, "grad_norm": 0.43372533774078476, "learning_rate": 1.4152166394530315e-05, "loss": 0.5096, "step": 4652 }, { "epoch": 0.3828841802098334, "grad_norm": 0.4234684263881324, "learning_rate": 1.414974156079427e-05, "loss": 0.5302, "step": 4653 }, { "epoch": 0.3829664678049784, "grad_norm": 1.1404665622376478, "learning_rate": 1.4147316432267221e-05, "loss": 0.7692, "step": 4654 }, { "epoch": 0.38304875540012345, "grad_norm": 1.3798957686446907, "learning_rate": 1.4144891009121445e-05, "loss": 0.7961, "step": 4655 }, { "epoch": 0.38313104299526846, "grad_norm": 0.40309752757542106, "learning_rate": 1.4142465291529242e-05, "loss": 0.5341, "step": 4656 }, { "epoch": 0.3832133305904135, "grad_norm": 1.375260305651529, "learning_rate": 1.4140039279662925e-05, "loss": 0.7648, "step": 4657 }, { "epoch": 0.3832956181855585, "grad_norm": 1.4117972630068905, "learning_rate": 1.4137612973694843e-05, "loss": 0.7884, "step": 4658 }, { "epoch": 0.3833779057807036, "grad_norm": 1.2109296707377057, "learning_rate": 1.4135186373797352e-05, "loss": 0.7326, "step": 4659 }, { "epoch": 0.3834601933758486, "grad_norm": 1.4551816177875234, "learning_rate": 1.4132759480142833e-05, "loss": 0.7792, "step": 4660 }, { "epoch": 0.38354248097099364, "grad_norm": 1.7120630918341764, "learning_rate": 1.4130332292903688e-05, "loss": 0.7886, "step": 4661 }, { "epoch": 0.38362476856613864, "grad_norm": 1.4394662645048832, "learning_rate": 1.4127904812252346e-05, "loss": 0.8079, "step": 4662 }, { "epoch": 0.3837070561612837, "grad_norm": 1.503221658017532, "learning_rate": 1.4125477038361246e-05, "loss": 0.7873, "step": 4663 }, { "epoch": 0.3837893437564287, "grad_norm": 1.4878693585300635, "learning_rate": 1.4123048971402856e-05, "loss": 0.7936, "step": 4664 }, { "epoch": 0.38387163135157376, "grad_norm": 1.3540421146847554, "learning_rate": 1.4120620611549658e-05, "loss": 0.8116, "step": 4665 }, { "epoch": 0.38395391894671876, "grad_norm": 0.442982182343593, "learning_rate": 1.4118191958974165e-05, "loss": 0.5275, "step": 4666 }, { "epoch": 0.3840362065418638, "grad_norm": 0.4257828491973008, "learning_rate": 1.4115763013848897e-05, "loss": 0.5258, "step": 4667 }, { "epoch": 0.3841184941370088, "grad_norm": 0.42064364777225555, "learning_rate": 1.4113333776346414e-05, "loss": 0.5117, "step": 4668 }, { "epoch": 0.3842007817321539, "grad_norm": 1.4756658546731998, "learning_rate": 1.4110904246639272e-05, "loss": 0.7734, "step": 4669 }, { "epoch": 0.3842830693272989, "grad_norm": 1.5890994209434663, "learning_rate": 1.4108474424900067e-05, "loss": 0.7915, "step": 4670 }, { "epoch": 0.38436535692244395, "grad_norm": 1.3176833024362846, "learning_rate": 1.4106044311301412e-05, "loss": 0.7731, "step": 4671 }, { "epoch": 0.38444764451758895, "grad_norm": 1.448650646202771, "learning_rate": 1.4103613906015935e-05, "loss": 0.7932, "step": 4672 }, { "epoch": 0.384529932112734, "grad_norm": 0.45435234039854683, "learning_rate": 1.4101183209216287e-05, "loss": 0.5304, "step": 4673 }, { "epoch": 0.384612219707879, "grad_norm": 1.419300408601491, "learning_rate": 1.4098752221075147e-05, "loss": 0.7779, "step": 4674 }, { "epoch": 0.38469450730302407, "grad_norm": 1.7050156449118206, "learning_rate": 1.4096320941765197e-05, "loss": 0.7375, "step": 4675 }, { "epoch": 0.3847767948981691, "grad_norm": 1.3631532898791363, "learning_rate": 1.4093889371459164e-05, "loss": 0.8061, "step": 4676 }, { "epoch": 0.38485908249331413, "grad_norm": 1.3952806505126574, "learning_rate": 1.4091457510329778e-05, "loss": 0.7942, "step": 4677 }, { "epoch": 0.3849413700884592, "grad_norm": 1.54950606552771, "learning_rate": 1.408902535854979e-05, "loss": 0.7743, "step": 4678 }, { "epoch": 0.3850236576836042, "grad_norm": 1.5343388971524679, "learning_rate": 1.4086592916291982e-05, "loss": 0.7701, "step": 4679 }, { "epoch": 0.38510594527874925, "grad_norm": 1.4484528580185516, "learning_rate": 1.408416018372915e-05, "loss": 0.7603, "step": 4680 }, { "epoch": 0.38518823287389425, "grad_norm": 0.4376941943244993, "learning_rate": 1.4081727161034109e-05, "loss": 0.5332, "step": 4681 }, { "epoch": 0.3852705204690393, "grad_norm": 1.3099462046946182, "learning_rate": 1.4079293848379696e-05, "loss": 0.7864, "step": 4682 }, { "epoch": 0.3853528080641843, "grad_norm": 1.391529083314694, "learning_rate": 1.4076860245938775e-05, "loss": 0.8025, "step": 4683 }, { "epoch": 0.3854350956593294, "grad_norm": 0.43217526617783636, "learning_rate": 1.407442635388422e-05, "loss": 0.5265, "step": 4684 }, { "epoch": 0.3855173832544744, "grad_norm": 2.4922892569291593, "learning_rate": 1.4071992172388933e-05, "loss": 0.7863, "step": 4685 }, { "epoch": 0.38559967084961944, "grad_norm": 1.5080506927939803, "learning_rate": 1.4069557701625836e-05, "loss": 0.7598, "step": 4686 }, { "epoch": 0.38568195844476444, "grad_norm": 0.43969057217864205, "learning_rate": 1.4067122941767868e-05, "loss": 0.5443, "step": 4687 }, { "epoch": 0.3857642460399095, "grad_norm": 1.5561662928790831, "learning_rate": 1.4064687892987987e-05, "loss": 0.7576, "step": 4688 }, { "epoch": 0.3858465336350545, "grad_norm": 1.6693651171520447, "learning_rate": 1.4062252555459183e-05, "loss": 0.7869, "step": 4689 }, { "epoch": 0.38592882123019956, "grad_norm": 1.4932354265644647, "learning_rate": 1.4059816929354452e-05, "loss": 0.7757, "step": 4690 }, { "epoch": 0.38601110882534456, "grad_norm": 1.6801554967119523, "learning_rate": 1.405738101484682e-05, "loss": 0.7544, "step": 4691 }, { "epoch": 0.3860933964204896, "grad_norm": 0.40610245923903565, "learning_rate": 1.405494481210933e-05, "loss": 0.4992, "step": 4692 }, { "epoch": 0.3861756840156346, "grad_norm": 1.4279030928712846, "learning_rate": 1.4052508321315043e-05, "loss": 0.7706, "step": 4693 }, { "epoch": 0.3862579716107797, "grad_norm": 1.5691049745726078, "learning_rate": 1.405007154263705e-05, "loss": 0.7752, "step": 4694 }, { "epoch": 0.3863402592059247, "grad_norm": 1.3813306660772908, "learning_rate": 1.404763447624845e-05, "loss": 0.7697, "step": 4695 }, { "epoch": 0.38642254680106974, "grad_norm": 1.7524155707894806, "learning_rate": 1.4045197122322366e-05, "loss": 0.7711, "step": 4696 }, { "epoch": 0.38650483439621475, "grad_norm": 1.618016021200021, "learning_rate": 1.4042759481031954e-05, "loss": 0.7865, "step": 4697 }, { "epoch": 0.3865871219913598, "grad_norm": 1.3578284028285228, "learning_rate": 1.4040321552550368e-05, "loss": 0.7681, "step": 4698 }, { "epoch": 0.3866694095865048, "grad_norm": 4.092460781356398, "learning_rate": 1.4037883337050803e-05, "loss": 0.7714, "step": 4699 }, { "epoch": 0.38675169718164987, "grad_norm": 1.4719471939488527, "learning_rate": 1.4035444834706466e-05, "loss": 0.7892, "step": 4700 }, { "epoch": 0.3868339847767949, "grad_norm": 1.4608554046728215, "learning_rate": 1.4033006045690577e-05, "loss": 0.7578, "step": 4701 }, { "epoch": 0.38691627237193993, "grad_norm": 1.223396737139375, "learning_rate": 1.403056697017639e-05, "loss": 0.7707, "step": 4702 }, { "epoch": 0.386998559967085, "grad_norm": 1.317923428108782, "learning_rate": 1.4028127608337175e-05, "loss": 0.7717, "step": 4703 }, { "epoch": 0.38708084756223, "grad_norm": 1.3819884068481956, "learning_rate": 1.4025687960346214e-05, "loss": 0.7872, "step": 4704 }, { "epoch": 0.38716313515737505, "grad_norm": 1.96519181229529, "learning_rate": 1.4023248026376817e-05, "loss": 0.785, "step": 4705 }, { "epoch": 0.38724542275252005, "grad_norm": 1.5293915817900865, "learning_rate": 1.4020807806602317e-05, "loss": 0.782, "step": 4706 }, { "epoch": 0.3873277103476651, "grad_norm": 1.3503938845969048, "learning_rate": 1.4018367301196059e-05, "loss": 0.7628, "step": 4707 }, { "epoch": 0.3874099979428101, "grad_norm": 2.46831478516891, "learning_rate": 1.4015926510331415e-05, "loss": 0.7582, "step": 4708 }, { "epoch": 0.3874922855379552, "grad_norm": 1.512639308351756, "learning_rate": 1.4013485434181775e-05, "loss": 0.8096, "step": 4709 }, { "epoch": 0.3875745731331002, "grad_norm": 0.4787808359261194, "learning_rate": 1.4011044072920545e-05, "loss": 0.5421, "step": 4710 }, { "epoch": 0.38765686072824523, "grad_norm": 1.841445502457464, "learning_rate": 1.4008602426721162e-05, "loss": 0.7505, "step": 4711 }, { "epoch": 0.38773914832339024, "grad_norm": 1.9793074512100168, "learning_rate": 1.4006160495757075e-05, "loss": 0.8026, "step": 4712 }, { "epoch": 0.3878214359185353, "grad_norm": 0.41820306154431286, "learning_rate": 1.4003718280201749e-05, "loss": 0.529, "step": 4713 }, { "epoch": 0.3879037235136803, "grad_norm": 2.1922234268617027, "learning_rate": 1.4001275780228681e-05, "loss": 0.7875, "step": 4714 }, { "epoch": 0.38798601110882536, "grad_norm": 1.71731266852407, "learning_rate": 1.399883299601138e-05, "loss": 0.7809, "step": 4715 }, { "epoch": 0.38806829870397036, "grad_norm": 1.6436657501427487, "learning_rate": 1.399638992772338e-05, "loss": 0.7546, "step": 4716 }, { "epoch": 0.3881505862991154, "grad_norm": 2.0600468316771163, "learning_rate": 1.3993946575538231e-05, "loss": 0.7787, "step": 4717 }, { "epoch": 0.3882328738942604, "grad_norm": 1.7074996919331196, "learning_rate": 1.3991502939629502e-05, "loss": 0.7736, "step": 4718 }, { "epoch": 0.3883151614894055, "grad_norm": 0.44896731408123147, "learning_rate": 1.398905902017079e-05, "loss": 0.5334, "step": 4719 }, { "epoch": 0.3883974490845505, "grad_norm": 2.2707396136393876, "learning_rate": 1.3986614817335704e-05, "loss": 0.7572, "step": 4720 }, { "epoch": 0.38847973667969554, "grad_norm": 1.7804022965363049, "learning_rate": 1.3984170331297878e-05, "loss": 0.7947, "step": 4721 }, { "epoch": 0.38856202427484055, "grad_norm": 2.0217448209881135, "learning_rate": 1.3981725562230958e-05, "loss": 0.7654, "step": 4722 }, { "epoch": 0.3886443118699856, "grad_norm": 2.394829419690257, "learning_rate": 1.397928051030863e-05, "loss": 0.7892, "step": 4723 }, { "epoch": 0.3887265994651306, "grad_norm": 2.176981887296413, "learning_rate": 1.3976835175704575e-05, "loss": 0.7702, "step": 4724 }, { "epoch": 0.38880888706027567, "grad_norm": 1.8281059093883796, "learning_rate": 1.3974389558592507e-05, "loss": 0.7785, "step": 4725 }, { "epoch": 0.38889117465542067, "grad_norm": 1.667697252251999, "learning_rate": 1.3971943659146162e-05, "loss": 0.798, "step": 4726 }, { "epoch": 0.3889734622505657, "grad_norm": 3.194609710321003, "learning_rate": 1.3969497477539294e-05, "loss": 0.7882, "step": 4727 }, { "epoch": 0.3890557498457108, "grad_norm": 2.0144735792488864, "learning_rate": 1.3967051013945672e-05, "loss": 0.7874, "step": 4728 }, { "epoch": 0.3891380374408558, "grad_norm": 0.4336243517562767, "learning_rate": 1.396460426853909e-05, "loss": 0.5518, "step": 4729 }, { "epoch": 0.38922032503600085, "grad_norm": 1.8817602468966521, "learning_rate": 1.3962157241493361e-05, "loss": 0.7682, "step": 4730 }, { "epoch": 0.38930261263114585, "grad_norm": 1.867001888964424, "learning_rate": 1.395970993298232e-05, "loss": 0.8407, "step": 4731 }, { "epoch": 0.3893849002262909, "grad_norm": 1.849096345409507, "learning_rate": 1.3957262343179815e-05, "loss": 0.7818, "step": 4732 }, { "epoch": 0.3894671878214359, "grad_norm": 1.747856622988127, "learning_rate": 1.3954814472259724e-05, "loss": 0.7259, "step": 4733 }, { "epoch": 0.38954947541658097, "grad_norm": 1.5758884339815993, "learning_rate": 1.3952366320395936e-05, "loss": 0.7638, "step": 4734 }, { "epoch": 0.389631763011726, "grad_norm": 2.3860119040884213, "learning_rate": 1.3949917887762367e-05, "loss": 0.7582, "step": 4735 }, { "epoch": 0.38971405060687103, "grad_norm": 1.7462878572225637, "learning_rate": 1.3947469174532948e-05, "loss": 0.7564, "step": 4736 }, { "epoch": 0.38979633820201604, "grad_norm": 1.6596576039588808, "learning_rate": 1.3945020180881632e-05, "loss": 0.7733, "step": 4737 }, { "epoch": 0.3898786257971611, "grad_norm": 1.62926660004808, "learning_rate": 1.394257090698239e-05, "loss": 0.8072, "step": 4738 }, { "epoch": 0.3899609133923061, "grad_norm": 1.809049284714984, "learning_rate": 1.3940121353009217e-05, "loss": 0.7642, "step": 4739 }, { "epoch": 0.39004320098745116, "grad_norm": 1.775395529870585, "learning_rate": 1.3937671519136127e-05, "loss": 0.7894, "step": 4740 }, { "epoch": 0.39012548858259616, "grad_norm": 1.6312728683199293, "learning_rate": 1.3935221405537145e-05, "loss": 0.8085, "step": 4741 }, { "epoch": 0.3902077761777412, "grad_norm": 2.3350774301403154, "learning_rate": 1.3932771012386331e-05, "loss": 0.8029, "step": 4742 }, { "epoch": 0.3902900637728862, "grad_norm": 1.460942446929902, "learning_rate": 1.3930320339857753e-05, "loss": 0.7773, "step": 4743 }, { "epoch": 0.3903723513680313, "grad_norm": 1.7378370895545514, "learning_rate": 1.3927869388125504e-05, "loss": 0.7826, "step": 4744 }, { "epoch": 0.3904546389631763, "grad_norm": 1.731763812909668, "learning_rate": 1.3925418157363693e-05, "loss": 0.7758, "step": 4745 }, { "epoch": 0.39053692655832134, "grad_norm": 1.9791135260178654, "learning_rate": 1.3922966647746456e-05, "loss": 0.7604, "step": 4746 }, { "epoch": 0.39061921415346634, "grad_norm": 1.9208066343833679, "learning_rate": 1.3920514859447943e-05, "loss": 0.8044, "step": 4747 }, { "epoch": 0.3907015017486114, "grad_norm": 1.8190125542396773, "learning_rate": 1.3918062792642322e-05, "loss": 0.7723, "step": 4748 }, { "epoch": 0.3907837893437564, "grad_norm": 1.6378758273832892, "learning_rate": 1.391561044750379e-05, "loss": 0.7932, "step": 4749 }, { "epoch": 0.39086607693890146, "grad_norm": 1.5833266449769714, "learning_rate": 1.391315782420655e-05, "loss": 0.7398, "step": 4750 }, { "epoch": 0.39094836453404647, "grad_norm": 3.6267223924593806, "learning_rate": 1.3910704922924836e-05, "loss": 0.7919, "step": 4751 }, { "epoch": 0.3910306521291915, "grad_norm": 1.8648929003549561, "learning_rate": 1.39082517438329e-05, "loss": 0.7559, "step": 4752 }, { "epoch": 0.3911129397243366, "grad_norm": 1.7769186522984872, "learning_rate": 1.390579828710501e-05, "loss": 0.7853, "step": 4753 }, { "epoch": 0.3911952273194816, "grad_norm": 1.7015576774330343, "learning_rate": 1.3903344552915457e-05, "loss": 0.768, "step": 4754 }, { "epoch": 0.39127751491462665, "grad_norm": 1.633530944077053, "learning_rate": 1.390089054143855e-05, "loss": 0.7719, "step": 4755 }, { "epoch": 0.39135980250977165, "grad_norm": 1.75285757945078, "learning_rate": 1.3898436252848617e-05, "loss": 0.7993, "step": 4756 }, { "epoch": 0.3914420901049167, "grad_norm": 1.6549874434880767, "learning_rate": 1.3895981687320006e-05, "loss": 0.7753, "step": 4757 }, { "epoch": 0.3915243777000617, "grad_norm": 1.8798099701099278, "learning_rate": 1.389352684502709e-05, "loss": 0.7499, "step": 4758 }, { "epoch": 0.39160666529520677, "grad_norm": 1.8933438995047365, "learning_rate": 1.389107172614425e-05, "loss": 0.7842, "step": 4759 }, { "epoch": 0.39168895289035177, "grad_norm": 1.5117186427168021, "learning_rate": 1.3888616330845897e-05, "loss": 0.7415, "step": 4760 }, { "epoch": 0.39177124048549683, "grad_norm": 4.377633633758451, "learning_rate": 1.3886160659306463e-05, "loss": 0.7919, "step": 4761 }, { "epoch": 0.39185352808064183, "grad_norm": 1.7648356133107233, "learning_rate": 1.3883704711700387e-05, "loss": 0.7718, "step": 4762 }, { "epoch": 0.3919358156757869, "grad_norm": 1.332711715229098, "learning_rate": 1.3881248488202138e-05, "loss": 0.7658, "step": 4763 }, { "epoch": 0.3920181032709319, "grad_norm": 2.8899492462415983, "learning_rate": 1.3878791988986208e-05, "loss": 0.7515, "step": 4764 }, { "epoch": 0.39210039086607695, "grad_norm": 1.5841886588796068, "learning_rate": 1.3876335214227098e-05, "loss": 0.7801, "step": 4765 }, { "epoch": 0.39218267846122196, "grad_norm": 0.4287991207533434, "learning_rate": 1.3873878164099331e-05, "loss": 0.5361, "step": 4766 }, { "epoch": 0.392264966056367, "grad_norm": 1.7828845456243445, "learning_rate": 1.3871420838777456e-05, "loss": 0.7549, "step": 4767 }, { "epoch": 0.392347253651512, "grad_norm": 1.883930819084228, "learning_rate": 1.3868963238436035e-05, "loss": 0.7893, "step": 4768 }, { "epoch": 0.3924295412466571, "grad_norm": 1.7613580118582424, "learning_rate": 1.3866505363249651e-05, "loss": 0.7761, "step": 4769 }, { "epoch": 0.3925118288418021, "grad_norm": 0.42030830073722, "learning_rate": 1.3864047213392916e-05, "loss": 0.4859, "step": 4770 }, { "epoch": 0.39259411643694714, "grad_norm": 3.439379003095375, "learning_rate": 1.3861588789040442e-05, "loss": 0.7925, "step": 4771 }, { "epoch": 0.39267640403209214, "grad_norm": 2.911765158971951, "learning_rate": 1.3859130090366877e-05, "loss": 0.7733, "step": 4772 }, { "epoch": 0.3927586916272372, "grad_norm": 1.846601442536393, "learning_rate": 1.385667111754688e-05, "loss": 0.7616, "step": 4773 }, { "epoch": 0.3928409792223822, "grad_norm": 1.747058906293054, "learning_rate": 1.3854211870755139e-05, "loss": 0.7706, "step": 4774 }, { "epoch": 0.39292326681752726, "grad_norm": 1.6299233156124149, "learning_rate": 1.385175235016635e-05, "loss": 0.7786, "step": 4775 }, { "epoch": 0.39300555441267226, "grad_norm": 1.4978035079756256, "learning_rate": 1.384929255595523e-05, "loss": 0.7926, "step": 4776 }, { "epoch": 0.3930878420078173, "grad_norm": 1.6306867114241461, "learning_rate": 1.3846832488296524e-05, "loss": 0.7695, "step": 4777 }, { "epoch": 0.3931701296029623, "grad_norm": 0.4536405273178374, "learning_rate": 1.3844372147364992e-05, "loss": 0.5241, "step": 4778 }, { "epoch": 0.3932524171981074, "grad_norm": 1.7178068525189172, "learning_rate": 1.384191153333541e-05, "loss": 0.7742, "step": 4779 }, { "epoch": 0.39333470479325244, "grad_norm": 1.9137628042469057, "learning_rate": 1.3839450646382577e-05, "loss": 0.7531, "step": 4780 }, { "epoch": 0.39341699238839745, "grad_norm": 1.810913371509303, "learning_rate": 1.3836989486681311e-05, "loss": 0.7618, "step": 4781 }, { "epoch": 0.3934992799835425, "grad_norm": 1.7017193906277954, "learning_rate": 1.3834528054406447e-05, "loss": 0.7683, "step": 4782 }, { "epoch": 0.3935815675786875, "grad_norm": 1.9713949746257746, "learning_rate": 1.3832066349732843e-05, "loss": 0.7761, "step": 4783 }, { "epoch": 0.39366385517383257, "grad_norm": 1.6119526302159937, "learning_rate": 1.3829604372835377e-05, "loss": 0.7604, "step": 4784 }, { "epoch": 0.39374614276897757, "grad_norm": 1.8727639953963953, "learning_rate": 1.3827142123888936e-05, "loss": 0.8005, "step": 4785 }, { "epoch": 0.39382843036412263, "grad_norm": 0.43477428222626374, "learning_rate": 1.382467960306844e-05, "loss": 0.5363, "step": 4786 }, { "epoch": 0.39391071795926763, "grad_norm": 1.4108350338929352, "learning_rate": 1.3822216810548822e-05, "loss": 0.7755, "step": 4787 }, { "epoch": 0.3939930055544127, "grad_norm": 0.4476186386981923, "learning_rate": 1.3819753746505038e-05, "loss": 0.492, "step": 4788 }, { "epoch": 0.3940752931495577, "grad_norm": 1.5801613370282777, "learning_rate": 1.3817290411112052e-05, "loss": 0.8049, "step": 4789 }, { "epoch": 0.39415758074470275, "grad_norm": 1.5624580377664357, "learning_rate": 1.3814826804544863e-05, "loss": 0.7701, "step": 4790 }, { "epoch": 0.39423986833984775, "grad_norm": 1.5718463792858444, "learning_rate": 1.3812362926978478e-05, "loss": 0.8046, "step": 4791 }, { "epoch": 0.3943221559349928, "grad_norm": 1.5522539374335746, "learning_rate": 1.3809898778587927e-05, "loss": 0.7409, "step": 4792 }, { "epoch": 0.3944044435301378, "grad_norm": 2.0762750291167875, "learning_rate": 1.380743435954826e-05, "loss": 0.7785, "step": 4793 }, { "epoch": 0.3944867311252829, "grad_norm": 1.8873388178236636, "learning_rate": 1.3804969670034545e-05, "loss": 0.7728, "step": 4794 }, { "epoch": 0.3945690187204279, "grad_norm": 1.7509046034453075, "learning_rate": 1.380250471022187e-05, "loss": 0.765, "step": 4795 }, { "epoch": 0.39465130631557294, "grad_norm": 1.3241241903003864, "learning_rate": 1.3800039480285343e-05, "loss": 0.7607, "step": 4796 }, { "epoch": 0.39473359391071794, "grad_norm": 1.4133930458860324, "learning_rate": 1.3797573980400088e-05, "loss": 0.7739, "step": 4797 }, { "epoch": 0.394815881505863, "grad_norm": 0.42795374263489266, "learning_rate": 1.3795108210741248e-05, "loss": 0.5007, "step": 4798 }, { "epoch": 0.394898169101008, "grad_norm": 1.527365234078468, "learning_rate": 1.3792642171483994e-05, "loss": 0.7856, "step": 4799 }, { "epoch": 0.39498045669615306, "grad_norm": 1.6491814501007112, "learning_rate": 1.3790175862803504e-05, "loss": 0.7829, "step": 4800 }, { "epoch": 0.39506274429129806, "grad_norm": 2.122180018875669, "learning_rate": 1.378770928487498e-05, "loss": 0.7627, "step": 4801 }, { "epoch": 0.3951450318864431, "grad_norm": 1.8021052844293737, "learning_rate": 1.378524243787365e-05, "loss": 0.7754, "step": 4802 }, { "epoch": 0.3952273194815881, "grad_norm": 1.9893859162041378, "learning_rate": 1.3782775321974746e-05, "loss": 0.7531, "step": 4803 }, { "epoch": 0.3953096070767332, "grad_norm": 2.9921955103282523, "learning_rate": 1.378030793735354e-05, "loss": 0.7578, "step": 4804 }, { "epoch": 0.39539189467187824, "grad_norm": 1.748968558557209, "learning_rate": 1.3777840284185295e-05, "loss": 0.7807, "step": 4805 }, { "epoch": 0.39547418226702324, "grad_norm": 1.4623425341945417, "learning_rate": 1.3775372362645324e-05, "loss": 0.7607, "step": 4806 }, { "epoch": 0.3955564698621683, "grad_norm": 0.4242403740438736, "learning_rate": 1.3772904172908936e-05, "loss": 0.5264, "step": 4807 }, { "epoch": 0.3956387574573133, "grad_norm": 1.921682656195057, "learning_rate": 1.377043571515147e-05, "loss": 0.7799, "step": 4808 }, { "epoch": 0.39572104505245836, "grad_norm": 0.4151638734851008, "learning_rate": 1.376796698954828e-05, "loss": 0.5419, "step": 4809 }, { "epoch": 0.39580333264760337, "grad_norm": 1.7046813432671464, "learning_rate": 1.3765497996274744e-05, "loss": 0.801, "step": 4810 }, { "epoch": 0.3958856202427484, "grad_norm": 1.739406298158605, "learning_rate": 1.3763028735506247e-05, "loss": 0.7916, "step": 4811 }, { "epoch": 0.39596790783789343, "grad_norm": 2.2464481204144375, "learning_rate": 1.3760559207418209e-05, "loss": 0.773, "step": 4812 }, { "epoch": 0.3960501954330385, "grad_norm": 1.996458080137954, "learning_rate": 1.3758089412186062e-05, "loss": 0.7685, "step": 4813 }, { "epoch": 0.3961324830281835, "grad_norm": 0.43619925736944076, "learning_rate": 1.375561934998525e-05, "loss": 0.5418, "step": 4814 }, { "epoch": 0.39621477062332855, "grad_norm": 2.0310487198193887, "learning_rate": 1.3753149020991248e-05, "loss": 0.7547, "step": 4815 }, { "epoch": 0.39629705821847355, "grad_norm": 1.765975567326183, "learning_rate": 1.375067842537954e-05, "loss": 0.7551, "step": 4816 }, { "epoch": 0.3963793458136186, "grad_norm": 1.6822865641220783, "learning_rate": 1.3748207563325635e-05, "loss": 0.8024, "step": 4817 }, { "epoch": 0.3964616334087636, "grad_norm": 1.5848718906882158, "learning_rate": 1.3745736435005059e-05, "loss": 0.7821, "step": 4818 }, { "epoch": 0.3965439210039087, "grad_norm": 1.9770200734068921, "learning_rate": 1.3743265040593358e-05, "loss": 0.7795, "step": 4819 }, { "epoch": 0.3966262085990537, "grad_norm": 2.105318678727579, "learning_rate": 1.3740793380266095e-05, "loss": 0.7769, "step": 4820 }, { "epoch": 0.39670849619419873, "grad_norm": 1.9266919948005554, "learning_rate": 1.373832145419885e-05, "loss": 0.7715, "step": 4821 }, { "epoch": 0.39679078378934374, "grad_norm": 2.383660140396743, "learning_rate": 1.3735849262567231e-05, "loss": 0.785, "step": 4822 }, { "epoch": 0.3968730713844888, "grad_norm": 1.8308391472082335, "learning_rate": 1.3733376805546855e-05, "loss": 0.7703, "step": 4823 }, { "epoch": 0.3969553589796338, "grad_norm": 0.42636535497538935, "learning_rate": 1.373090408331336e-05, "loss": 0.5094, "step": 4824 }, { "epoch": 0.39703764657477886, "grad_norm": 1.944343003241766, "learning_rate": 1.3728431096042407e-05, "loss": 0.7707, "step": 4825 }, { "epoch": 0.39711993416992386, "grad_norm": 1.7768131165944197, "learning_rate": 1.372595784390967e-05, "loss": 0.7884, "step": 4826 }, { "epoch": 0.3972022217650689, "grad_norm": 1.9144595023422326, "learning_rate": 1.3723484327090846e-05, "loss": 0.7583, "step": 4827 }, { "epoch": 0.3972845093602139, "grad_norm": 0.41638232983062673, "learning_rate": 1.3721010545761653e-05, "loss": 0.5149, "step": 4828 }, { "epoch": 0.397366796955359, "grad_norm": 2.1243611290257998, "learning_rate": 1.371853650009782e-05, "loss": 0.7848, "step": 4829 }, { "epoch": 0.397449084550504, "grad_norm": 0.41768502231270427, "learning_rate": 1.37160621902751e-05, "loss": 0.5374, "step": 4830 }, { "epoch": 0.39753137214564904, "grad_norm": 1.7962254209278277, "learning_rate": 1.3713587616469266e-05, "loss": 0.767, "step": 4831 }, { "epoch": 0.3976136597407941, "grad_norm": 1.7898996121973307, "learning_rate": 1.3711112778856107e-05, "loss": 0.7682, "step": 4832 }, { "epoch": 0.3976959473359391, "grad_norm": 2.781789426512144, "learning_rate": 1.3708637677611429e-05, "loss": 0.7873, "step": 4833 }, { "epoch": 0.39777823493108416, "grad_norm": 1.8111725457097099, "learning_rate": 1.3706162312911064e-05, "loss": 0.7987, "step": 4834 }, { "epoch": 0.39786052252622917, "grad_norm": 2.0199120742026353, "learning_rate": 1.3703686684930855e-05, "loss": 0.7897, "step": 4835 }, { "epoch": 0.3979428101213742, "grad_norm": 0.4221006521753932, "learning_rate": 1.3701210793846667e-05, "loss": 0.5703, "step": 4836 }, { "epoch": 0.3980250977165192, "grad_norm": 0.4192743796138713, "learning_rate": 1.3698734639834385e-05, "loss": 0.5413, "step": 4837 }, { "epoch": 0.3981073853116643, "grad_norm": 0.43264439140179317, "learning_rate": 1.3696258223069908e-05, "loss": 0.5525, "step": 4838 }, { "epoch": 0.3981896729068093, "grad_norm": 1.907492323372713, "learning_rate": 1.3693781543729157e-05, "loss": 0.7543, "step": 4839 }, { "epoch": 0.39827196050195435, "grad_norm": 1.858901544893876, "learning_rate": 1.3691304601988074e-05, "loss": 0.7705, "step": 4840 }, { "epoch": 0.39835424809709935, "grad_norm": 1.6933598530478624, "learning_rate": 1.3688827398022612e-05, "loss": 0.781, "step": 4841 }, { "epoch": 0.3984365356922444, "grad_norm": 2.068591134858905, "learning_rate": 1.3686349932008755e-05, "loss": 0.7469, "step": 4842 }, { "epoch": 0.3985188232873894, "grad_norm": 1.6224698284144443, "learning_rate": 1.3683872204122495e-05, "loss": 0.758, "step": 4843 }, { "epoch": 0.39860111088253447, "grad_norm": 1.7594490655620967, "learning_rate": 1.368139421453984e-05, "loss": 0.7857, "step": 4844 }, { "epoch": 0.3986833984776795, "grad_norm": 1.481693328560124, "learning_rate": 1.3678915963436834e-05, "loss": 0.7524, "step": 4845 }, { "epoch": 0.39876568607282453, "grad_norm": 2.1486729334941774, "learning_rate": 1.3676437450989518e-05, "loss": 0.752, "step": 4846 }, { "epoch": 0.39884797366796954, "grad_norm": 2.1799912946799878, "learning_rate": 1.3673958677373964e-05, "loss": 0.7819, "step": 4847 }, { "epoch": 0.3989302612631146, "grad_norm": 2.1180983344081348, "learning_rate": 1.3671479642766263e-05, "loss": 0.8214, "step": 4848 }, { "epoch": 0.3990125488582596, "grad_norm": 2.7757867868868846, "learning_rate": 1.3669000347342519e-05, "loss": 0.7644, "step": 4849 }, { "epoch": 0.39909483645340466, "grad_norm": 1.6900340191126524, "learning_rate": 1.3666520791278859e-05, "loss": 0.7946, "step": 4850 }, { "epoch": 0.39917712404854966, "grad_norm": 1.7298253546304831, "learning_rate": 1.3664040974751424e-05, "loss": 0.7745, "step": 4851 }, { "epoch": 0.3992594116436947, "grad_norm": 0.46359746310117156, "learning_rate": 1.3661560897936379e-05, "loss": 0.5404, "step": 4852 }, { "epoch": 0.3993416992388397, "grad_norm": 1.6835608035272525, "learning_rate": 1.3659080561009904e-05, "loss": 0.7727, "step": 4853 }, { "epoch": 0.3994239868339848, "grad_norm": 1.5474459085552865, "learning_rate": 1.3656599964148198e-05, "loss": 0.7466, "step": 4854 }, { "epoch": 0.3995062744291298, "grad_norm": 1.8375199925729886, "learning_rate": 1.3654119107527477e-05, "loss": 0.7441, "step": 4855 }, { "epoch": 0.39958856202427484, "grad_norm": 2.9603745816889036, "learning_rate": 1.3651637991323981e-05, "loss": 0.7595, "step": 4856 }, { "epoch": 0.3996708496194199, "grad_norm": 1.8405677376439393, "learning_rate": 1.364915661571396e-05, "loss": 0.768, "step": 4857 }, { "epoch": 0.3997531372145649, "grad_norm": 0.44053315994172615, "learning_rate": 1.3646674980873689e-05, "loss": 0.547, "step": 4858 }, { "epoch": 0.39983542480970996, "grad_norm": 0.4214633069703209, "learning_rate": 1.3644193086979458e-05, "loss": 0.5257, "step": 4859 }, { "epoch": 0.39991771240485496, "grad_norm": 2.3436845812867064, "learning_rate": 1.3641710934207582e-05, "loss": 0.7553, "step": 4860 }, { "epoch": 0.4, "grad_norm": 1.7970209908615555, "learning_rate": 1.3639228522734382e-05, "loss": 0.7962, "step": 4861 }, { "epoch": 0.400082287595145, "grad_norm": 0.4087400009461183, "learning_rate": 1.3636745852736209e-05, "loss": 0.4983, "step": 4862 }, { "epoch": 0.4001645751902901, "grad_norm": 1.960318064560911, "learning_rate": 1.3634262924389427e-05, "loss": 0.7403, "step": 4863 }, { "epoch": 0.4002468627854351, "grad_norm": 1.6788656523589116, "learning_rate": 1.3631779737870419e-05, "loss": 0.7595, "step": 4864 } ], "logging_steps": 1.0, "max_steps": 12152, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 608, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1381394073714688e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }