diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.4502724420716605, + "epoch": 0.5503329847542517, "eval_steps": 500, - "global_step": 8181, + "global_step": 9999, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -57274,6 +57274,12732 @@ "learning_rate": 8.822378335411856e-06, "loss": 0.765, "step": 8181 + }, + { + "epoch": 0.4503274808740162, + "grad_norm": 0.740364670753479, + "learning_rate": 8.822098885673346e-06, + "loss": 0.6354, + "step": 8182 + }, + { + "epoch": 0.45038251967637183, + "grad_norm": 0.8049225807189941, + "learning_rate": 8.821819407208963e-06, + "loss": 0.7023, + "step": 8183 + }, + { + "epoch": 0.4504375584787275, + "grad_norm": 0.7320911288261414, + "learning_rate": 8.821539900020808e-06, + "loss": 0.8429, + "step": 8184 + }, + { + "epoch": 0.45049259728108315, + "grad_norm": 0.7065376043319702, + "learning_rate": 8.821260364110984e-06, + "loss": 0.7283, + "step": 8185 + }, + { + "epoch": 0.45054763608343884, + "grad_norm": 0.7172972559928894, + "learning_rate": 8.820980799481588e-06, + "loss": 0.7673, + "step": 8186 + }, + { + "epoch": 0.4506026748857945, + "grad_norm": 0.712273895740509, + "learning_rate": 8.820701206134724e-06, + "loss": 0.7317, + "step": 8187 + }, + { + "epoch": 0.45065771368815016, + "grad_norm": 0.6954227685928345, + "learning_rate": 8.820421584072492e-06, + "loss": 0.7037, + "step": 8188 + }, + { + "epoch": 0.4507127524905058, + "grad_norm": 0.6790304780006409, + "learning_rate": 8.820141933296994e-06, + "loss": 0.7544, + "step": 8189 + }, + { + "epoch": 0.4507677912928615, + "grad_norm": 0.7483745813369751, + "learning_rate": 8.819862253810332e-06, + "loss": 0.7894, + "step": 8190 + }, + { + "epoch": 0.4508228300952171, + "grad_norm": 0.7926133871078491, + "learning_rate": 8.819582545614608e-06, + "loss": 0.8085, + "step": 8191 + }, + { + "epoch": 0.4508778688975728, + "grad_norm": 0.8442840576171875, + "learning_rate": 8.819302808711924e-06, + "loss": 0.8252, + "step": 8192 + }, + { + "epoch": 0.45093290769992844, + "grad_norm": 0.8359581232070923, + "learning_rate": 8.819023043104383e-06, + "loss": 0.8187, + "step": 8193 + }, + { + "epoch": 0.45098794650228413, + "grad_norm": 0.7793936133384705, + "learning_rate": 8.818743248794085e-06, + "loss": 0.8425, + "step": 8194 + }, + { + "epoch": 0.45104298530463977, + "grad_norm": 0.735509991645813, + "learning_rate": 8.818463425783136e-06, + "loss": 0.7781, + "step": 8195 + }, + { + "epoch": 0.45109802410699545, + "grad_norm": 0.6735361814498901, + "learning_rate": 8.818183574073639e-06, + "loss": 0.6987, + "step": 8196 + }, + { + "epoch": 0.4511530629093511, + "grad_norm": 0.7780157923698425, + "learning_rate": 8.817903693667695e-06, + "loss": 0.8474, + "step": 8197 + }, + { + "epoch": 0.4512081017117068, + "grad_norm": 0.6714445948600769, + "learning_rate": 8.817623784567411e-06, + "loss": 0.7216, + "step": 8198 + }, + { + "epoch": 0.4512631405140624, + "grad_norm": 0.6311395168304443, + "learning_rate": 8.817343846774886e-06, + "loss": 0.5724, + "step": 8199 + }, + { + "epoch": 0.4513181793164181, + "grad_norm": 0.7446333169937134, + "learning_rate": 8.817063880292227e-06, + "loss": 0.7867, + "step": 8200 + }, + { + "epoch": 0.45137321811877373, + "grad_norm": 0.7684246301651001, + "learning_rate": 8.816783885121539e-06, + "loss": 0.8141, + "step": 8201 + }, + { + "epoch": 0.4514282569211294, + "grad_norm": 0.754781186580658, + "learning_rate": 8.816503861264925e-06, + "loss": 0.8438, + "step": 8202 + }, + { + "epoch": 0.45148329572348506, + "grad_norm": 0.7705762982368469, + "learning_rate": 8.816223808724488e-06, + "loss": 0.8948, + "step": 8203 + }, + { + "epoch": 0.4515383345258407, + "grad_norm": 0.7731552720069885, + "learning_rate": 8.815943727502333e-06, + "loss": 0.7462, + "step": 8204 + }, + { + "epoch": 0.4515933733281964, + "grad_norm": 0.6615393757820129, + "learning_rate": 8.81566361760057e-06, + "loss": 0.7499, + "step": 8205 + }, + { + "epoch": 0.451648412130552, + "grad_norm": 0.724453866481781, + "learning_rate": 8.8153834790213e-06, + "loss": 0.7382, + "step": 8206 + }, + { + "epoch": 0.4517034509329077, + "grad_norm": 0.6369735598564148, + "learning_rate": 8.815103311766629e-06, + "loss": 0.7452, + "step": 8207 + }, + { + "epoch": 0.45175848973526334, + "grad_norm": 0.686000406742096, + "learning_rate": 8.814823115838659e-06, + "loss": 0.6971, + "step": 8208 + }, + { + "epoch": 0.451813528537619, + "grad_norm": 0.7372714281082153, + "learning_rate": 8.814542891239505e-06, + "loss": 0.8553, + "step": 8209 + }, + { + "epoch": 0.45186856733997466, + "grad_norm": 0.8348672986030579, + "learning_rate": 8.814262637971264e-06, + "loss": 0.7135, + "step": 8210 + }, + { + "epoch": 0.45192360614233035, + "grad_norm": 0.7829258441925049, + "learning_rate": 8.813982356036049e-06, + "loss": 0.7974, + "step": 8211 + }, + { + "epoch": 0.451978644944686, + "grad_norm": 0.7013983726501465, + "learning_rate": 8.81370204543596e-06, + "loss": 0.7531, + "step": 8212 + }, + { + "epoch": 0.45203368374704167, + "grad_norm": 0.8424196243286133, + "learning_rate": 8.81342170617311e-06, + "loss": 0.8217, + "step": 8213 + }, + { + "epoch": 0.4520887225493973, + "grad_norm": 0.7113365530967712, + "learning_rate": 8.813141338249603e-06, + "loss": 0.7728, + "step": 8214 + }, + { + "epoch": 0.452143761351753, + "grad_norm": 0.958642303943634, + "learning_rate": 8.812860941667545e-06, + "loss": 0.7234, + "step": 8215 + }, + { + "epoch": 0.4521988001541086, + "grad_norm": 0.6712706685066223, + "learning_rate": 8.812580516429045e-06, + "loss": 0.6998, + "step": 8216 + }, + { + "epoch": 0.4522538389564643, + "grad_norm": 0.7258469462394714, + "learning_rate": 8.812300062536212e-06, + "loss": 0.6758, + "step": 8217 + }, + { + "epoch": 0.45230887775881995, + "grad_norm": 0.735047459602356, + "learning_rate": 8.812019579991152e-06, + "loss": 0.7045, + "step": 8218 + }, + { + "epoch": 0.45236391656117564, + "grad_norm": 0.8339886665344238, + "learning_rate": 8.811739068795971e-06, + "loss": 0.8069, + "step": 8219 + }, + { + "epoch": 0.45241895536353127, + "grad_norm": 0.7170082926750183, + "learning_rate": 8.81145852895278e-06, + "loss": 0.6345, + "step": 8220 + }, + { + "epoch": 0.45247399416588696, + "grad_norm": 0.6892569661140442, + "learning_rate": 8.81117796046369e-06, + "loss": 0.712, + "step": 8221 + }, + { + "epoch": 0.4525290329682426, + "grad_norm": 0.6837140321731567, + "learning_rate": 8.810897363330804e-06, + "loss": 0.7184, + "step": 8222 + }, + { + "epoch": 0.4525840717705983, + "grad_norm": 0.7410069108009338, + "learning_rate": 8.810616737556235e-06, + "loss": 0.8265, + "step": 8223 + }, + { + "epoch": 0.4526391105729539, + "grad_norm": 0.6945875883102417, + "learning_rate": 8.810336083142089e-06, + "loss": 0.7163, + "step": 8224 + }, + { + "epoch": 0.4526941493753096, + "grad_norm": 0.6978884339332581, + "learning_rate": 8.810055400090477e-06, + "loss": 0.795, + "step": 8225 + }, + { + "epoch": 0.45274918817766524, + "grad_norm": 0.7209095358848572, + "learning_rate": 8.809774688403509e-06, + "loss": 0.7317, + "step": 8226 + }, + { + "epoch": 0.45280422698002093, + "grad_norm": 0.7279626727104187, + "learning_rate": 8.809493948083294e-06, + "loss": 0.7699, + "step": 8227 + }, + { + "epoch": 0.45285926578237656, + "grad_norm": 0.7642556428909302, + "learning_rate": 8.809213179131943e-06, + "loss": 0.8518, + "step": 8228 + }, + { + "epoch": 0.45291430458473225, + "grad_norm": 0.6868709325790405, + "learning_rate": 8.808932381551565e-06, + "loss": 0.737, + "step": 8229 + }, + { + "epoch": 0.4529693433870879, + "grad_norm": 0.7012789845466614, + "learning_rate": 8.80865155534427e-06, + "loss": 0.8146, + "step": 8230 + }, + { + "epoch": 0.4530243821894436, + "grad_norm": 0.678683340549469, + "learning_rate": 8.808370700512171e-06, + "loss": 0.7531, + "step": 8231 + }, + { + "epoch": 0.4530794209917992, + "grad_norm": 0.690559983253479, + "learning_rate": 8.808089817057377e-06, + "loss": 0.6779, + "step": 8232 + }, + { + "epoch": 0.4531344597941549, + "grad_norm": 0.7179763317108154, + "learning_rate": 8.807808904981997e-06, + "loss": 0.8815, + "step": 8233 + }, + { + "epoch": 0.45318949859651053, + "grad_norm": 0.7708277702331543, + "learning_rate": 8.807527964288147e-06, + "loss": 0.8084, + "step": 8234 + }, + { + "epoch": 0.4532445373988662, + "grad_norm": 0.6828494071960449, + "learning_rate": 8.807246994977936e-06, + "loss": 0.7587, + "step": 8235 + }, + { + "epoch": 0.45329957620122185, + "grad_norm": 0.7085250616073608, + "learning_rate": 8.806965997053475e-06, + "loss": 0.7894, + "step": 8236 + }, + { + "epoch": 0.45335461500357754, + "grad_norm": 0.7723467946052551, + "learning_rate": 8.806684970516876e-06, + "loss": 0.7408, + "step": 8237 + }, + { + "epoch": 0.4534096538059332, + "grad_norm": 0.8887566328048706, + "learning_rate": 8.806403915370253e-06, + "loss": 0.9022, + "step": 8238 + }, + { + "epoch": 0.45346469260828887, + "grad_norm": 0.7379833459854126, + "learning_rate": 8.806122831615718e-06, + "loss": 0.8264, + "step": 8239 + }, + { + "epoch": 0.4535197314106445, + "grad_norm": 0.903279721736908, + "learning_rate": 8.80584171925538e-06, + "loss": 0.7432, + "step": 8240 + }, + { + "epoch": 0.4535747702130002, + "grad_norm": 0.7671363353729248, + "learning_rate": 8.805560578291356e-06, + "loss": 0.8109, + "step": 8241 + }, + { + "epoch": 0.4536298090153558, + "grad_norm": 0.6047827005386353, + "learning_rate": 8.805279408725755e-06, + "loss": 0.6628, + "step": 8242 + }, + { + "epoch": 0.4536848478177115, + "grad_norm": 1.0570796728134155, + "learning_rate": 8.804998210560696e-06, + "loss": 0.7981, + "step": 8243 + }, + { + "epoch": 0.45373988662006715, + "grad_norm": 0.7116600871086121, + "learning_rate": 8.804716983798288e-06, + "loss": 0.7601, + "step": 8244 + }, + { + "epoch": 0.45379492542242283, + "grad_norm": 0.7162767648696899, + "learning_rate": 8.804435728440644e-06, + "loss": 0.8389, + "step": 8245 + }, + { + "epoch": 0.45384996422477847, + "grad_norm": 0.6715626120567322, + "learning_rate": 8.80415444448988e-06, + "loss": 0.6377, + "step": 8246 + }, + { + "epoch": 0.4539050030271341, + "grad_norm": 0.7168908715248108, + "learning_rate": 8.80387313194811e-06, + "loss": 0.7946, + "step": 8247 + }, + { + "epoch": 0.4539600418294898, + "grad_norm": 0.7497992515563965, + "learning_rate": 8.803591790817448e-06, + "loss": 0.8026, + "step": 8248 + }, + { + "epoch": 0.4540150806318454, + "grad_norm": 0.6665049195289612, + "learning_rate": 8.803310421100009e-06, + "loss": 0.779, + "step": 8249 + }, + { + "epoch": 0.4540701194342011, + "grad_norm": 0.766674280166626, + "learning_rate": 8.803029022797905e-06, + "loss": 0.7467, + "step": 8250 + }, + { + "epoch": 0.45412515823655675, + "grad_norm": 0.7306104302406311, + "learning_rate": 8.802747595913255e-06, + "loss": 0.8323, + "step": 8251 + }, + { + "epoch": 0.45418019703891244, + "grad_norm": 0.6425766944885254, + "learning_rate": 8.802466140448169e-06, + "loss": 0.7226, + "step": 8252 + }, + { + "epoch": 0.45423523584126807, + "grad_norm": 0.7992560267448425, + "learning_rate": 8.802184656404769e-06, + "loss": 0.7285, + "step": 8253 + }, + { + "epoch": 0.45429027464362376, + "grad_norm": 0.6935924887657166, + "learning_rate": 8.801903143785164e-06, + "loss": 0.5757, + "step": 8254 + }, + { + "epoch": 0.4543453134459794, + "grad_norm": 0.7091512084007263, + "learning_rate": 8.801621602591473e-06, + "loss": 0.7719, + "step": 8255 + }, + { + "epoch": 0.4544003522483351, + "grad_norm": 0.851231038570404, + "learning_rate": 8.801340032825814e-06, + "loss": 0.7804, + "step": 8256 + }, + { + "epoch": 0.4544553910506907, + "grad_norm": 0.7443445920944214, + "learning_rate": 8.801058434490298e-06, + "loss": 0.7172, + "step": 8257 + }, + { + "epoch": 0.4545104298530464, + "grad_norm": 0.7156546115875244, + "learning_rate": 8.800776807587046e-06, + "loss": 0.7756, + "step": 8258 + }, + { + "epoch": 0.45456546865540204, + "grad_norm": 0.8027580380439758, + "learning_rate": 8.800495152118172e-06, + "loss": 0.8035, + "step": 8259 + }, + { + "epoch": 0.4546205074577577, + "grad_norm": 0.6868240833282471, + "learning_rate": 8.800213468085794e-06, + "loss": 0.7159, + "step": 8260 + }, + { + "epoch": 0.45467554626011336, + "grad_norm": 0.9127504229545593, + "learning_rate": 8.79993175549203e-06, + "loss": 0.7705, + "step": 8261 + }, + { + "epoch": 0.45473058506246905, + "grad_norm": 0.7074575424194336, + "learning_rate": 8.799650014338994e-06, + "loss": 0.7841, + "step": 8262 + }, + { + "epoch": 0.4547856238648247, + "grad_norm": 0.7462378740310669, + "learning_rate": 8.799368244628807e-06, + "loss": 0.8125, + "step": 8263 + }, + { + "epoch": 0.4548406626671804, + "grad_norm": 0.7510300874710083, + "learning_rate": 8.799086446363585e-06, + "loss": 0.8354, + "step": 8264 + }, + { + "epoch": 0.454895701469536, + "grad_norm": 0.7134591937065125, + "learning_rate": 8.798804619545446e-06, + "loss": 0.7968, + "step": 8265 + }, + { + "epoch": 0.4549507402718917, + "grad_norm": 1.0424071550369263, + "learning_rate": 8.798522764176509e-06, + "loss": 0.8638, + "step": 8266 + }, + { + "epoch": 0.45500577907424733, + "grad_norm": 0.6805267930030823, + "learning_rate": 8.79824088025889e-06, + "loss": 0.757, + "step": 8267 + }, + { + "epoch": 0.455060817876603, + "grad_norm": 0.8145313262939453, + "learning_rate": 8.79795896779471e-06, + "loss": 0.7589, + "step": 8268 + }, + { + "epoch": 0.45511585667895865, + "grad_norm": 0.7611781358718872, + "learning_rate": 8.79767702678609e-06, + "loss": 0.8426, + "step": 8269 + }, + { + "epoch": 0.45517089548131434, + "grad_norm": 0.7639568448066711, + "learning_rate": 8.797395057235142e-06, + "loss": 0.6609, + "step": 8270 + }, + { + "epoch": 0.45522593428367, + "grad_norm": 0.8577544093132019, + "learning_rate": 8.79711305914399e-06, + "loss": 0.8085, + "step": 8271 + }, + { + "epoch": 0.45528097308602566, + "grad_norm": 0.7740383148193359, + "learning_rate": 8.796831032514754e-06, + "loss": 0.8689, + "step": 8272 + }, + { + "epoch": 0.4553360118883813, + "grad_norm": 0.7300885915756226, + "learning_rate": 8.796548977349553e-06, + "loss": 0.8303, + "step": 8273 + }, + { + "epoch": 0.455391050690737, + "grad_norm": 0.6677057147026062, + "learning_rate": 8.796266893650504e-06, + "loss": 0.7449, + "step": 8274 + }, + { + "epoch": 0.4554460894930926, + "grad_norm": 0.7269144058227539, + "learning_rate": 8.79598478141973e-06, + "loss": 0.8744, + "step": 8275 + }, + { + "epoch": 0.4555011282954483, + "grad_norm": 0.7458559274673462, + "learning_rate": 8.795702640659351e-06, + "loss": 0.8036, + "step": 8276 + }, + { + "epoch": 0.45555616709780394, + "grad_norm": 0.7693114280700684, + "learning_rate": 8.795420471371487e-06, + "loss": 0.7617, + "step": 8277 + }, + { + "epoch": 0.45561120590015963, + "grad_norm": 0.7594510316848755, + "learning_rate": 8.79513827355826e-06, + "loss": 0.7049, + "step": 8278 + }, + { + "epoch": 0.45566624470251527, + "grad_norm": 0.7481217980384827, + "learning_rate": 8.794856047221786e-06, + "loss": 0.804, + "step": 8279 + }, + { + "epoch": 0.45572128350487096, + "grad_norm": 0.726859986782074, + "learning_rate": 8.794573792364192e-06, + "loss": 0.7322, + "step": 8280 + }, + { + "epoch": 0.4557763223072266, + "grad_norm": 0.6800149083137512, + "learning_rate": 8.794291508987597e-06, + "loss": 0.8467, + "step": 8281 + }, + { + "epoch": 0.4558313611095823, + "grad_norm": 0.6264217495918274, + "learning_rate": 8.794009197094122e-06, + "loss": 0.6203, + "step": 8282 + }, + { + "epoch": 0.4558863999119379, + "grad_norm": 0.6973850131034851, + "learning_rate": 8.79372685668589e-06, + "loss": 0.8211, + "step": 8283 + }, + { + "epoch": 0.4559414387142936, + "grad_norm": 0.6992879509925842, + "learning_rate": 8.793444487765022e-06, + "loss": 0.7831, + "step": 8284 + }, + { + "epoch": 0.45599647751664923, + "grad_norm": 0.7641519904136658, + "learning_rate": 8.793162090333643e-06, + "loss": 0.7519, + "step": 8285 + }, + { + "epoch": 0.4560515163190049, + "grad_norm": 0.7296152710914612, + "learning_rate": 8.79287966439387e-06, + "loss": 0.8738, + "step": 8286 + }, + { + "epoch": 0.45610655512136056, + "grad_norm": 0.7549383044242859, + "learning_rate": 8.79259720994783e-06, + "loss": 0.7868, + "step": 8287 + }, + { + "epoch": 0.45616159392371625, + "grad_norm": 0.7932083606719971, + "learning_rate": 8.792314726997644e-06, + "loss": 0.8443, + "step": 8288 + }, + { + "epoch": 0.4562166327260719, + "grad_norm": 0.7999894022941589, + "learning_rate": 8.792032215545437e-06, + "loss": 0.852, + "step": 8289 + }, + { + "epoch": 0.4562716715284275, + "grad_norm": 0.8092383742332458, + "learning_rate": 8.79174967559333e-06, + "loss": 0.7922, + "step": 8290 + }, + { + "epoch": 0.4563267103307832, + "grad_norm": 0.7481340169906616, + "learning_rate": 8.791467107143447e-06, + "loss": 0.7086, + "step": 8291 + }, + { + "epoch": 0.45638174913313884, + "grad_norm": 0.8096129298210144, + "learning_rate": 8.791184510197912e-06, + "loss": 0.6645, + "step": 8292 + }, + { + "epoch": 0.4564367879354945, + "grad_norm": 0.7276492118835449, + "learning_rate": 8.79090188475885e-06, + "loss": 0.7174, + "step": 8293 + }, + { + "epoch": 0.45649182673785016, + "grad_norm": 0.815535843372345, + "learning_rate": 8.790619230828385e-06, + "loss": 0.8622, + "step": 8294 + }, + { + "epoch": 0.45654686554020585, + "grad_norm": 0.8191169500350952, + "learning_rate": 8.790336548408637e-06, + "loss": 0.8666, + "step": 8295 + }, + { + "epoch": 0.4566019043425615, + "grad_norm": 0.7449167966842651, + "learning_rate": 8.790053837501737e-06, + "loss": 0.7728, + "step": 8296 + }, + { + "epoch": 0.45665694314491717, + "grad_norm": 0.7311065196990967, + "learning_rate": 8.789771098109808e-06, + "loss": 0.8059, + "step": 8297 + }, + { + "epoch": 0.4567119819472728, + "grad_norm": 0.7381907105445862, + "learning_rate": 8.789488330234971e-06, + "loss": 0.7722, + "step": 8298 + }, + { + "epoch": 0.4567670207496285, + "grad_norm": 0.8180661201477051, + "learning_rate": 8.789205533879355e-06, + "loss": 0.9032, + "step": 8299 + }, + { + "epoch": 0.4568220595519841, + "grad_norm": 0.7993118762969971, + "learning_rate": 8.788922709045087e-06, + "loss": 0.8065, + "step": 8300 + }, + { + "epoch": 0.4568770983543398, + "grad_norm": 0.8449206948280334, + "learning_rate": 8.788639855734287e-06, + "loss": 0.7895, + "step": 8301 + }, + { + "epoch": 0.45693213715669545, + "grad_norm": 0.9224583506584167, + "learning_rate": 8.788356973949084e-06, + "loss": 0.78, + "step": 8302 + }, + { + "epoch": 0.45698717595905114, + "grad_norm": 0.7109915614128113, + "learning_rate": 8.788074063691604e-06, + "loss": 0.8029, + "step": 8303 + }, + { + "epoch": 0.4570422147614068, + "grad_norm": 0.7372310757637024, + "learning_rate": 8.787791124963976e-06, + "loss": 0.8118, + "step": 8304 + }, + { + "epoch": 0.45709725356376246, + "grad_norm": 0.8127168416976929, + "learning_rate": 8.787508157768323e-06, + "loss": 0.8665, + "step": 8305 + }, + { + "epoch": 0.4571522923661181, + "grad_norm": 0.7193050980567932, + "learning_rate": 8.787225162106771e-06, + "loss": 0.749, + "step": 8306 + }, + { + "epoch": 0.4572073311684738, + "grad_norm": 0.8825041651725769, + "learning_rate": 8.786942137981449e-06, + "loss": 0.9651, + "step": 8307 + }, + { + "epoch": 0.4572623699708294, + "grad_norm": 0.6854885816574097, + "learning_rate": 8.786659085394485e-06, + "loss": 0.8259, + "step": 8308 + }, + { + "epoch": 0.4573174087731851, + "grad_norm": 0.6698010563850403, + "learning_rate": 8.786376004348004e-06, + "loss": 0.7212, + "step": 8309 + }, + { + "epoch": 0.45737244757554074, + "grad_norm": 0.7706398963928223, + "learning_rate": 8.786092894844132e-06, + "loss": 0.719, + "step": 8310 + }, + { + "epoch": 0.45742748637789643, + "grad_norm": 0.8905620574951172, + "learning_rate": 8.785809756885002e-06, + "loss": 0.7518, + "step": 8311 + }, + { + "epoch": 0.45748252518025206, + "grad_norm": 0.7537117004394531, + "learning_rate": 8.78552659047274e-06, + "loss": 0.8267, + "step": 8312 + }, + { + "epoch": 0.45753756398260775, + "grad_norm": 0.7840754985809326, + "learning_rate": 8.78524339560947e-06, + "loss": 0.8417, + "step": 8313 + }, + { + "epoch": 0.4575926027849634, + "grad_norm": 0.7373713254928589, + "learning_rate": 8.784960172297327e-06, + "loss": 0.784, + "step": 8314 + }, + { + "epoch": 0.4576476415873191, + "grad_norm": 0.6648432016372681, + "learning_rate": 8.784676920538436e-06, + "loss": 0.7252, + "step": 8315 + }, + { + "epoch": 0.4577026803896747, + "grad_norm": 0.7904912829399109, + "learning_rate": 8.784393640334925e-06, + "loss": 0.7777, + "step": 8316 + }, + { + "epoch": 0.4577577191920304, + "grad_norm": 0.7691501379013062, + "learning_rate": 8.784110331688927e-06, + "loss": 0.733, + "step": 8317 + }, + { + "epoch": 0.45781275799438603, + "grad_norm": 0.6054617762565613, + "learning_rate": 8.783826994602566e-06, + "loss": 0.6367, + "step": 8318 + }, + { + "epoch": 0.4578677967967417, + "grad_norm": 0.7495457530021667, + "learning_rate": 8.783543629077976e-06, + "loss": 0.8672, + "step": 8319 + }, + { + "epoch": 0.45792283559909736, + "grad_norm": 0.6979867815971375, + "learning_rate": 8.783260235117283e-06, + "loss": 0.7338, + "step": 8320 + }, + { + "epoch": 0.45797787440145304, + "grad_norm": 0.6927759647369385, + "learning_rate": 8.78297681272262e-06, + "loss": 0.6925, + "step": 8321 + }, + { + "epoch": 0.4580329132038087, + "grad_norm": 0.9076687097549438, + "learning_rate": 8.782693361896115e-06, + "loss": 0.8225, + "step": 8322 + }, + { + "epoch": 0.45808795200616437, + "grad_norm": 0.7990893721580505, + "learning_rate": 8.782409882639902e-06, + "loss": 0.8144, + "step": 8323 + }, + { + "epoch": 0.45814299080852, + "grad_norm": 0.7958230376243591, + "learning_rate": 8.782126374956107e-06, + "loss": 0.7717, + "step": 8324 + }, + { + "epoch": 0.4581980296108757, + "grad_norm": 0.7694645524024963, + "learning_rate": 8.781842838846861e-06, + "loss": 0.8314, + "step": 8325 + }, + { + "epoch": 0.4582530684132313, + "grad_norm": 0.8653621077537537, + "learning_rate": 8.781559274314297e-06, + "loss": 0.7567, + "step": 8326 + }, + { + "epoch": 0.458308107215587, + "grad_norm": 0.7834668755531311, + "learning_rate": 8.781275681360548e-06, + "loss": 0.7431, + "step": 8327 + }, + { + "epoch": 0.45836314601794265, + "grad_norm": 0.6800104975700378, + "learning_rate": 8.780992059987742e-06, + "loss": 0.8266, + "step": 8328 + }, + { + "epoch": 0.45841818482029834, + "grad_norm": 0.7274910807609558, + "learning_rate": 8.780708410198011e-06, + "loss": 0.7358, + "step": 8329 + }, + { + "epoch": 0.45847322362265397, + "grad_norm": 0.8102344870567322, + "learning_rate": 8.780424731993488e-06, + "loss": 0.7397, + "step": 8330 + }, + { + "epoch": 0.45852826242500966, + "grad_norm": 0.7536956071853638, + "learning_rate": 8.780141025376305e-06, + "loss": 0.7053, + "step": 8331 + }, + { + "epoch": 0.4585833012273653, + "grad_norm": 0.678535521030426, + "learning_rate": 8.779857290348594e-06, + "loss": 0.792, + "step": 8332 + }, + { + "epoch": 0.4586383400297209, + "grad_norm": 0.8847216963768005, + "learning_rate": 8.779573526912487e-06, + "loss": 0.8117, + "step": 8333 + }, + { + "epoch": 0.4586933788320766, + "grad_norm": 0.6997288465499878, + "learning_rate": 8.779289735070117e-06, + "loss": 0.7797, + "step": 8334 + }, + { + "epoch": 0.45874841763443225, + "grad_norm": 0.7445441484451294, + "learning_rate": 8.779005914823617e-06, + "loss": 0.7505, + "step": 8335 + }, + { + "epoch": 0.45880345643678794, + "grad_norm": 0.618844211101532, + "learning_rate": 8.778722066175121e-06, + "loss": 0.661, + "step": 8336 + }, + { + "epoch": 0.45885849523914357, + "grad_norm": 0.6810492873191833, + "learning_rate": 8.778438189126761e-06, + "loss": 0.6819, + "step": 8337 + }, + { + "epoch": 0.45891353404149926, + "grad_norm": 0.6785591244697571, + "learning_rate": 8.778154283680671e-06, + "loss": 0.7808, + "step": 8338 + }, + { + "epoch": 0.4589685728438549, + "grad_norm": 0.7461212873458862, + "learning_rate": 8.777870349838984e-06, + "loss": 0.8566, + "step": 8339 + }, + { + "epoch": 0.4590236116462106, + "grad_norm": 0.6731496453285217, + "learning_rate": 8.777586387603836e-06, + "loss": 0.823, + "step": 8340 + }, + { + "epoch": 0.4590786504485662, + "grad_norm": 0.7295553684234619, + "learning_rate": 8.77730239697736e-06, + "loss": 0.9229, + "step": 8341 + }, + { + "epoch": 0.4591336892509219, + "grad_norm": 0.783275842666626, + "learning_rate": 8.77701837796169e-06, + "loss": 0.782, + "step": 8342 + }, + { + "epoch": 0.45918872805327754, + "grad_norm": 0.6952852606773376, + "learning_rate": 8.77673433055896e-06, + "loss": 0.7977, + "step": 8343 + }, + { + "epoch": 0.45924376685563323, + "grad_norm": 0.7381969094276428, + "learning_rate": 8.776450254771305e-06, + "loss": 0.768, + "step": 8344 + }, + { + "epoch": 0.45929880565798886, + "grad_norm": 0.7911093831062317, + "learning_rate": 8.776166150600862e-06, + "loss": 0.8284, + "step": 8345 + }, + { + "epoch": 0.45935384446034455, + "grad_norm": 0.7319246530532837, + "learning_rate": 8.775882018049765e-06, + "loss": 0.8135, + "step": 8346 + }, + { + "epoch": 0.4594088832627002, + "grad_norm": 0.7888429760932922, + "learning_rate": 8.77559785712015e-06, + "loss": 0.9001, + "step": 8347 + }, + { + "epoch": 0.4594639220650559, + "grad_norm": 0.6983326077461243, + "learning_rate": 8.775313667814151e-06, + "loss": 0.7537, + "step": 8348 + }, + { + "epoch": 0.4595189608674115, + "grad_norm": 0.7532416582107544, + "learning_rate": 8.775029450133905e-06, + "loss": 0.8307, + "step": 8349 + }, + { + "epoch": 0.4595739996697672, + "grad_norm": 0.7159993052482605, + "learning_rate": 8.774745204081549e-06, + "loss": 0.7874, + "step": 8350 + }, + { + "epoch": 0.45962903847212283, + "grad_norm": 0.6898767352104187, + "learning_rate": 8.774460929659218e-06, + "loss": 0.7453, + "step": 8351 + }, + { + "epoch": 0.4596840772744785, + "grad_norm": 0.6833236813545227, + "learning_rate": 8.774176626869051e-06, + "loss": 0.7281, + "step": 8352 + }, + { + "epoch": 0.45973911607683415, + "grad_norm": 0.7840244770050049, + "learning_rate": 8.77389229571318e-06, + "loss": 0.7194, + "step": 8353 + }, + { + "epoch": 0.45979415487918984, + "grad_norm": 0.7920441627502441, + "learning_rate": 8.773607936193747e-06, + "loss": 0.7135, + "step": 8354 + }, + { + "epoch": 0.4598491936815455, + "grad_norm": 0.7395668625831604, + "learning_rate": 8.773323548312884e-06, + "loss": 0.8162, + "step": 8355 + }, + { + "epoch": 0.45990423248390117, + "grad_norm": 0.7854128479957581, + "learning_rate": 8.773039132072734e-06, + "loss": 0.8252, + "step": 8356 + }, + { + "epoch": 0.4599592712862568, + "grad_norm": 0.694997251033783, + "learning_rate": 8.772754687475431e-06, + "loss": 0.6627, + "step": 8357 + }, + { + "epoch": 0.4600143100886125, + "grad_norm": 0.7698866724967957, + "learning_rate": 8.772470214523112e-06, + "loss": 0.8814, + "step": 8358 + }, + { + "epoch": 0.4600693488909681, + "grad_norm": 0.7323407530784607, + "learning_rate": 8.77218571321792e-06, + "loss": 0.7769, + "step": 8359 + }, + { + "epoch": 0.4601243876933238, + "grad_norm": 0.6637027263641357, + "learning_rate": 8.771901183561986e-06, + "loss": 0.6741, + "step": 8360 + }, + { + "epoch": 0.46017942649567944, + "grad_norm": 0.7423702478408813, + "learning_rate": 8.771616625557455e-06, + "loss": 0.7303, + "step": 8361 + }, + { + "epoch": 0.46023446529803513, + "grad_norm": 0.7599568367004395, + "learning_rate": 8.771332039206463e-06, + "loss": 0.8161, + "step": 8362 + }, + { + "epoch": 0.46028950410039077, + "grad_norm": 0.9063183069229126, + "learning_rate": 8.771047424511148e-06, + "loss": 0.8098, + "step": 8363 + }, + { + "epoch": 0.46034454290274646, + "grad_norm": 0.658210813999176, + "learning_rate": 8.770762781473651e-06, + "loss": 0.7097, + "step": 8364 + }, + { + "epoch": 0.4603995817051021, + "grad_norm": 0.8396975994110107, + "learning_rate": 8.770478110096111e-06, + "loss": 0.8731, + "step": 8365 + }, + { + "epoch": 0.4604546205074578, + "grad_norm": 0.7334815263748169, + "learning_rate": 8.770193410380663e-06, + "loss": 0.7689, + "step": 8366 + }, + { + "epoch": 0.4605096593098134, + "grad_norm": 0.8220386505126953, + "learning_rate": 8.769908682329453e-06, + "loss": 0.8139, + "step": 8367 + }, + { + "epoch": 0.4605646981121691, + "grad_norm": 0.8077995181083679, + "learning_rate": 8.76962392594462e-06, + "loss": 0.7379, + "step": 8368 + }, + { + "epoch": 0.46061973691452474, + "grad_norm": 0.8007730841636658, + "learning_rate": 8.7693391412283e-06, + "loss": 0.7835, + "step": 8369 + }, + { + "epoch": 0.4606747757168804, + "grad_norm": 0.7108187079429626, + "learning_rate": 8.769054328182637e-06, + "loss": 0.6787, + "step": 8370 + }, + { + "epoch": 0.46072981451923606, + "grad_norm": 0.7623056173324585, + "learning_rate": 8.768769486809772e-06, + "loss": 0.8056, + "step": 8371 + }, + { + "epoch": 0.46078485332159175, + "grad_norm": 0.6991614103317261, + "learning_rate": 8.768484617111843e-06, + "loss": 0.7404, + "step": 8372 + }, + { + "epoch": 0.4608398921239474, + "grad_norm": 0.7531471848487854, + "learning_rate": 8.768199719090991e-06, + "loss": 0.8104, + "step": 8373 + }, + { + "epoch": 0.46089493092630307, + "grad_norm": 1.0271111726760864, + "learning_rate": 8.76791479274936e-06, + "loss": 0.9028, + "step": 8374 + }, + { + "epoch": 0.4609499697286587, + "grad_norm": 0.7346897125244141, + "learning_rate": 8.76762983808909e-06, + "loss": 0.8179, + "step": 8375 + }, + { + "epoch": 0.46100500853101434, + "grad_norm": 0.6413559913635254, + "learning_rate": 8.767344855112324e-06, + "loss": 0.7995, + "step": 8376 + }, + { + "epoch": 0.46106004733337, + "grad_norm": 0.7187537550926208, + "learning_rate": 8.767059843821199e-06, + "loss": 0.7973, + "step": 8377 + }, + { + "epoch": 0.46111508613572566, + "grad_norm": 0.6819092035293579, + "learning_rate": 8.766774804217864e-06, + "loss": 0.8255, + "step": 8378 + }, + { + "epoch": 0.46117012493808135, + "grad_norm": 0.683318018913269, + "learning_rate": 8.766489736304457e-06, + "loss": 0.6794, + "step": 8379 + }, + { + "epoch": 0.461225163740437, + "grad_norm": 0.7345470786094666, + "learning_rate": 8.76620464008312e-06, + "loss": 0.8741, + "step": 8380 + }, + { + "epoch": 0.46128020254279267, + "grad_norm": 0.7369397282600403, + "learning_rate": 8.765919515556e-06, + "loss": 0.8301, + "step": 8381 + }, + { + "epoch": 0.4613352413451483, + "grad_norm": 0.7304979562759399, + "learning_rate": 8.765634362725233e-06, + "loss": 0.7507, + "step": 8382 + }, + { + "epoch": 0.461390280147504, + "grad_norm": 0.7968454957008362, + "learning_rate": 8.765349181592969e-06, + "loss": 0.7396, + "step": 8383 + }, + { + "epoch": 0.46144531894985963, + "grad_norm": 0.691439151763916, + "learning_rate": 8.765063972161347e-06, + "loss": 0.7199, + "step": 8384 + }, + { + "epoch": 0.4615003577522153, + "grad_norm": 0.8355879187583923, + "learning_rate": 8.764778734432513e-06, + "loss": 0.7369, + "step": 8385 + }, + { + "epoch": 0.46155539655457095, + "grad_norm": 0.908017098903656, + "learning_rate": 8.76449346840861e-06, + "loss": 0.8271, + "step": 8386 + }, + { + "epoch": 0.46161043535692664, + "grad_norm": 0.6426172852516174, + "learning_rate": 8.764208174091781e-06, + "loss": 0.6646, + "step": 8387 + }, + { + "epoch": 0.4616654741592823, + "grad_norm": 0.7003652453422546, + "learning_rate": 8.763922851484171e-06, + "loss": 0.7272, + "step": 8388 + }, + { + "epoch": 0.46172051296163796, + "grad_norm": 0.7470494508743286, + "learning_rate": 8.763637500587925e-06, + "loss": 0.8333, + "step": 8389 + }, + { + "epoch": 0.4617755517639936, + "grad_norm": 0.6974903345108032, + "learning_rate": 8.763352121405187e-06, + "loss": 0.834, + "step": 8390 + }, + { + "epoch": 0.4618305905663493, + "grad_norm": 0.8146659135818481, + "learning_rate": 8.7630667139381e-06, + "loss": 0.724, + "step": 8391 + }, + { + "epoch": 0.4618856293687049, + "grad_norm": 0.6614096164703369, + "learning_rate": 8.762781278188813e-06, + "loss": 0.6822, + "step": 8392 + }, + { + "epoch": 0.4619406681710606, + "grad_norm": 0.712944746017456, + "learning_rate": 8.762495814159469e-06, + "loss": 0.7864, + "step": 8393 + }, + { + "epoch": 0.46199570697341624, + "grad_norm": 0.7531552910804749, + "learning_rate": 8.762210321852213e-06, + "loss": 0.7494, + "step": 8394 + }, + { + "epoch": 0.46205074577577193, + "grad_norm": 0.8150199055671692, + "learning_rate": 8.761924801269191e-06, + "loss": 0.7869, + "step": 8395 + }, + { + "epoch": 0.46210578457812757, + "grad_norm": 0.8586462736129761, + "learning_rate": 8.76163925241255e-06, + "loss": 0.7647, + "step": 8396 + }, + { + "epoch": 0.46216082338048325, + "grad_norm": 0.7258061766624451, + "learning_rate": 8.761353675284434e-06, + "loss": 0.7672, + "step": 8397 + }, + { + "epoch": 0.4622158621828389, + "grad_norm": 0.6592851281166077, + "learning_rate": 8.761068069886992e-06, + "loss": 0.7488, + "step": 8398 + }, + { + "epoch": 0.4622709009851946, + "grad_norm": 0.7410836219787598, + "learning_rate": 8.760782436222368e-06, + "loss": 0.6669, + "step": 8399 + }, + { + "epoch": 0.4623259397875502, + "grad_norm": 0.7121642231941223, + "learning_rate": 8.76049677429271e-06, + "loss": 0.7005, + "step": 8400 + }, + { + "epoch": 0.4623809785899059, + "grad_norm": 0.7170663475990295, + "learning_rate": 8.760211084100166e-06, + "loss": 0.8154, + "step": 8401 + }, + { + "epoch": 0.46243601739226153, + "grad_norm": 0.6851769685745239, + "learning_rate": 8.759925365646882e-06, + "loss": 0.7948, + "step": 8402 + }, + { + "epoch": 0.4624910561946172, + "grad_norm": 0.7728533744812012, + "learning_rate": 8.759639618935006e-06, + "loss": 0.8263, + "step": 8403 + }, + { + "epoch": 0.46254609499697286, + "grad_norm": 0.7276784777641296, + "learning_rate": 8.759353843966682e-06, + "loss": 0.6992, + "step": 8404 + }, + { + "epoch": 0.46260113379932855, + "grad_norm": 0.7533649802207947, + "learning_rate": 8.759068040744063e-06, + "loss": 0.7744, + "step": 8405 + }, + { + "epoch": 0.4626561726016842, + "grad_norm": 0.6911979913711548, + "learning_rate": 8.758782209269294e-06, + "loss": 0.6977, + "step": 8406 + }, + { + "epoch": 0.46271121140403987, + "grad_norm": 0.6723766922950745, + "learning_rate": 8.758496349544526e-06, + "loss": 0.7286, + "step": 8407 + }, + { + "epoch": 0.4627662502063955, + "grad_norm": 0.7327921390533447, + "learning_rate": 8.758210461571903e-06, + "loss": 0.7708, + "step": 8408 + }, + { + "epoch": 0.4628212890087512, + "grad_norm": 0.7498626708984375, + "learning_rate": 8.757924545353578e-06, + "loss": 0.7476, + "step": 8409 + }, + { + "epoch": 0.4628763278111068, + "grad_norm": 0.8944914937019348, + "learning_rate": 8.757638600891696e-06, + "loss": 0.7814, + "step": 8410 + }, + { + "epoch": 0.4629313666134625, + "grad_norm": 0.7242841124534607, + "learning_rate": 8.757352628188411e-06, + "loss": 0.7564, + "step": 8411 + }, + { + "epoch": 0.46298640541581815, + "grad_norm": 0.6706324815750122, + "learning_rate": 8.757066627245866e-06, + "loss": 0.7792, + "step": 8412 + }, + { + "epoch": 0.46304144421817384, + "grad_norm": 0.8044155836105347, + "learning_rate": 8.756780598066218e-06, + "loss": 0.7873, + "step": 8413 + }, + { + "epoch": 0.46309648302052947, + "grad_norm": 0.9265295267105103, + "learning_rate": 8.75649454065161e-06, + "loss": 0.878, + "step": 8414 + }, + { + "epoch": 0.46315152182288516, + "grad_norm": 0.8162378668785095, + "learning_rate": 8.756208455004194e-06, + "loss": 0.8758, + "step": 8415 + }, + { + "epoch": 0.4632065606252408, + "grad_norm": 0.7081401348114014, + "learning_rate": 8.755922341126121e-06, + "loss": 0.8053, + "step": 8416 + }, + { + "epoch": 0.4632615994275965, + "grad_norm": 0.663885235786438, + "learning_rate": 8.755636199019544e-06, + "loss": 0.7456, + "step": 8417 + }, + { + "epoch": 0.4633166382299521, + "grad_norm": 0.6934974193572998, + "learning_rate": 8.755350028686608e-06, + "loss": 0.7316, + "step": 8418 + }, + { + "epoch": 0.46337167703230775, + "grad_norm": 0.7162168025970459, + "learning_rate": 8.755063830129467e-06, + "loss": 0.8566, + "step": 8419 + }, + { + "epoch": 0.46342671583466344, + "grad_norm": 0.7507640719413757, + "learning_rate": 8.75477760335027e-06, + "loss": 0.8141, + "step": 8420 + }, + { + "epoch": 0.46348175463701907, + "grad_norm": 0.6853382587432861, + "learning_rate": 8.754491348351172e-06, + "loss": 0.6995, + "step": 8421 + }, + { + "epoch": 0.46353679343937476, + "grad_norm": 0.6421381831169128, + "learning_rate": 8.75420506513432e-06, + "loss": 0.6344, + "step": 8422 + }, + { + "epoch": 0.4635918322417304, + "grad_norm": 0.8042624592781067, + "learning_rate": 8.753918753701868e-06, + "loss": 0.7506, + "step": 8423 + }, + { + "epoch": 0.4636468710440861, + "grad_norm": 0.7184088230133057, + "learning_rate": 8.753632414055969e-06, + "loss": 0.7997, + "step": 8424 + }, + { + "epoch": 0.4637019098464417, + "grad_norm": 0.749919593334198, + "learning_rate": 8.753346046198773e-06, + "loss": 0.8168, + "step": 8425 + }, + { + "epoch": 0.4637569486487974, + "grad_norm": 0.6583670973777771, + "learning_rate": 8.753059650132433e-06, + "loss": 0.6615, + "step": 8426 + }, + { + "epoch": 0.46381198745115304, + "grad_norm": 0.7560496926307678, + "learning_rate": 8.7527732258591e-06, + "loss": 0.7221, + "step": 8427 + }, + { + "epoch": 0.46386702625350873, + "grad_norm": 0.7031972408294678, + "learning_rate": 8.752486773380928e-06, + "loss": 0.8124, + "step": 8428 + }, + { + "epoch": 0.46392206505586436, + "grad_norm": 0.684124767780304, + "learning_rate": 8.752200292700072e-06, + "loss": 0.6862, + "step": 8429 + }, + { + "epoch": 0.46397710385822005, + "grad_norm": 0.8015589118003845, + "learning_rate": 8.751913783818682e-06, + "loss": 0.7863, + "step": 8430 + }, + { + "epoch": 0.4640321426605757, + "grad_norm": 0.6815705299377441, + "learning_rate": 8.751627246738912e-06, + "loss": 0.8116, + "step": 8431 + }, + { + "epoch": 0.4640871814629314, + "grad_norm": 0.7402058839797974, + "learning_rate": 8.751340681462914e-06, + "loss": 0.7341, + "step": 8432 + }, + { + "epoch": 0.464142220265287, + "grad_norm": 0.7484470009803772, + "learning_rate": 8.751054087992848e-06, + "loss": 0.8103, + "step": 8433 + }, + { + "epoch": 0.4641972590676427, + "grad_norm": 0.8148707151412964, + "learning_rate": 8.75076746633086e-06, + "loss": 0.8995, + "step": 8434 + }, + { + "epoch": 0.46425229786999833, + "grad_norm": 0.6403086185455322, + "learning_rate": 8.750480816479107e-06, + "loss": 0.6705, + "step": 8435 + }, + { + "epoch": 0.464307336672354, + "grad_norm": 0.7787690758705139, + "learning_rate": 8.750194138439748e-06, + "loss": 0.854, + "step": 8436 + }, + { + "epoch": 0.46436237547470965, + "grad_norm": 0.6975393891334534, + "learning_rate": 8.749907432214931e-06, + "loss": 0.7588, + "step": 8437 + }, + { + "epoch": 0.46441741427706534, + "grad_norm": 0.8002430200576782, + "learning_rate": 8.749620697806812e-06, + "loss": 0.8244, + "step": 8438 + }, + { + "epoch": 0.464472453079421, + "grad_norm": 0.8049100637435913, + "learning_rate": 8.74933393521755e-06, + "loss": 0.7686, + "step": 8439 + }, + { + "epoch": 0.46452749188177667, + "grad_norm": 0.6716971397399902, + "learning_rate": 8.749047144449298e-06, + "loss": 0.7823, + "step": 8440 + }, + { + "epoch": 0.4645825306841323, + "grad_norm": 0.7292011380195618, + "learning_rate": 8.748760325504212e-06, + "loss": 0.7643, + "step": 8441 + }, + { + "epoch": 0.464637569486488, + "grad_norm": 0.6823335886001587, + "learning_rate": 8.748473478384444e-06, + "loss": 0.7539, + "step": 8442 + }, + { + "epoch": 0.4646926082888436, + "grad_norm": 0.761730968952179, + "learning_rate": 8.748186603092155e-06, + "loss": 0.7279, + "step": 8443 + }, + { + "epoch": 0.4647476470911993, + "grad_norm": 0.694007933139801, + "learning_rate": 8.747899699629498e-06, + "loss": 0.7907, + "step": 8444 + }, + { + "epoch": 0.46480268589355495, + "grad_norm": 0.7638683319091797, + "learning_rate": 8.74761276799863e-06, + "loss": 0.7278, + "step": 8445 + }, + { + "epoch": 0.46485772469591063, + "grad_norm": 0.6281229853630066, + "learning_rate": 8.747325808201708e-06, + "loss": 0.6609, + "step": 8446 + }, + { + "epoch": 0.46491276349826627, + "grad_norm": 0.7273259162902832, + "learning_rate": 8.747038820240887e-06, + "loss": 0.7553, + "step": 8447 + }, + { + "epoch": 0.46496780230062196, + "grad_norm": 0.807482898235321, + "learning_rate": 8.746751804118326e-06, + "loss": 0.7783, + "step": 8448 + }, + { + "epoch": 0.4650228411029776, + "grad_norm": 0.7088230848312378, + "learning_rate": 8.746464759836182e-06, + "loss": 0.762, + "step": 8449 + }, + { + "epoch": 0.4650778799053333, + "grad_norm": 0.7039850354194641, + "learning_rate": 8.746177687396612e-06, + "loss": 0.7811, + "step": 8450 + }, + { + "epoch": 0.4651329187076889, + "grad_norm": 0.7154161334037781, + "learning_rate": 8.745890586801773e-06, + "loss": 0.76, + "step": 8451 + }, + { + "epoch": 0.4651879575100446, + "grad_norm": 0.6738846302032471, + "learning_rate": 8.745603458053822e-06, + "loss": 0.7119, + "step": 8452 + }, + { + "epoch": 0.46524299631240024, + "grad_norm": 0.6615753173828125, + "learning_rate": 8.745316301154919e-06, + "loss": 0.8061, + "step": 8453 + }, + { + "epoch": 0.4652980351147559, + "grad_norm": 0.7285076379776001, + "learning_rate": 8.74502911610722e-06, + "loss": 0.7522, + "step": 8454 + }, + { + "epoch": 0.46535307391711156, + "grad_norm": 0.7100732922554016, + "learning_rate": 8.744741902912886e-06, + "loss": 0.7665, + "step": 8455 + }, + { + "epoch": 0.46540811271946725, + "grad_norm": 0.6564487814903259, + "learning_rate": 8.744454661574074e-06, + "loss": 0.7352, + "step": 8456 + }, + { + "epoch": 0.4654631515218229, + "grad_norm": 0.689549446105957, + "learning_rate": 8.744167392092944e-06, + "loss": 0.7011, + "step": 8457 + }, + { + "epoch": 0.46551819032417857, + "grad_norm": 0.6660958528518677, + "learning_rate": 8.743880094471651e-06, + "loss": 0.7074, + "step": 8458 + }, + { + "epoch": 0.4655732291265342, + "grad_norm": 0.7470804452896118, + "learning_rate": 8.743592768712361e-06, + "loss": 0.6684, + "step": 8459 + }, + { + "epoch": 0.4656282679288899, + "grad_norm": 0.8058002591133118, + "learning_rate": 8.743305414817227e-06, + "loss": 0.7945, + "step": 8460 + }, + { + "epoch": 0.4656833067312455, + "grad_norm": 0.7756261825561523, + "learning_rate": 8.743018032788413e-06, + "loss": 0.8442, + "step": 8461 + }, + { + "epoch": 0.46573834553360116, + "grad_norm": 0.9267478585243225, + "learning_rate": 8.742730622628077e-06, + "loss": 0.8721, + "step": 8462 + }, + { + "epoch": 0.46579338433595685, + "grad_norm": 0.8684219121932983, + "learning_rate": 8.74244318433838e-06, + "loss": 0.7833, + "step": 8463 + }, + { + "epoch": 0.4658484231383125, + "grad_norm": 0.7060475945472717, + "learning_rate": 8.742155717921481e-06, + "loss": 0.7724, + "step": 8464 + }, + { + "epoch": 0.4659034619406682, + "grad_norm": 0.7316318154335022, + "learning_rate": 8.741868223379543e-06, + "loss": 0.7489, + "step": 8465 + }, + { + "epoch": 0.4659585007430238, + "grad_norm": 0.8131282925605774, + "learning_rate": 8.741580700714724e-06, + "loss": 0.7453, + "step": 8466 + }, + { + "epoch": 0.4660135395453795, + "grad_norm": 0.6985850930213928, + "learning_rate": 8.741293149929187e-06, + "loss": 0.7083, + "step": 8467 + }, + { + "epoch": 0.46606857834773513, + "grad_norm": 0.7512301206588745, + "learning_rate": 8.74100557102509e-06, + "loss": 0.7343, + "step": 8468 + }, + { + "epoch": 0.4661236171500908, + "grad_norm": 0.7547290921211243, + "learning_rate": 8.740717964004596e-06, + "loss": 0.8358, + "step": 8469 + }, + { + "epoch": 0.46617865595244645, + "grad_norm": 0.9091271758079529, + "learning_rate": 8.740430328869868e-06, + "loss": 0.762, + "step": 8470 + }, + { + "epoch": 0.46623369475480214, + "grad_norm": 0.6960130333900452, + "learning_rate": 8.740142665623069e-06, + "loss": 0.7317, + "step": 8471 + }, + { + "epoch": 0.4662887335571578, + "grad_norm": 0.684309184551239, + "learning_rate": 8.739854974266357e-06, + "loss": 0.7653, + "step": 8472 + }, + { + "epoch": 0.46634377235951346, + "grad_norm": 0.7669411301612854, + "learning_rate": 8.739567254801898e-06, + "loss": 0.7152, + "step": 8473 + }, + { + "epoch": 0.4663988111618691, + "grad_norm": 0.7072784900665283, + "learning_rate": 8.73927950723185e-06, + "loss": 0.7508, + "step": 8474 + }, + { + "epoch": 0.4664538499642248, + "grad_norm": 0.7249277234077454, + "learning_rate": 8.73899173155838e-06, + "loss": 0.7469, + "step": 8475 + }, + { + "epoch": 0.4665088887665804, + "grad_norm": 0.7664750218391418, + "learning_rate": 8.738703927783647e-06, + "loss": 0.8692, + "step": 8476 + }, + { + "epoch": 0.4665639275689361, + "grad_norm": 0.7579765319824219, + "learning_rate": 8.738416095909818e-06, + "loss": 0.8283, + "step": 8477 + }, + { + "epoch": 0.46661896637129174, + "grad_norm": 0.7066456079483032, + "learning_rate": 8.738128235939054e-06, + "loss": 0.7125, + "step": 8478 + }, + { + "epoch": 0.46667400517364743, + "grad_norm": 0.766106367111206, + "learning_rate": 8.737840347873518e-06, + "loss": 0.7683, + "step": 8479 + }, + { + "epoch": 0.46672904397600307, + "grad_norm": 0.7599226236343384, + "learning_rate": 8.737552431715374e-06, + "loss": 0.8375, + "step": 8480 + }, + { + "epoch": 0.46678408277835876, + "grad_norm": 0.6955341100692749, + "learning_rate": 8.737264487466789e-06, + "loss": 0.7012, + "step": 8481 + }, + { + "epoch": 0.4668391215807144, + "grad_norm": 0.6096246242523193, + "learning_rate": 8.736976515129923e-06, + "loss": 0.6126, + "step": 8482 + }, + { + "epoch": 0.4668941603830701, + "grad_norm": 0.7469536066055298, + "learning_rate": 8.73668851470694e-06, + "loss": 0.7675, + "step": 8483 + }, + { + "epoch": 0.4669491991854257, + "grad_norm": 0.8018775582313538, + "learning_rate": 8.73640048620001e-06, + "loss": 0.7372, + "step": 8484 + }, + { + "epoch": 0.4670042379877814, + "grad_norm": 0.7446827292442322, + "learning_rate": 8.736112429611293e-06, + "loss": 0.7277, + "step": 8485 + }, + { + "epoch": 0.46705927679013703, + "grad_norm": 0.6292026042938232, + "learning_rate": 8.735824344942954e-06, + "loss": 0.6172, + "step": 8486 + }, + { + "epoch": 0.4671143155924927, + "grad_norm": 0.7207980751991272, + "learning_rate": 8.735536232197159e-06, + "loss": 0.8363, + "step": 8487 + }, + { + "epoch": 0.46716935439484836, + "grad_norm": 0.8585891127586365, + "learning_rate": 8.735248091376073e-06, + "loss": 0.8006, + "step": 8488 + }, + { + "epoch": 0.46722439319720405, + "grad_norm": 0.8149702548980713, + "learning_rate": 8.734959922481863e-06, + "loss": 0.7869, + "step": 8489 + }, + { + "epoch": 0.4672794319995597, + "grad_norm": 0.7113268971443176, + "learning_rate": 8.734671725516695e-06, + "loss": 0.7774, + "step": 8490 + }, + { + "epoch": 0.46733447080191537, + "grad_norm": 0.6940683722496033, + "learning_rate": 8.734383500482733e-06, + "loss": 0.7157, + "step": 8491 + }, + { + "epoch": 0.467389509604271, + "grad_norm": 0.7823536396026611, + "learning_rate": 8.734095247382145e-06, + "loss": 0.8161, + "step": 8492 + }, + { + "epoch": 0.4674445484066267, + "grad_norm": 0.7094922065734863, + "learning_rate": 8.733806966217096e-06, + "loss": 0.7593, + "step": 8493 + }, + { + "epoch": 0.4674995872089823, + "grad_norm": 0.656432569026947, + "learning_rate": 8.733518656989753e-06, + "loss": 0.7853, + "step": 8494 + }, + { + "epoch": 0.467554626011338, + "grad_norm": 0.6715715527534485, + "learning_rate": 8.733230319702284e-06, + "loss": 0.839, + "step": 8495 + }, + { + "epoch": 0.46760966481369365, + "grad_norm": 0.7496705055236816, + "learning_rate": 8.732941954356854e-06, + "loss": 0.8231, + "step": 8496 + }, + { + "epoch": 0.46766470361604934, + "grad_norm": 0.7728047370910645, + "learning_rate": 8.732653560955635e-06, + "loss": 0.7852, + "step": 8497 + }, + { + "epoch": 0.46771974241840497, + "grad_norm": 1.5637458562850952, + "learning_rate": 8.732365139500787e-06, + "loss": 0.7749, + "step": 8498 + }, + { + "epoch": 0.46777478122076066, + "grad_norm": 0.6603190898895264, + "learning_rate": 8.732076689994484e-06, + "loss": 0.6628, + "step": 8499 + }, + { + "epoch": 0.4678298200231163, + "grad_norm": 0.7170974612236023, + "learning_rate": 8.73178821243889e-06, + "loss": 0.7855, + "step": 8500 + }, + { + "epoch": 0.467884858825472, + "grad_norm": 0.7220103740692139, + "learning_rate": 8.731499706836175e-06, + "loss": 0.7035, + "step": 8501 + }, + { + "epoch": 0.4679398976278276, + "grad_norm": 0.6940942406654358, + "learning_rate": 8.731211173188507e-06, + "loss": 0.7857, + "step": 8502 + }, + { + "epoch": 0.4679949364301833, + "grad_norm": 2.441596508026123, + "learning_rate": 8.730922611498057e-06, + "loss": 0.695, + "step": 8503 + }, + { + "epoch": 0.46804997523253894, + "grad_norm": 0.7654910087585449, + "learning_rate": 8.730634021766989e-06, + "loss": 0.788, + "step": 8504 + }, + { + "epoch": 0.4681050140348946, + "grad_norm": 0.791824996471405, + "learning_rate": 8.730345403997475e-06, + "loss": 0.7899, + "step": 8505 + }, + { + "epoch": 0.46816005283725026, + "grad_norm": 0.6863934993743896, + "learning_rate": 8.730056758191682e-06, + "loss": 0.7402, + "step": 8506 + }, + { + "epoch": 0.4682150916396059, + "grad_norm": 0.7920359373092651, + "learning_rate": 8.729768084351783e-06, + "loss": 0.7835, + "step": 8507 + }, + { + "epoch": 0.4682701304419616, + "grad_norm": 0.7077129483222961, + "learning_rate": 8.729479382479944e-06, + "loss": 0.7761, + "step": 8508 + }, + { + "epoch": 0.4683251692443172, + "grad_norm": 0.6870049238204956, + "learning_rate": 8.729190652578337e-06, + "loss": 0.8169, + "step": 8509 + }, + { + "epoch": 0.4683802080466729, + "grad_norm": 0.6802713871002197, + "learning_rate": 8.728901894649131e-06, + "loss": 0.7914, + "step": 8510 + }, + { + "epoch": 0.46843524684902854, + "grad_norm": 0.6645112633705139, + "learning_rate": 8.728613108694497e-06, + "loss": 0.7543, + "step": 8511 + }, + { + "epoch": 0.46849028565138423, + "grad_norm": 0.708292543888092, + "learning_rate": 8.728324294716604e-06, + "loss": 0.7015, + "step": 8512 + }, + { + "epoch": 0.46854532445373986, + "grad_norm": 0.7444465160369873, + "learning_rate": 8.728035452717625e-06, + "loss": 0.7999, + "step": 8513 + }, + { + "epoch": 0.46860036325609555, + "grad_norm": 0.7028616666793823, + "learning_rate": 8.727746582699728e-06, + "loss": 0.8094, + "step": 8514 + }, + { + "epoch": 0.4686554020584512, + "grad_norm": 0.7063208222389221, + "learning_rate": 8.727457684665088e-06, + "loss": 0.8028, + "step": 8515 + }, + { + "epoch": 0.4687104408608069, + "grad_norm": 0.8455138802528381, + "learning_rate": 8.727168758615871e-06, + "loss": 0.7691, + "step": 8516 + }, + { + "epoch": 0.4687654796631625, + "grad_norm": 1.0325778722763062, + "learning_rate": 8.726879804554252e-06, + "loss": 0.7042, + "step": 8517 + }, + { + "epoch": 0.4688205184655182, + "grad_norm": 0.7352754473686218, + "learning_rate": 8.726590822482402e-06, + "loss": 0.8467, + "step": 8518 + }, + { + "epoch": 0.46887555726787383, + "grad_norm": 0.7247193455696106, + "learning_rate": 8.726301812402494e-06, + "loss": 0.8034, + "step": 8519 + }, + { + "epoch": 0.4689305960702295, + "grad_norm": 0.6876820921897888, + "learning_rate": 8.726012774316699e-06, + "loss": 0.7308, + "step": 8520 + }, + { + "epoch": 0.46898563487258516, + "grad_norm": 0.6987231969833374, + "learning_rate": 8.725723708227188e-06, + "loss": 0.7655, + "step": 8521 + }, + { + "epoch": 0.46904067367494084, + "grad_norm": 0.7471843361854553, + "learning_rate": 8.725434614136135e-06, + "loss": 0.7271, + "step": 8522 + }, + { + "epoch": 0.4690957124772965, + "grad_norm": 0.7564642429351807, + "learning_rate": 8.725145492045715e-06, + "loss": 0.7335, + "step": 8523 + }, + { + "epoch": 0.46915075127965217, + "grad_norm": 0.7488992214202881, + "learning_rate": 8.724856341958095e-06, + "loss": 0.8815, + "step": 8524 + }, + { + "epoch": 0.4692057900820078, + "grad_norm": 0.6776759028434753, + "learning_rate": 8.724567163875455e-06, + "loss": 0.7452, + "step": 8525 + }, + { + "epoch": 0.4692608288843635, + "grad_norm": 0.6905981302261353, + "learning_rate": 8.724277957799963e-06, + "loss": 0.6815, + "step": 8526 + }, + { + "epoch": 0.4693158676867191, + "grad_norm": 0.7392297983169556, + "learning_rate": 8.723988723733795e-06, + "loss": 0.7546, + "step": 8527 + }, + { + "epoch": 0.4693709064890748, + "grad_norm": 0.7479110360145569, + "learning_rate": 8.723699461679128e-06, + "loss": 0.7455, + "step": 8528 + }, + { + "epoch": 0.46942594529143045, + "grad_norm": 0.7231360673904419, + "learning_rate": 8.723410171638129e-06, + "loss": 0.7611, + "step": 8529 + }, + { + "epoch": 0.46948098409378614, + "grad_norm": 0.7493714690208435, + "learning_rate": 8.723120853612976e-06, + "loss": 0.6997, + "step": 8530 + }, + { + "epoch": 0.46953602289614177, + "grad_norm": 0.8056793808937073, + "learning_rate": 8.722831507605844e-06, + "loss": 0.7431, + "step": 8531 + }, + { + "epoch": 0.46959106169849746, + "grad_norm": 0.7528547048568726, + "learning_rate": 8.722542133618907e-06, + "loss": 0.8798, + "step": 8532 + }, + { + "epoch": 0.4696461005008531, + "grad_norm": 0.6964863538742065, + "learning_rate": 8.72225273165434e-06, + "loss": 0.8462, + "step": 8533 + }, + { + "epoch": 0.4697011393032088, + "grad_norm": 0.7354302406311035, + "learning_rate": 8.721963301714318e-06, + "loss": 0.7882, + "step": 8534 + }, + { + "epoch": 0.4697561781055644, + "grad_norm": 0.7365205883979797, + "learning_rate": 8.721673843801014e-06, + "loss": 0.7483, + "step": 8535 + }, + { + "epoch": 0.4698112169079201, + "grad_norm": 0.7485378384590149, + "learning_rate": 8.72138435791661e-06, + "loss": 0.8539, + "step": 8536 + }, + { + "epoch": 0.46986625571027574, + "grad_norm": 0.7674353718757629, + "learning_rate": 8.721094844063274e-06, + "loss": 0.834, + "step": 8537 + }, + { + "epoch": 0.4699212945126314, + "grad_norm": 0.7054184079170227, + "learning_rate": 8.720805302243185e-06, + "loss": 0.7938, + "step": 8538 + }, + { + "epoch": 0.46997633331498706, + "grad_norm": 0.7414574027061462, + "learning_rate": 8.72051573245852e-06, + "loss": 0.7932, + "step": 8539 + }, + { + "epoch": 0.47003137211734275, + "grad_norm": 0.6734428405761719, + "learning_rate": 8.720226134711455e-06, + "loss": 0.8775, + "step": 8540 + }, + { + "epoch": 0.4700864109196984, + "grad_norm": 0.6588559150695801, + "learning_rate": 8.719936509004166e-06, + "loss": 0.6985, + "step": 8541 + }, + { + "epoch": 0.4701414497220541, + "grad_norm": 0.6557223200798035, + "learning_rate": 8.71964685533883e-06, + "loss": 0.7243, + "step": 8542 + }, + { + "epoch": 0.4701964885244097, + "grad_norm": 0.7876269221305847, + "learning_rate": 8.719357173717624e-06, + "loss": 0.8075, + "step": 8543 + }, + { + "epoch": 0.4702515273267654, + "grad_norm": 0.8346554040908813, + "learning_rate": 8.719067464142726e-06, + "loss": 0.8427, + "step": 8544 + }, + { + "epoch": 0.47030656612912103, + "grad_norm": 0.7190483808517456, + "learning_rate": 8.718777726616311e-06, + "loss": 0.7689, + "step": 8545 + }, + { + "epoch": 0.4703616049314767, + "grad_norm": 1.303118109703064, + "learning_rate": 8.718487961140558e-06, + "loss": 0.7537, + "step": 8546 + }, + { + "epoch": 0.47041664373383235, + "grad_norm": 0.7733024954795837, + "learning_rate": 8.718198167717647e-06, + "loss": 0.747, + "step": 8547 + }, + { + "epoch": 0.470471682536188, + "grad_norm": 0.6692484617233276, + "learning_rate": 8.717908346349751e-06, + "loss": 0.725, + "step": 8548 + }, + { + "epoch": 0.4705267213385437, + "grad_norm": 0.9639461636543274, + "learning_rate": 8.717618497039054e-06, + "loss": 0.8642, + "step": 8549 + }, + { + "epoch": 0.4705817601408993, + "grad_norm": 0.7584646344184875, + "learning_rate": 8.717328619787728e-06, + "loss": 0.8174, + "step": 8550 + }, + { + "epoch": 0.470636798943255, + "grad_norm": 0.7051709890365601, + "learning_rate": 8.717038714597957e-06, + "loss": 0.7962, + "step": 8551 + }, + { + "epoch": 0.47069183774561063, + "grad_norm": 0.738913893699646, + "learning_rate": 8.716748781471918e-06, + "loss": 0.7367, + "step": 8552 + }, + { + "epoch": 0.4707468765479663, + "grad_norm": 0.7027214169502258, + "learning_rate": 8.716458820411791e-06, + "loss": 0.7613, + "step": 8553 + }, + { + "epoch": 0.47080191535032195, + "grad_norm": 0.6701993346214294, + "learning_rate": 8.716168831419754e-06, + "loss": 0.638, + "step": 8554 + }, + { + "epoch": 0.47085695415267764, + "grad_norm": 0.7422072887420654, + "learning_rate": 8.715878814497984e-06, + "loss": 0.8338, + "step": 8555 + }, + { + "epoch": 0.4709119929550333, + "grad_norm": 0.985992968082428, + "learning_rate": 8.715588769648667e-06, + "loss": 0.7765, + "step": 8556 + }, + { + "epoch": 0.47096703175738897, + "grad_norm": 0.6937553882598877, + "learning_rate": 8.715298696873978e-06, + "loss": 0.7306, + "step": 8557 + }, + { + "epoch": 0.4710220705597446, + "grad_norm": 1.1683214902877808, + "learning_rate": 8.715008596176099e-06, + "loss": 0.7782, + "step": 8558 + }, + { + "epoch": 0.4710771093621003, + "grad_norm": 0.7493681907653809, + "learning_rate": 8.714718467557209e-06, + "loss": 0.9166, + "step": 8559 + }, + { + "epoch": 0.4711321481644559, + "grad_norm": 0.7562084794044495, + "learning_rate": 8.71442831101949e-06, + "loss": 0.7999, + "step": 8560 + }, + { + "epoch": 0.4711871869668116, + "grad_norm": 0.7950266003608704, + "learning_rate": 8.71413812656512e-06, + "loss": 0.8094, + "step": 8561 + }, + { + "epoch": 0.47124222576916724, + "grad_norm": 1.1411044597625732, + "learning_rate": 8.713847914196287e-06, + "loss": 0.7631, + "step": 8562 + }, + { + "epoch": 0.47129726457152293, + "grad_norm": 0.7270122170448303, + "learning_rate": 8.713557673915162e-06, + "loss": 0.7529, + "step": 8563 + }, + { + "epoch": 0.47135230337387857, + "grad_norm": 0.8138573169708252, + "learning_rate": 8.713267405723935e-06, + "loss": 0.8215, + "step": 8564 + }, + { + "epoch": 0.47140734217623426, + "grad_norm": 0.732982873916626, + "learning_rate": 8.712977109624783e-06, + "loss": 0.7099, + "step": 8565 + }, + { + "epoch": 0.4714623809785899, + "grad_norm": 0.7307591438293457, + "learning_rate": 8.712686785619888e-06, + "loss": 0.7035, + "step": 8566 + }, + { + "epoch": 0.4715174197809456, + "grad_norm": 0.8684857487678528, + "learning_rate": 8.712396433711434e-06, + "loss": 0.8605, + "step": 8567 + }, + { + "epoch": 0.4715724585833012, + "grad_norm": 0.7490718364715576, + "learning_rate": 8.712106053901603e-06, + "loss": 0.7439, + "step": 8568 + }, + { + "epoch": 0.4716274973856569, + "grad_norm": 0.8572973012924194, + "learning_rate": 8.711815646192575e-06, + "loss": 0.8187, + "step": 8569 + }, + { + "epoch": 0.47168253618801254, + "grad_norm": 0.785270094871521, + "learning_rate": 8.711525210586536e-06, + "loss": 0.7812, + "step": 8570 + }, + { + "epoch": 0.4717375749903682, + "grad_norm": 0.683651864528656, + "learning_rate": 8.711234747085663e-06, + "loss": 0.7682, + "step": 8571 + }, + { + "epoch": 0.47179261379272386, + "grad_norm": 0.7990714907646179, + "learning_rate": 8.710944255692147e-06, + "loss": 0.8114, + "step": 8572 + }, + { + "epoch": 0.47184765259507955, + "grad_norm": 0.9354856610298157, + "learning_rate": 8.710653736408165e-06, + "loss": 0.7353, + "step": 8573 + }, + { + "epoch": 0.4719026913974352, + "grad_norm": 0.8309356570243835, + "learning_rate": 8.710363189235904e-06, + "loss": 0.8635, + "step": 8574 + }, + { + "epoch": 0.47195773019979087, + "grad_norm": 0.7018463015556335, + "learning_rate": 8.710072614177547e-06, + "loss": 0.6372, + "step": 8575 + }, + { + "epoch": 0.4720127690021465, + "grad_norm": 0.7626469135284424, + "learning_rate": 8.709782011235277e-06, + "loss": 0.7684, + "step": 8576 + }, + { + "epoch": 0.4720678078045022, + "grad_norm": 0.6995826959609985, + "learning_rate": 8.70949138041128e-06, + "loss": 0.7301, + "step": 8577 + }, + { + "epoch": 0.4721228466068578, + "grad_norm": 0.719307541847229, + "learning_rate": 8.709200721707736e-06, + "loss": 0.7437, + "step": 8578 + }, + { + "epoch": 0.4721778854092135, + "grad_norm": 0.7355539202690125, + "learning_rate": 8.708910035126832e-06, + "loss": 0.7926, + "step": 8579 + }, + { + "epoch": 0.47223292421156915, + "grad_norm": 0.7262680530548096, + "learning_rate": 8.708619320670755e-06, + "loss": 0.7641, + "step": 8580 + }, + { + "epoch": 0.47228796301392484, + "grad_norm": 0.844745934009552, + "learning_rate": 8.708328578341687e-06, + "loss": 0.7228, + "step": 8581 + }, + { + "epoch": 0.47234300181628047, + "grad_norm": 0.8169287443161011, + "learning_rate": 8.708037808141814e-06, + "loss": 0.7076, + "step": 8582 + }, + { + "epoch": 0.47239804061863616, + "grad_norm": 0.7342209219932556, + "learning_rate": 8.707747010073322e-06, + "loss": 0.7997, + "step": 8583 + }, + { + "epoch": 0.4724530794209918, + "grad_norm": 0.7138200402259827, + "learning_rate": 8.707456184138394e-06, + "loss": 0.7796, + "step": 8584 + }, + { + "epoch": 0.4725081182233475, + "grad_norm": 0.7168061137199402, + "learning_rate": 8.70716533033922e-06, + "loss": 0.6876, + "step": 8585 + }, + { + "epoch": 0.4725631570257031, + "grad_norm": 0.7256397604942322, + "learning_rate": 8.706874448677982e-06, + "loss": 0.8296, + "step": 8586 + }, + { + "epoch": 0.4726181958280588, + "grad_norm": 0.8232730627059937, + "learning_rate": 8.70658353915687e-06, + "loss": 0.8001, + "step": 8587 + }, + { + "epoch": 0.47267323463041444, + "grad_norm": 0.7110162973403931, + "learning_rate": 8.706292601778067e-06, + "loss": 0.7061, + "step": 8588 + }, + { + "epoch": 0.47272827343277013, + "grad_norm": 0.9466721415519714, + "learning_rate": 8.706001636543761e-06, + "loss": 0.8713, + "step": 8589 + }, + { + "epoch": 0.47278331223512576, + "grad_norm": 0.7017776370048523, + "learning_rate": 8.705710643456138e-06, + "loss": 0.759, + "step": 8590 + }, + { + "epoch": 0.4728383510374814, + "grad_norm": 0.7140772938728333, + "learning_rate": 8.705419622517386e-06, + "loss": 0.6962, + "step": 8591 + }, + { + "epoch": 0.4728933898398371, + "grad_norm": 1.1076452732086182, + "learning_rate": 8.705128573729694e-06, + "loss": 0.8264, + "step": 8592 + }, + { + "epoch": 0.4729484286421927, + "grad_norm": 0.7308200597763062, + "learning_rate": 8.704837497095247e-06, + "loss": 0.6243, + "step": 8593 + }, + { + "epoch": 0.4730034674445484, + "grad_norm": 0.9445781111717224, + "learning_rate": 8.704546392616231e-06, + "loss": 0.6676, + "step": 8594 + }, + { + "epoch": 0.47305850624690404, + "grad_norm": 0.6527873277664185, + "learning_rate": 8.704255260294837e-06, + "loss": 0.6979, + "step": 8595 + }, + { + "epoch": 0.47311354504925973, + "grad_norm": 0.6732963919639587, + "learning_rate": 8.703964100133252e-06, + "loss": 0.7724, + "step": 8596 + }, + { + "epoch": 0.47316858385161537, + "grad_norm": 0.7661726474761963, + "learning_rate": 8.703672912133665e-06, + "loss": 0.7988, + "step": 8597 + }, + { + "epoch": 0.47322362265397105, + "grad_norm": 0.7006877660751343, + "learning_rate": 8.703381696298262e-06, + "loss": 0.6765, + "step": 8598 + }, + { + "epoch": 0.4732786614563267, + "grad_norm": 0.7195086479187012, + "learning_rate": 8.703090452629236e-06, + "loss": 0.6676, + "step": 8599 + }, + { + "epoch": 0.4733337002586824, + "grad_norm": 0.6692042350769043, + "learning_rate": 8.702799181128771e-06, + "loss": 0.7882, + "step": 8600 + }, + { + "epoch": 0.473388739061038, + "grad_norm": 0.7736524343490601, + "learning_rate": 8.70250788179906e-06, + "loss": 0.7977, + "step": 8601 + }, + { + "epoch": 0.4734437778633937, + "grad_norm": 0.8821607828140259, + "learning_rate": 8.70221655464229e-06, + "loss": 0.7465, + "step": 8602 + }, + { + "epoch": 0.47349881666574933, + "grad_norm": 0.7565156817436218, + "learning_rate": 8.701925199660652e-06, + "loss": 0.831, + "step": 8603 + }, + { + "epoch": 0.473553855468105, + "grad_norm": 0.8542304039001465, + "learning_rate": 8.701633816856335e-06, + "loss": 0.7538, + "step": 8604 + }, + { + "epoch": 0.47360889427046066, + "grad_norm": 0.6891050338745117, + "learning_rate": 8.701342406231529e-06, + "loss": 0.7687, + "step": 8605 + }, + { + "epoch": 0.47366393307281635, + "grad_norm": 0.8570719361305237, + "learning_rate": 8.701050967788424e-06, + "loss": 0.7236, + "step": 8606 + }, + { + "epoch": 0.473718971875172, + "grad_norm": 0.7921456098556519, + "learning_rate": 8.700759501529212e-06, + "loss": 0.8214, + "step": 8607 + }, + { + "epoch": 0.47377401067752767, + "grad_norm": 0.7584527730941772, + "learning_rate": 8.70046800745608e-06, + "loss": 0.8204, + "step": 8608 + }, + { + "epoch": 0.4738290494798833, + "grad_norm": 0.8033978343009949, + "learning_rate": 8.700176485571222e-06, + "loss": 0.8278, + "step": 8609 + }, + { + "epoch": 0.473884088282239, + "grad_norm": 0.9950750470161438, + "learning_rate": 8.699884935876828e-06, + "loss": 0.8181, + "step": 8610 + }, + { + "epoch": 0.4739391270845946, + "grad_norm": 0.7213684916496277, + "learning_rate": 8.69959335837509e-06, + "loss": 0.7099, + "step": 8611 + }, + { + "epoch": 0.4739941658869503, + "grad_norm": 0.7847200632095337, + "learning_rate": 8.699301753068199e-06, + "loss": 0.8272, + "step": 8612 + }, + { + "epoch": 0.47404920468930595, + "grad_norm": 0.7075058221817017, + "learning_rate": 8.699010119958344e-06, + "loss": 0.7127, + "step": 8613 + }, + { + "epoch": 0.47410424349166164, + "grad_norm": 0.682741641998291, + "learning_rate": 8.69871845904772e-06, + "loss": 0.8446, + "step": 8614 + }, + { + "epoch": 0.47415928229401727, + "grad_norm": 0.7120605111122131, + "learning_rate": 8.69842677033852e-06, + "loss": 0.7776, + "step": 8615 + }, + { + "epoch": 0.47421432109637296, + "grad_norm": 0.822405219078064, + "learning_rate": 8.698135053832933e-06, + "loss": 0.8018, + "step": 8616 + }, + { + "epoch": 0.4742693598987286, + "grad_norm": 0.6815186738967896, + "learning_rate": 8.697843309533152e-06, + "loss": 0.7413, + "step": 8617 + }, + { + "epoch": 0.4743243987010843, + "grad_norm": 0.7587849497795105, + "learning_rate": 8.69755153744137e-06, + "loss": 0.7809, + "step": 8618 + }, + { + "epoch": 0.4743794375034399, + "grad_norm": 0.7092488408088684, + "learning_rate": 8.697259737559782e-06, + "loss": 0.7921, + "step": 8619 + }, + { + "epoch": 0.4744344763057956, + "grad_norm": 0.7396836280822754, + "learning_rate": 8.69696790989058e-06, + "loss": 0.7946, + "step": 8620 + }, + { + "epoch": 0.47448951510815124, + "grad_norm": 0.6760729551315308, + "learning_rate": 8.696676054435955e-06, + "loss": 0.7389, + "step": 8621 + }, + { + "epoch": 0.4745445539105069, + "grad_norm": 1.1640692949295044, + "learning_rate": 8.696384171198105e-06, + "loss": 0.8291, + "step": 8622 + }, + { + "epoch": 0.47459959271286256, + "grad_norm": 0.7415158152580261, + "learning_rate": 8.696092260179219e-06, + "loss": 0.7534, + "step": 8623 + }, + { + "epoch": 0.47465463151521825, + "grad_norm": 0.7730052471160889, + "learning_rate": 8.695800321381492e-06, + "loss": 0.8447, + "step": 8624 + }, + { + "epoch": 0.4747096703175739, + "grad_norm": 0.811522364616394, + "learning_rate": 8.695508354807121e-06, + "loss": 0.7466, + "step": 8625 + }, + { + "epoch": 0.4747647091199296, + "grad_norm": 0.7908332347869873, + "learning_rate": 8.695216360458298e-06, + "loss": 0.7769, + "step": 8626 + }, + { + "epoch": 0.4748197479222852, + "grad_norm": 0.744971752166748, + "learning_rate": 8.694924338337217e-06, + "loss": 0.7651, + "step": 8627 + }, + { + "epoch": 0.4748747867246409, + "grad_norm": 0.705565869808197, + "learning_rate": 8.694632288446075e-06, + "loss": 0.8258, + "step": 8628 + }, + { + "epoch": 0.47492982552699653, + "grad_norm": 0.8199328780174255, + "learning_rate": 8.694340210787065e-06, + "loss": 0.733, + "step": 8629 + }, + { + "epoch": 0.4749848643293522, + "grad_norm": 0.6965511441230774, + "learning_rate": 8.694048105362382e-06, + "loss": 0.7548, + "step": 8630 + }, + { + "epoch": 0.47503990313170785, + "grad_norm": 0.7943055629730225, + "learning_rate": 8.693755972174225e-06, + "loss": 0.7518, + "step": 8631 + }, + { + "epoch": 0.47509494193406354, + "grad_norm": 0.6277437806129456, + "learning_rate": 8.693463811224785e-06, + "loss": 0.6941, + "step": 8632 + }, + { + "epoch": 0.4751499807364192, + "grad_norm": 1.0745574235916138, + "learning_rate": 8.693171622516259e-06, + "loss": 0.8056, + "step": 8633 + }, + { + "epoch": 0.4752050195387748, + "grad_norm": 0.7005153894424438, + "learning_rate": 8.692879406050844e-06, + "loss": 0.757, + "step": 8634 + }, + { + "epoch": 0.4752600583411305, + "grad_norm": 0.6971127986907959, + "learning_rate": 8.692587161830737e-06, + "loss": 0.7509, + "step": 8635 + }, + { + "epoch": 0.47531509714348613, + "grad_norm": 0.7583497762680054, + "learning_rate": 8.692294889858133e-06, + "loss": 0.7895, + "step": 8636 + }, + { + "epoch": 0.4753701359458418, + "grad_norm": 0.719932496547699, + "learning_rate": 8.692002590135228e-06, + "loss": 0.762, + "step": 8637 + }, + { + "epoch": 0.47542517474819745, + "grad_norm": 0.7041804790496826, + "learning_rate": 8.691710262664222e-06, + "loss": 0.7101, + "step": 8638 + }, + { + "epoch": 0.47548021355055314, + "grad_norm": 0.7395016551017761, + "learning_rate": 8.691417907447309e-06, + "loss": 0.723, + "step": 8639 + }, + { + "epoch": 0.4755352523529088, + "grad_norm": 0.6605637073516846, + "learning_rate": 8.691125524486686e-06, + "loss": 0.644, + "step": 8640 + }, + { + "epoch": 0.47559029115526447, + "grad_norm": 0.694732129573822, + "learning_rate": 8.690833113784552e-06, + "loss": 0.7162, + "step": 8641 + }, + { + "epoch": 0.4756453299576201, + "grad_norm": 0.7622451186180115, + "learning_rate": 8.690540675343105e-06, + "loss": 0.6995, + "step": 8642 + }, + { + "epoch": 0.4757003687599758, + "grad_norm": 0.6961628794670105, + "learning_rate": 8.69024820916454e-06, + "loss": 0.7955, + "step": 8643 + }, + { + "epoch": 0.4757554075623314, + "grad_norm": 0.706266462802887, + "learning_rate": 8.68995571525106e-06, + "loss": 0.7237, + "step": 8644 + }, + { + "epoch": 0.4758104463646871, + "grad_norm": 0.7727495431900024, + "learning_rate": 8.689663193604858e-06, + "loss": 0.7215, + "step": 8645 + }, + { + "epoch": 0.47586548516704275, + "grad_norm": 0.7320648431777954, + "learning_rate": 8.689370644228136e-06, + "loss": 0.7592, + "step": 8646 + }, + { + "epoch": 0.47592052396939843, + "grad_norm": 0.8149487376213074, + "learning_rate": 8.689078067123093e-06, + "loss": 0.7666, + "step": 8647 + }, + { + "epoch": 0.47597556277175407, + "grad_norm": 0.6584552526473999, + "learning_rate": 8.688785462291927e-06, + "loss": 0.7497, + "step": 8648 + }, + { + "epoch": 0.47603060157410976, + "grad_norm": 0.7197825312614441, + "learning_rate": 8.688492829736836e-06, + "loss": 0.7559, + "step": 8649 + }, + { + "epoch": 0.4760856403764654, + "grad_norm": 0.8116913437843323, + "learning_rate": 8.68820016946002e-06, + "loss": 0.7029, + "step": 8650 + }, + { + "epoch": 0.4761406791788211, + "grad_norm": 0.6733378171920776, + "learning_rate": 8.68790748146368e-06, + "loss": 0.7242, + "step": 8651 + }, + { + "epoch": 0.4761957179811767, + "grad_norm": 0.690464437007904, + "learning_rate": 8.687614765750012e-06, + "loss": 0.6668, + "step": 8652 + }, + { + "epoch": 0.4762507567835324, + "grad_norm": 0.7901185154914856, + "learning_rate": 8.687322022321221e-06, + "loss": 0.7436, + "step": 8653 + }, + { + "epoch": 0.47630579558588804, + "grad_norm": 0.7608267068862915, + "learning_rate": 8.687029251179504e-06, + "loss": 0.8292, + "step": 8654 + }, + { + "epoch": 0.4763608343882437, + "grad_norm": 0.6851119995117188, + "learning_rate": 8.686736452327062e-06, + "loss": 0.7974, + "step": 8655 + }, + { + "epoch": 0.47641587319059936, + "grad_norm": 0.6946395635604858, + "learning_rate": 8.686443625766094e-06, + "loss": 0.6745, + "step": 8656 + }, + { + "epoch": 0.47647091199295505, + "grad_norm": 0.7403521537780762, + "learning_rate": 8.686150771498804e-06, + "loss": 0.7759, + "step": 8657 + }, + { + "epoch": 0.4765259507953107, + "grad_norm": 0.8415689468383789, + "learning_rate": 8.685857889527393e-06, + "loss": 0.7911, + "step": 8658 + }, + { + "epoch": 0.47658098959766637, + "grad_norm": 0.6947778463363647, + "learning_rate": 8.68556497985406e-06, + "loss": 0.8026, + "step": 8659 + }, + { + "epoch": 0.476636028400022, + "grad_norm": 0.6807059645652771, + "learning_rate": 8.685272042481006e-06, + "loss": 0.7194, + "step": 8660 + }, + { + "epoch": 0.4766910672023777, + "grad_norm": 0.8948639631271362, + "learning_rate": 8.684979077410434e-06, + "loss": 0.8017, + "step": 8661 + }, + { + "epoch": 0.4767461060047333, + "grad_norm": 0.6697849035263062, + "learning_rate": 8.684686084644546e-06, + "loss": 0.7653, + "step": 8662 + }, + { + "epoch": 0.476801144807089, + "grad_norm": 0.7303311228752136, + "learning_rate": 8.684393064185543e-06, + "loss": 0.8287, + "step": 8663 + }, + { + "epoch": 0.47685618360944465, + "grad_norm": 0.6545100808143616, + "learning_rate": 8.68410001603563e-06, + "loss": 0.7438, + "step": 8664 + }, + { + "epoch": 0.47691122241180034, + "grad_norm": 0.8757766485214233, + "learning_rate": 8.683806940197006e-06, + "loss": 0.8343, + "step": 8665 + }, + { + "epoch": 0.476966261214156, + "grad_norm": 0.6414330005645752, + "learning_rate": 8.683513836671876e-06, + "loss": 0.7201, + "step": 8666 + }, + { + "epoch": 0.47702130001651166, + "grad_norm": 0.6736441850662231, + "learning_rate": 8.68322070546244e-06, + "loss": 0.7365, + "step": 8667 + }, + { + "epoch": 0.4770763388188673, + "grad_norm": 0.780491054058075, + "learning_rate": 8.682927546570905e-06, + "loss": 0.924, + "step": 8668 + }, + { + "epoch": 0.477131377621223, + "grad_norm": 0.6913807988166809, + "learning_rate": 8.68263435999947e-06, + "loss": 0.8269, + "step": 8669 + }, + { + "epoch": 0.4771864164235786, + "grad_norm": 0.7264360189437866, + "learning_rate": 8.682341145750344e-06, + "loss": 0.788, + "step": 8670 + }, + { + "epoch": 0.4772414552259343, + "grad_norm": 0.7777243852615356, + "learning_rate": 8.682047903825725e-06, + "loss": 0.8691, + "step": 8671 + }, + { + "epoch": 0.47729649402828994, + "grad_norm": 0.7590457797050476, + "learning_rate": 8.681754634227821e-06, + "loss": 0.8249, + "step": 8672 + }, + { + "epoch": 0.47735153283064563, + "grad_norm": 0.7672324776649475, + "learning_rate": 8.681461336958836e-06, + "loss": 0.8334, + "step": 8673 + }, + { + "epoch": 0.47740657163300126, + "grad_norm": 0.7181395888328552, + "learning_rate": 8.681168012020971e-06, + "loss": 0.8089, + "step": 8674 + }, + { + "epoch": 0.47746161043535695, + "grad_norm": 0.7671428918838501, + "learning_rate": 8.680874659416433e-06, + "loss": 0.7634, + "step": 8675 + }, + { + "epoch": 0.4775166492377126, + "grad_norm": 0.73219895362854, + "learning_rate": 8.680581279147427e-06, + "loss": 0.7013, + "step": 8676 + }, + { + "epoch": 0.4775716880400682, + "grad_norm": 0.8050867319107056, + "learning_rate": 8.680287871216158e-06, + "loss": 0.7524, + "step": 8677 + }, + { + "epoch": 0.4776267268424239, + "grad_norm": 0.7154340744018555, + "learning_rate": 8.679994435624828e-06, + "loss": 0.802, + "step": 8678 + }, + { + "epoch": 0.47768176564477954, + "grad_norm": 0.7005884051322937, + "learning_rate": 8.679700972375647e-06, + "loss": 0.7633, + "step": 8679 + }, + { + "epoch": 0.47773680444713523, + "grad_norm": 0.8203871846199036, + "learning_rate": 8.679407481470818e-06, + "loss": 0.7782, + "step": 8680 + }, + { + "epoch": 0.47779184324949087, + "grad_norm": 0.6582844853401184, + "learning_rate": 8.679113962912547e-06, + "loss": 0.6799, + "step": 8681 + }, + { + "epoch": 0.47784688205184656, + "grad_norm": 0.7052889466285706, + "learning_rate": 8.67882041670304e-06, + "loss": 0.7814, + "step": 8682 + }, + { + "epoch": 0.4779019208542022, + "grad_norm": 0.7533165812492371, + "learning_rate": 8.678526842844504e-06, + "loss": 0.7983, + "step": 8683 + }, + { + "epoch": 0.4779569596565579, + "grad_norm": 0.7335212230682373, + "learning_rate": 8.678233241339144e-06, + "loss": 0.8023, + "step": 8684 + }, + { + "epoch": 0.4780119984589135, + "grad_norm": 0.7824274897575378, + "learning_rate": 8.67793961218917e-06, + "loss": 0.8219, + "step": 8685 + }, + { + "epoch": 0.4780670372612692, + "grad_norm": 0.6547996401786804, + "learning_rate": 8.677645955396784e-06, + "loss": 0.715, + "step": 8686 + }, + { + "epoch": 0.47812207606362483, + "grad_norm": 0.7507368326187134, + "learning_rate": 8.677352270964196e-06, + "loss": 0.9379, + "step": 8687 + }, + { + "epoch": 0.4781771148659805, + "grad_norm": 0.6403020620346069, + "learning_rate": 8.677058558893613e-06, + "loss": 0.659, + "step": 8688 + }, + { + "epoch": 0.47823215366833616, + "grad_norm": 0.7075803279876709, + "learning_rate": 8.676764819187242e-06, + "loss": 0.7515, + "step": 8689 + }, + { + "epoch": 0.47828719247069185, + "grad_norm": 0.6899601817131042, + "learning_rate": 8.676471051847291e-06, + "loss": 0.8398, + "step": 8690 + }, + { + "epoch": 0.4783422312730475, + "grad_norm": 0.7145645618438721, + "learning_rate": 8.676177256875969e-06, + "loss": 0.7711, + "step": 8691 + }, + { + "epoch": 0.47839727007540317, + "grad_norm": 0.7139655351638794, + "learning_rate": 8.675883434275479e-06, + "loss": 0.8664, + "step": 8692 + }, + { + "epoch": 0.4784523088777588, + "grad_norm": 0.7100433111190796, + "learning_rate": 8.675589584048037e-06, + "loss": 0.7812, + "step": 8693 + }, + { + "epoch": 0.4785073476801145, + "grad_norm": 0.6103882789611816, + "learning_rate": 8.675295706195845e-06, + "loss": 0.6565, + "step": 8694 + }, + { + "epoch": 0.4785623864824701, + "grad_norm": 0.7236714959144592, + "learning_rate": 8.675001800721114e-06, + "loss": 0.6849, + "step": 8695 + }, + { + "epoch": 0.4786174252848258, + "grad_norm": 0.7567160129547119, + "learning_rate": 8.674707867626056e-06, + "loss": 0.8289, + "step": 8696 + }, + { + "epoch": 0.47867246408718145, + "grad_norm": 0.7004136443138123, + "learning_rate": 8.674413906912876e-06, + "loss": 0.7466, + "step": 8697 + }, + { + "epoch": 0.47872750288953714, + "grad_norm": 0.713835597038269, + "learning_rate": 8.674119918583783e-06, + "loss": 0.7875, + "step": 8698 + }, + { + "epoch": 0.47878254169189277, + "grad_norm": 0.8476874232292175, + "learning_rate": 8.67382590264099e-06, + "loss": 0.8028, + "step": 8699 + }, + { + "epoch": 0.47883758049424846, + "grad_norm": 0.720273494720459, + "learning_rate": 8.673531859086706e-06, + "loss": 0.7829, + "step": 8700 + }, + { + "epoch": 0.4788926192966041, + "grad_norm": 0.8042417168617249, + "learning_rate": 8.673237787923137e-06, + "loss": 0.7914, + "step": 8701 + }, + { + "epoch": 0.4789476580989598, + "grad_norm": 0.7779260277748108, + "learning_rate": 8.672943689152498e-06, + "loss": 0.6921, + "step": 8702 + }, + { + "epoch": 0.4790026969013154, + "grad_norm": 0.7957637906074524, + "learning_rate": 8.672649562776997e-06, + "loss": 0.8761, + "step": 8703 + }, + { + "epoch": 0.4790577357036711, + "grad_norm": 0.7467649579048157, + "learning_rate": 8.672355408798845e-06, + "loss": 0.7984, + "step": 8704 + }, + { + "epoch": 0.47911277450602674, + "grad_norm": 0.6746538877487183, + "learning_rate": 8.672061227220252e-06, + "loss": 0.7392, + "step": 8705 + }, + { + "epoch": 0.47916781330838243, + "grad_norm": 0.7331795692443848, + "learning_rate": 8.671767018043432e-06, + "loss": 0.7171, + "step": 8706 + }, + { + "epoch": 0.47922285211073806, + "grad_norm": 0.7879608273506165, + "learning_rate": 8.671472781270592e-06, + "loss": 0.8497, + "step": 8707 + }, + { + "epoch": 0.47927789091309375, + "grad_norm": 0.8659428358078003, + "learning_rate": 8.671178516903946e-06, + "loss": 0.8102, + "step": 8708 + }, + { + "epoch": 0.4793329297154494, + "grad_norm": 0.6489408612251282, + "learning_rate": 8.670884224945704e-06, + "loss": 0.6752, + "step": 8709 + }, + { + "epoch": 0.4793879685178051, + "grad_norm": 0.8182825446128845, + "learning_rate": 8.670589905398079e-06, + "loss": 0.7972, + "step": 8710 + }, + { + "epoch": 0.4794430073201607, + "grad_norm": 0.7759343981742859, + "learning_rate": 8.670295558263285e-06, + "loss": 0.7856, + "step": 8711 + }, + { + "epoch": 0.4794980461225164, + "grad_norm": 0.7421835064888, + "learning_rate": 8.670001183543528e-06, + "loss": 0.8165, + "step": 8712 + }, + { + "epoch": 0.47955308492487203, + "grad_norm": 0.6498512625694275, + "learning_rate": 8.669706781241028e-06, + "loss": 0.7212, + "step": 8713 + }, + { + "epoch": 0.4796081237272277, + "grad_norm": 0.8493219614028931, + "learning_rate": 8.669412351357993e-06, + "loss": 0.8036, + "step": 8714 + }, + { + "epoch": 0.47966316252958335, + "grad_norm": 0.6834331750869751, + "learning_rate": 8.669117893896637e-06, + "loss": 0.8127, + "step": 8715 + }, + { + "epoch": 0.47971820133193904, + "grad_norm": 0.7793670296669006, + "learning_rate": 8.668823408859172e-06, + "loss": 0.7276, + "step": 8716 + }, + { + "epoch": 0.4797732401342947, + "grad_norm": 0.7108075022697449, + "learning_rate": 8.668528896247815e-06, + "loss": 0.8328, + "step": 8717 + }, + { + "epoch": 0.47982827893665037, + "grad_norm": 0.6662433743476868, + "learning_rate": 8.668234356064774e-06, + "loss": 0.6751, + "step": 8718 + }, + { + "epoch": 0.479883317739006, + "grad_norm": 0.6595591902732849, + "learning_rate": 8.667939788312267e-06, + "loss": 0.707, + "step": 8719 + }, + { + "epoch": 0.47993835654136163, + "grad_norm": 0.7435836791992188, + "learning_rate": 8.667645192992506e-06, + "loss": 0.7885, + "step": 8720 + }, + { + "epoch": 0.4799933953437173, + "grad_norm": 0.6999356746673584, + "learning_rate": 8.667350570107706e-06, + "loss": 0.7538, + "step": 8721 + }, + { + "epoch": 0.48004843414607296, + "grad_norm": 0.7111191749572754, + "learning_rate": 8.66705591966008e-06, + "loss": 0.6814, + "step": 8722 + }, + { + "epoch": 0.48010347294842864, + "grad_norm": 0.6752734780311584, + "learning_rate": 8.666761241651844e-06, + "loss": 0.7221, + "step": 8723 + }, + { + "epoch": 0.4801585117507843, + "grad_norm": 0.7432951331138611, + "learning_rate": 8.666466536085212e-06, + "loss": 0.7689, + "step": 8724 + }, + { + "epoch": 0.48021355055313997, + "grad_norm": 0.7384392023086548, + "learning_rate": 8.666171802962398e-06, + "loss": 0.7862, + "step": 8725 + }, + { + "epoch": 0.4802685893554956, + "grad_norm": 0.6878762245178223, + "learning_rate": 8.66587704228562e-06, + "loss": 0.7246, + "step": 8726 + }, + { + "epoch": 0.4803236281578513, + "grad_norm": 0.6640586853027344, + "learning_rate": 8.66558225405709e-06, + "loss": 0.7181, + "step": 8727 + }, + { + "epoch": 0.4803786669602069, + "grad_norm": 0.6808595061302185, + "learning_rate": 8.665287438279024e-06, + "loss": 0.7866, + "step": 8728 + }, + { + "epoch": 0.4804337057625626, + "grad_norm": 0.5966268181800842, + "learning_rate": 8.66499259495364e-06, + "loss": 0.6755, + "step": 8729 + }, + { + "epoch": 0.48048874456491825, + "grad_norm": 0.742016077041626, + "learning_rate": 8.664697724083152e-06, + "loss": 0.8682, + "step": 8730 + }, + { + "epoch": 0.48054378336727394, + "grad_norm": 0.6621154546737671, + "learning_rate": 8.66440282566978e-06, + "loss": 0.7525, + "step": 8731 + }, + { + "epoch": 0.48059882216962957, + "grad_norm": 0.7347434759140015, + "learning_rate": 8.664107899715733e-06, + "loss": 0.7919, + "step": 8732 + }, + { + "epoch": 0.48065386097198526, + "grad_norm": 0.7564681172370911, + "learning_rate": 8.663812946223234e-06, + "loss": 0.9172, + "step": 8733 + }, + { + "epoch": 0.4807088997743409, + "grad_norm": 0.7193084359169006, + "learning_rate": 8.663517965194497e-06, + "loss": 0.7931, + "step": 8734 + }, + { + "epoch": 0.4807639385766966, + "grad_norm": 0.6882064938545227, + "learning_rate": 8.66322295663174e-06, + "loss": 0.7678, + "step": 8735 + }, + { + "epoch": 0.4808189773790522, + "grad_norm": 0.7954713106155396, + "learning_rate": 8.662927920537179e-06, + "loss": 0.6357, + "step": 8736 + }, + { + "epoch": 0.4808740161814079, + "grad_norm": 0.7123041749000549, + "learning_rate": 8.662632856913034e-06, + "loss": 0.7234, + "step": 8737 + }, + { + "epoch": 0.48092905498376354, + "grad_norm": 0.745145320892334, + "learning_rate": 8.66233776576152e-06, + "loss": 0.7516, + "step": 8738 + }, + { + "epoch": 0.4809840937861192, + "grad_norm": 0.6904219388961792, + "learning_rate": 8.662042647084856e-06, + "loss": 0.7995, + "step": 8739 + }, + { + "epoch": 0.48103913258847486, + "grad_norm": 0.71831214427948, + "learning_rate": 8.661747500885258e-06, + "loss": 0.7965, + "step": 8740 + }, + { + "epoch": 0.48109417139083055, + "grad_norm": 0.8514378666877747, + "learning_rate": 8.661452327164948e-06, + "loss": 0.8023, + "step": 8741 + }, + { + "epoch": 0.4811492101931862, + "grad_norm": 0.7411143779754639, + "learning_rate": 8.66115712592614e-06, + "loss": 0.797, + "step": 8742 + }, + { + "epoch": 0.4812042489955419, + "grad_norm": 0.737178385257721, + "learning_rate": 8.660861897171057e-06, + "loss": 0.7286, + "step": 8743 + }, + { + "epoch": 0.4812592877978975, + "grad_norm": 0.6823513507843018, + "learning_rate": 8.660566640901918e-06, + "loss": 0.7482, + "step": 8744 + }, + { + "epoch": 0.4813143266002532, + "grad_norm": 0.7205879092216492, + "learning_rate": 8.660271357120937e-06, + "loss": 0.8294, + "step": 8745 + }, + { + "epoch": 0.48136936540260883, + "grad_norm": 0.6887338757514954, + "learning_rate": 8.659976045830337e-06, + "loss": 0.7711, + "step": 8746 + }, + { + "epoch": 0.4814244042049645, + "grad_norm": 0.7498533129692078, + "learning_rate": 8.659680707032336e-06, + "loss": 0.7296, + "step": 8747 + }, + { + "epoch": 0.48147944300732015, + "grad_norm": 0.8041636943817139, + "learning_rate": 8.659385340729155e-06, + "loss": 0.9213, + "step": 8748 + }, + { + "epoch": 0.48153448180967584, + "grad_norm": 0.8623721599578857, + "learning_rate": 8.659089946923014e-06, + "loss": 0.8024, + "step": 8749 + }, + { + "epoch": 0.4815895206120315, + "grad_norm": 0.7212050557136536, + "learning_rate": 8.658794525616132e-06, + "loss": 0.732, + "step": 8750 + }, + { + "epoch": 0.48164455941438716, + "grad_norm": 0.7141492366790771, + "learning_rate": 8.658499076810729e-06, + "loss": 0.8062, + "step": 8751 + }, + { + "epoch": 0.4816995982167428, + "grad_norm": 0.7191516160964966, + "learning_rate": 8.658203600509027e-06, + "loss": 0.805, + "step": 8752 + }, + { + "epoch": 0.4817546370190985, + "grad_norm": 0.71059650182724, + "learning_rate": 8.657908096713245e-06, + "loss": 0.6755, + "step": 8753 + }, + { + "epoch": 0.4818096758214541, + "grad_norm": 0.6715459823608398, + "learning_rate": 8.657612565425607e-06, + "loss": 0.8093, + "step": 8754 + }, + { + "epoch": 0.4818647146238098, + "grad_norm": 0.7438814640045166, + "learning_rate": 8.65731700664833e-06, + "loss": 0.8059, + "step": 8755 + }, + { + "epoch": 0.48191975342616544, + "grad_norm": 0.7295387387275696, + "learning_rate": 8.657021420383637e-06, + "loss": 0.8437, + "step": 8756 + }, + { + "epoch": 0.48197479222852113, + "grad_norm": 0.7053797245025635, + "learning_rate": 8.656725806633753e-06, + "loss": 0.8424, + "step": 8757 + }, + { + "epoch": 0.48202983103087677, + "grad_norm": 0.6902007460594177, + "learning_rate": 8.656430165400894e-06, + "loss": 0.6967, + "step": 8758 + }, + { + "epoch": 0.48208486983323245, + "grad_norm": 0.66749507188797, + "learning_rate": 8.656134496687286e-06, + "loss": 0.7858, + "step": 8759 + }, + { + "epoch": 0.4821399086355881, + "grad_norm": 0.6755428314208984, + "learning_rate": 8.65583880049515e-06, + "loss": 0.6669, + "step": 8760 + }, + { + "epoch": 0.4821949474379438, + "grad_norm": 0.921096920967102, + "learning_rate": 8.655543076826706e-06, + "loss": 0.8545, + "step": 8761 + }, + { + "epoch": 0.4822499862402994, + "grad_norm": 0.7931553721427917, + "learning_rate": 8.65524732568418e-06, + "loss": 0.8708, + "step": 8762 + }, + { + "epoch": 0.48230502504265504, + "grad_norm": 0.7891780734062195, + "learning_rate": 8.654951547069794e-06, + "loss": 0.687, + "step": 8763 + }, + { + "epoch": 0.48236006384501073, + "grad_norm": 0.747662365436554, + "learning_rate": 8.65465574098577e-06, + "loss": 0.8153, + "step": 8764 + }, + { + "epoch": 0.48241510264736637, + "grad_norm": 0.7758497595787048, + "learning_rate": 8.65435990743433e-06, + "loss": 0.8018, + "step": 8765 + }, + { + "epoch": 0.48247014144972206, + "grad_norm": 0.6997805237770081, + "learning_rate": 8.654064046417703e-06, + "loss": 0.7845, + "step": 8766 + }, + { + "epoch": 0.4825251802520777, + "grad_norm": 0.7188366651535034, + "learning_rate": 8.653768157938106e-06, + "loss": 0.7528, + "step": 8767 + }, + { + "epoch": 0.4825802190544334, + "grad_norm": 0.6848055124282837, + "learning_rate": 8.653472241997767e-06, + "loss": 0.7658, + "step": 8768 + }, + { + "epoch": 0.482635257856789, + "grad_norm": 1.0603824853897095, + "learning_rate": 8.653176298598907e-06, + "loss": 0.7692, + "step": 8769 + }, + { + "epoch": 0.4826902966591447, + "grad_norm": 0.8191514611244202, + "learning_rate": 8.652880327743753e-06, + "loss": 0.7706, + "step": 8770 + }, + { + "epoch": 0.48274533546150034, + "grad_norm": 0.6318503618240356, + "learning_rate": 8.652584329434527e-06, + "loss": 0.6635, + "step": 8771 + }, + { + "epoch": 0.482800374263856, + "grad_norm": 0.6860769391059875, + "learning_rate": 8.652288303673457e-06, + "loss": 0.739, + "step": 8772 + }, + { + "epoch": 0.48285541306621166, + "grad_norm": 0.7414761185646057, + "learning_rate": 8.651992250462765e-06, + "loss": 0.7949, + "step": 8773 + }, + { + "epoch": 0.48291045186856735, + "grad_norm": 0.7255183458328247, + "learning_rate": 8.651696169804676e-06, + "loss": 0.8569, + "step": 8774 + }, + { + "epoch": 0.482965490670923, + "grad_norm": 0.7034135460853577, + "learning_rate": 8.651400061701417e-06, + "loss": 0.7562, + "step": 8775 + }, + { + "epoch": 0.48302052947327867, + "grad_norm": 0.7041038274765015, + "learning_rate": 8.651103926155212e-06, + "loss": 0.7194, + "step": 8776 + }, + { + "epoch": 0.4830755682756343, + "grad_norm": 1.0965619087219238, + "learning_rate": 8.650807763168287e-06, + "loss": 0.9033, + "step": 8777 + }, + { + "epoch": 0.48313060707799, + "grad_norm": 0.7400044798851013, + "learning_rate": 8.650511572742869e-06, + "loss": 0.7626, + "step": 8778 + }, + { + "epoch": 0.4831856458803456, + "grad_norm": 0.6957885026931763, + "learning_rate": 8.650215354881182e-06, + "loss": 0.7283, + "step": 8779 + }, + { + "epoch": 0.4832406846827013, + "grad_norm": 0.7992473840713501, + "learning_rate": 8.649919109585454e-06, + "loss": 0.8376, + "step": 8780 + }, + { + "epoch": 0.48329572348505695, + "grad_norm": 0.8556981086730957, + "learning_rate": 8.649622836857911e-06, + "loss": 0.7737, + "step": 8781 + }, + { + "epoch": 0.48335076228741264, + "grad_norm": 0.8476192355155945, + "learning_rate": 8.64932653670078e-06, + "loss": 0.8926, + "step": 8782 + }, + { + "epoch": 0.48340580108976827, + "grad_norm": 0.6461093425750732, + "learning_rate": 8.649030209116289e-06, + "loss": 0.7452, + "step": 8783 + }, + { + "epoch": 0.48346083989212396, + "grad_norm": 0.6997528076171875, + "learning_rate": 8.648733854106661e-06, + "loss": 0.7962, + "step": 8784 + }, + { + "epoch": 0.4835158786944796, + "grad_norm": 0.7606356739997864, + "learning_rate": 8.648437471674128e-06, + "loss": 0.6517, + "step": 8785 + }, + { + "epoch": 0.4835709174968353, + "grad_norm": 0.8118630051612854, + "learning_rate": 8.648141061820913e-06, + "loss": 0.7539, + "step": 8786 + }, + { + "epoch": 0.4836259562991909, + "grad_norm": 0.8778805136680603, + "learning_rate": 8.64784462454925e-06, + "loss": 0.763, + "step": 8787 + }, + { + "epoch": 0.4836809951015466, + "grad_norm": 0.7741022706031799, + "learning_rate": 8.647548159861361e-06, + "loss": 0.7749, + "step": 8788 + }, + { + "epoch": 0.48373603390390224, + "grad_norm": 0.76578688621521, + "learning_rate": 8.647251667759478e-06, + "loss": 0.6968, + "step": 8789 + }, + { + "epoch": 0.48379107270625793, + "grad_norm": 0.8477250933647156, + "learning_rate": 8.646955148245827e-06, + "loss": 0.8364, + "step": 8790 + }, + { + "epoch": 0.48384611150861356, + "grad_norm": 0.9105041027069092, + "learning_rate": 8.646658601322635e-06, + "loss": 0.823, + "step": 8791 + }, + { + "epoch": 0.48390115031096925, + "grad_norm": 0.7642726898193359, + "learning_rate": 8.646362026992135e-06, + "loss": 0.721, + "step": 8792 + }, + { + "epoch": 0.4839561891133249, + "grad_norm": 0.7567259669303894, + "learning_rate": 8.646065425256555e-06, + "loss": 0.7876, + "step": 8793 + }, + { + "epoch": 0.4840112279156806, + "grad_norm": 0.7691231966018677, + "learning_rate": 8.64576879611812e-06, + "loss": 0.8308, + "step": 8794 + }, + { + "epoch": 0.4840662667180362, + "grad_norm": 1.0769426822662354, + "learning_rate": 8.645472139579067e-06, + "loss": 0.892, + "step": 8795 + }, + { + "epoch": 0.4841213055203919, + "grad_norm": 0.6987955570220947, + "learning_rate": 8.64517545564162e-06, + "loss": 0.8254, + "step": 8796 + }, + { + "epoch": 0.48417634432274753, + "grad_norm": 0.7736005783081055, + "learning_rate": 8.644878744308007e-06, + "loss": 0.7666, + "step": 8797 + }, + { + "epoch": 0.4842313831251032, + "grad_norm": 0.6233380436897278, + "learning_rate": 8.644582005580464e-06, + "loss": 0.6443, + "step": 8798 + }, + { + "epoch": 0.48428642192745885, + "grad_norm": 0.7343530654907227, + "learning_rate": 8.644285239461217e-06, + "loss": 0.724, + "step": 8799 + }, + { + "epoch": 0.48434146072981454, + "grad_norm": 0.725321352481842, + "learning_rate": 8.643988445952499e-06, + "loss": 0.7249, + "step": 8800 + }, + { + "epoch": 0.4843964995321702, + "grad_norm": 0.7256256341934204, + "learning_rate": 8.643691625056539e-06, + "loss": 0.8656, + "step": 8801 + }, + { + "epoch": 0.48445153833452587, + "grad_norm": 0.8559528589248657, + "learning_rate": 8.643394776775567e-06, + "loss": 0.9186, + "step": 8802 + }, + { + "epoch": 0.4845065771368815, + "grad_norm": 0.6735692024230957, + "learning_rate": 8.643097901111815e-06, + "loss": 0.7007, + "step": 8803 + }, + { + "epoch": 0.4845616159392372, + "grad_norm": 0.8373280167579651, + "learning_rate": 8.642800998067515e-06, + "loss": 0.8774, + "step": 8804 + }, + { + "epoch": 0.4846166547415928, + "grad_norm": 0.731311023235321, + "learning_rate": 8.642504067644898e-06, + "loss": 0.7102, + "step": 8805 + }, + { + "epoch": 0.48467169354394846, + "grad_norm": 0.7259742617607117, + "learning_rate": 8.642207109846195e-06, + "loss": 0.7174, + "step": 8806 + }, + { + "epoch": 0.48472673234630415, + "grad_norm": 0.6454386115074158, + "learning_rate": 8.641910124673638e-06, + "loss": 0.7656, + "step": 8807 + }, + { + "epoch": 0.4847817711486598, + "grad_norm": 0.7701624631881714, + "learning_rate": 8.641613112129462e-06, + "loss": 0.7926, + "step": 8808 + }, + { + "epoch": 0.48483680995101547, + "grad_norm": 0.6812854409217834, + "learning_rate": 8.641316072215893e-06, + "loss": 0.7072, + "step": 8809 + }, + { + "epoch": 0.4848918487533711, + "grad_norm": 0.8180119395256042, + "learning_rate": 8.641019004935169e-06, + "loss": 0.8621, + "step": 8810 + }, + { + "epoch": 0.4849468875557268, + "grad_norm": 0.6346331834793091, + "learning_rate": 8.64072191028952e-06, + "loss": 0.6907, + "step": 8811 + }, + { + "epoch": 0.4850019263580824, + "grad_norm": 0.6819741129875183, + "learning_rate": 8.64042478828118e-06, + "loss": 0.77, + "step": 8812 + }, + { + "epoch": 0.4850569651604381, + "grad_norm": 0.9074214100837708, + "learning_rate": 8.640127638912383e-06, + "loss": 0.7799, + "step": 8813 + }, + { + "epoch": 0.48511200396279375, + "grad_norm": 0.8065158724784851, + "learning_rate": 8.63983046218536e-06, + "loss": 0.8033, + "step": 8814 + }, + { + "epoch": 0.48516704276514944, + "grad_norm": 0.6241241097450256, + "learning_rate": 8.639533258102345e-06, + "loss": 0.6936, + "step": 8815 + }, + { + "epoch": 0.48522208156750507, + "grad_norm": 0.6928265690803528, + "learning_rate": 8.639236026665573e-06, + "loss": 0.7526, + "step": 8816 + }, + { + "epoch": 0.48527712036986076, + "grad_norm": 0.8171425461769104, + "learning_rate": 8.638938767877276e-06, + "loss": 0.8227, + "step": 8817 + }, + { + "epoch": 0.4853321591722164, + "grad_norm": 0.7007083296775818, + "learning_rate": 8.638641481739692e-06, + "loss": 0.7439, + "step": 8818 + }, + { + "epoch": 0.4853871979745721, + "grad_norm": 0.8905115127563477, + "learning_rate": 8.63834416825505e-06, + "loss": 0.6873, + "step": 8819 + }, + { + "epoch": 0.4854422367769277, + "grad_norm": 0.702198326587677, + "learning_rate": 8.638046827425588e-06, + "loss": 0.7999, + "step": 8820 + }, + { + "epoch": 0.4854972755792834, + "grad_norm": 0.7280104160308838, + "learning_rate": 8.63774945925354e-06, + "loss": 0.8562, + "step": 8821 + }, + { + "epoch": 0.48555231438163904, + "grad_norm": 0.9803630113601685, + "learning_rate": 8.63745206374114e-06, + "loss": 0.8347, + "step": 8822 + }, + { + "epoch": 0.4856073531839947, + "grad_norm": 0.6781168580055237, + "learning_rate": 8.637154640890625e-06, + "loss": 0.8124, + "step": 8823 + }, + { + "epoch": 0.48566239198635036, + "grad_norm": 0.7219669222831726, + "learning_rate": 8.63685719070423e-06, + "loss": 0.8053, + "step": 8824 + }, + { + "epoch": 0.48571743078870605, + "grad_norm": 0.7077241539955139, + "learning_rate": 8.636559713184187e-06, + "loss": 0.7534, + "step": 8825 + }, + { + "epoch": 0.4857724695910617, + "grad_norm": 0.70063316822052, + "learning_rate": 8.636262208332737e-06, + "loss": 0.7509, + "step": 8826 + }, + { + "epoch": 0.4858275083934174, + "grad_norm": 0.7292184233665466, + "learning_rate": 8.635964676152114e-06, + "loss": 0.7485, + "step": 8827 + }, + { + "epoch": 0.485882547195773, + "grad_norm": 0.7970258593559265, + "learning_rate": 8.635667116644552e-06, + "loss": 0.8874, + "step": 8828 + }, + { + "epoch": 0.4859375859981287, + "grad_norm": 0.7090024352073669, + "learning_rate": 8.63536952981229e-06, + "loss": 0.7665, + "step": 8829 + }, + { + "epoch": 0.48599262480048433, + "grad_norm": 0.761409342288971, + "learning_rate": 8.635071915657565e-06, + "loss": 0.7977, + "step": 8830 + }, + { + "epoch": 0.48604766360284, + "grad_norm": 0.724896252155304, + "learning_rate": 8.634774274182611e-06, + "loss": 0.8591, + "step": 8831 + }, + { + "epoch": 0.48610270240519565, + "grad_norm": 0.737424910068512, + "learning_rate": 8.634476605389666e-06, + "loss": 0.8256, + "step": 8832 + }, + { + "epoch": 0.48615774120755134, + "grad_norm": 0.8261227607727051, + "learning_rate": 8.63417890928097e-06, + "loss": 0.8089, + "step": 8833 + }, + { + "epoch": 0.486212780009907, + "grad_norm": 0.6744595766067505, + "learning_rate": 8.633881185858756e-06, + "loss": 0.7821, + "step": 8834 + }, + { + "epoch": 0.48626781881226266, + "grad_norm": 0.6717672944068909, + "learning_rate": 8.633583435125263e-06, + "loss": 0.7823, + "step": 8835 + }, + { + "epoch": 0.4863228576146183, + "grad_norm": 0.753616213798523, + "learning_rate": 8.633285657082732e-06, + "loss": 0.8044, + "step": 8836 + }, + { + "epoch": 0.486377896416974, + "grad_norm": 0.6910914182662964, + "learning_rate": 8.632987851733397e-06, + "loss": 0.8244, + "step": 8837 + }, + { + "epoch": 0.4864329352193296, + "grad_norm": 0.9127064347267151, + "learning_rate": 8.632690019079499e-06, + "loss": 0.7918, + "step": 8838 + }, + { + "epoch": 0.4864879740216853, + "grad_norm": 0.715918779373169, + "learning_rate": 8.632392159123274e-06, + "loss": 0.744, + "step": 8839 + }, + { + "epoch": 0.48654301282404094, + "grad_norm": 0.8206684589385986, + "learning_rate": 8.632094271866963e-06, + "loss": 0.7852, + "step": 8840 + }, + { + "epoch": 0.48659805162639663, + "grad_norm": 0.6502171158790588, + "learning_rate": 8.631796357312802e-06, + "loss": 0.7653, + "step": 8841 + }, + { + "epoch": 0.48665309042875227, + "grad_norm": 0.6987786889076233, + "learning_rate": 8.631498415463033e-06, + "loss": 0.7669, + "step": 8842 + }, + { + "epoch": 0.48670812923110796, + "grad_norm": 0.7902390360832214, + "learning_rate": 8.631200446319894e-06, + "loss": 0.8438, + "step": 8843 + }, + { + "epoch": 0.4867631680334636, + "grad_norm": 0.7464659810066223, + "learning_rate": 8.630902449885625e-06, + "loss": 0.8276, + "step": 8844 + }, + { + "epoch": 0.4868182068358193, + "grad_norm": 0.7375630736351013, + "learning_rate": 8.630604426162465e-06, + "loss": 0.7921, + "step": 8845 + }, + { + "epoch": 0.4868732456381749, + "grad_norm": 0.7206295728683472, + "learning_rate": 8.630306375152653e-06, + "loss": 0.8424, + "step": 8846 + }, + { + "epoch": 0.4869282844405306, + "grad_norm": 0.7384368181228638, + "learning_rate": 8.63000829685843e-06, + "loss": 0.8702, + "step": 8847 + }, + { + "epoch": 0.48698332324288623, + "grad_norm": 0.7839015126228333, + "learning_rate": 8.629710191282037e-06, + "loss": 0.7064, + "step": 8848 + }, + { + "epoch": 0.48703836204524187, + "grad_norm": 0.6909724473953247, + "learning_rate": 8.629412058425712e-06, + "loss": 0.6924, + "step": 8849 + }, + { + "epoch": 0.48709340084759756, + "grad_norm": 0.6553036570549011, + "learning_rate": 8.6291138982917e-06, + "loss": 0.6526, + "step": 8850 + }, + { + "epoch": 0.4871484396499532, + "grad_norm": 0.7202072143554688, + "learning_rate": 8.628815710882239e-06, + "loss": 0.7272, + "step": 8851 + }, + { + "epoch": 0.4872034784523089, + "grad_norm": 0.6898619532585144, + "learning_rate": 8.62851749619957e-06, + "loss": 0.7687, + "step": 8852 + }, + { + "epoch": 0.4872585172546645, + "grad_norm": 0.7888908386230469, + "learning_rate": 8.628219254245935e-06, + "loss": 0.7654, + "step": 8853 + }, + { + "epoch": 0.4873135560570202, + "grad_norm": 0.7312424778938293, + "learning_rate": 8.627920985023575e-06, + "loss": 0.8053, + "step": 8854 + }, + { + "epoch": 0.48736859485937584, + "grad_norm": 0.6588439345359802, + "learning_rate": 8.627622688534731e-06, + "loss": 0.7229, + "step": 8855 + }, + { + "epoch": 0.4874236336617315, + "grad_norm": 0.8292293548583984, + "learning_rate": 8.627324364781647e-06, + "loss": 0.8482, + "step": 8856 + }, + { + "epoch": 0.48747867246408716, + "grad_norm": 0.7573973536491394, + "learning_rate": 8.627026013766564e-06, + "loss": 0.7282, + "step": 8857 + }, + { + "epoch": 0.48753371126644285, + "grad_norm": 1.2215768098831177, + "learning_rate": 8.626727635491726e-06, + "loss": 0.7771, + "step": 8858 + }, + { + "epoch": 0.4875887500687985, + "grad_norm": 0.7324759364128113, + "learning_rate": 8.626429229959369e-06, + "loss": 0.781, + "step": 8859 + }, + { + "epoch": 0.48764378887115417, + "grad_norm": 0.6995676159858704, + "learning_rate": 8.626130797171745e-06, + "loss": 0.6907, + "step": 8860 + }, + { + "epoch": 0.4876988276735098, + "grad_norm": 0.7400509119033813, + "learning_rate": 8.625832337131092e-06, + "loss": 0.6572, + "step": 8861 + }, + { + "epoch": 0.4877538664758655, + "grad_norm": 0.6634842753410339, + "learning_rate": 8.625533849839653e-06, + "loss": 0.7229, + "step": 8862 + }, + { + "epoch": 0.4878089052782211, + "grad_norm": 0.7357299327850342, + "learning_rate": 8.625235335299673e-06, + "loss": 0.6418, + "step": 8863 + }, + { + "epoch": 0.4878639440805768, + "grad_norm": 0.6473466157913208, + "learning_rate": 8.624936793513394e-06, + "loss": 0.6796, + "step": 8864 + }, + { + "epoch": 0.48791898288293245, + "grad_norm": 0.9110734462738037, + "learning_rate": 8.62463822448306e-06, + "loss": 0.8143, + "step": 8865 + }, + { + "epoch": 0.48797402168528814, + "grad_norm": 0.7932308316230774, + "learning_rate": 8.624339628210916e-06, + "loss": 0.9103, + "step": 8866 + }, + { + "epoch": 0.4880290604876438, + "grad_norm": 0.6677752137184143, + "learning_rate": 8.624041004699205e-06, + "loss": 0.8073, + "step": 8867 + }, + { + "epoch": 0.48808409928999946, + "grad_norm": 0.7379121780395508, + "learning_rate": 8.623742353950171e-06, + "loss": 0.8643, + "step": 8868 + }, + { + "epoch": 0.4881391380923551, + "grad_norm": 0.7479479312896729, + "learning_rate": 8.623443675966062e-06, + "loss": 0.6117, + "step": 8869 + }, + { + "epoch": 0.4881941768947108, + "grad_norm": 0.7822794914245605, + "learning_rate": 8.623144970749118e-06, + "loss": 0.8629, + "step": 8870 + }, + { + "epoch": 0.4882492156970664, + "grad_norm": 0.7040950655937195, + "learning_rate": 8.622846238301587e-06, + "loss": 0.7519, + "step": 8871 + }, + { + "epoch": 0.4883042544994221, + "grad_norm": 0.747368574142456, + "learning_rate": 8.622547478625714e-06, + "loss": 0.7459, + "step": 8872 + }, + { + "epoch": 0.48835929330177774, + "grad_norm": 0.6755948066711426, + "learning_rate": 8.622248691723742e-06, + "loss": 0.7515, + "step": 8873 + }, + { + "epoch": 0.48841433210413343, + "grad_norm": 0.7265586256980896, + "learning_rate": 8.62194987759792e-06, + "loss": 0.7691, + "step": 8874 + }, + { + "epoch": 0.48846937090648906, + "grad_norm": 0.6696380972862244, + "learning_rate": 8.621651036250493e-06, + "loss": 0.778, + "step": 8875 + }, + { + "epoch": 0.48852440970884475, + "grad_norm": 0.7666454911231995, + "learning_rate": 8.621352167683705e-06, + "loss": 0.7396, + "step": 8876 + }, + { + "epoch": 0.4885794485112004, + "grad_norm": 0.7079235315322876, + "learning_rate": 8.621053271899803e-06, + "loss": 0.7917, + "step": 8877 + }, + { + "epoch": 0.4886344873135561, + "grad_norm": 0.6888919472694397, + "learning_rate": 8.620754348901034e-06, + "loss": 0.605, + "step": 8878 + }, + { + "epoch": 0.4886895261159117, + "grad_norm": 0.7177572250366211, + "learning_rate": 8.620455398689645e-06, + "loss": 0.7534, + "step": 8879 + }, + { + "epoch": 0.4887445649182674, + "grad_norm": 0.7268772721290588, + "learning_rate": 8.620156421267883e-06, + "loss": 0.7748, + "step": 8880 + }, + { + "epoch": 0.48879960372062303, + "grad_norm": 0.8015080690383911, + "learning_rate": 8.619857416637993e-06, + "loss": 0.6716, + "step": 8881 + }, + { + "epoch": 0.4888546425229787, + "grad_norm": 0.7464118599891663, + "learning_rate": 8.619558384802226e-06, + "loss": 0.796, + "step": 8882 + }, + { + "epoch": 0.48890968132533436, + "grad_norm": 0.6829718351364136, + "learning_rate": 8.619259325762826e-06, + "loss": 0.788, + "step": 8883 + }, + { + "epoch": 0.48896472012769004, + "grad_norm": 0.6553084850311279, + "learning_rate": 8.618960239522041e-06, + "loss": 0.7215, + "step": 8884 + }, + { + "epoch": 0.4890197589300457, + "grad_norm": 0.8056252598762512, + "learning_rate": 8.618661126082119e-06, + "loss": 0.8588, + "step": 8885 + }, + { + "epoch": 0.48907479773240137, + "grad_norm": 0.8145674467086792, + "learning_rate": 8.618361985445309e-06, + "loss": 0.8095, + "step": 8886 + }, + { + "epoch": 0.489129836534757, + "grad_norm": 0.740031898021698, + "learning_rate": 8.61806281761386e-06, + "loss": 0.7029, + "step": 8887 + }, + { + "epoch": 0.4891848753371127, + "grad_norm": 0.7442640662193298, + "learning_rate": 8.617763622590019e-06, + "loss": 0.782, + "step": 8888 + }, + { + "epoch": 0.4892399141394683, + "grad_norm": 0.6992725133895874, + "learning_rate": 8.617464400376035e-06, + "loss": 0.7877, + "step": 8889 + }, + { + "epoch": 0.489294952941824, + "grad_norm": 1.19756281375885, + "learning_rate": 8.617165150974157e-06, + "loss": 0.6985, + "step": 8890 + }, + { + "epoch": 0.48934999174417965, + "grad_norm": 0.6418262720108032, + "learning_rate": 8.616865874386633e-06, + "loss": 0.7385, + "step": 8891 + }, + { + "epoch": 0.4894050305465353, + "grad_norm": 0.787406325340271, + "learning_rate": 8.616566570615714e-06, + "loss": 0.8686, + "step": 8892 + }, + { + "epoch": 0.48946006934889097, + "grad_norm": 0.6990430951118469, + "learning_rate": 8.616267239663648e-06, + "loss": 0.7683, + "step": 8893 + }, + { + "epoch": 0.4895151081512466, + "grad_norm": 0.7180235981941223, + "learning_rate": 8.615967881532687e-06, + "loss": 0.8337, + "step": 8894 + }, + { + "epoch": 0.4895701469536023, + "grad_norm": 0.7647475600242615, + "learning_rate": 8.615668496225077e-06, + "loss": 0.8668, + "step": 8895 + }, + { + "epoch": 0.4896251857559579, + "grad_norm": 0.843063473701477, + "learning_rate": 8.615369083743072e-06, + "loss": 0.7968, + "step": 8896 + }, + { + "epoch": 0.4896802245583136, + "grad_norm": 0.9526075124740601, + "learning_rate": 8.61506964408892e-06, + "loss": 0.8766, + "step": 8897 + }, + { + "epoch": 0.48973526336066925, + "grad_norm": 0.7850056290626526, + "learning_rate": 8.614770177264874e-06, + "loss": 0.8033, + "step": 8898 + }, + { + "epoch": 0.48979030216302494, + "grad_norm": 0.8658629655838013, + "learning_rate": 8.614470683273182e-06, + "loss": 0.8206, + "step": 8899 + }, + { + "epoch": 0.48984534096538057, + "grad_norm": 0.8060176968574524, + "learning_rate": 8.614171162116096e-06, + "loss": 0.7602, + "step": 8900 + }, + { + "epoch": 0.48990037976773626, + "grad_norm": 0.7398280501365662, + "learning_rate": 8.613871613795865e-06, + "loss": 0.8067, + "step": 8901 + }, + { + "epoch": 0.4899554185700919, + "grad_norm": 0.7341256141662598, + "learning_rate": 8.613572038314744e-06, + "loss": 0.7305, + "step": 8902 + }, + { + "epoch": 0.4900104573724476, + "grad_norm": 0.7832887172698975, + "learning_rate": 8.613272435674984e-06, + "loss": 0.7012, + "step": 8903 + }, + { + "epoch": 0.4900654961748032, + "grad_norm": 0.6536995768547058, + "learning_rate": 8.612972805878834e-06, + "loss": 0.745, + "step": 8904 + }, + { + "epoch": 0.4901205349771589, + "grad_norm": 0.7511856555938721, + "learning_rate": 8.612673148928547e-06, + "loss": 0.7741, + "step": 8905 + }, + { + "epoch": 0.49017557377951454, + "grad_norm": 0.6117261648178101, + "learning_rate": 8.612373464826377e-06, + "loss": 0.5813, + "step": 8906 + }, + { + "epoch": 0.49023061258187023, + "grad_norm": 0.7832254767417908, + "learning_rate": 8.612073753574574e-06, + "loss": 0.7426, + "step": 8907 + }, + { + "epoch": 0.49028565138422586, + "grad_norm": 0.7516622543334961, + "learning_rate": 8.611774015175393e-06, + "loss": 0.8205, + "step": 8908 + }, + { + "epoch": 0.49034069018658155, + "grad_norm": 0.7776936888694763, + "learning_rate": 8.611474249631085e-06, + "loss": 0.8457, + "step": 8909 + }, + { + "epoch": 0.4903957289889372, + "grad_norm": 0.9364853501319885, + "learning_rate": 8.6111744569439e-06, + "loss": 0.9114, + "step": 8910 + }, + { + "epoch": 0.4904507677912929, + "grad_norm": 0.7584181427955627, + "learning_rate": 8.610874637116099e-06, + "loss": 0.6852, + "step": 8911 + }, + { + "epoch": 0.4905058065936485, + "grad_norm": 0.7326254844665527, + "learning_rate": 8.610574790149929e-06, + "loss": 0.7843, + "step": 8912 + }, + { + "epoch": 0.4905608453960042, + "grad_norm": 0.918258547782898, + "learning_rate": 8.610274916047645e-06, + "loss": 0.766, + "step": 8913 + }, + { + "epoch": 0.49061588419835983, + "grad_norm": 1.0083420276641846, + "learning_rate": 8.609975014811502e-06, + "loss": 0.7436, + "step": 8914 + }, + { + "epoch": 0.4906709230007155, + "grad_norm": 0.712664783000946, + "learning_rate": 8.609675086443752e-06, + "loss": 0.7891, + "step": 8915 + }, + { + "epoch": 0.49072596180307115, + "grad_norm": 0.7635206580162048, + "learning_rate": 8.609375130946651e-06, + "loss": 0.7842, + "step": 8916 + }, + { + "epoch": 0.49078100060542684, + "grad_norm": 0.7567723989486694, + "learning_rate": 8.609075148322452e-06, + "loss": 0.8435, + "step": 8917 + }, + { + "epoch": 0.4908360394077825, + "grad_norm": 0.8918718099594116, + "learning_rate": 8.60877513857341e-06, + "loss": 0.8015, + "step": 8918 + }, + { + "epoch": 0.49089107821013817, + "grad_norm": 0.8701914548873901, + "learning_rate": 8.608475101701781e-06, + "loss": 0.7806, + "step": 8919 + }, + { + "epoch": 0.4909461170124938, + "grad_norm": 0.7528215646743774, + "learning_rate": 8.608175037709819e-06, + "loss": 0.7958, + "step": 8920 + }, + { + "epoch": 0.4910011558148495, + "grad_norm": 0.7277387380599976, + "learning_rate": 8.60787494659978e-06, + "loss": 0.7878, + "step": 8921 + }, + { + "epoch": 0.4910561946172051, + "grad_norm": 0.6739892959594727, + "learning_rate": 8.607574828373917e-06, + "loss": 0.7212, + "step": 8922 + }, + { + "epoch": 0.4911112334195608, + "grad_norm": 0.712480366230011, + "learning_rate": 8.607274683034487e-06, + "loss": 0.7966, + "step": 8923 + }, + { + "epoch": 0.49116627222191644, + "grad_norm": 0.7192126512527466, + "learning_rate": 8.606974510583747e-06, + "loss": 0.7032, + "step": 8924 + }, + { + "epoch": 0.49122131102427213, + "grad_norm": 0.7502614855766296, + "learning_rate": 8.606674311023953e-06, + "loss": 0.7465, + "step": 8925 + }, + { + "epoch": 0.49127634982662777, + "grad_norm": 0.8475236892700195, + "learning_rate": 8.606374084357361e-06, + "loss": 0.8083, + "step": 8926 + }, + { + "epoch": 0.49133138862898346, + "grad_norm": 0.6972761750221252, + "learning_rate": 8.606073830586224e-06, + "loss": 0.7206, + "step": 8927 + }, + { + "epoch": 0.4913864274313391, + "grad_norm": 0.6209561824798584, + "learning_rate": 8.605773549712803e-06, + "loss": 0.6664, + "step": 8928 + }, + { + "epoch": 0.4914414662336948, + "grad_norm": 0.7905771732330322, + "learning_rate": 8.605473241739353e-06, + "loss": 0.7243, + "step": 8929 + }, + { + "epoch": 0.4914965050360504, + "grad_norm": 0.762959897518158, + "learning_rate": 8.605172906668131e-06, + "loss": 0.7747, + "step": 8930 + }, + { + "epoch": 0.4915515438384061, + "grad_norm": 0.7297530174255371, + "learning_rate": 8.604872544501394e-06, + "loss": 0.7441, + "step": 8931 + }, + { + "epoch": 0.49160658264076174, + "grad_norm": 0.6732318997383118, + "learning_rate": 8.6045721552414e-06, + "loss": 0.7621, + "step": 8932 + }, + { + "epoch": 0.4916616214431174, + "grad_norm": 0.7010045647621155, + "learning_rate": 8.604271738890407e-06, + "loss": 0.7971, + "step": 8933 + }, + { + "epoch": 0.49171666024547306, + "grad_norm": 0.6996648907661438, + "learning_rate": 8.603971295450672e-06, + "loss": 0.8119, + "step": 8934 + }, + { + "epoch": 0.4917716990478287, + "grad_norm": 0.7679941058158875, + "learning_rate": 8.603670824924456e-06, + "loss": 0.8035, + "step": 8935 + }, + { + "epoch": 0.4918267378501844, + "grad_norm": 0.8009630441665649, + "learning_rate": 8.603370327314011e-06, + "loss": 0.7817, + "step": 8936 + }, + { + "epoch": 0.49188177665254, + "grad_norm": 0.7167709469795227, + "learning_rate": 8.603069802621601e-06, + "loss": 0.7621, + "step": 8937 + }, + { + "epoch": 0.4919368154548957, + "grad_norm": 0.7447960376739502, + "learning_rate": 8.602769250849483e-06, + "loss": 0.7664, + "step": 8938 + }, + { + "epoch": 0.49199185425725134, + "grad_norm": 0.653131365776062, + "learning_rate": 8.602468671999915e-06, + "loss": 0.6927, + "step": 8939 + }, + { + "epoch": 0.492046893059607, + "grad_norm": 0.6758691072463989, + "learning_rate": 8.602168066075158e-06, + "loss": 0.7519, + "step": 8940 + }, + { + "epoch": 0.49210193186196266, + "grad_norm": 0.9186220765113831, + "learning_rate": 8.60186743307747e-06, + "loss": 0.7265, + "step": 8941 + }, + { + "epoch": 0.49215697066431835, + "grad_norm": 0.6781855225563049, + "learning_rate": 8.60156677300911e-06, + "loss": 0.6719, + "step": 8942 + }, + { + "epoch": 0.492212009466674, + "grad_norm": 0.7262865304946899, + "learning_rate": 8.601266085872336e-06, + "loss": 0.6449, + "step": 8943 + }, + { + "epoch": 0.4922670482690297, + "grad_norm": 0.6877585053443909, + "learning_rate": 8.600965371669411e-06, + "loss": 0.6999, + "step": 8944 + }, + { + "epoch": 0.4923220870713853, + "grad_norm": 1.1133443117141724, + "learning_rate": 8.600664630402596e-06, + "loss": 0.7842, + "step": 8945 + }, + { + "epoch": 0.492377125873741, + "grad_norm": 0.643478274345398, + "learning_rate": 8.600363862074149e-06, + "loss": 0.7009, + "step": 8946 + }, + { + "epoch": 0.49243216467609663, + "grad_norm": 0.7692574262619019, + "learning_rate": 8.600063066686331e-06, + "loss": 0.7777, + "step": 8947 + }, + { + "epoch": 0.4924872034784523, + "grad_norm": 0.884963870048523, + "learning_rate": 8.599762244241403e-06, + "loss": 0.7789, + "step": 8948 + }, + { + "epoch": 0.49254224228080795, + "grad_norm": 0.6918813586235046, + "learning_rate": 8.599461394741624e-06, + "loss": 0.7769, + "step": 8949 + }, + { + "epoch": 0.49259728108316364, + "grad_norm": 0.7432044148445129, + "learning_rate": 8.599160518189258e-06, + "loss": 0.7972, + "step": 8950 + }, + { + "epoch": 0.4926523198855193, + "grad_norm": 0.7530491948127747, + "learning_rate": 8.598859614586564e-06, + "loss": 0.8812, + "step": 8951 + }, + { + "epoch": 0.49270735868787496, + "grad_norm": 0.8738592267036438, + "learning_rate": 8.598558683935806e-06, + "loss": 0.6967, + "step": 8952 + }, + { + "epoch": 0.4927623974902306, + "grad_norm": 1.032084584236145, + "learning_rate": 8.598257726239242e-06, + "loss": 0.8513, + "step": 8953 + }, + { + "epoch": 0.4928174362925863, + "grad_norm": 0.8717961311340332, + "learning_rate": 8.597956741499136e-06, + "loss": 0.7703, + "step": 8954 + }, + { + "epoch": 0.4928724750949419, + "grad_norm": 0.6788356900215149, + "learning_rate": 8.597655729717753e-06, + "loss": 0.7649, + "step": 8955 + }, + { + "epoch": 0.4929275138972976, + "grad_norm": 1.0595613718032837, + "learning_rate": 8.59735469089735e-06, + "loss": 0.6967, + "step": 8956 + }, + { + "epoch": 0.49298255269965324, + "grad_norm": 0.7583820819854736, + "learning_rate": 8.597053625040193e-06, + "loss": 0.8384, + "step": 8957 + }, + { + "epoch": 0.49303759150200893, + "grad_norm": 0.7232168912887573, + "learning_rate": 8.596752532148545e-06, + "loss": 0.7643, + "step": 8958 + }, + { + "epoch": 0.49309263030436457, + "grad_norm": 0.727190375328064, + "learning_rate": 8.596451412224666e-06, + "loss": 0.845, + "step": 8959 + }, + { + "epoch": 0.49314766910672025, + "grad_norm": 0.6844252347946167, + "learning_rate": 8.596150265270821e-06, + "loss": 0.7099, + "step": 8960 + }, + { + "epoch": 0.4932027079090759, + "grad_norm": 0.7379910945892334, + "learning_rate": 8.595849091289275e-06, + "loss": 0.8168, + "step": 8961 + }, + { + "epoch": 0.4932577467114316, + "grad_norm": 0.77718186378479, + "learning_rate": 8.595547890282288e-06, + "loss": 0.8457, + "step": 8962 + }, + { + "epoch": 0.4933127855137872, + "grad_norm": 0.686126172542572, + "learning_rate": 8.595246662252127e-06, + "loss": 0.7918, + "step": 8963 + }, + { + "epoch": 0.4933678243161429, + "grad_norm": 0.7406145930290222, + "learning_rate": 8.594945407201051e-06, + "loss": 0.6866, + "step": 8964 + }, + { + "epoch": 0.49342286311849853, + "grad_norm": 0.9543277025222778, + "learning_rate": 8.594644125131331e-06, + "loss": 0.8444, + "step": 8965 + }, + { + "epoch": 0.4934779019208542, + "grad_norm": 0.8659517765045166, + "learning_rate": 8.594342816045228e-06, + "loss": 0.7661, + "step": 8966 + }, + { + "epoch": 0.49353294072320986, + "grad_norm": 0.7289552092552185, + "learning_rate": 8.594041479945005e-06, + "loss": 0.7734, + "step": 8967 + }, + { + "epoch": 0.49358797952556555, + "grad_norm": 0.7232840657234192, + "learning_rate": 8.59374011683293e-06, + "loss": 0.8557, + "step": 8968 + }, + { + "epoch": 0.4936430183279212, + "grad_norm": 0.738684356212616, + "learning_rate": 8.593438726711265e-06, + "loss": 0.7779, + "step": 8969 + }, + { + "epoch": 0.49369805713027687, + "grad_norm": 0.7486668229103088, + "learning_rate": 8.593137309582276e-06, + "loss": 0.7326, + "step": 8970 + }, + { + "epoch": 0.4937530959326325, + "grad_norm": 0.6564297080039978, + "learning_rate": 8.59283586544823e-06, + "loss": 0.6927, + "step": 8971 + }, + { + "epoch": 0.4938081347349882, + "grad_norm": 0.722540557384491, + "learning_rate": 8.592534394311392e-06, + "loss": 0.7254, + "step": 8972 + }, + { + "epoch": 0.4938631735373438, + "grad_norm": 0.7466141581535339, + "learning_rate": 8.592232896174026e-06, + "loss": 0.8551, + "step": 8973 + }, + { + "epoch": 0.4939182123396995, + "grad_norm": 0.7819109559059143, + "learning_rate": 8.591931371038398e-06, + "loss": 0.7271, + "step": 8974 + }, + { + "epoch": 0.49397325114205515, + "grad_norm": 0.7847672700881958, + "learning_rate": 8.591629818906776e-06, + "loss": 0.8404, + "step": 8975 + }, + { + "epoch": 0.49402828994441084, + "grad_norm": 0.8167426586151123, + "learning_rate": 8.591328239781428e-06, + "loss": 0.7375, + "step": 8976 + }, + { + "epoch": 0.49408332874676647, + "grad_norm": 0.7894755005836487, + "learning_rate": 8.591026633664615e-06, + "loss": 0.7872, + "step": 8977 + }, + { + "epoch": 0.4941383675491221, + "grad_norm": 0.726204514503479, + "learning_rate": 8.590725000558609e-06, + "loss": 0.7289, + "step": 8978 + }, + { + "epoch": 0.4941934063514778, + "grad_norm": 0.7116577625274658, + "learning_rate": 8.590423340465675e-06, + "loss": 0.7379, + "step": 8979 + }, + { + "epoch": 0.4942484451538334, + "grad_norm": 0.7302193641662598, + "learning_rate": 8.59012165338808e-06, + "loss": 0.7951, + "step": 8980 + }, + { + "epoch": 0.4943034839561891, + "grad_norm": 0.680555522441864, + "learning_rate": 8.58981993932809e-06, + "loss": 0.7609, + "step": 8981 + }, + { + "epoch": 0.49435852275854475, + "grad_norm": 0.874546229839325, + "learning_rate": 8.589518198287976e-06, + "loss": 0.8025, + "step": 8982 + }, + { + "epoch": 0.49441356156090044, + "grad_norm": 0.7164583206176758, + "learning_rate": 8.589216430270004e-06, + "loss": 0.7466, + "step": 8983 + }, + { + "epoch": 0.49446860036325607, + "grad_norm": 0.9155141115188599, + "learning_rate": 8.588914635276442e-06, + "loss": 0.7896, + "step": 8984 + }, + { + "epoch": 0.49452363916561176, + "grad_norm": 0.6777059435844421, + "learning_rate": 8.588612813309558e-06, + "loss": 0.7468, + "step": 8985 + }, + { + "epoch": 0.4945786779679674, + "grad_norm": 0.7100371718406677, + "learning_rate": 8.58831096437162e-06, + "loss": 0.7216, + "step": 8986 + }, + { + "epoch": 0.4946337167703231, + "grad_norm": 0.6842584609985352, + "learning_rate": 8.5880090884649e-06, + "loss": 0.7103, + "step": 8987 + }, + { + "epoch": 0.4946887555726787, + "grad_norm": 0.6347573399543762, + "learning_rate": 8.587707185591661e-06, + "loss": 0.7103, + "step": 8988 + }, + { + "epoch": 0.4947437943750344, + "grad_norm": 0.7175829410552979, + "learning_rate": 8.587405255754177e-06, + "loss": 0.8375, + "step": 8989 + }, + { + "epoch": 0.49479883317739004, + "grad_norm": 0.8402735590934753, + "learning_rate": 8.587103298954715e-06, + "loss": 0.6841, + "step": 8990 + }, + { + "epoch": 0.49485387197974573, + "grad_norm": 0.6988743543624878, + "learning_rate": 8.586801315195545e-06, + "loss": 0.7637, + "step": 8991 + }, + { + "epoch": 0.49490891078210136, + "grad_norm": 0.6672561168670654, + "learning_rate": 8.586499304478934e-06, + "loss": 0.7103, + "step": 8992 + }, + { + "epoch": 0.49496394958445705, + "grad_norm": 0.6821330189704895, + "learning_rate": 8.586197266807158e-06, + "loss": 0.6881, + "step": 8993 + }, + { + "epoch": 0.4950189883868127, + "grad_norm": 0.7886170744895935, + "learning_rate": 8.585895202182482e-06, + "loss": 0.7892, + "step": 8994 + }, + { + "epoch": 0.4950740271891684, + "grad_norm": 0.7348074913024902, + "learning_rate": 8.585593110607177e-06, + "loss": 0.7835, + "step": 8995 + }, + { + "epoch": 0.495129065991524, + "grad_norm": 0.9375506639480591, + "learning_rate": 8.585290992083514e-06, + "loss": 0.8017, + "step": 8996 + }, + { + "epoch": 0.4951841047938797, + "grad_norm": 0.7442331910133362, + "learning_rate": 8.584988846613765e-06, + "loss": 0.72, + "step": 8997 + }, + { + "epoch": 0.49523914359623533, + "grad_norm": 0.7347918748855591, + "learning_rate": 8.584686674200197e-06, + "loss": 0.8229, + "step": 8998 + }, + { + "epoch": 0.495294182398591, + "grad_norm": 0.7168740630149841, + "learning_rate": 8.584384474845084e-06, + "loss": 0.7288, + "step": 8999 + }, + { + "epoch": 0.49534922120094665, + "grad_norm": 0.7834853529930115, + "learning_rate": 8.584082248550697e-06, + "loss": 0.8521, + "step": 9000 + }, + { + "epoch": 0.49540426000330234, + "grad_norm": 0.6499035358428955, + "learning_rate": 8.58377999531931e-06, + "loss": 0.6887, + "step": 9001 + }, + { + "epoch": 0.495459298805658, + "grad_norm": 0.8000181913375854, + "learning_rate": 8.583477715153189e-06, + "loss": 0.8688, + "step": 9002 + }, + { + "epoch": 0.49551433760801367, + "grad_norm": 0.7539342045783997, + "learning_rate": 8.58317540805461e-06, + "loss": 0.6151, + "step": 9003 + }, + { + "epoch": 0.4955693764103693, + "grad_norm": 0.7677812576293945, + "learning_rate": 8.582873074025841e-06, + "loss": 0.8168, + "step": 9004 + }, + { + "epoch": 0.495624415212725, + "grad_norm": 0.7679157853126526, + "learning_rate": 8.58257071306916e-06, + "loss": 0.7719, + "step": 9005 + }, + { + "epoch": 0.4956794540150806, + "grad_norm": 0.9745703935623169, + "learning_rate": 8.582268325186836e-06, + "loss": 0.8272, + "step": 9006 + }, + { + "epoch": 0.4957344928174363, + "grad_norm": 0.66932612657547, + "learning_rate": 8.581965910381143e-06, + "loss": 0.7256, + "step": 9007 + }, + { + "epoch": 0.49578953161979195, + "grad_norm": 0.7630981206893921, + "learning_rate": 8.581663468654351e-06, + "loss": 0.7594, + "step": 9008 + }, + { + "epoch": 0.49584457042214763, + "grad_norm": 0.7420778870582581, + "learning_rate": 8.581361000008737e-06, + "loss": 0.7834, + "step": 9009 + }, + { + "epoch": 0.49589960922450327, + "grad_norm": 0.6775205731391907, + "learning_rate": 8.58105850444657e-06, + "loss": 0.7609, + "step": 9010 + }, + { + "epoch": 0.49595464802685896, + "grad_norm": 0.6588264107704163, + "learning_rate": 8.580755981970128e-06, + "loss": 0.805, + "step": 9011 + }, + { + "epoch": 0.4960096868292146, + "grad_norm": 0.7325689196586609, + "learning_rate": 8.580453432581681e-06, + "loss": 0.8817, + "step": 9012 + }, + { + "epoch": 0.4960647256315703, + "grad_norm": 0.7319273948669434, + "learning_rate": 8.580150856283505e-06, + "loss": 0.8001, + "step": 9013 + }, + { + "epoch": 0.4961197644339259, + "grad_norm": 0.7841789126396179, + "learning_rate": 8.579848253077875e-06, + "loss": 0.8415, + "step": 9014 + }, + { + "epoch": 0.4961748032362816, + "grad_norm": 0.7593979239463806, + "learning_rate": 8.579545622967062e-06, + "loss": 0.8238, + "step": 9015 + }, + { + "epoch": 0.49622984203863724, + "grad_norm": 0.6938808560371399, + "learning_rate": 8.579242965953343e-06, + "loss": 0.7325, + "step": 9016 + }, + { + "epoch": 0.4962848808409929, + "grad_norm": 0.7907594442367554, + "learning_rate": 8.578940282038993e-06, + "loss": 0.6947, + "step": 9017 + }, + { + "epoch": 0.49633991964334856, + "grad_norm": 0.708703875541687, + "learning_rate": 8.578637571226283e-06, + "loss": 0.6712, + "step": 9018 + }, + { + "epoch": 0.49639495844570425, + "grad_norm": 0.6820377707481384, + "learning_rate": 8.578334833517492e-06, + "loss": 0.7269, + "step": 9019 + }, + { + "epoch": 0.4964499972480599, + "grad_norm": 0.6858653426170349, + "learning_rate": 8.578032068914896e-06, + "loss": 0.7325, + "step": 9020 + }, + { + "epoch": 0.4965050360504155, + "grad_norm": 0.8758736848831177, + "learning_rate": 8.577729277420768e-06, + "loss": 0.6652, + "step": 9021 + }, + { + "epoch": 0.4965600748527712, + "grad_norm": 0.731316328048706, + "learning_rate": 8.577426459037383e-06, + "loss": 0.7835, + "step": 9022 + }, + { + "epoch": 0.49661511365512684, + "grad_norm": 0.813778817653656, + "learning_rate": 8.57712361376702e-06, + "loss": 0.8025, + "step": 9023 + }, + { + "epoch": 0.4966701524574825, + "grad_norm": 0.7167351841926575, + "learning_rate": 8.576820741611952e-06, + "loss": 0.7483, + "step": 9024 + }, + { + "epoch": 0.49672519125983816, + "grad_norm": 0.7243192791938782, + "learning_rate": 8.576517842574457e-06, + "loss": 0.8411, + "step": 9025 + }, + { + "epoch": 0.49678023006219385, + "grad_norm": 0.5869036316871643, + "learning_rate": 8.576214916656814e-06, + "loss": 0.6661, + "step": 9026 + }, + { + "epoch": 0.4968352688645495, + "grad_norm": 0.7502203583717346, + "learning_rate": 8.575911963861293e-06, + "loss": 0.8838, + "step": 9027 + }, + { + "epoch": 0.4968903076669052, + "grad_norm": 0.687562108039856, + "learning_rate": 8.575608984190177e-06, + "loss": 0.7446, + "step": 9028 + }, + { + "epoch": 0.4969453464692608, + "grad_norm": 0.7735342383384705, + "learning_rate": 8.57530597764574e-06, + "loss": 0.8464, + "step": 9029 + }, + { + "epoch": 0.4970003852716165, + "grad_norm": 0.7828487753868103, + "learning_rate": 8.575002944230261e-06, + "loss": 0.7504, + "step": 9030 + }, + { + "epoch": 0.49705542407397213, + "grad_norm": 0.6359286904335022, + "learning_rate": 8.574699883946018e-06, + "loss": 0.6805, + "step": 9031 + }, + { + "epoch": 0.4971104628763278, + "grad_norm": 0.7462830543518066, + "learning_rate": 8.574396796795285e-06, + "loss": 0.8317, + "step": 9032 + }, + { + "epoch": 0.49716550167868345, + "grad_norm": 0.705115795135498, + "learning_rate": 8.574093682780344e-06, + "loss": 0.7401, + "step": 9033 + }, + { + "epoch": 0.49722054048103914, + "grad_norm": 0.6466538310050964, + "learning_rate": 8.573790541903472e-06, + "loss": 0.7761, + "step": 9034 + }, + { + "epoch": 0.4972755792833948, + "grad_norm": 0.7479867339134216, + "learning_rate": 8.573487374166946e-06, + "loss": 0.8394, + "step": 9035 + }, + { + "epoch": 0.49733061808575046, + "grad_norm": 0.7378019094467163, + "learning_rate": 8.573184179573046e-06, + "loss": 0.8215, + "step": 9036 + }, + { + "epoch": 0.4973856568881061, + "grad_norm": 0.6526094675064087, + "learning_rate": 8.57288095812405e-06, + "loss": 0.8055, + "step": 9037 + }, + { + "epoch": 0.4974406956904618, + "grad_norm": 0.679595947265625, + "learning_rate": 8.572577709822238e-06, + "loss": 0.8241, + "step": 9038 + }, + { + "epoch": 0.4974957344928174, + "grad_norm": 0.753466010093689, + "learning_rate": 8.572274434669886e-06, + "loss": 0.896, + "step": 9039 + }, + { + "epoch": 0.4975507732951731, + "grad_norm": 0.7068368792533875, + "learning_rate": 8.571971132669277e-06, + "loss": 0.778, + "step": 9040 + }, + { + "epoch": 0.49760581209752874, + "grad_norm": 0.7397973537445068, + "learning_rate": 8.571667803822689e-06, + "loss": 0.782, + "step": 9041 + }, + { + "epoch": 0.49766085089988443, + "grad_norm": 0.7837033271789551, + "learning_rate": 8.571364448132402e-06, + "loss": 0.7509, + "step": 9042 + }, + { + "epoch": 0.49771588970224007, + "grad_norm": 0.6808765530586243, + "learning_rate": 8.571061065600696e-06, + "loss": 0.672, + "step": 9043 + }, + { + "epoch": 0.49777092850459576, + "grad_norm": 0.6574100255966187, + "learning_rate": 8.570757656229852e-06, + "loss": 0.751, + "step": 9044 + }, + { + "epoch": 0.4978259673069514, + "grad_norm": 0.7357671856880188, + "learning_rate": 8.570454220022146e-06, + "loss": 0.7977, + "step": 9045 + }, + { + "epoch": 0.4978810061093071, + "grad_norm": 0.7937216758728027, + "learning_rate": 8.570150756979865e-06, + "loss": 0.8151, + "step": 9046 + }, + { + "epoch": 0.4979360449116627, + "grad_norm": 0.7050907611846924, + "learning_rate": 8.569847267105285e-06, + "loss": 0.7667, + "step": 9047 + }, + { + "epoch": 0.4979910837140184, + "grad_norm": 0.7105300426483154, + "learning_rate": 8.569543750400688e-06, + "loss": 0.7031, + "step": 9048 + }, + { + "epoch": 0.49804612251637403, + "grad_norm": 0.7174646854400635, + "learning_rate": 8.569240206868358e-06, + "loss": 0.7692, + "step": 9049 + }, + { + "epoch": 0.4981011613187297, + "grad_norm": 0.7525906562805176, + "learning_rate": 8.568936636510573e-06, + "loss": 0.7584, + "step": 9050 + }, + { + "epoch": 0.49815620012108536, + "grad_norm": 1.5518100261688232, + "learning_rate": 8.568633039329615e-06, + "loss": 0.7932, + "step": 9051 + }, + { + "epoch": 0.49821123892344105, + "grad_norm": 0.7037720084190369, + "learning_rate": 8.568329415327766e-06, + "loss": 0.8345, + "step": 9052 + }, + { + "epoch": 0.4982662777257967, + "grad_norm": 0.6422694325447083, + "learning_rate": 8.568025764507308e-06, + "loss": 0.7396, + "step": 9053 + }, + { + "epoch": 0.49832131652815237, + "grad_norm": 0.777306854724884, + "learning_rate": 8.567722086870525e-06, + "loss": 0.8605, + "step": 9054 + }, + { + "epoch": 0.498376355330508, + "grad_norm": 0.6619865298271179, + "learning_rate": 8.567418382419697e-06, + "loss": 0.7395, + "step": 9055 + }, + { + "epoch": 0.4984313941328637, + "grad_norm": 0.7214456796646118, + "learning_rate": 8.567114651157106e-06, + "loss": 0.7932, + "step": 9056 + }, + { + "epoch": 0.4984864329352193, + "grad_norm": 0.75806725025177, + "learning_rate": 8.566810893085037e-06, + "loss": 0.7998, + "step": 9057 + }, + { + "epoch": 0.498541471737575, + "grad_norm": 0.8089895844459534, + "learning_rate": 8.566507108205773e-06, + "loss": 0.7849, + "step": 9058 + }, + { + "epoch": 0.49859651053993065, + "grad_norm": 0.817814290523529, + "learning_rate": 8.566203296521597e-06, + "loss": 0.7261, + "step": 9059 + }, + { + "epoch": 0.49865154934228634, + "grad_norm": 0.7417539954185486, + "learning_rate": 8.56589945803479e-06, + "loss": 0.7087, + "step": 9060 + }, + { + "epoch": 0.49870658814464197, + "grad_norm": 0.7518000602722168, + "learning_rate": 8.565595592747639e-06, + "loss": 0.7245, + "step": 9061 + }, + { + "epoch": 0.49876162694699766, + "grad_norm": 0.9537304043769836, + "learning_rate": 8.565291700662423e-06, + "loss": 0.901, + "step": 9062 + }, + { + "epoch": 0.4988166657493533, + "grad_norm": 0.784545361995697, + "learning_rate": 8.56498778178143e-06, + "loss": 0.7813, + "step": 9063 + }, + { + "epoch": 0.4988717045517089, + "grad_norm": 0.9218429923057556, + "learning_rate": 8.564683836106945e-06, + "loss": 0.8452, + "step": 9064 + }, + { + "epoch": 0.4989267433540646, + "grad_norm": 0.6902065277099609, + "learning_rate": 8.56437986364125e-06, + "loss": 0.7527, + "step": 9065 + }, + { + "epoch": 0.49898178215642025, + "grad_norm": 0.7388677000999451, + "learning_rate": 8.56407586438663e-06, + "loss": 0.82, + "step": 9066 + }, + { + "epoch": 0.49903682095877594, + "grad_norm": 0.6959313154220581, + "learning_rate": 8.563771838345369e-06, + "loss": 0.7274, + "step": 9067 + }, + { + "epoch": 0.4990918597611316, + "grad_norm": 0.6582610607147217, + "learning_rate": 8.563467785519753e-06, + "loss": 0.6518, + "step": 9068 + }, + { + "epoch": 0.49914689856348726, + "grad_norm": 0.6525924801826477, + "learning_rate": 8.563163705912066e-06, + "loss": 0.7006, + "step": 9069 + }, + { + "epoch": 0.4992019373658429, + "grad_norm": 0.8092843890190125, + "learning_rate": 8.562859599524596e-06, + "loss": 0.6915, + "step": 9070 + }, + { + "epoch": 0.4992569761681986, + "grad_norm": 0.6540575623512268, + "learning_rate": 8.562555466359626e-06, + "loss": 0.6729, + "step": 9071 + }, + { + "epoch": 0.4993120149705542, + "grad_norm": 0.8220445513725281, + "learning_rate": 8.562251306419443e-06, + "loss": 0.8172, + "step": 9072 + }, + { + "epoch": 0.4993670537729099, + "grad_norm": 0.7461502552032471, + "learning_rate": 8.561947119706334e-06, + "loss": 0.6902, + "step": 9073 + }, + { + "epoch": 0.49942209257526554, + "grad_norm": 0.8166316151618958, + "learning_rate": 8.56164290622258e-06, + "loss": 0.8238, + "step": 9074 + }, + { + "epoch": 0.49947713137762123, + "grad_norm": 0.8453896641731262, + "learning_rate": 8.561338665970476e-06, + "loss": 0.7697, + "step": 9075 + }, + { + "epoch": 0.49953217017997686, + "grad_norm": 0.7606340050697327, + "learning_rate": 8.5610343989523e-06, + "loss": 0.6951, + "step": 9076 + }, + { + "epoch": 0.49958720898233255, + "grad_norm": 0.7408013343811035, + "learning_rate": 8.560730105170345e-06, + "loss": 0.8298, + "step": 9077 + }, + { + "epoch": 0.4996422477846882, + "grad_norm": 0.7625541090965271, + "learning_rate": 8.560425784626896e-06, + "loss": 0.6738, + "step": 9078 + }, + { + "epoch": 0.4996972865870439, + "grad_norm": 0.6940996646881104, + "learning_rate": 8.560121437324238e-06, + "loss": 0.78, + "step": 9079 + }, + { + "epoch": 0.4997523253893995, + "grad_norm": 0.8087461590766907, + "learning_rate": 8.559817063264661e-06, + "loss": 0.7831, + "step": 9080 + }, + { + "epoch": 0.4998073641917552, + "grad_norm": 0.7418510317802429, + "learning_rate": 8.559512662450452e-06, + "loss": 0.801, + "step": 9081 + }, + { + "epoch": 0.49986240299411083, + "grad_norm": 0.6793946027755737, + "learning_rate": 8.5592082348839e-06, + "loss": 0.7329, + "step": 9082 + }, + { + "epoch": 0.4999174417964665, + "grad_norm": 0.8197429180145264, + "learning_rate": 8.55890378056729e-06, + "loss": 0.804, + "step": 9083 + }, + { + "epoch": 0.49997248059882216, + "grad_norm": 0.7526460886001587, + "learning_rate": 8.558599299502912e-06, + "loss": 0.8378, + "step": 9084 + }, + { + "epoch": 0.5000275194011778, + "grad_norm": 0.8169133067131042, + "learning_rate": 8.558294791693055e-06, + "loss": 0.828, + "step": 9085 + }, + { + "epoch": 0.5000825582035335, + "grad_norm": 0.8386932015419006, + "learning_rate": 8.557990257140007e-06, + "loss": 0.7961, + "step": 9086 + }, + { + "epoch": 0.5001375970058891, + "grad_norm": 0.7183443903923035, + "learning_rate": 8.557685695846057e-06, + "loss": 0.6964, + "step": 9087 + }, + { + "epoch": 0.5001926358082448, + "grad_norm": 0.77079176902771, + "learning_rate": 8.557381107813491e-06, + "loss": 0.8222, + "step": 9088 + }, + { + "epoch": 0.5002476746106005, + "grad_norm": 0.6519342660903931, + "learning_rate": 8.557076493044603e-06, + "loss": 0.772, + "step": 9089 + }, + { + "epoch": 0.5003027134129562, + "grad_norm": 0.7039975523948669, + "learning_rate": 8.556771851541678e-06, + "loss": 0.7491, + "step": 9090 + }, + { + "epoch": 0.5003577522153118, + "grad_norm": 0.6459039449691772, + "learning_rate": 8.556467183307012e-06, + "loss": 0.7104, + "step": 9091 + }, + { + "epoch": 0.5004127910176674, + "grad_norm": 0.7359183430671692, + "learning_rate": 8.556162488342887e-06, + "loss": 0.829, + "step": 9092 + }, + { + "epoch": 0.5004678298200231, + "grad_norm": 0.7029602527618408, + "learning_rate": 8.555857766651599e-06, + "loss": 0.8163, + "step": 9093 + }, + { + "epoch": 0.5005228686223788, + "grad_norm": 0.6687049865722656, + "learning_rate": 8.555553018235435e-06, + "loss": 0.7589, + "step": 9094 + }, + { + "epoch": 0.5005779074247344, + "grad_norm": 0.7277147173881531, + "learning_rate": 8.555248243096686e-06, + "loss": 0.8334, + "step": 9095 + }, + { + "epoch": 0.5006329462270901, + "grad_norm": 0.6512065529823303, + "learning_rate": 8.554943441237642e-06, + "loss": 0.7174, + "step": 9096 + }, + { + "epoch": 0.5006879850294458, + "grad_norm": 0.725351095199585, + "learning_rate": 8.554638612660594e-06, + "loss": 0.6514, + "step": 9097 + }, + { + "epoch": 0.5007430238318015, + "grad_norm": 0.7983208894729614, + "learning_rate": 8.554333757367836e-06, + "loss": 0.8385, + "step": 9098 + }, + { + "epoch": 0.500798062634157, + "grad_norm": 0.6631388068199158, + "learning_rate": 8.554028875361657e-06, + "loss": 0.7103, + "step": 9099 + }, + { + "epoch": 0.5008531014365127, + "grad_norm": 0.730421245098114, + "learning_rate": 8.553723966644347e-06, + "loss": 0.8005, + "step": 9100 + }, + { + "epoch": 0.5009081402388684, + "grad_norm": 0.7385838627815247, + "learning_rate": 8.5534190312182e-06, + "loss": 0.7586, + "step": 9101 + }, + { + "epoch": 0.5009631790412241, + "grad_norm": 0.712458610534668, + "learning_rate": 8.553114069085506e-06, + "loss": 0.7587, + "step": 9102 + }, + { + "epoch": 0.5010182178435797, + "grad_norm": 0.7393542528152466, + "learning_rate": 8.552809080248559e-06, + "loss": 0.746, + "step": 9103 + }, + { + "epoch": 0.5010732566459354, + "grad_norm": 0.6596370935440063, + "learning_rate": 8.552504064709649e-06, + "loss": 0.6968, + "step": 9104 + }, + { + "epoch": 0.5011282954482911, + "grad_norm": 0.7340545654296875, + "learning_rate": 8.552199022471069e-06, + "loss": 0.8326, + "step": 9105 + }, + { + "epoch": 0.5011833342506467, + "grad_norm": 0.6586140990257263, + "learning_rate": 8.55189395353511e-06, + "loss": 0.7144, + "step": 9106 + }, + { + "epoch": 0.5012383730530023, + "grad_norm": 0.6875959038734436, + "learning_rate": 8.551588857904071e-06, + "loss": 0.721, + "step": 9107 + }, + { + "epoch": 0.501293411855358, + "grad_norm": 0.6754499077796936, + "learning_rate": 8.551283735580238e-06, + "loss": 0.6771, + "step": 9108 + }, + { + "epoch": 0.5013484506577137, + "grad_norm": 0.8027325868606567, + "learning_rate": 8.55097858656591e-06, + "loss": 0.8196, + "step": 9109 + }, + { + "epoch": 0.5014034894600693, + "grad_norm": 0.6992260217666626, + "learning_rate": 8.550673410863376e-06, + "loss": 0.7923, + "step": 9110 + }, + { + "epoch": 0.501458528262425, + "grad_norm": 0.741205632686615, + "learning_rate": 8.550368208474928e-06, + "loss": 0.7036, + "step": 9111 + }, + { + "epoch": 0.5015135670647807, + "grad_norm": 0.6485981345176697, + "learning_rate": 8.550062979402866e-06, + "loss": 0.6351, + "step": 9112 + }, + { + "epoch": 0.5015686058671364, + "grad_norm": 0.6984226703643799, + "learning_rate": 8.549757723649481e-06, + "loss": 0.7714, + "step": 9113 + }, + { + "epoch": 0.5016236446694919, + "grad_norm": 0.7773998975753784, + "learning_rate": 8.549452441217067e-06, + "loss": 0.8901, + "step": 9114 + }, + { + "epoch": 0.5016786834718476, + "grad_norm": 0.6912227272987366, + "learning_rate": 8.549147132107918e-06, + "loss": 0.7702, + "step": 9115 + }, + { + "epoch": 0.5017337222742033, + "grad_norm": 0.6742583513259888, + "learning_rate": 8.54884179632433e-06, + "loss": 0.7789, + "step": 9116 + }, + { + "epoch": 0.501788761076559, + "grad_norm": 0.7896195650100708, + "learning_rate": 8.548536433868595e-06, + "loss": 0.7358, + "step": 9117 + }, + { + "epoch": 0.5018437998789146, + "grad_norm": 0.7112523913383484, + "learning_rate": 8.548231044743011e-06, + "loss": 0.7286, + "step": 9118 + }, + { + "epoch": 0.5018988386812703, + "grad_norm": 0.9162774085998535, + "learning_rate": 8.547925628949873e-06, + "loss": 0.935, + "step": 9119 + }, + { + "epoch": 0.501953877483626, + "grad_norm": 0.6319599747657776, + "learning_rate": 8.547620186491477e-06, + "loss": 0.625, + "step": 9120 + }, + { + "epoch": 0.5020089162859817, + "grad_norm": 0.7074719667434692, + "learning_rate": 8.547314717370115e-06, + "loss": 0.6614, + "step": 9121 + }, + { + "epoch": 0.5020639550883372, + "grad_norm": 0.7417262196540833, + "learning_rate": 8.547009221588086e-06, + "loss": 0.8476, + "step": 9122 + }, + { + "epoch": 0.5021189938906929, + "grad_norm": 0.7057339549064636, + "learning_rate": 8.546703699147685e-06, + "loss": 0.805, + "step": 9123 + }, + { + "epoch": 0.5021740326930486, + "grad_norm": 0.7420887351036072, + "learning_rate": 8.546398150051207e-06, + "loss": 0.7331, + "step": 9124 + }, + { + "epoch": 0.5022290714954043, + "grad_norm": 0.9526195526123047, + "learning_rate": 8.546092574300953e-06, + "loss": 0.7803, + "step": 9125 + }, + { + "epoch": 0.5022841102977599, + "grad_norm": 0.748130202293396, + "learning_rate": 8.545786971899214e-06, + "loss": 0.7998, + "step": 9126 + }, + { + "epoch": 0.5023391491001156, + "grad_norm": 0.7266026139259338, + "learning_rate": 8.545481342848289e-06, + "loss": 0.8377, + "step": 9127 + }, + { + "epoch": 0.5023941879024713, + "grad_norm": 0.6762456893920898, + "learning_rate": 8.545175687150478e-06, + "loss": 0.7312, + "step": 9128 + }, + { + "epoch": 0.502449226704827, + "grad_norm": 0.7011429667472839, + "learning_rate": 8.544870004808072e-06, + "loss": 0.7666, + "step": 9129 + }, + { + "epoch": 0.5025042655071825, + "grad_norm": 0.6652229428291321, + "learning_rate": 8.544564295823375e-06, + "loss": 0.6904, + "step": 9130 + }, + { + "epoch": 0.5025593043095382, + "grad_norm": 0.8333765268325806, + "learning_rate": 8.54425856019868e-06, + "loss": 0.7318, + "step": 9131 + }, + { + "epoch": 0.5026143431118939, + "grad_norm": 0.6827245950698853, + "learning_rate": 8.543952797936285e-06, + "loss": 0.7692, + "step": 9132 + }, + { + "epoch": 0.5026693819142496, + "grad_norm": 0.8744323253631592, + "learning_rate": 8.543647009038491e-06, + "loss": 0.7316, + "step": 9133 + }, + { + "epoch": 0.5027244207166052, + "grad_norm": 0.7024276852607727, + "learning_rate": 8.543341193507594e-06, + "loss": 0.7008, + "step": 9134 + }, + { + "epoch": 0.5027794595189609, + "grad_norm": 0.8786055445671082, + "learning_rate": 8.543035351345895e-06, + "loss": 0.7054, + "step": 9135 + }, + { + "epoch": 0.5028344983213165, + "grad_norm": 0.727924108505249, + "learning_rate": 8.54272948255569e-06, + "loss": 0.8049, + "step": 9136 + }, + { + "epoch": 0.5028895371236722, + "grad_norm": 0.8366256356239319, + "learning_rate": 8.542423587139277e-06, + "loss": 0.7926, + "step": 9137 + }, + { + "epoch": 0.5029445759260278, + "grad_norm": 0.7657913565635681, + "learning_rate": 8.542117665098958e-06, + "loss": 0.8152, + "step": 9138 + }, + { + "epoch": 0.5029996147283835, + "grad_norm": 0.7543498277664185, + "learning_rate": 8.54181171643703e-06, + "loss": 0.7566, + "step": 9139 + }, + { + "epoch": 0.5030546535307392, + "grad_norm": 0.7771349549293518, + "learning_rate": 8.541505741155794e-06, + "loss": 0.7907, + "step": 9140 + }, + { + "epoch": 0.5031096923330949, + "grad_norm": 0.6661877632141113, + "learning_rate": 8.541199739257548e-06, + "loss": 0.7481, + "step": 9141 + }, + { + "epoch": 0.5031647311354505, + "grad_norm": 0.7700417637825012, + "learning_rate": 8.540893710744593e-06, + "loss": 0.7544, + "step": 9142 + }, + { + "epoch": 0.5032197699378061, + "grad_norm": 0.6476640105247498, + "learning_rate": 8.54058765561923e-06, + "loss": 0.7221, + "step": 9143 + }, + { + "epoch": 0.5032748087401618, + "grad_norm": 0.7098944187164307, + "learning_rate": 8.540281573883755e-06, + "loss": 0.8083, + "step": 9144 + }, + { + "epoch": 0.5033298475425175, + "grad_norm": 0.9733545184135437, + "learning_rate": 8.539975465540473e-06, + "loss": 0.7381, + "step": 9145 + }, + { + "epoch": 0.5033848863448731, + "grad_norm": 0.641211986541748, + "learning_rate": 8.539669330591685e-06, + "loss": 0.7511, + "step": 9146 + }, + { + "epoch": 0.5034399251472288, + "grad_norm": 0.626027524471283, + "learning_rate": 8.539363169039687e-06, + "loss": 0.7321, + "step": 9147 + }, + { + "epoch": 0.5034949639495845, + "grad_norm": 0.7627241611480713, + "learning_rate": 8.539056980886785e-06, + "loss": 0.7269, + "step": 9148 + }, + { + "epoch": 0.5035500027519401, + "grad_norm": 0.6711145639419556, + "learning_rate": 8.538750766135275e-06, + "loss": 0.8179, + "step": 9149 + }, + { + "epoch": 0.5036050415542958, + "grad_norm": 0.6981950998306274, + "learning_rate": 8.538444524787463e-06, + "loss": 0.8095, + "step": 9150 + }, + { + "epoch": 0.5036600803566514, + "grad_norm": 0.8869871497154236, + "learning_rate": 8.53813825684565e-06, + "loss": 0.8549, + "step": 9151 + }, + { + "epoch": 0.5037151191590071, + "grad_norm": 0.6461544036865234, + "learning_rate": 8.537831962312137e-06, + "loss": 0.7388, + "step": 9152 + }, + { + "epoch": 0.5037701579613627, + "grad_norm": 0.8279222249984741, + "learning_rate": 8.537525641189224e-06, + "loss": 0.8609, + "step": 9153 + }, + { + "epoch": 0.5038251967637184, + "grad_norm": 0.7117578387260437, + "learning_rate": 8.537219293479217e-06, + "loss": 0.802, + "step": 9154 + }, + { + "epoch": 0.5038802355660741, + "grad_norm": 0.6831860542297363, + "learning_rate": 8.536912919184416e-06, + "loss": 0.7821, + "step": 9155 + }, + { + "epoch": 0.5039352743684298, + "grad_norm": 1.1528539657592773, + "learning_rate": 8.536606518307125e-06, + "loss": 0.8578, + "step": 9156 + }, + { + "epoch": 0.5039903131707854, + "grad_norm": 0.6545060873031616, + "learning_rate": 8.536300090849645e-06, + "loss": 0.7744, + "step": 9157 + }, + { + "epoch": 0.504045351973141, + "grad_norm": 0.7176601886749268, + "learning_rate": 8.535993636814281e-06, + "loss": 0.8104, + "step": 9158 + }, + { + "epoch": 0.5041003907754967, + "grad_norm": 0.8458410501480103, + "learning_rate": 8.535687156203334e-06, + "loss": 0.8653, + "step": 9159 + }, + { + "epoch": 0.5041554295778524, + "grad_norm": 0.7500274777412415, + "learning_rate": 8.53538064901911e-06, + "loss": 0.8043, + "step": 9160 + }, + { + "epoch": 0.504210468380208, + "grad_norm": 0.6982965469360352, + "learning_rate": 8.535074115263911e-06, + "loss": 0.7564, + "step": 9161 + }, + { + "epoch": 0.5042655071825637, + "grad_norm": 0.8344218134880066, + "learning_rate": 8.534767554940042e-06, + "loss": 0.7575, + "step": 9162 + }, + { + "epoch": 0.5043205459849194, + "grad_norm": 0.7527137398719788, + "learning_rate": 8.534460968049806e-06, + "loss": 0.7757, + "step": 9163 + }, + { + "epoch": 0.5043755847872751, + "grad_norm": 0.7136969566345215, + "learning_rate": 8.534154354595508e-06, + "loss": 0.826, + "step": 9164 + }, + { + "epoch": 0.5044306235896306, + "grad_norm": 0.8102819919586182, + "learning_rate": 8.533847714579449e-06, + "loss": 0.7247, + "step": 9165 + }, + { + "epoch": 0.5044856623919863, + "grad_norm": 0.7568309903144836, + "learning_rate": 8.53354104800394e-06, + "loss": 0.8509, + "step": 9166 + }, + { + "epoch": 0.504540701194342, + "grad_norm": 0.7719592452049255, + "learning_rate": 8.53323435487128e-06, + "loss": 0.8039, + "step": 9167 + }, + { + "epoch": 0.5045957399966977, + "grad_norm": 0.7514411807060242, + "learning_rate": 8.532927635183778e-06, + "loss": 0.8759, + "step": 9168 + }, + { + "epoch": 0.5046507787990533, + "grad_norm": 0.9781903028488159, + "learning_rate": 8.532620888943736e-06, + "loss": 0.8022, + "step": 9169 + }, + { + "epoch": 0.504705817601409, + "grad_norm": 0.7713304758071899, + "learning_rate": 8.532314116153462e-06, + "loss": 0.8372, + "step": 9170 + }, + { + "epoch": 0.5047608564037647, + "grad_norm": 0.7519709467887878, + "learning_rate": 8.53200731681526e-06, + "loss": 0.7374, + "step": 9171 + }, + { + "epoch": 0.5048158952061204, + "grad_norm": 0.6923980712890625, + "learning_rate": 8.531700490931438e-06, + "loss": 0.7511, + "step": 9172 + }, + { + "epoch": 0.5048709340084759, + "grad_norm": 0.682357907295227, + "learning_rate": 8.5313936385043e-06, + "loss": 0.7647, + "step": 9173 + }, + { + "epoch": 0.5049259728108316, + "grad_norm": 0.8255659341812134, + "learning_rate": 8.531086759536152e-06, + "loss": 0.7533, + "step": 9174 + }, + { + "epoch": 0.5049810116131873, + "grad_norm": 0.6774975061416626, + "learning_rate": 8.530779854029301e-06, + "loss": 0.7019, + "step": 9175 + }, + { + "epoch": 0.505036050415543, + "grad_norm": 0.7973241209983826, + "learning_rate": 8.530472921986053e-06, + "loss": 0.7824, + "step": 9176 + }, + { + "epoch": 0.5050910892178986, + "grad_norm": 0.8216109275817871, + "learning_rate": 8.530165963408716e-06, + "loss": 0.8063, + "step": 9177 + }, + { + "epoch": 0.5051461280202543, + "grad_norm": 0.7277935743331909, + "learning_rate": 8.5298589782996e-06, + "loss": 0.7631, + "step": 9178 + }, + { + "epoch": 0.50520116682261, + "grad_norm": 0.6647855043411255, + "learning_rate": 8.529551966661004e-06, + "loss": 0.7462, + "step": 9179 + }, + { + "epoch": 0.5052562056249656, + "grad_norm": 0.766272783279419, + "learning_rate": 8.529244928495241e-06, + "loss": 0.8075, + "step": 9180 + }, + { + "epoch": 0.5053112444273212, + "grad_norm": 0.7276293635368347, + "learning_rate": 8.52893786380462e-06, + "loss": 0.7908, + "step": 9181 + }, + { + "epoch": 0.5053662832296769, + "grad_norm": 0.7864169478416443, + "learning_rate": 8.528630772591447e-06, + "loss": 0.8082, + "step": 9182 + }, + { + "epoch": 0.5054213220320326, + "grad_norm": 0.9106804132461548, + "learning_rate": 8.528323654858028e-06, + "loss": 0.8989, + "step": 9183 + }, + { + "epoch": 0.5054763608343883, + "grad_norm": 0.7288523316383362, + "learning_rate": 8.52801651060667e-06, + "loss": 0.7972, + "step": 9184 + }, + { + "epoch": 0.5055313996367439, + "grad_norm": 0.7149643301963806, + "learning_rate": 8.527709339839689e-06, + "loss": 0.8191, + "step": 9185 + }, + { + "epoch": 0.5055864384390996, + "grad_norm": 0.6661714911460876, + "learning_rate": 8.527402142559388e-06, + "loss": 0.6596, + "step": 9186 + }, + { + "epoch": 0.5056414772414553, + "grad_norm": 0.7071447372436523, + "learning_rate": 8.527094918768076e-06, + "loss": 0.7633, + "step": 9187 + }, + { + "epoch": 0.5056965160438109, + "grad_norm": 0.7314093112945557, + "learning_rate": 8.526787668468064e-06, + "loss": 0.7815, + "step": 9188 + }, + { + "epoch": 0.5057515548461665, + "grad_norm": 0.8200539946556091, + "learning_rate": 8.526480391661657e-06, + "loss": 0.8376, + "step": 9189 + }, + { + "epoch": 0.5058065936485222, + "grad_norm": 0.7422435283660889, + "learning_rate": 8.52617308835117e-06, + "loss": 0.8783, + "step": 9190 + }, + { + "epoch": 0.5058616324508779, + "grad_norm": 0.7845084071159363, + "learning_rate": 8.525865758538909e-06, + "loss": 0.8005, + "step": 9191 + }, + { + "epoch": 0.5059166712532335, + "grad_norm": 0.6854296922683716, + "learning_rate": 8.525558402227185e-06, + "loss": 0.8118, + "step": 9192 + }, + { + "epoch": 0.5059717100555892, + "grad_norm": 0.6805297136306763, + "learning_rate": 8.525251019418309e-06, + "loss": 0.6765, + "step": 9193 + }, + { + "epoch": 0.5060267488579449, + "grad_norm": 0.7194867134094238, + "learning_rate": 8.524943610114587e-06, + "loss": 0.6752, + "step": 9194 + }, + { + "epoch": 0.5060817876603005, + "grad_norm": 0.6935137510299683, + "learning_rate": 8.524636174318335e-06, + "loss": 0.7122, + "step": 9195 + }, + { + "epoch": 0.5061368264626561, + "grad_norm": 0.8652825951576233, + "learning_rate": 8.52432871203186e-06, + "loss": 0.7725, + "step": 9196 + }, + { + "epoch": 0.5061918652650118, + "grad_norm": 0.9104461669921875, + "learning_rate": 8.524021223257472e-06, + "loss": 0.8589, + "step": 9197 + }, + { + "epoch": 0.5062469040673675, + "grad_norm": 0.7680580019950867, + "learning_rate": 8.523713707997486e-06, + "loss": 0.842, + "step": 9198 + }, + { + "epoch": 0.5063019428697232, + "grad_norm": 0.7324872612953186, + "learning_rate": 8.52340616625421e-06, + "loss": 0.802, + "step": 9199 + }, + { + "epoch": 0.5063569816720788, + "grad_norm": 0.8812359571456909, + "learning_rate": 8.523098598029958e-06, + "loss": 0.8286, + "step": 9200 + }, + { + "epoch": 0.5064120204744345, + "grad_norm": 0.6992496848106384, + "learning_rate": 8.522791003327038e-06, + "loss": 0.811, + "step": 9201 + }, + { + "epoch": 0.5064670592767901, + "grad_norm": 0.8191942572593689, + "learning_rate": 8.522483382147766e-06, + "loss": 0.7192, + "step": 9202 + }, + { + "epoch": 0.5065220980791458, + "grad_norm": 0.9354501366615295, + "learning_rate": 8.522175734494452e-06, + "loss": 0.7424, + "step": 9203 + }, + { + "epoch": 0.5065771368815014, + "grad_norm": 0.6481999754905701, + "learning_rate": 8.521868060369405e-06, + "loss": 0.6385, + "step": 9204 + }, + { + "epoch": 0.5066321756838571, + "grad_norm": 0.7158499360084534, + "learning_rate": 8.521560359774943e-06, + "loss": 0.6116, + "step": 9205 + }, + { + "epoch": 0.5066872144862128, + "grad_norm": 0.8738408088684082, + "learning_rate": 8.521252632713376e-06, + "loss": 0.894, + "step": 9206 + }, + { + "epoch": 0.5067422532885685, + "grad_norm": 0.7037062644958496, + "learning_rate": 8.520944879187015e-06, + "loss": 0.6958, + "step": 9207 + }, + { + "epoch": 0.5067972920909241, + "grad_norm": 0.7205594778060913, + "learning_rate": 8.520637099198175e-06, + "loss": 0.7188, + "step": 9208 + }, + { + "epoch": 0.5068523308932797, + "grad_norm": 0.6761966347694397, + "learning_rate": 8.520329292749169e-06, + "loss": 0.7669, + "step": 9209 + }, + { + "epoch": 0.5069073696956354, + "grad_norm": 0.682556688785553, + "learning_rate": 8.520021459842312e-06, + "loss": 0.7745, + "step": 9210 + }, + { + "epoch": 0.5069624084979911, + "grad_norm": 0.6687794923782349, + "learning_rate": 8.519713600479913e-06, + "loss": 0.7814, + "step": 9211 + }, + { + "epoch": 0.5070174473003467, + "grad_norm": 0.6391967535018921, + "learning_rate": 8.51940571466429e-06, + "loss": 0.7331, + "step": 9212 + }, + { + "epoch": 0.5070724861027024, + "grad_norm": 0.8420151472091675, + "learning_rate": 8.519097802397758e-06, + "loss": 0.8257, + "step": 9213 + }, + { + "epoch": 0.5071275249050581, + "grad_norm": 0.692787230014801, + "learning_rate": 8.518789863682625e-06, + "loss": 0.7179, + "step": 9214 + }, + { + "epoch": 0.5071825637074138, + "grad_norm": 0.6874318718910217, + "learning_rate": 8.518481898521213e-06, + "loss": 0.6847, + "step": 9215 + }, + { + "epoch": 0.5072376025097693, + "grad_norm": 0.8107750415802002, + "learning_rate": 8.518173906915832e-06, + "loss": 0.8459, + "step": 9216 + }, + { + "epoch": 0.507292641312125, + "grad_norm": 0.7952812910079956, + "learning_rate": 8.517865888868797e-06, + "loss": 0.8503, + "step": 9217 + }, + { + "epoch": 0.5073476801144807, + "grad_norm": 0.6926921606063843, + "learning_rate": 8.517557844382424e-06, + "loss": 0.6713, + "step": 9218 + }, + { + "epoch": 0.5074027189168364, + "grad_norm": 0.8203585147857666, + "learning_rate": 8.517249773459026e-06, + "loss": 0.8483, + "step": 9219 + }, + { + "epoch": 0.507457757719192, + "grad_norm": 0.6788125038146973, + "learning_rate": 8.516941676100923e-06, + "loss": 0.7521, + "step": 9220 + }, + { + "epoch": 0.5075127965215477, + "grad_norm": 0.6439838409423828, + "learning_rate": 8.516633552310426e-06, + "loss": 0.7359, + "step": 9221 + }, + { + "epoch": 0.5075678353239034, + "grad_norm": 0.6872217655181885, + "learning_rate": 8.516325402089854e-06, + "loss": 0.73, + "step": 9222 + }, + { + "epoch": 0.5076228741262591, + "grad_norm": 0.6695985794067383, + "learning_rate": 8.51601722544152e-06, + "loss": 0.7519, + "step": 9223 + }, + { + "epoch": 0.5076779129286146, + "grad_norm": 0.7779402136802673, + "learning_rate": 8.515709022367741e-06, + "loss": 0.7325, + "step": 9224 + }, + { + "epoch": 0.5077329517309703, + "grad_norm": 0.9289746284484863, + "learning_rate": 8.515400792870836e-06, + "loss": 0.7839, + "step": 9225 + }, + { + "epoch": 0.507787990533326, + "grad_norm": 0.6949248313903809, + "learning_rate": 8.51509253695312e-06, + "loss": 0.7363, + "step": 9226 + }, + { + "epoch": 0.5078430293356817, + "grad_norm": 0.6463130116462708, + "learning_rate": 8.514784254616908e-06, + "loss": 0.7607, + "step": 9227 + }, + { + "epoch": 0.5078980681380373, + "grad_norm": 0.7332046031951904, + "learning_rate": 8.514475945864519e-06, + "loss": 0.6833, + "step": 9228 + }, + { + "epoch": 0.507953106940393, + "grad_norm": 0.8674100637435913, + "learning_rate": 8.51416761069827e-06, + "loss": 0.669, + "step": 9229 + }, + { + "epoch": 0.5080081457427487, + "grad_norm": 0.8073185682296753, + "learning_rate": 8.513859249120477e-06, + "loss": 0.7215, + "step": 9230 + }, + { + "epoch": 0.5080631845451044, + "grad_norm": 0.674117386341095, + "learning_rate": 8.51355086113346e-06, + "loss": 0.7813, + "step": 9231 + }, + { + "epoch": 0.5081182233474599, + "grad_norm": 0.8564596176147461, + "learning_rate": 8.513242446739534e-06, + "loss": 0.7393, + "step": 9232 + }, + { + "epoch": 0.5081732621498156, + "grad_norm": 0.684637188911438, + "learning_rate": 8.512934005941015e-06, + "loss": 0.781, + "step": 9233 + }, + { + "epoch": 0.5082283009521713, + "grad_norm": 0.816123902797699, + "learning_rate": 8.51262553874023e-06, + "loss": 0.8597, + "step": 9234 + }, + { + "epoch": 0.5082833397545269, + "grad_norm": 0.6582320332527161, + "learning_rate": 8.512317045139488e-06, + "loss": 0.6654, + "step": 9235 + }, + { + "epoch": 0.5083383785568826, + "grad_norm": 1.0153518915176392, + "learning_rate": 8.512008525141113e-06, + "loss": 0.7946, + "step": 9236 + }, + { + "epoch": 0.5083934173592383, + "grad_norm": 0.7455416917800903, + "learning_rate": 8.511699978747422e-06, + "loss": 0.8365, + "step": 9237 + }, + { + "epoch": 0.508448456161594, + "grad_norm": 0.6498221755027771, + "learning_rate": 8.511391405960733e-06, + "loss": 0.7252, + "step": 9238 + }, + { + "epoch": 0.5085034949639495, + "grad_norm": 0.6856792569160461, + "learning_rate": 8.511082806783368e-06, + "loss": 0.7282, + "step": 9239 + }, + { + "epoch": 0.5085585337663052, + "grad_norm": 0.6930065751075745, + "learning_rate": 8.510774181217643e-06, + "loss": 0.7404, + "step": 9240 + }, + { + "epoch": 0.5086135725686609, + "grad_norm": 0.6953150033950806, + "learning_rate": 8.51046552926588e-06, + "loss": 0.7684, + "step": 9241 + }, + { + "epoch": 0.5086686113710166, + "grad_norm": 0.7307711839675903, + "learning_rate": 8.510156850930395e-06, + "loss": 0.7557, + "step": 9242 + }, + { + "epoch": 0.5087236501733722, + "grad_norm": 0.7296478152275085, + "learning_rate": 8.509848146213513e-06, + "loss": 0.7469, + "step": 9243 + }, + { + "epoch": 0.5087786889757279, + "grad_norm": 0.7035672664642334, + "learning_rate": 8.509539415117553e-06, + "loss": 0.7151, + "step": 9244 + }, + { + "epoch": 0.5088337277780836, + "grad_norm": 0.7818698883056641, + "learning_rate": 8.509230657644832e-06, + "loss": 0.7134, + "step": 9245 + }, + { + "epoch": 0.5088887665804392, + "grad_norm": 0.7503119111061096, + "learning_rate": 8.508921873797674e-06, + "loss": 0.7028, + "step": 9246 + }, + { + "epoch": 0.5089438053827948, + "grad_norm": 0.7733498215675354, + "learning_rate": 8.508613063578397e-06, + "loss": 0.8159, + "step": 9247 + }, + { + "epoch": 0.5089988441851505, + "grad_norm": 0.9236353635787964, + "learning_rate": 8.508304226989326e-06, + "loss": 0.8013, + "step": 9248 + }, + { + "epoch": 0.5090538829875062, + "grad_norm": 0.6567198634147644, + "learning_rate": 8.507995364032777e-06, + "loss": 0.8285, + "step": 9249 + }, + { + "epoch": 0.5091089217898619, + "grad_norm": 0.6555445790290833, + "learning_rate": 8.507686474711074e-06, + "loss": 0.6917, + "step": 9250 + }, + { + "epoch": 0.5091639605922175, + "grad_norm": 0.8505375385284424, + "learning_rate": 8.507377559026539e-06, + "loss": 0.824, + "step": 9251 + }, + { + "epoch": 0.5092189993945732, + "grad_norm": 0.703413188457489, + "learning_rate": 8.507068616981493e-06, + "loss": 0.7162, + "step": 9252 + }, + { + "epoch": 0.5092740381969288, + "grad_norm": 0.7257823944091797, + "learning_rate": 8.50675964857826e-06, + "loss": 0.8031, + "step": 9253 + }, + { + "epoch": 0.5093290769992845, + "grad_norm": 0.6861198544502258, + "learning_rate": 8.506450653819159e-06, + "loss": 0.7724, + "step": 9254 + }, + { + "epoch": 0.5093841158016401, + "grad_norm": 0.7733107209205627, + "learning_rate": 8.506141632706512e-06, + "loss": 0.7834, + "step": 9255 + }, + { + "epoch": 0.5094391546039958, + "grad_norm": 0.7472217082977295, + "learning_rate": 8.505832585242644e-06, + "loss": 0.7594, + "step": 9256 + }, + { + "epoch": 0.5094941934063515, + "grad_norm": 0.6273325085639954, + "learning_rate": 8.505523511429876e-06, + "loss": 0.6798, + "step": 9257 + }, + { + "epoch": 0.5095492322087072, + "grad_norm": 0.7366517186164856, + "learning_rate": 8.505214411270533e-06, + "loss": 0.7916, + "step": 9258 + }, + { + "epoch": 0.5096042710110628, + "grad_norm": 0.6654453873634338, + "learning_rate": 8.504905284766936e-06, + "loss": 0.7228, + "step": 9259 + }, + { + "epoch": 0.5096593098134184, + "grad_norm": 0.7926275134086609, + "learning_rate": 8.50459613192141e-06, + "loss": 0.8303, + "step": 9260 + }, + { + "epoch": 0.5097143486157741, + "grad_norm": 0.7256377935409546, + "learning_rate": 8.504286952736277e-06, + "loss": 0.7977, + "step": 9261 + }, + { + "epoch": 0.5097693874181298, + "grad_norm": 0.7333946824073792, + "learning_rate": 8.50397774721386e-06, + "loss": 0.7978, + "step": 9262 + }, + { + "epoch": 0.5098244262204854, + "grad_norm": 0.6102882623672485, + "learning_rate": 8.503668515356485e-06, + "loss": 0.6386, + "step": 9263 + }, + { + "epoch": 0.5098794650228411, + "grad_norm": 0.7939823865890503, + "learning_rate": 8.503359257166477e-06, + "loss": 0.7328, + "step": 9264 + }, + { + "epoch": 0.5099345038251968, + "grad_norm": 0.7245013117790222, + "learning_rate": 8.503049972646157e-06, + "loss": 0.795, + "step": 9265 + }, + { + "epoch": 0.5099895426275525, + "grad_norm": 0.6722108125686646, + "learning_rate": 8.502740661797852e-06, + "loss": 0.7062, + "step": 9266 + }, + { + "epoch": 0.510044581429908, + "grad_norm": 0.6759012341499329, + "learning_rate": 8.502431324623884e-06, + "loss": 0.7427, + "step": 9267 + }, + { + "epoch": 0.5100996202322637, + "grad_norm": 0.6448835730552673, + "learning_rate": 8.502121961126581e-06, + "loss": 0.7381, + "step": 9268 + }, + { + "epoch": 0.5101546590346194, + "grad_norm": 0.6437426209449768, + "learning_rate": 8.501812571308266e-06, + "loss": 0.6733, + "step": 9269 + }, + { + "epoch": 0.5102096978369751, + "grad_norm": 0.6879013776779175, + "learning_rate": 8.501503155171267e-06, + "loss": 0.7227, + "step": 9270 + }, + { + "epoch": 0.5102647366393307, + "grad_norm": 0.6628512740135193, + "learning_rate": 8.501193712717906e-06, + "loss": 0.7151, + "step": 9271 + }, + { + "epoch": 0.5103197754416864, + "grad_norm": 0.7653747797012329, + "learning_rate": 8.500884243950511e-06, + "loss": 0.8189, + "step": 9272 + }, + { + "epoch": 0.5103748142440421, + "grad_norm": 0.7180060148239136, + "learning_rate": 8.500574748871407e-06, + "loss": 0.7633, + "step": 9273 + }, + { + "epoch": 0.5104298530463978, + "grad_norm": 0.7045086622238159, + "learning_rate": 8.50026522748292e-06, + "loss": 0.746, + "step": 9274 + }, + { + "epoch": 0.5104848918487533, + "grad_norm": 0.6224614381790161, + "learning_rate": 8.499955679787376e-06, + "loss": 0.7436, + "step": 9275 + }, + { + "epoch": 0.510539930651109, + "grad_norm": 0.6716495156288147, + "learning_rate": 8.499646105787103e-06, + "loss": 0.8006, + "step": 9276 + }, + { + "epoch": 0.5105949694534647, + "grad_norm": 0.83705735206604, + "learning_rate": 8.499336505484426e-06, + "loss": 0.886, + "step": 9277 + }, + { + "epoch": 0.5106500082558203, + "grad_norm": 0.7942199110984802, + "learning_rate": 8.499026878881673e-06, + "loss": 0.7709, + "step": 9278 + }, + { + "epoch": 0.510705047058176, + "grad_norm": 0.7500330209732056, + "learning_rate": 8.49871722598117e-06, + "loss": 0.7737, + "step": 9279 + }, + { + "epoch": 0.5107600858605317, + "grad_norm": 0.7283433675765991, + "learning_rate": 8.498407546785245e-06, + "loss": 0.8345, + "step": 9280 + }, + { + "epoch": 0.5108151246628874, + "grad_norm": 0.6970989108085632, + "learning_rate": 8.498097841296224e-06, + "loss": 0.7451, + "step": 9281 + }, + { + "epoch": 0.5108701634652429, + "grad_norm": 0.8338573575019836, + "learning_rate": 8.497788109516438e-06, + "loss": 0.8198, + "step": 9282 + }, + { + "epoch": 0.5109252022675986, + "grad_norm": 0.6544861197471619, + "learning_rate": 8.497478351448213e-06, + "loss": 0.7549, + "step": 9283 + }, + { + "epoch": 0.5109802410699543, + "grad_norm": 0.6627360582351685, + "learning_rate": 8.497168567093876e-06, + "loss": 0.7136, + "step": 9284 + }, + { + "epoch": 0.51103527987231, + "grad_norm": 0.7176669239997864, + "learning_rate": 8.496858756455755e-06, + "loss": 0.766, + "step": 9285 + }, + { + "epoch": 0.5110903186746656, + "grad_norm": 0.8260897397994995, + "learning_rate": 8.496548919536183e-06, + "loss": 0.8167, + "step": 9286 + }, + { + "epoch": 0.5111453574770213, + "grad_norm": 0.7077773809432983, + "learning_rate": 8.496239056337483e-06, + "loss": 0.776, + "step": 9287 + }, + { + "epoch": 0.511200396279377, + "grad_norm": 0.7609447836875916, + "learning_rate": 8.495929166861988e-06, + "loss": 0.7339, + "step": 9288 + }, + { + "epoch": 0.5112554350817327, + "grad_norm": 0.6896487474441528, + "learning_rate": 8.495619251112022e-06, + "loss": 0.7639, + "step": 9289 + }, + { + "epoch": 0.5113104738840882, + "grad_norm": 0.6946871280670166, + "learning_rate": 8.495309309089918e-06, + "loss": 0.8242, + "step": 9290 + }, + { + "epoch": 0.5113655126864439, + "grad_norm": 0.79847252368927, + "learning_rate": 8.494999340798007e-06, + "loss": 0.8226, + "step": 9291 + }, + { + "epoch": 0.5114205514887996, + "grad_norm": 0.7845447063446045, + "learning_rate": 8.494689346238615e-06, + "loss": 0.8593, + "step": 9292 + }, + { + "epoch": 0.5114755902911553, + "grad_norm": 1.1577119827270508, + "learning_rate": 8.494379325414074e-06, + "loss": 0.746, + "step": 9293 + }, + { + "epoch": 0.5115306290935109, + "grad_norm": 0.6720938682556152, + "learning_rate": 8.494069278326713e-06, + "loss": 0.6768, + "step": 9294 + }, + { + "epoch": 0.5115856678958666, + "grad_norm": 0.7389395833015442, + "learning_rate": 8.493759204978862e-06, + "loss": 0.8126, + "step": 9295 + }, + { + "epoch": 0.5116407066982223, + "grad_norm": 0.7629536986351013, + "learning_rate": 8.493449105372853e-06, + "loss": 0.7107, + "step": 9296 + }, + { + "epoch": 0.511695745500578, + "grad_norm": 0.7339474558830261, + "learning_rate": 8.493138979511015e-06, + "loss": 0.8144, + "step": 9297 + }, + { + "epoch": 0.5117507843029335, + "grad_norm": 0.7222825288772583, + "learning_rate": 8.49282882739568e-06, + "loss": 0.7512, + "step": 9298 + }, + { + "epoch": 0.5118058231052892, + "grad_norm": 0.676659107208252, + "learning_rate": 8.49251864902918e-06, + "loss": 0.6515, + "step": 9299 + }, + { + "epoch": 0.5118608619076449, + "grad_norm": 0.6336323618888855, + "learning_rate": 8.492208444413844e-06, + "loss": 0.719, + "step": 9300 + }, + { + "epoch": 0.5119159007100006, + "grad_norm": 0.701543927192688, + "learning_rate": 8.491898213552e-06, + "loss": 0.728, + "step": 9301 + }, + { + "epoch": 0.5119709395123562, + "grad_norm": 0.6809069514274597, + "learning_rate": 8.491587956445988e-06, + "loss": 0.8844, + "step": 9302 + }, + { + "epoch": 0.5120259783147119, + "grad_norm": 0.8046489357948303, + "learning_rate": 8.491277673098135e-06, + "loss": 0.817, + "step": 9303 + }, + { + "epoch": 0.5120810171170675, + "grad_norm": 0.8630616068840027, + "learning_rate": 8.490967363510774e-06, + "loss": 0.7745, + "step": 9304 + }, + { + "epoch": 0.5121360559194232, + "grad_norm": 0.7457678914070129, + "learning_rate": 8.490657027686235e-06, + "loss": 0.7956, + "step": 9305 + }, + { + "epoch": 0.5121910947217788, + "grad_norm": 0.6383466124534607, + "learning_rate": 8.490346665626854e-06, + "loss": 0.8046, + "step": 9306 + }, + { + "epoch": 0.5122461335241345, + "grad_norm": 0.7658202052116394, + "learning_rate": 8.49003627733496e-06, + "loss": 0.7905, + "step": 9307 + }, + { + "epoch": 0.5123011723264902, + "grad_norm": 0.6793283224105835, + "learning_rate": 8.48972586281289e-06, + "loss": 0.6646, + "step": 9308 + }, + { + "epoch": 0.5123562111288459, + "grad_norm": 0.7345246076583862, + "learning_rate": 8.489415422062972e-06, + "loss": 0.788, + "step": 9309 + }, + { + "epoch": 0.5124112499312015, + "grad_norm": 0.6665463447570801, + "learning_rate": 8.489104955087542e-06, + "loss": 0.706, + "step": 9310 + }, + { + "epoch": 0.5124662887335572, + "grad_norm": 0.7895458936691284, + "learning_rate": 8.488794461888934e-06, + "loss": 0.7464, + "step": 9311 + }, + { + "epoch": 0.5125213275359128, + "grad_norm": 0.7375221252441406, + "learning_rate": 8.488483942469481e-06, + "loss": 0.8029, + "step": 9312 + }, + { + "epoch": 0.5125763663382685, + "grad_norm": 0.792348325252533, + "learning_rate": 8.488173396831514e-06, + "loss": 0.7324, + "step": 9313 + }, + { + "epoch": 0.5126314051406241, + "grad_norm": 0.6500192880630493, + "learning_rate": 8.487862824977373e-06, + "loss": 0.7331, + "step": 9314 + }, + { + "epoch": 0.5126864439429798, + "grad_norm": 0.6607314348220825, + "learning_rate": 8.487552226909386e-06, + "loss": 0.7782, + "step": 9315 + }, + { + "epoch": 0.5127414827453355, + "grad_norm": 0.8261791467666626, + "learning_rate": 8.487241602629892e-06, + "loss": 0.8036, + "step": 9316 + }, + { + "epoch": 0.5127965215476912, + "grad_norm": 0.8301663994789124, + "learning_rate": 8.486930952141222e-06, + "loss": 0.7928, + "step": 9317 + }, + { + "epoch": 0.5128515603500468, + "grad_norm": 0.6957940459251404, + "learning_rate": 8.486620275445713e-06, + "loss": 0.7359, + "step": 9318 + }, + { + "epoch": 0.5129065991524024, + "grad_norm": 0.7562606334686279, + "learning_rate": 8.4863095725457e-06, + "loss": 0.7546, + "step": 9319 + }, + { + "epoch": 0.5129616379547581, + "grad_norm": 0.795886218547821, + "learning_rate": 8.485998843443517e-06, + "loss": 0.7558, + "step": 9320 + }, + { + "epoch": 0.5130166767571137, + "grad_norm": 0.6558147072792053, + "learning_rate": 8.4856880881415e-06, + "loss": 0.6832, + "step": 9321 + }, + { + "epoch": 0.5130717155594694, + "grad_norm": 0.7300151586532593, + "learning_rate": 8.485377306641984e-06, + "loss": 0.8018, + "step": 9322 + }, + { + "epoch": 0.5131267543618251, + "grad_norm": 0.7114105224609375, + "learning_rate": 8.485066498947305e-06, + "loss": 0.7374, + "step": 9323 + }, + { + "epoch": 0.5131817931641808, + "grad_norm": 0.7061085104942322, + "learning_rate": 8.484755665059798e-06, + "loss": 0.7905, + "step": 9324 + }, + { + "epoch": 0.5132368319665364, + "grad_norm": 0.8481647968292236, + "learning_rate": 8.484444804981802e-06, + "loss": 0.8518, + "step": 9325 + }, + { + "epoch": 0.513291870768892, + "grad_norm": 0.7583557367324829, + "learning_rate": 8.48413391871565e-06, + "loss": 0.8328, + "step": 9326 + }, + { + "epoch": 0.5133469095712477, + "grad_norm": 0.7381925582885742, + "learning_rate": 8.483823006263683e-06, + "loss": 0.76, + "step": 9327 + }, + { + "epoch": 0.5134019483736034, + "grad_norm": 0.8037852644920349, + "learning_rate": 8.483512067628232e-06, + "loss": 0.711, + "step": 9328 + }, + { + "epoch": 0.513456987175959, + "grad_norm": 0.6682618260383606, + "learning_rate": 8.483201102811637e-06, + "loss": 0.7479, + "step": 9329 + }, + { + "epoch": 0.5135120259783147, + "grad_norm": 0.662234365940094, + "learning_rate": 8.482890111816237e-06, + "loss": 0.7701, + "step": 9330 + }, + { + "epoch": 0.5135670647806704, + "grad_norm": 0.7081482410430908, + "learning_rate": 8.482579094644365e-06, + "loss": 0.8255, + "step": 9331 + }, + { + "epoch": 0.5136221035830261, + "grad_norm": 0.9659954905509949, + "learning_rate": 8.482268051298364e-06, + "loss": 0.8742, + "step": 9332 + }, + { + "epoch": 0.5136771423853816, + "grad_norm": 0.7837772369384766, + "learning_rate": 8.481956981780564e-06, + "loss": 0.7692, + "step": 9333 + }, + { + "epoch": 0.5137321811877373, + "grad_norm": 0.681918203830719, + "learning_rate": 8.481645886093311e-06, + "loss": 0.6952, + "step": 9334 + }, + { + "epoch": 0.513787219990093, + "grad_norm": 0.7253187894821167, + "learning_rate": 8.481334764238937e-06, + "loss": 0.7074, + "step": 9335 + }, + { + "epoch": 0.5138422587924487, + "grad_norm": 0.8845877051353455, + "learning_rate": 8.481023616219783e-06, + "loss": 0.675, + "step": 9336 + }, + { + "epoch": 0.5138972975948043, + "grad_norm": 0.6569344401359558, + "learning_rate": 8.480712442038188e-06, + "loss": 0.7181, + "step": 9337 + }, + { + "epoch": 0.51395233639716, + "grad_norm": 0.7372813820838928, + "learning_rate": 8.480401241696491e-06, + "loss": 0.8137, + "step": 9338 + }, + { + "epoch": 0.5140073751995157, + "grad_norm": 0.843099057674408, + "learning_rate": 8.48009001519703e-06, + "loss": 0.7648, + "step": 9339 + }, + { + "epoch": 0.5140624140018714, + "grad_norm": 0.7762032747268677, + "learning_rate": 8.479778762542142e-06, + "loss": 0.7805, + "step": 9340 + }, + { + "epoch": 0.5141174528042269, + "grad_norm": 0.739086925983429, + "learning_rate": 8.479467483734169e-06, + "loss": 0.7125, + "step": 9341 + }, + { + "epoch": 0.5141724916065826, + "grad_norm": 0.7351683974266052, + "learning_rate": 8.479156178775451e-06, + "loss": 0.7855, + "step": 9342 + }, + { + "epoch": 0.5142275304089383, + "grad_norm": 0.7601314187049866, + "learning_rate": 8.478844847668325e-06, + "loss": 0.8349, + "step": 9343 + }, + { + "epoch": 0.514282569211294, + "grad_norm": 0.6841638684272766, + "learning_rate": 8.478533490415133e-06, + "loss": 0.7986, + "step": 9344 + }, + { + "epoch": 0.5143376080136496, + "grad_norm": 0.6734872460365295, + "learning_rate": 8.478222107018213e-06, + "loss": 0.6941, + "step": 9345 + }, + { + "epoch": 0.5143926468160053, + "grad_norm": 0.801930844783783, + "learning_rate": 8.47791069747991e-06, + "loss": 0.8537, + "step": 9346 + }, + { + "epoch": 0.514447685618361, + "grad_norm": 0.6960629224777222, + "learning_rate": 8.477599261802558e-06, + "loss": 0.6629, + "step": 9347 + }, + { + "epoch": 0.5145027244207167, + "grad_norm": 0.7791358232498169, + "learning_rate": 8.477287799988502e-06, + "loss": 0.8777, + "step": 9348 + }, + { + "epoch": 0.5145577632230722, + "grad_norm": 0.7022722959518433, + "learning_rate": 8.476976312040082e-06, + "loss": 0.7116, + "step": 9349 + }, + { + "epoch": 0.5146128020254279, + "grad_norm": 0.7791306376457214, + "learning_rate": 8.476664797959639e-06, + "loss": 0.7262, + "step": 9350 + }, + { + "epoch": 0.5146678408277836, + "grad_norm": 0.7391177415847778, + "learning_rate": 8.476353257749514e-06, + "loss": 0.7308, + "step": 9351 + }, + { + "epoch": 0.5147228796301393, + "grad_norm": 0.6989552974700928, + "learning_rate": 8.476041691412046e-06, + "loss": 0.7754, + "step": 9352 + }, + { + "epoch": 0.5147779184324949, + "grad_norm": 0.7639930844306946, + "learning_rate": 8.475730098949582e-06, + "loss": 0.8385, + "step": 9353 + }, + { + "epoch": 0.5148329572348506, + "grad_norm": 0.7687931060791016, + "learning_rate": 8.47541848036446e-06, + "loss": 0.8118, + "step": 9354 + }, + { + "epoch": 0.5148879960372063, + "grad_norm": 0.8831589221954346, + "learning_rate": 8.475106835659024e-06, + "loss": 0.7705, + "step": 9355 + }, + { + "epoch": 0.5149430348395619, + "grad_norm": 0.7585502862930298, + "learning_rate": 8.474795164835614e-06, + "loss": 0.8167, + "step": 9356 + }, + { + "epoch": 0.5149980736419175, + "grad_norm": 0.7078690528869629, + "learning_rate": 8.474483467896572e-06, + "loss": 0.7412, + "step": 9357 + }, + { + "epoch": 0.5150531124442732, + "grad_norm": 0.8950889706611633, + "learning_rate": 8.474171744844246e-06, + "loss": 0.8132, + "step": 9358 + }, + { + "epoch": 0.5151081512466289, + "grad_norm": 0.7196077704429626, + "learning_rate": 8.473859995680973e-06, + "loss": 0.8041, + "step": 9359 + }, + { + "epoch": 0.5151631900489846, + "grad_norm": 0.7705141305923462, + "learning_rate": 8.473548220409099e-06, + "loss": 0.8437, + "step": 9360 + }, + { + "epoch": 0.5152182288513402, + "grad_norm": 0.6507467031478882, + "learning_rate": 8.473236419030966e-06, + "loss": 0.7713, + "step": 9361 + }, + { + "epoch": 0.5152732676536959, + "grad_norm": 0.7120817303657532, + "learning_rate": 8.472924591548917e-06, + "loss": 0.7688, + "step": 9362 + }, + { + "epoch": 0.5153283064560515, + "grad_norm": 0.7830487489700317, + "learning_rate": 8.472612737965297e-06, + "loss": 0.8875, + "step": 9363 + }, + { + "epoch": 0.5153833452584071, + "grad_norm": 0.8790529370307922, + "learning_rate": 8.47230085828245e-06, + "loss": 0.7648, + "step": 9364 + }, + { + "epoch": 0.5154383840607628, + "grad_norm": 0.8956806659698486, + "learning_rate": 8.471988952502718e-06, + "loss": 0.7891, + "step": 9365 + }, + { + "epoch": 0.5154934228631185, + "grad_norm": 0.7370011210441589, + "learning_rate": 8.471677020628448e-06, + "loss": 0.7609, + "step": 9366 + }, + { + "epoch": 0.5155484616654742, + "grad_norm": 0.6794238090515137, + "learning_rate": 8.471365062661982e-06, + "loss": 0.6679, + "step": 9367 + }, + { + "epoch": 0.5156035004678298, + "grad_norm": 0.7330273985862732, + "learning_rate": 8.471053078605664e-06, + "loss": 0.7276, + "step": 9368 + }, + { + "epoch": 0.5156585392701855, + "grad_norm": 0.7796601057052612, + "learning_rate": 8.470741068461843e-06, + "loss": 0.7897, + "step": 9369 + }, + { + "epoch": 0.5157135780725411, + "grad_norm": 0.6834099888801575, + "learning_rate": 8.470429032232858e-06, + "loss": 0.7924, + "step": 9370 + }, + { + "epoch": 0.5157686168748968, + "grad_norm": 0.6991616487503052, + "learning_rate": 8.47011696992106e-06, + "loss": 0.7901, + "step": 9371 + }, + { + "epoch": 0.5158236556772524, + "grad_norm": 0.7321401834487915, + "learning_rate": 8.469804881528792e-06, + "loss": 0.6718, + "step": 9372 + }, + { + "epoch": 0.5158786944796081, + "grad_norm": 0.7091043591499329, + "learning_rate": 8.469492767058398e-06, + "loss": 0.8204, + "step": 9373 + }, + { + "epoch": 0.5159337332819638, + "grad_norm": 0.8777012825012207, + "learning_rate": 8.469180626512223e-06, + "loss": 0.8045, + "step": 9374 + }, + { + "epoch": 0.5159887720843195, + "grad_norm": 0.6652738451957703, + "learning_rate": 8.468868459892619e-06, + "loss": 0.7248, + "step": 9375 + }, + { + "epoch": 0.5160438108866751, + "grad_norm": 0.7209659218788147, + "learning_rate": 8.468556267201925e-06, + "loss": 0.7508, + "step": 9376 + }, + { + "epoch": 0.5160988496890307, + "grad_norm": 0.7685441970825195, + "learning_rate": 8.468244048442494e-06, + "loss": 0.7501, + "step": 9377 + }, + { + "epoch": 0.5161538884913864, + "grad_norm": 0.6773725152015686, + "learning_rate": 8.467931803616665e-06, + "loss": 0.8036, + "step": 9378 + }, + { + "epoch": 0.5162089272937421, + "grad_norm": 0.7167890071868896, + "learning_rate": 8.467619532726792e-06, + "loss": 0.7229, + "step": 9379 + }, + { + "epoch": 0.5162639660960977, + "grad_norm": 0.7066929340362549, + "learning_rate": 8.467307235775218e-06, + "loss": 0.7433, + "step": 9380 + }, + { + "epoch": 0.5163190048984534, + "grad_norm": 0.7261828780174255, + "learning_rate": 8.46699491276429e-06, + "loss": 0.7873, + "step": 9381 + }, + { + "epoch": 0.5163740437008091, + "grad_norm": 0.7442463636398315, + "learning_rate": 8.466682563696356e-06, + "loss": 0.7953, + "step": 9382 + }, + { + "epoch": 0.5164290825031648, + "grad_norm": 0.5668768286705017, + "learning_rate": 8.466370188573765e-06, + "loss": 0.5602, + "step": 9383 + }, + { + "epoch": 0.5164841213055203, + "grad_norm": 0.7364997267723083, + "learning_rate": 8.466057787398864e-06, + "loss": 0.8274, + "step": 9384 + }, + { + "epoch": 0.516539160107876, + "grad_norm": 0.7793132066726685, + "learning_rate": 8.465745360174e-06, + "loss": 0.7832, + "step": 9385 + }, + { + "epoch": 0.5165941989102317, + "grad_norm": 0.6818128824234009, + "learning_rate": 8.46543290690152e-06, + "loss": 0.8314, + "step": 9386 + }, + { + "epoch": 0.5166492377125874, + "grad_norm": 0.7392195463180542, + "learning_rate": 8.465120427583778e-06, + "loss": 0.8124, + "step": 9387 + }, + { + "epoch": 0.516704276514943, + "grad_norm": 0.8582521677017212, + "learning_rate": 8.464807922223115e-06, + "loss": 0.7417, + "step": 9388 + }, + { + "epoch": 0.5167593153172987, + "grad_norm": 0.7322097420692444, + "learning_rate": 8.464495390821882e-06, + "loss": 0.7408, + "step": 9389 + }, + { + "epoch": 0.5168143541196544, + "grad_norm": 0.8177433013916016, + "learning_rate": 8.464182833382432e-06, + "loss": 0.87, + "step": 9390 + }, + { + "epoch": 0.5168693929220101, + "grad_norm": 0.7088115215301514, + "learning_rate": 8.46387024990711e-06, + "loss": 0.7748, + "step": 9391 + }, + { + "epoch": 0.5169244317243656, + "grad_norm": 0.6648650169372559, + "learning_rate": 8.463557640398268e-06, + "loss": 0.6302, + "step": 9392 + }, + { + "epoch": 0.5169794705267213, + "grad_norm": 0.6688859462738037, + "learning_rate": 8.463245004858251e-06, + "loss": 0.7252, + "step": 9393 + }, + { + "epoch": 0.517034509329077, + "grad_norm": 0.7231030464172363, + "learning_rate": 8.462932343289412e-06, + "loss": 0.8497, + "step": 9394 + }, + { + "epoch": 0.5170895481314327, + "grad_norm": 0.7142065763473511, + "learning_rate": 8.462619655694103e-06, + "loss": 0.7041, + "step": 9395 + }, + { + "epoch": 0.5171445869337883, + "grad_norm": 0.7197136878967285, + "learning_rate": 8.462306942074669e-06, + "loss": 0.7022, + "step": 9396 + }, + { + "epoch": 0.517199625736144, + "grad_norm": 0.7620192766189575, + "learning_rate": 8.461994202433463e-06, + "loss": 0.8243, + "step": 9397 + }, + { + "epoch": 0.5172546645384997, + "grad_norm": 0.7697533965110779, + "learning_rate": 8.461681436772836e-06, + "loss": 0.7861, + "step": 9398 + }, + { + "epoch": 0.5173097033408554, + "grad_norm": 0.7224711179733276, + "learning_rate": 8.461368645095138e-06, + "loss": 0.7588, + "step": 9399 + }, + { + "epoch": 0.5173647421432109, + "grad_norm": 0.9285979270935059, + "learning_rate": 8.46105582740272e-06, + "loss": 0.8113, + "step": 9400 + }, + { + "epoch": 0.5174197809455666, + "grad_norm": 0.7297842502593994, + "learning_rate": 8.460742983697934e-06, + "loss": 0.7115, + "step": 9401 + }, + { + "epoch": 0.5174748197479223, + "grad_norm": 0.6712872982025146, + "learning_rate": 8.460430113983126e-06, + "loss": 0.751, + "step": 9402 + }, + { + "epoch": 0.517529858550278, + "grad_norm": 0.7807186245918274, + "learning_rate": 8.460117218260657e-06, + "loss": 0.8375, + "step": 9403 + }, + { + "epoch": 0.5175848973526336, + "grad_norm": 0.621530294418335, + "learning_rate": 8.45980429653287e-06, + "loss": 0.638, + "step": 9404 + }, + { + "epoch": 0.5176399361549893, + "grad_norm": 0.7086256146430969, + "learning_rate": 8.45949134880212e-06, + "loss": 0.8304, + "step": 9405 + }, + { + "epoch": 0.517694974957345, + "grad_norm": 0.62705397605896, + "learning_rate": 8.45917837507076e-06, + "loss": 0.7008, + "step": 9406 + }, + { + "epoch": 0.5177500137597005, + "grad_norm": 0.9109121561050415, + "learning_rate": 8.458865375341142e-06, + "loss": 0.7529, + "step": 9407 + }, + { + "epoch": 0.5178050525620562, + "grad_norm": 0.6909900903701782, + "learning_rate": 8.458552349615615e-06, + "loss": 0.8453, + "step": 9408 + }, + { + "epoch": 0.5178600913644119, + "grad_norm": 0.7548434138298035, + "learning_rate": 8.458239297896536e-06, + "loss": 0.7516, + "step": 9409 + }, + { + "epoch": 0.5179151301667676, + "grad_norm": 0.7595730423927307, + "learning_rate": 8.457926220186257e-06, + "loss": 0.7599, + "step": 9410 + }, + { + "epoch": 0.5179701689691232, + "grad_norm": 0.7449337840080261, + "learning_rate": 8.45761311648713e-06, + "loss": 0.8236, + "step": 9411 + }, + { + "epoch": 0.5180252077714789, + "grad_norm": 0.7529160976409912, + "learning_rate": 8.457299986801507e-06, + "loss": 0.8655, + "step": 9412 + }, + { + "epoch": 0.5180802465738346, + "grad_norm": 0.6777701377868652, + "learning_rate": 8.456986831131742e-06, + "loss": 0.7737, + "step": 9413 + }, + { + "epoch": 0.5181352853761902, + "grad_norm": 0.9363510012626648, + "learning_rate": 8.456673649480191e-06, + "loss": 0.8227, + "step": 9414 + }, + { + "epoch": 0.5181903241785458, + "grad_norm": 0.798001229763031, + "learning_rate": 8.456360441849206e-06, + "loss": 0.8881, + "step": 9415 + }, + { + "epoch": 0.5182453629809015, + "grad_norm": 0.7212072610855103, + "learning_rate": 8.456047208241141e-06, + "loss": 0.8165, + "step": 9416 + }, + { + "epoch": 0.5183004017832572, + "grad_norm": 0.6918027997016907, + "learning_rate": 8.45573394865835e-06, + "loss": 0.8048, + "step": 9417 + }, + { + "epoch": 0.5183554405856129, + "grad_norm": 0.6474916338920593, + "learning_rate": 8.455420663103187e-06, + "loss": 0.6502, + "step": 9418 + }, + { + "epoch": 0.5184104793879685, + "grad_norm": 0.6592364311218262, + "learning_rate": 8.455107351578008e-06, + "loss": 0.7509, + "step": 9419 + }, + { + "epoch": 0.5184655181903242, + "grad_norm": 0.7658745646476746, + "learning_rate": 8.454794014085168e-06, + "loss": 0.8444, + "step": 9420 + }, + { + "epoch": 0.5185205569926798, + "grad_norm": 0.6814215183258057, + "learning_rate": 8.45448065062702e-06, + "loss": 0.7367, + "step": 9421 + }, + { + "epoch": 0.5185755957950355, + "grad_norm": 0.644740104675293, + "learning_rate": 8.45416726120592e-06, + "loss": 0.7456, + "step": 9422 + }, + { + "epoch": 0.5186306345973911, + "grad_norm": 0.8578751087188721, + "learning_rate": 8.453853845824225e-06, + "loss": 0.8481, + "step": 9423 + }, + { + "epoch": 0.5186856733997468, + "grad_norm": 0.6630389094352722, + "learning_rate": 8.453540404484288e-06, + "loss": 0.7487, + "step": 9424 + }, + { + "epoch": 0.5187407122021025, + "grad_norm": 0.7756431698799133, + "learning_rate": 8.453226937188466e-06, + "loss": 0.798, + "step": 9425 + }, + { + "epoch": 0.5187957510044582, + "grad_norm": 0.7856318354606628, + "learning_rate": 8.452913443939113e-06, + "loss": 0.785, + "step": 9426 + }, + { + "epoch": 0.5188507898068138, + "grad_norm": 0.7563977837562561, + "learning_rate": 8.45259992473859e-06, + "loss": 0.8182, + "step": 9427 + }, + { + "epoch": 0.5189058286091695, + "grad_norm": 0.6945043802261353, + "learning_rate": 8.452286379589247e-06, + "loss": 0.7262, + "step": 9428 + }, + { + "epoch": 0.5189608674115251, + "grad_norm": 0.6607717275619507, + "learning_rate": 8.451972808493444e-06, + "loss": 0.7257, + "step": 9429 + }, + { + "epoch": 0.5190159062138808, + "grad_norm": 0.6682843565940857, + "learning_rate": 8.451659211453539e-06, + "loss": 0.6775, + "step": 9430 + }, + { + "epoch": 0.5190709450162364, + "grad_norm": 0.7175559401512146, + "learning_rate": 8.451345588471886e-06, + "loss": 0.7154, + "step": 9431 + }, + { + "epoch": 0.5191259838185921, + "grad_norm": 0.7499119639396667, + "learning_rate": 8.451031939550845e-06, + "loss": 0.7537, + "step": 9432 + }, + { + "epoch": 0.5191810226209478, + "grad_norm": 0.65048748254776, + "learning_rate": 8.450718264692771e-06, + "loss": 0.7253, + "step": 9433 + }, + { + "epoch": 0.5192360614233035, + "grad_norm": 0.7067640423774719, + "learning_rate": 8.450404563900022e-06, + "loss": 0.7245, + "step": 9434 + }, + { + "epoch": 0.519291100225659, + "grad_norm": 0.7079932689666748, + "learning_rate": 8.450090837174956e-06, + "loss": 0.7776, + "step": 9435 + }, + { + "epoch": 0.5193461390280147, + "grad_norm": 0.8260107636451721, + "learning_rate": 8.44977708451993e-06, + "loss": 0.8529, + "step": 9436 + }, + { + "epoch": 0.5194011778303704, + "grad_norm": 0.6412167549133301, + "learning_rate": 8.449463305937304e-06, + "loss": 0.7371, + "step": 9437 + }, + { + "epoch": 0.5194562166327261, + "grad_norm": 0.7067576050758362, + "learning_rate": 8.449149501429435e-06, + "loss": 0.7161, + "step": 9438 + }, + { + "epoch": 0.5195112554350817, + "grad_norm": 0.6966904997825623, + "learning_rate": 8.448835670998681e-06, + "loss": 0.7285, + "step": 9439 + }, + { + "epoch": 0.5195662942374374, + "grad_norm": 0.8066132664680481, + "learning_rate": 8.448521814647401e-06, + "loss": 0.8265, + "step": 9440 + }, + { + "epoch": 0.5196213330397931, + "grad_norm": 0.7597149610519409, + "learning_rate": 8.448207932377957e-06, + "loss": 0.7721, + "step": 9441 + }, + { + "epoch": 0.5196763718421488, + "grad_norm": 0.6965302228927612, + "learning_rate": 8.447894024192702e-06, + "loss": 0.749, + "step": 9442 + }, + { + "epoch": 0.5197314106445043, + "grad_norm": 0.7032600045204163, + "learning_rate": 8.447580090094e-06, + "loss": 0.7923, + "step": 9443 + }, + { + "epoch": 0.51978644944686, + "grad_norm": 0.7255309820175171, + "learning_rate": 8.447266130084208e-06, + "loss": 0.6739, + "step": 9444 + }, + { + "epoch": 0.5198414882492157, + "grad_norm": 0.6602993011474609, + "learning_rate": 8.446952144165686e-06, + "loss": 0.7886, + "step": 9445 + }, + { + "epoch": 0.5198965270515714, + "grad_norm": 0.7017884850502014, + "learning_rate": 8.446638132340796e-06, + "loss": 0.7554, + "step": 9446 + }, + { + "epoch": 0.519951565853927, + "grad_norm": 0.7234843969345093, + "learning_rate": 8.446324094611894e-06, + "loss": 0.8294, + "step": 9447 + }, + { + "epoch": 0.5200066046562827, + "grad_norm": 0.6859332919120789, + "learning_rate": 8.446010030981347e-06, + "loss": 0.7563, + "step": 9448 + }, + { + "epoch": 0.5200616434586384, + "grad_norm": 0.7759458422660828, + "learning_rate": 8.445695941451507e-06, + "loss": 0.7577, + "step": 9449 + }, + { + "epoch": 0.520116682260994, + "grad_norm": 0.7852263450622559, + "learning_rate": 8.44538182602474e-06, + "loss": 0.7446, + "step": 9450 + }, + { + "epoch": 0.5201717210633496, + "grad_norm": 0.8143053650856018, + "learning_rate": 8.445067684703406e-06, + "loss": 0.7995, + "step": 9451 + }, + { + "epoch": 0.5202267598657053, + "grad_norm": 0.692738950252533, + "learning_rate": 8.444753517489865e-06, + "loss": 0.7185, + "step": 9452 + }, + { + "epoch": 0.520281798668061, + "grad_norm": 0.6615390181541443, + "learning_rate": 8.444439324386478e-06, + "loss": 0.7128, + "step": 9453 + }, + { + "epoch": 0.5203368374704166, + "grad_norm": 0.7360419034957886, + "learning_rate": 8.444125105395608e-06, + "loss": 0.6565, + "step": 9454 + }, + { + "epoch": 0.5203918762727723, + "grad_norm": 0.7280182838439941, + "learning_rate": 8.443810860519615e-06, + "loss": 0.7295, + "step": 9455 + }, + { + "epoch": 0.520446915075128, + "grad_norm": 0.787367582321167, + "learning_rate": 8.44349658976086e-06, + "loss": 0.7342, + "step": 9456 + }, + { + "epoch": 0.5205019538774837, + "grad_norm": 0.7496024966239929, + "learning_rate": 8.44318229312171e-06, + "loss": 0.7499, + "step": 9457 + }, + { + "epoch": 0.5205569926798392, + "grad_norm": 0.9167383909225464, + "learning_rate": 8.44286797060452e-06, + "loss": 0.7797, + "step": 9458 + }, + { + "epoch": 0.5206120314821949, + "grad_norm": 0.7032341957092285, + "learning_rate": 8.442553622211659e-06, + "loss": 0.7627, + "step": 9459 + }, + { + "epoch": 0.5206670702845506, + "grad_norm": 1.2905993461608887, + "learning_rate": 8.442239247945485e-06, + "loss": 0.7841, + "step": 9460 + }, + { + "epoch": 0.5207221090869063, + "grad_norm": 0.6909230351448059, + "learning_rate": 8.441924847808362e-06, + "loss": 0.7234, + "step": 9461 + }, + { + "epoch": 0.5207771478892619, + "grad_norm": 0.6632175445556641, + "learning_rate": 8.441610421802653e-06, + "loss": 0.6733, + "step": 9462 + }, + { + "epoch": 0.5208321866916176, + "grad_norm": 0.7838154435157776, + "learning_rate": 8.441295969930722e-06, + "loss": 0.7583, + "step": 9463 + }, + { + "epoch": 0.5208872254939733, + "grad_norm": 0.6380481123924255, + "learning_rate": 8.440981492194932e-06, + "loss": 0.7109, + "step": 9464 + }, + { + "epoch": 0.520942264296329, + "grad_norm": 0.6859052181243896, + "learning_rate": 8.440666988597646e-06, + "loss": 0.7387, + "step": 9465 + }, + { + "epoch": 0.5209973030986845, + "grad_norm": 0.7411379814147949, + "learning_rate": 8.440352459141226e-06, + "loss": 0.7852, + "step": 9466 + }, + { + "epoch": 0.5210523419010402, + "grad_norm": 0.6925216913223267, + "learning_rate": 8.44003790382804e-06, + "loss": 0.8228, + "step": 9467 + }, + { + "epoch": 0.5211073807033959, + "grad_norm": 0.7136396169662476, + "learning_rate": 8.43972332266045e-06, + "loss": 0.8168, + "step": 9468 + }, + { + "epoch": 0.5211624195057516, + "grad_norm": 0.719639003276825, + "learning_rate": 8.43940871564082e-06, + "loss": 0.6728, + "step": 9469 + }, + { + "epoch": 0.5212174583081072, + "grad_norm": 0.647861897945404, + "learning_rate": 8.439094082771513e-06, + "loss": 0.6986, + "step": 9470 + }, + { + "epoch": 0.5212724971104629, + "grad_norm": 0.6644579172134399, + "learning_rate": 8.438779424054897e-06, + "loss": 0.6263, + "step": 9471 + }, + { + "epoch": 0.5213275359128186, + "grad_norm": 0.7157352566719055, + "learning_rate": 8.438464739493335e-06, + "loss": 0.827, + "step": 9472 + }, + { + "epoch": 0.5213825747151742, + "grad_norm": 0.793765127658844, + "learning_rate": 8.438150029089193e-06, + "loss": 0.741, + "step": 9473 + }, + { + "epoch": 0.5214376135175298, + "grad_norm": 0.7078518867492676, + "learning_rate": 8.437835292844836e-06, + "loss": 0.7618, + "step": 9474 + }, + { + "epoch": 0.5214926523198855, + "grad_norm": 0.7492140531539917, + "learning_rate": 8.437520530762628e-06, + "loss": 0.7894, + "step": 9475 + }, + { + "epoch": 0.5215476911222412, + "grad_norm": 0.6534473299980164, + "learning_rate": 8.437205742844937e-06, + "loss": 0.7567, + "step": 9476 + }, + { + "epoch": 0.5216027299245969, + "grad_norm": 0.8745388984680176, + "learning_rate": 8.436890929094126e-06, + "loss": 0.8758, + "step": 9477 + }, + { + "epoch": 0.5216577687269525, + "grad_norm": 0.6804752349853516, + "learning_rate": 8.436576089512564e-06, + "loss": 0.7841, + "step": 9478 + }, + { + "epoch": 0.5217128075293082, + "grad_norm": 0.712065577507019, + "learning_rate": 8.436261224102615e-06, + "loss": 0.8079, + "step": 9479 + }, + { + "epoch": 0.5217678463316638, + "grad_norm": 0.8733783960342407, + "learning_rate": 8.435946332866648e-06, + "loss": 0.8295, + "step": 9480 + }, + { + "epoch": 0.5218228851340195, + "grad_norm": 0.6871289610862732, + "learning_rate": 8.435631415807028e-06, + "loss": 0.7087, + "step": 9481 + }, + { + "epoch": 0.5218779239363751, + "grad_norm": 0.8363185524940491, + "learning_rate": 8.43531647292612e-06, + "loss": 0.7329, + "step": 9482 + }, + { + "epoch": 0.5219329627387308, + "grad_norm": 0.6845195293426514, + "learning_rate": 8.435001504226295e-06, + "loss": 0.7651, + "step": 9483 + }, + { + "epoch": 0.5219880015410865, + "grad_norm": 0.7527645826339722, + "learning_rate": 8.434686509709917e-06, + "loss": 0.6856, + "step": 9484 + }, + { + "epoch": 0.5220430403434422, + "grad_norm": 0.6945710778236389, + "learning_rate": 8.434371489379356e-06, + "loss": 0.6875, + "step": 9485 + }, + { + "epoch": 0.5220980791457978, + "grad_norm": 0.7668873071670532, + "learning_rate": 8.434056443236977e-06, + "loss": 0.7662, + "step": 9486 + }, + { + "epoch": 0.5221531179481534, + "grad_norm": 0.9873473048210144, + "learning_rate": 8.433741371285148e-06, + "loss": 0.7662, + "step": 9487 + }, + { + "epoch": 0.5222081567505091, + "grad_norm": 0.8635447025299072, + "learning_rate": 8.43342627352624e-06, + "loss": 0.645, + "step": 9488 + }, + { + "epoch": 0.5222631955528648, + "grad_norm": 0.7836978435516357, + "learning_rate": 8.43311114996262e-06, + "loss": 0.7647, + "step": 9489 + }, + { + "epoch": 0.5223182343552204, + "grad_norm": 0.8370835185050964, + "learning_rate": 8.432796000596652e-06, + "loss": 0.8402, + "step": 9490 + }, + { + "epoch": 0.5223732731575761, + "grad_norm": 0.9627843499183655, + "learning_rate": 8.432480825430712e-06, + "loss": 0.6985, + "step": 9491 + }, + { + "epoch": 0.5224283119599318, + "grad_norm": 0.6774263978004456, + "learning_rate": 8.432165624467163e-06, + "loss": 0.7051, + "step": 9492 + }, + { + "epoch": 0.5224833507622874, + "grad_norm": 0.6590597033500671, + "learning_rate": 8.431850397708375e-06, + "loss": 0.7147, + "step": 9493 + }, + { + "epoch": 0.522538389564643, + "grad_norm": 0.8153522610664368, + "learning_rate": 8.43153514515672e-06, + "loss": 0.6759, + "step": 9494 + }, + { + "epoch": 0.5225934283669987, + "grad_norm": 0.7457708716392517, + "learning_rate": 8.431219866814563e-06, + "loss": 0.7168, + "step": 9495 + }, + { + "epoch": 0.5226484671693544, + "grad_norm": 0.6994161009788513, + "learning_rate": 8.430904562684278e-06, + "loss": 0.8393, + "step": 9496 + }, + { + "epoch": 0.52270350597171, + "grad_norm": 0.780337393283844, + "learning_rate": 8.430589232768232e-06, + "loss": 0.6528, + "step": 9497 + }, + { + "epoch": 0.5227585447740657, + "grad_norm": 0.6833232641220093, + "learning_rate": 8.430273877068796e-06, + "loss": 0.7545, + "step": 9498 + }, + { + "epoch": 0.5228135835764214, + "grad_norm": 0.7330057621002197, + "learning_rate": 8.42995849558834e-06, + "loss": 0.7932, + "step": 9499 + }, + { + "epoch": 0.5228686223787771, + "grad_norm": 0.8131541609764099, + "learning_rate": 8.429643088329233e-06, + "loss": 0.7546, + "step": 9500 + }, + { + "epoch": 0.5229236611811326, + "grad_norm": 0.7353833317756653, + "learning_rate": 8.42932765529385e-06, + "loss": 0.7508, + "step": 9501 + }, + { + "epoch": 0.5229786999834883, + "grad_norm": 0.7166246771812439, + "learning_rate": 8.429012196484554e-06, + "loss": 0.728, + "step": 9502 + }, + { + "epoch": 0.523033738785844, + "grad_norm": 0.732064962387085, + "learning_rate": 8.428696711903721e-06, + "loss": 0.8306, + "step": 9503 + }, + { + "epoch": 0.5230887775881997, + "grad_norm": 0.6858934164047241, + "learning_rate": 8.428381201553721e-06, + "loss": 0.7801, + "step": 9504 + }, + { + "epoch": 0.5231438163905553, + "grad_norm": 0.7046478986740112, + "learning_rate": 8.428065665436928e-06, + "loss": 0.7365, + "step": 9505 + }, + { + "epoch": 0.523198855192911, + "grad_norm": 0.6669325828552246, + "learning_rate": 8.42775010355571e-06, + "loss": 0.7764, + "step": 9506 + }, + { + "epoch": 0.5232538939952667, + "grad_norm": 0.655619740486145, + "learning_rate": 8.427434515912438e-06, + "loss": 0.7919, + "step": 9507 + }, + { + "epoch": 0.5233089327976224, + "grad_norm": 0.6236690878868103, + "learning_rate": 8.427118902509487e-06, + "loss": 0.6653, + "step": 9508 + }, + { + "epoch": 0.5233639715999779, + "grad_norm": 0.8233165740966797, + "learning_rate": 8.426803263349228e-06, + "loss": 0.8012, + "step": 9509 + }, + { + "epoch": 0.5234190104023336, + "grad_norm": 0.6626759171485901, + "learning_rate": 8.426487598434035e-06, + "loss": 0.7728, + "step": 9510 + }, + { + "epoch": 0.5234740492046893, + "grad_norm": 0.9209974408149719, + "learning_rate": 8.426171907766275e-06, + "loss": 0.769, + "step": 9511 + }, + { + "epoch": 0.523529088007045, + "grad_norm": 0.6297587156295776, + "learning_rate": 8.425856191348325e-06, + "loss": 0.7333, + "step": 9512 + }, + { + "epoch": 0.5235841268094006, + "grad_norm": 0.6995256543159485, + "learning_rate": 8.425540449182558e-06, + "loss": 0.7486, + "step": 9513 + }, + { + "epoch": 0.5236391656117563, + "grad_norm": 0.8076607584953308, + "learning_rate": 8.425224681271345e-06, + "loss": 0.8533, + "step": 9514 + }, + { + "epoch": 0.523694204414112, + "grad_norm": 1.2198601961135864, + "learning_rate": 8.42490888761706e-06, + "loss": 0.7291, + "step": 9515 + }, + { + "epoch": 0.5237492432164677, + "grad_norm": 0.7047159671783447, + "learning_rate": 8.424593068222076e-06, + "loss": 0.713, + "step": 9516 + }, + { + "epoch": 0.5238042820188232, + "grad_norm": 0.7652333378791809, + "learning_rate": 8.424277223088768e-06, + "loss": 0.8149, + "step": 9517 + }, + { + "epoch": 0.5238593208211789, + "grad_norm": 1.1311010122299194, + "learning_rate": 8.42396135221951e-06, + "loss": 0.8195, + "step": 9518 + }, + { + "epoch": 0.5239143596235346, + "grad_norm": 0.7855533957481384, + "learning_rate": 8.423645455616674e-06, + "loss": 0.7901, + "step": 9519 + }, + { + "epoch": 0.5239693984258903, + "grad_norm": 0.7028971314430237, + "learning_rate": 8.423329533282635e-06, + "loss": 0.8006, + "step": 9520 + }, + { + "epoch": 0.5240244372282459, + "grad_norm": 0.703809916973114, + "learning_rate": 8.423013585219769e-06, + "loss": 0.7581, + "step": 9521 + }, + { + "epoch": 0.5240794760306016, + "grad_norm": 0.94233238697052, + "learning_rate": 8.422697611430448e-06, + "loss": 0.7689, + "step": 9522 + }, + { + "epoch": 0.5241345148329573, + "grad_norm": 0.8164071440696716, + "learning_rate": 8.422381611917047e-06, + "loss": 0.8761, + "step": 9523 + }, + { + "epoch": 0.5241895536353129, + "grad_norm": 0.6242091059684753, + "learning_rate": 8.422065586681944e-06, + "loss": 0.6975, + "step": 9524 + }, + { + "epoch": 0.5242445924376685, + "grad_norm": 0.6607261300086975, + "learning_rate": 8.42174953572751e-06, + "loss": 0.6847, + "step": 9525 + }, + { + "epoch": 0.5242996312400242, + "grad_norm": 0.7174261212348938, + "learning_rate": 8.421433459056123e-06, + "loss": 0.7905, + "step": 9526 + }, + { + "epoch": 0.5243546700423799, + "grad_norm": 0.7414089441299438, + "learning_rate": 8.42111735667016e-06, + "loss": 0.7788, + "step": 9527 + }, + { + "epoch": 0.5244097088447356, + "grad_norm": 0.7347442507743835, + "learning_rate": 8.420801228571992e-06, + "loss": 0.7691, + "step": 9528 + }, + { + "epoch": 0.5244647476470912, + "grad_norm": 0.6947832107543945, + "learning_rate": 8.420485074763999e-06, + "loss": 0.6702, + "step": 9529 + }, + { + "epoch": 0.5245197864494469, + "grad_norm": 0.6865423321723938, + "learning_rate": 8.420168895248557e-06, + "loss": 0.7577, + "step": 9530 + }, + { + "epoch": 0.5245748252518025, + "grad_norm": 0.7023190855979919, + "learning_rate": 8.419852690028039e-06, + "loss": 0.7711, + "step": 9531 + }, + { + "epoch": 0.5246298640541582, + "grad_norm": 0.8312145471572876, + "learning_rate": 8.419536459104824e-06, + "loss": 0.7999, + "step": 9532 + }, + { + "epoch": 0.5246849028565138, + "grad_norm": 0.6700688600540161, + "learning_rate": 8.419220202481288e-06, + "loss": 0.7163, + "step": 9533 + }, + { + "epoch": 0.5247399416588695, + "grad_norm": 0.767062246799469, + "learning_rate": 8.418903920159809e-06, + "loss": 0.7451, + "step": 9534 + }, + { + "epoch": 0.5247949804612252, + "grad_norm": 0.6814010143280029, + "learning_rate": 8.418587612142763e-06, + "loss": 0.771, + "step": 9535 + }, + { + "epoch": 0.5248500192635808, + "grad_norm": 0.6728426218032837, + "learning_rate": 8.418271278432528e-06, + "loss": 0.8336, + "step": 9536 + }, + { + "epoch": 0.5249050580659365, + "grad_norm": 0.7112382650375366, + "learning_rate": 8.417954919031482e-06, + "loss": 0.7392, + "step": 9537 + }, + { + "epoch": 0.5249600968682921, + "grad_norm": 0.7371365427970886, + "learning_rate": 8.417638533942e-06, + "loss": 0.8233, + "step": 9538 + }, + { + "epoch": 0.5250151356706478, + "grad_norm": 0.6593502163887024, + "learning_rate": 8.41732212316646e-06, + "loss": 0.7455, + "step": 9539 + }, + { + "epoch": 0.5250701744730034, + "grad_norm": 0.685553252696991, + "learning_rate": 8.417005686707245e-06, + "loss": 0.7783, + "step": 9540 + }, + { + "epoch": 0.5251252132753591, + "grad_norm": 0.7003353238105774, + "learning_rate": 8.41668922456673e-06, + "loss": 0.7733, + "step": 9541 + }, + { + "epoch": 0.5251802520777148, + "grad_norm": 0.7602891325950623, + "learning_rate": 8.416372736747292e-06, + "loss": 0.7236, + "step": 9542 + }, + { + "epoch": 0.5252352908800705, + "grad_norm": 0.647531270980835, + "learning_rate": 8.41605622325131e-06, + "loss": 0.7388, + "step": 9543 + }, + { + "epoch": 0.5252903296824261, + "grad_norm": 0.7309756875038147, + "learning_rate": 8.415739684081165e-06, + "loss": 0.7178, + "step": 9544 + }, + { + "epoch": 0.5253453684847817, + "grad_norm": 0.6991532444953918, + "learning_rate": 8.415423119239236e-06, + "loss": 0.8078, + "step": 9545 + }, + { + "epoch": 0.5254004072871374, + "grad_norm": 0.7392330765724182, + "learning_rate": 8.4151065287279e-06, + "loss": 0.8452, + "step": 9546 + }, + { + "epoch": 0.5254554460894931, + "grad_norm": 0.7617329955101013, + "learning_rate": 8.414789912549537e-06, + "loss": 0.7885, + "step": 9547 + }, + { + "epoch": 0.5255104848918487, + "grad_norm": 1.160125732421875, + "learning_rate": 8.414473270706527e-06, + "loss": 0.9628, + "step": 9548 + }, + { + "epoch": 0.5255655236942044, + "grad_norm": 0.7578685879707336, + "learning_rate": 8.414156603201252e-06, + "loss": 0.7745, + "step": 9549 + }, + { + "epoch": 0.5256205624965601, + "grad_norm": 0.6963017582893372, + "learning_rate": 8.413839910036089e-06, + "loss": 0.7693, + "step": 9550 + }, + { + "epoch": 0.5256756012989158, + "grad_norm": 0.6631398797035217, + "learning_rate": 8.413523191213415e-06, + "loss": 0.6606, + "step": 9551 + }, + { + "epoch": 0.5257306401012714, + "grad_norm": 0.707343339920044, + "learning_rate": 8.41320644673562e-06, + "loss": 0.7161, + "step": 9552 + }, + { + "epoch": 0.525785678903627, + "grad_norm": 0.833448588848114, + "learning_rate": 8.412889676605075e-06, + "loss": 0.7509, + "step": 9553 + }, + { + "epoch": 0.5258407177059827, + "grad_norm": 0.6214264631271362, + "learning_rate": 8.412572880824168e-06, + "loss": 0.7436, + "step": 9554 + }, + { + "epoch": 0.5258957565083384, + "grad_norm": 0.6479233503341675, + "learning_rate": 8.412256059395274e-06, + "loss": 0.7359, + "step": 9555 + }, + { + "epoch": 0.525950795310694, + "grad_norm": 0.7596501111984253, + "learning_rate": 8.411939212320778e-06, + "loss": 0.7422, + "step": 9556 + }, + { + "epoch": 0.5260058341130497, + "grad_norm": 0.8040934205055237, + "learning_rate": 8.41162233960306e-06, + "loss": 0.7721, + "step": 9557 + }, + { + "epoch": 0.5260608729154054, + "grad_norm": 0.7190027832984924, + "learning_rate": 8.411305441244505e-06, + "loss": 0.8794, + "step": 9558 + }, + { + "epoch": 0.5261159117177611, + "grad_norm": 0.8002649545669556, + "learning_rate": 8.410988517247486e-06, + "loss": 0.7958, + "step": 9559 + }, + { + "epoch": 0.5261709505201166, + "grad_norm": 0.7151750326156616, + "learning_rate": 8.410671567614394e-06, + "loss": 0.7597, + "step": 9560 + }, + { + "epoch": 0.5262259893224723, + "grad_norm": 0.9718102812767029, + "learning_rate": 8.410354592347607e-06, + "loss": 0.8272, + "step": 9561 + }, + { + "epoch": 0.526281028124828, + "grad_norm": 0.701932966709137, + "learning_rate": 8.410037591449506e-06, + "loss": 0.808, + "step": 9562 + }, + { + "epoch": 0.5263360669271837, + "grad_norm": 0.8247585296630859, + "learning_rate": 8.409720564922476e-06, + "loss": 0.7598, + "step": 9563 + }, + { + "epoch": 0.5263911057295393, + "grad_norm": 0.7305104732513428, + "learning_rate": 8.409403512768899e-06, + "loss": 0.8161, + "step": 9564 + }, + { + "epoch": 0.526446144531895, + "grad_norm": 0.8726410865783691, + "learning_rate": 8.409086434991158e-06, + "loss": 0.8598, + "step": 9565 + }, + { + "epoch": 0.5265011833342507, + "grad_norm": 0.7329155802726746, + "learning_rate": 8.408769331591637e-06, + "loss": 0.7355, + "step": 9566 + }, + { + "epoch": 0.5265562221366064, + "grad_norm": 0.8227902054786682, + "learning_rate": 8.408452202572716e-06, + "loss": 0.7888, + "step": 9567 + }, + { + "epoch": 0.5266112609389619, + "grad_norm": 0.7190666794776917, + "learning_rate": 8.408135047936783e-06, + "loss": 0.669, + "step": 9568 + }, + { + "epoch": 0.5266662997413176, + "grad_norm": 0.6529938578605652, + "learning_rate": 8.407817867686217e-06, + "loss": 0.7345, + "step": 9569 + }, + { + "epoch": 0.5267213385436733, + "grad_norm": 0.6985379457473755, + "learning_rate": 8.407500661823407e-06, + "loss": 0.852, + "step": 9570 + }, + { + "epoch": 0.526776377346029, + "grad_norm": 0.7480047345161438, + "learning_rate": 8.407183430350732e-06, + "loss": 0.7422, + "step": 9571 + }, + { + "epoch": 0.5268314161483846, + "grad_norm": 0.7599420547485352, + "learning_rate": 8.406866173270579e-06, + "loss": 0.7499, + "step": 9572 + }, + { + "epoch": 0.5268864549507403, + "grad_norm": 0.813448965549469, + "learning_rate": 8.406548890585331e-06, + "loss": 0.7979, + "step": 9573 + }, + { + "epoch": 0.526941493753096, + "grad_norm": 0.6029278039932251, + "learning_rate": 8.406231582297374e-06, + "loss": 0.7289, + "step": 9574 + }, + { + "epoch": 0.5269965325554516, + "grad_norm": 0.656829297542572, + "learning_rate": 8.40591424840909e-06, + "loss": 0.6778, + "step": 9575 + }, + { + "epoch": 0.5270515713578072, + "grad_norm": 0.7147198915481567, + "learning_rate": 8.405596888922869e-06, + "loss": 0.7212, + "step": 9576 + }, + { + "epoch": 0.5271066101601629, + "grad_norm": 0.7722035050392151, + "learning_rate": 8.405279503841094e-06, + "loss": 0.8008, + "step": 9577 + }, + { + "epoch": 0.5271616489625186, + "grad_norm": 0.6828493475914001, + "learning_rate": 8.40496209316615e-06, + "loss": 0.787, + "step": 9578 + }, + { + "epoch": 0.5272166877648742, + "grad_norm": 0.6965187788009644, + "learning_rate": 8.40464465690042e-06, + "loss": 0.6803, + "step": 9579 + }, + { + "epoch": 0.5272717265672299, + "grad_norm": 0.7300547957420349, + "learning_rate": 8.404327195046293e-06, + "loss": 0.8165, + "step": 9580 + }, + { + "epoch": 0.5273267653695856, + "grad_norm": 0.7367526292800903, + "learning_rate": 8.404009707606153e-06, + "loss": 0.7709, + "step": 9581 + }, + { + "epoch": 0.5273818041719412, + "grad_norm": 0.6694689989089966, + "learning_rate": 8.40369219458239e-06, + "loss": 0.7971, + "step": 9582 + }, + { + "epoch": 0.5274368429742968, + "grad_norm": 0.6723141074180603, + "learning_rate": 8.403374655977384e-06, + "loss": 0.695, + "step": 9583 + }, + { + "epoch": 0.5274918817766525, + "grad_norm": 0.7737089395523071, + "learning_rate": 8.403057091793528e-06, + "loss": 0.7765, + "step": 9584 + }, + { + "epoch": 0.5275469205790082, + "grad_norm": 0.8378487825393677, + "learning_rate": 8.402739502033204e-06, + "loss": 0.7984, + "step": 9585 + }, + { + "epoch": 0.5276019593813639, + "grad_norm": 0.7496509552001953, + "learning_rate": 8.402421886698802e-06, + "loss": 0.7846, + "step": 9586 + }, + { + "epoch": 0.5276569981837195, + "grad_norm": 0.7020435929298401, + "learning_rate": 8.402104245792706e-06, + "loss": 0.8102, + "step": 9587 + }, + { + "epoch": 0.5277120369860752, + "grad_norm": 0.8877277374267578, + "learning_rate": 8.401786579317308e-06, + "loss": 0.6995, + "step": 9588 + }, + { + "epoch": 0.5277670757884309, + "grad_norm": 0.6975196599960327, + "learning_rate": 8.401468887274991e-06, + "loss": 0.7475, + "step": 9589 + }, + { + "epoch": 0.5278221145907865, + "grad_norm": 0.8267357349395752, + "learning_rate": 8.401151169668144e-06, + "loss": 0.7091, + "step": 9590 + }, + { + "epoch": 0.5278771533931421, + "grad_norm": 0.6778179407119751, + "learning_rate": 8.400833426499156e-06, + "loss": 0.8198, + "step": 9591 + }, + { + "epoch": 0.5279321921954978, + "grad_norm": 0.7343330979347229, + "learning_rate": 8.400515657770414e-06, + "loss": 0.7565, + "step": 9592 + }, + { + "epoch": 0.5279872309978535, + "grad_norm": 0.7745271325111389, + "learning_rate": 8.400197863484307e-06, + "loss": 0.7991, + "step": 9593 + }, + { + "epoch": 0.5280422698002092, + "grad_norm": 0.7652345895767212, + "learning_rate": 8.399880043643224e-06, + "loss": 0.7752, + "step": 9594 + }, + { + "epoch": 0.5280973086025648, + "grad_norm": 0.9764432311058044, + "learning_rate": 8.399562198249551e-06, + "loss": 0.784, + "step": 9595 + }, + { + "epoch": 0.5281523474049205, + "grad_norm": 0.6763052940368652, + "learning_rate": 8.399244327305678e-06, + "loss": 0.7695, + "step": 9596 + }, + { + "epoch": 0.5282073862072761, + "grad_norm": 0.7788934111595154, + "learning_rate": 8.398926430813996e-06, + "loss": 0.8152, + "step": 9597 + }, + { + "epoch": 0.5282624250096318, + "grad_norm": 0.8088317513465881, + "learning_rate": 8.398608508776894e-06, + "loss": 0.7751, + "step": 9598 + }, + { + "epoch": 0.5283174638119874, + "grad_norm": 0.6735319495201111, + "learning_rate": 8.398290561196756e-06, + "loss": 0.7305, + "step": 9599 + }, + { + "epoch": 0.5283725026143431, + "grad_norm": 0.7279297113418579, + "learning_rate": 8.39797258807598e-06, + "loss": 0.7381, + "step": 9600 + }, + { + "epoch": 0.5284275414166988, + "grad_norm": 0.74604332447052, + "learning_rate": 8.39765458941695e-06, + "loss": 0.8138, + "step": 9601 + }, + { + "epoch": 0.5284825802190545, + "grad_norm": 0.7735850214958191, + "learning_rate": 8.397336565222057e-06, + "loss": 0.7364, + "step": 9602 + }, + { + "epoch": 0.52853761902141, + "grad_norm": 0.7890003323554993, + "learning_rate": 8.397018515493693e-06, + "loss": 0.8301, + "step": 9603 + }, + { + "epoch": 0.5285926578237657, + "grad_norm": 0.739054262638092, + "learning_rate": 8.396700440234245e-06, + "loss": 0.7503, + "step": 9604 + }, + { + "epoch": 0.5286476966261214, + "grad_norm": 0.7611023783683777, + "learning_rate": 8.396382339446108e-06, + "loss": 0.7225, + "step": 9605 + }, + { + "epoch": 0.5287027354284771, + "grad_norm": 0.770602285861969, + "learning_rate": 8.39606421313167e-06, + "loss": 0.71, + "step": 9606 + }, + { + "epoch": 0.5287577742308327, + "grad_norm": 0.7495261430740356, + "learning_rate": 8.395746061293322e-06, + "loss": 0.7729, + "step": 9607 + }, + { + "epoch": 0.5288128130331884, + "grad_norm": 0.7159668207168579, + "learning_rate": 8.395427883933456e-06, + "loss": 0.8457, + "step": 9608 + }, + { + "epoch": 0.5288678518355441, + "grad_norm": 0.7663426399230957, + "learning_rate": 8.395109681054463e-06, + "loss": 0.784, + "step": 9609 + }, + { + "epoch": 0.5289228906378998, + "grad_norm": 0.7271933555603027, + "learning_rate": 8.394791452658732e-06, + "loss": 0.7981, + "step": 9610 + }, + { + "epoch": 0.5289779294402553, + "grad_norm": 0.7782096266746521, + "learning_rate": 8.394473198748661e-06, + "loss": 0.7953, + "step": 9611 + }, + { + "epoch": 0.529032968242611, + "grad_norm": 0.8318955302238464, + "learning_rate": 8.394154919326636e-06, + "loss": 0.6875, + "step": 9612 + }, + { + "epoch": 0.5290880070449667, + "grad_norm": 0.7402167916297913, + "learning_rate": 8.393836614395051e-06, + "loss": 0.7805, + "step": 9613 + }, + { + "epoch": 0.5291430458473224, + "grad_norm": 0.6314370632171631, + "learning_rate": 8.393518283956299e-06, + "loss": 0.6841, + "step": 9614 + }, + { + "epoch": 0.529198084649678, + "grad_norm": 0.8387365937232971, + "learning_rate": 8.393199928012772e-06, + "loss": 0.8503, + "step": 9615 + }, + { + "epoch": 0.5292531234520337, + "grad_norm": 0.7066243886947632, + "learning_rate": 8.392881546566863e-06, + "loss": 0.8494, + "step": 9616 + }, + { + "epoch": 0.5293081622543894, + "grad_norm": 0.7034226059913635, + "learning_rate": 8.392563139620964e-06, + "loss": 0.7335, + "step": 9617 + }, + { + "epoch": 0.5293632010567451, + "grad_norm": 0.6969622373580933, + "learning_rate": 8.392244707177468e-06, + "loss": 0.7203, + "step": 9618 + }, + { + "epoch": 0.5294182398591006, + "grad_norm": 0.7694050073623657, + "learning_rate": 8.391926249238768e-06, + "loss": 0.7864, + "step": 9619 + }, + { + "epoch": 0.5294732786614563, + "grad_norm": 0.7284281253814697, + "learning_rate": 8.391607765807262e-06, + "loss": 0.6704, + "step": 9620 + }, + { + "epoch": 0.529528317463812, + "grad_norm": 1.0466688871383667, + "learning_rate": 8.391289256885337e-06, + "loss": 0.7807, + "step": 9621 + }, + { + "epoch": 0.5295833562661676, + "grad_norm": 0.7118388414382935, + "learning_rate": 8.39097072247539e-06, + "loss": 0.738, + "step": 9622 + }, + { + "epoch": 0.5296383950685233, + "grad_norm": 0.794377863407135, + "learning_rate": 8.390652162579815e-06, + "loss": 0.6831, + "step": 9623 + }, + { + "epoch": 0.529693433870879, + "grad_norm": 0.6042492389678955, + "learning_rate": 8.390333577201007e-06, + "loss": 0.6773, + "step": 9624 + }, + { + "epoch": 0.5297484726732347, + "grad_norm": 0.6452521681785583, + "learning_rate": 8.390014966341357e-06, + "loss": 0.7168, + "step": 9625 + }, + { + "epoch": 0.5298035114755902, + "grad_norm": 0.7113651633262634, + "learning_rate": 8.389696330003265e-06, + "loss": 0.709, + "step": 9626 + }, + { + "epoch": 0.5298585502779459, + "grad_norm": 0.6469250917434692, + "learning_rate": 8.38937766818912e-06, + "loss": 0.6804, + "step": 9627 + }, + { + "epoch": 0.5299135890803016, + "grad_norm": 0.7529417872428894, + "learning_rate": 8.389058980901322e-06, + "loss": 0.8537, + "step": 9628 + }, + { + "epoch": 0.5299686278826573, + "grad_norm": 0.7681186199188232, + "learning_rate": 8.388740268142262e-06, + "loss": 0.7383, + "step": 9629 + }, + { + "epoch": 0.5300236666850129, + "grad_norm": 0.6585648655891418, + "learning_rate": 8.388421529914337e-06, + "loss": 0.7535, + "step": 9630 + }, + { + "epoch": 0.5300787054873686, + "grad_norm": 0.7432085871696472, + "learning_rate": 8.388102766219943e-06, + "loss": 0.7391, + "step": 9631 + }, + { + "epoch": 0.5301337442897243, + "grad_norm": 0.6672815084457397, + "learning_rate": 8.387783977061476e-06, + "loss": 0.8056, + "step": 9632 + }, + { + "epoch": 0.53018878309208, + "grad_norm": 0.7566675543785095, + "learning_rate": 8.387465162441332e-06, + "loss": 0.7858, + "step": 9633 + }, + { + "epoch": 0.5302438218944355, + "grad_norm": 0.6522077322006226, + "learning_rate": 8.387146322361907e-06, + "loss": 0.759, + "step": 9634 + }, + { + "epoch": 0.5302988606967912, + "grad_norm": 0.7246397137641907, + "learning_rate": 8.386827456825597e-06, + "loss": 0.8158, + "step": 9635 + }, + { + "epoch": 0.5303538994991469, + "grad_norm": 0.7577807307243347, + "learning_rate": 8.386508565834797e-06, + "loss": 0.7495, + "step": 9636 + }, + { + "epoch": 0.5304089383015026, + "grad_norm": 0.7080703973770142, + "learning_rate": 8.386189649391906e-06, + "loss": 0.8086, + "step": 9637 + }, + { + "epoch": 0.5304639771038582, + "grad_norm": 0.7505277395248413, + "learning_rate": 8.385870707499321e-06, + "loss": 0.7206, + "step": 9638 + }, + { + "epoch": 0.5305190159062139, + "grad_norm": 0.7044165134429932, + "learning_rate": 8.385551740159437e-06, + "loss": 0.7838, + "step": 9639 + }, + { + "epoch": 0.5305740547085696, + "grad_norm": 0.7921645641326904, + "learning_rate": 8.385232747374652e-06, + "loss": 0.7604, + "step": 9640 + }, + { + "epoch": 0.5306290935109252, + "grad_norm": 0.9930111169815063, + "learning_rate": 8.384913729147364e-06, + "loss": 0.7839, + "step": 9641 + }, + { + "epoch": 0.5306841323132808, + "grad_norm": 0.7333244681358337, + "learning_rate": 8.38459468547997e-06, + "loss": 0.7941, + "step": 9642 + }, + { + "epoch": 0.5307391711156365, + "grad_norm": 0.7857590913772583, + "learning_rate": 8.384275616374868e-06, + "loss": 0.8535, + "step": 9643 + }, + { + "epoch": 0.5307942099179922, + "grad_norm": 0.8568746447563171, + "learning_rate": 8.383956521834459e-06, + "loss": 0.6586, + "step": 9644 + }, + { + "epoch": 0.5308492487203479, + "grad_norm": 0.7061276435852051, + "learning_rate": 8.383637401861136e-06, + "loss": 0.7288, + "step": 9645 + }, + { + "epoch": 0.5309042875227035, + "grad_norm": 0.7348940968513489, + "learning_rate": 8.383318256457303e-06, + "loss": 0.8099, + "step": 9646 + }, + { + "epoch": 0.5309593263250592, + "grad_norm": 0.6526725888252258, + "learning_rate": 8.382999085625353e-06, + "loss": 0.6702, + "step": 9647 + }, + { + "epoch": 0.5310143651274148, + "grad_norm": 0.8122747540473938, + "learning_rate": 8.382679889367687e-06, + "loss": 0.67, + "step": 9648 + }, + { + "epoch": 0.5310694039297705, + "grad_norm": 0.9145376682281494, + "learning_rate": 8.382360667686706e-06, + "loss": 0.7719, + "step": 9649 + }, + { + "epoch": 0.5311244427321261, + "grad_norm": 0.6659818887710571, + "learning_rate": 8.382041420584807e-06, + "loss": 0.806, + "step": 9650 + }, + { + "epoch": 0.5311794815344818, + "grad_norm": 0.7088539004325867, + "learning_rate": 8.381722148064391e-06, + "loss": 0.7046, + "step": 9651 + }, + { + "epoch": 0.5312345203368375, + "grad_norm": 0.8610590696334839, + "learning_rate": 8.381402850127854e-06, + "loss": 0.6998, + "step": 9652 + }, + { + "epoch": 0.5312895591391932, + "grad_norm": 0.775830864906311, + "learning_rate": 8.3810835267776e-06, + "loss": 0.8874, + "step": 9653 + }, + { + "epoch": 0.5313445979415488, + "grad_norm": 0.6871606707572937, + "learning_rate": 8.380764178016028e-06, + "loss": 0.7903, + "step": 9654 + }, + { + "epoch": 0.5313996367439044, + "grad_norm": 0.7005272507667542, + "learning_rate": 8.380444803845537e-06, + "loss": 0.6685, + "step": 9655 + }, + { + "epoch": 0.5314546755462601, + "grad_norm": 0.8922042846679688, + "learning_rate": 8.380125404268527e-06, + "loss": 0.7797, + "step": 9656 + }, + { + "epoch": 0.5315097143486158, + "grad_norm": 0.7242267727851868, + "learning_rate": 8.3798059792874e-06, + "loss": 0.863, + "step": 9657 + }, + { + "epoch": 0.5315647531509714, + "grad_norm": 0.6625328660011292, + "learning_rate": 8.379486528904555e-06, + "loss": 0.7, + "step": 9658 + }, + { + "epoch": 0.5316197919533271, + "grad_norm": 0.9882226586341858, + "learning_rate": 8.379167053122394e-06, + "loss": 0.7534, + "step": 9659 + }, + { + "epoch": 0.5316748307556828, + "grad_norm": 0.6894702911376953, + "learning_rate": 8.378847551943318e-06, + "loss": 0.7503, + "step": 9660 + }, + { + "epoch": 0.5317298695580385, + "grad_norm": 0.6820259690284729, + "learning_rate": 8.37852802536973e-06, + "loss": 0.7713, + "step": 9661 + }, + { + "epoch": 0.531784908360394, + "grad_norm": 0.667918860912323, + "learning_rate": 8.378208473404028e-06, + "loss": 0.7524, + "step": 9662 + }, + { + "epoch": 0.5318399471627497, + "grad_norm": 0.7789241075515747, + "learning_rate": 8.377888896048617e-06, + "loss": 0.6906, + "step": 9663 + }, + { + "epoch": 0.5318949859651054, + "grad_norm": 0.7264542579650879, + "learning_rate": 8.377569293305894e-06, + "loss": 0.7836, + "step": 9664 + }, + { + "epoch": 0.531950024767461, + "grad_norm": 0.6979835629463196, + "learning_rate": 8.377249665178267e-06, + "loss": 0.7739, + "step": 9665 + }, + { + "epoch": 0.5320050635698167, + "grad_norm": 0.8008072376251221, + "learning_rate": 8.376930011668136e-06, + "loss": 0.7853, + "step": 9666 + }, + { + "epoch": 0.5320601023721724, + "grad_norm": 0.7185621857643127, + "learning_rate": 8.376610332777901e-06, + "loss": 0.7311, + "step": 9667 + }, + { + "epoch": 0.5321151411745281, + "grad_norm": 0.7644047141075134, + "learning_rate": 8.376290628509969e-06, + "loss": 0.6919, + "step": 9668 + }, + { + "epoch": 0.5321701799768837, + "grad_norm": 0.7387600541114807, + "learning_rate": 8.37597089886674e-06, + "loss": 0.7285, + "step": 9669 + }, + { + "epoch": 0.5322252187792393, + "grad_norm": 0.7344895005226135, + "learning_rate": 8.375651143850614e-06, + "loss": 0.7514, + "step": 9670 + }, + { + "epoch": 0.532280257581595, + "grad_norm": 0.6930707097053528, + "learning_rate": 8.375331363464002e-06, + "loss": 0.8318, + "step": 9671 + }, + { + "epoch": 0.5323352963839507, + "grad_norm": 0.678162693977356, + "learning_rate": 8.3750115577093e-06, + "loss": 0.7123, + "step": 9672 + }, + { + "epoch": 0.5323903351863063, + "grad_norm": 0.7780481576919556, + "learning_rate": 8.374691726588914e-06, + "loss": 0.7672, + "step": 9673 + }, + { + "epoch": 0.532445373988662, + "grad_norm": 0.6664674282073975, + "learning_rate": 8.374371870105252e-06, + "loss": 0.6994, + "step": 9674 + }, + { + "epoch": 0.5325004127910177, + "grad_norm": 0.6952562928199768, + "learning_rate": 8.374051988260712e-06, + "loss": 0.8638, + "step": 9675 + }, + { + "epoch": 0.5325554515933734, + "grad_norm": 0.764005184173584, + "learning_rate": 8.373732081057699e-06, + "loss": 0.756, + "step": 9676 + }, + { + "epoch": 0.5326104903957289, + "grad_norm": 0.9434393048286438, + "learning_rate": 8.373412148498621e-06, + "loss": 0.8668, + "step": 9677 + }, + { + "epoch": 0.5326655291980846, + "grad_norm": 0.752609133720398, + "learning_rate": 8.373092190585878e-06, + "loss": 0.8078, + "step": 9678 + }, + { + "epoch": 0.5327205680004403, + "grad_norm": 0.671940803527832, + "learning_rate": 8.37277220732188e-06, + "loss": 0.7726, + "step": 9679 + }, + { + "epoch": 0.532775606802796, + "grad_norm": 0.7824863791465759, + "learning_rate": 8.372452198709027e-06, + "loss": 0.8246, + "step": 9680 + }, + { + "epoch": 0.5328306456051516, + "grad_norm": 0.7300587892532349, + "learning_rate": 8.372132164749726e-06, + "loss": 0.7953, + "step": 9681 + }, + { + "epoch": 0.5328856844075073, + "grad_norm": 0.7146018743515015, + "learning_rate": 8.371812105446384e-06, + "loss": 0.7409, + "step": 9682 + }, + { + "epoch": 0.532940723209863, + "grad_norm": 0.73857581615448, + "learning_rate": 8.371492020801404e-06, + "loss": 0.8067, + "step": 9683 + }, + { + "epoch": 0.5329957620122187, + "grad_norm": 0.6760877966880798, + "learning_rate": 8.37117191081719e-06, + "loss": 0.7363, + "step": 9684 + }, + { + "epoch": 0.5330508008145742, + "grad_norm": 0.766482412815094, + "learning_rate": 8.370851775496154e-06, + "loss": 0.7358, + "step": 9685 + }, + { + "epoch": 0.5331058396169299, + "grad_norm": 0.7230576276779175, + "learning_rate": 8.370531614840697e-06, + "loss": 0.8154, + "step": 9686 + }, + { + "epoch": 0.5331608784192856, + "grad_norm": 0.7357933521270752, + "learning_rate": 8.370211428853225e-06, + "loss": 0.7187, + "step": 9687 + }, + { + "epoch": 0.5332159172216413, + "grad_norm": 0.8208534121513367, + "learning_rate": 8.369891217536148e-06, + "loss": 0.8037, + "step": 9688 + }, + { + "epoch": 0.5332709560239969, + "grad_norm": 0.6771863698959351, + "learning_rate": 8.36957098089187e-06, + "loss": 0.733, + "step": 9689 + }, + { + "epoch": 0.5333259948263526, + "grad_norm": 0.6382480263710022, + "learning_rate": 8.369250718922798e-06, + "loss": 0.7391, + "step": 9690 + }, + { + "epoch": 0.5333810336287083, + "grad_norm": 0.6638994812965393, + "learning_rate": 8.368930431631342e-06, + "loss": 0.7176, + "step": 9691 + }, + { + "epoch": 0.533436072431064, + "grad_norm": 0.7599604725837708, + "learning_rate": 8.368610119019903e-06, + "loss": 0.8814, + "step": 9692 + }, + { + "epoch": 0.5334911112334195, + "grad_norm": 0.6896547079086304, + "learning_rate": 8.368289781090894e-06, + "loss": 0.7618, + "step": 9693 + }, + { + "epoch": 0.5335461500357752, + "grad_norm": 0.7081224918365479, + "learning_rate": 8.36796941784672e-06, + "loss": 0.656, + "step": 9694 + }, + { + "epoch": 0.5336011888381309, + "grad_norm": 0.8819646835327148, + "learning_rate": 8.367649029289791e-06, + "loss": 0.8946, + "step": 9695 + }, + { + "epoch": 0.5336562276404866, + "grad_norm": 0.6597925424575806, + "learning_rate": 8.367328615422512e-06, + "loss": 0.6891, + "step": 9696 + }, + { + "epoch": 0.5337112664428422, + "grad_norm": 0.6855770945549011, + "learning_rate": 8.367008176247294e-06, + "loss": 0.7158, + "step": 9697 + }, + { + "epoch": 0.5337663052451979, + "grad_norm": 0.6874905228614807, + "learning_rate": 8.366687711766541e-06, + "loss": 0.7445, + "step": 9698 + }, + { + "epoch": 0.5338213440475535, + "grad_norm": 0.6990895867347717, + "learning_rate": 8.366367221982666e-06, + "loss": 0.6189, + "step": 9699 + }, + { + "epoch": 0.5338763828499092, + "grad_norm": 0.7235365509986877, + "learning_rate": 8.366046706898075e-06, + "loss": 0.6406, + "step": 9700 + }, + { + "epoch": 0.5339314216522648, + "grad_norm": 0.7563154697418213, + "learning_rate": 8.36572616651518e-06, + "loss": 0.7798, + "step": 9701 + }, + { + "epoch": 0.5339864604546205, + "grad_norm": 0.6845980286598206, + "learning_rate": 8.365405600836387e-06, + "loss": 0.7665, + "step": 9702 + }, + { + "epoch": 0.5340414992569762, + "grad_norm": 0.6374378204345703, + "learning_rate": 8.365085009864106e-06, + "loss": 0.6935, + "step": 9703 + }, + { + "epoch": 0.5340965380593319, + "grad_norm": 0.726672887802124, + "learning_rate": 8.364764393600747e-06, + "loss": 0.7821, + "step": 9704 + }, + { + "epoch": 0.5341515768616875, + "grad_norm": 0.6784456372261047, + "learning_rate": 8.364443752048719e-06, + "loss": 0.7722, + "step": 9705 + }, + { + "epoch": 0.5342066156640431, + "grad_norm": 0.6344080567359924, + "learning_rate": 8.364123085210433e-06, + "loss": 0.7256, + "step": 9706 + }, + { + "epoch": 0.5342616544663988, + "grad_norm": 0.7913152575492859, + "learning_rate": 8.363802393088299e-06, + "loss": 0.7892, + "step": 9707 + }, + { + "epoch": 0.5343166932687544, + "grad_norm": 0.6792107820510864, + "learning_rate": 8.363481675684726e-06, + "loss": 0.7374, + "step": 9708 + }, + { + "epoch": 0.5343717320711101, + "grad_norm": 1.0153685808181763, + "learning_rate": 8.363160933002126e-06, + "loss": 0.7396, + "step": 9709 + }, + { + "epoch": 0.5344267708734658, + "grad_norm": 0.7655258774757385, + "learning_rate": 8.362840165042906e-06, + "loss": 0.7746, + "step": 9710 + }, + { + "epoch": 0.5344818096758215, + "grad_norm": 0.7830179929733276, + "learning_rate": 8.362519371809483e-06, + "loss": 0.7082, + "step": 9711 + }, + { + "epoch": 0.5345368484781771, + "grad_norm": 0.7410556674003601, + "learning_rate": 8.362198553304261e-06, + "loss": 0.7055, + "step": 9712 + }, + { + "epoch": 0.5345918872805328, + "grad_norm": 0.6542297005653381, + "learning_rate": 8.361877709529658e-06, + "loss": 0.7153, + "step": 9713 + }, + { + "epoch": 0.5346469260828884, + "grad_norm": 0.6752653121948242, + "learning_rate": 8.36155684048808e-06, + "loss": 0.6901, + "step": 9714 + }, + { + "epoch": 0.5347019648852441, + "grad_norm": 0.7158684134483337, + "learning_rate": 8.361235946181943e-06, + "loss": 0.7775, + "step": 9715 + }, + { + "epoch": 0.5347570036875997, + "grad_norm": 0.6174392700195312, + "learning_rate": 8.360915026613652e-06, + "loss": 0.6501, + "step": 9716 + }, + { + "epoch": 0.5348120424899554, + "grad_norm": 0.7110500931739807, + "learning_rate": 8.360594081785627e-06, + "loss": 0.742, + "step": 9717 + }, + { + "epoch": 0.5348670812923111, + "grad_norm": 0.8456488251686096, + "learning_rate": 8.360273111700276e-06, + "loss": 0.8237, + "step": 9718 + }, + { + "epoch": 0.5349221200946668, + "grad_norm": 0.6660711169242859, + "learning_rate": 8.359952116360011e-06, + "loss": 0.7856, + "step": 9719 + }, + { + "epoch": 0.5349771588970224, + "grad_norm": 0.7661204934120178, + "learning_rate": 8.359631095767244e-06, + "loss": 0.8336, + "step": 9720 + }, + { + "epoch": 0.535032197699378, + "grad_norm": 0.7747855186462402, + "learning_rate": 8.359310049924392e-06, + "loss": 0.7302, + "step": 9721 + }, + { + "epoch": 0.5350872365017337, + "grad_norm": 0.8156001567840576, + "learning_rate": 8.358988978833864e-06, + "loss": 0.7878, + "step": 9722 + }, + { + "epoch": 0.5351422753040894, + "grad_norm": 0.7371010780334473, + "learning_rate": 8.358667882498073e-06, + "loss": 0.803, + "step": 9723 + }, + { + "epoch": 0.535197314106445, + "grad_norm": 0.7141744494438171, + "learning_rate": 8.358346760919431e-06, + "loss": 0.687, + "step": 9724 + }, + { + "epoch": 0.5352523529088007, + "grad_norm": 0.6395956873893738, + "learning_rate": 8.358025614100358e-06, + "loss": 0.7052, + "step": 9725 + }, + { + "epoch": 0.5353073917111564, + "grad_norm": 0.7135289311408997, + "learning_rate": 8.35770444204326e-06, + "loss": 0.7882, + "step": 9726 + }, + { + "epoch": 0.5353624305135121, + "grad_norm": 0.702408492565155, + "learning_rate": 8.357383244750557e-06, + "loss": 0.6965, + "step": 9727 + }, + { + "epoch": 0.5354174693158676, + "grad_norm": 0.731193482875824, + "learning_rate": 8.357062022224658e-06, + "loss": 0.7525, + "step": 9728 + }, + { + "epoch": 0.5354725081182233, + "grad_norm": 0.8115057945251465, + "learning_rate": 8.356740774467982e-06, + "loss": 0.7466, + "step": 9729 + }, + { + "epoch": 0.535527546920579, + "grad_norm": 0.8644380569458008, + "learning_rate": 8.356419501482938e-06, + "loss": 0.7989, + "step": 9730 + }, + { + "epoch": 0.5355825857229347, + "grad_norm": 1.414620041847229, + "learning_rate": 8.356098203271945e-06, + "loss": 0.7782, + "step": 9731 + }, + { + "epoch": 0.5356376245252903, + "grad_norm": 0.7355421185493469, + "learning_rate": 8.355776879837417e-06, + "loss": 0.7163, + "step": 9732 + }, + { + "epoch": 0.535692663327646, + "grad_norm": 0.6556879281997681, + "learning_rate": 8.355455531181766e-06, + "loss": 0.7543, + "step": 9733 + }, + { + "epoch": 0.5357477021300017, + "grad_norm": 0.6632516980171204, + "learning_rate": 8.355134157307412e-06, + "loss": 0.7382, + "step": 9734 + }, + { + "epoch": 0.5358027409323574, + "grad_norm": 0.7096145153045654, + "learning_rate": 8.354812758216767e-06, + "loss": 0.7797, + "step": 9735 + }, + { + "epoch": 0.5358577797347129, + "grad_norm": 0.6404649019241333, + "learning_rate": 8.354491333912244e-06, + "loss": 0.6637, + "step": 9736 + }, + { + "epoch": 0.5359128185370686, + "grad_norm": 0.6987022757530212, + "learning_rate": 8.354169884396266e-06, + "loss": 0.7682, + "step": 9737 + }, + { + "epoch": 0.5359678573394243, + "grad_norm": 0.6593581438064575, + "learning_rate": 8.353848409671245e-06, + "loss": 0.6747, + "step": 9738 + }, + { + "epoch": 0.53602289614178, + "grad_norm": 0.6999880075454712, + "learning_rate": 8.353526909739596e-06, + "loss": 0.6659, + "step": 9739 + }, + { + "epoch": 0.5360779349441356, + "grad_norm": 0.6448989510536194, + "learning_rate": 8.353205384603735e-06, + "loss": 0.7297, + "step": 9740 + }, + { + "epoch": 0.5361329737464913, + "grad_norm": 0.6666765213012695, + "learning_rate": 8.352883834266082e-06, + "loss": 0.6459, + "step": 9741 + }, + { + "epoch": 0.536188012548847, + "grad_norm": 0.8020225763320923, + "learning_rate": 8.352562258729051e-06, + "loss": 0.8122, + "step": 9742 + }, + { + "epoch": 0.5362430513512026, + "grad_norm": 0.6883382201194763, + "learning_rate": 8.35224065799506e-06, + "loss": 0.7084, + "step": 9743 + }, + { + "epoch": 0.5362980901535582, + "grad_norm": 0.7366660237312317, + "learning_rate": 8.351919032066525e-06, + "loss": 0.848, + "step": 9744 + }, + { + "epoch": 0.5363531289559139, + "grad_norm": 0.7408311367034912, + "learning_rate": 8.351597380945863e-06, + "loss": 0.798, + "step": 9745 + }, + { + "epoch": 0.5364081677582696, + "grad_norm": 0.6841676235198975, + "learning_rate": 8.351275704635495e-06, + "loss": 0.7372, + "step": 9746 + }, + { + "epoch": 0.5364632065606253, + "grad_norm": 0.6903505325317383, + "learning_rate": 8.350954003137833e-06, + "loss": 0.7371, + "step": 9747 + }, + { + "epoch": 0.5365182453629809, + "grad_norm": 0.6444700956344604, + "learning_rate": 8.350632276455298e-06, + "loss": 0.6685, + "step": 9748 + }, + { + "epoch": 0.5365732841653366, + "grad_norm": 0.6821029186248779, + "learning_rate": 8.350310524590307e-06, + "loss": 0.8796, + "step": 9749 + }, + { + "epoch": 0.5366283229676923, + "grad_norm": 0.6733999848365784, + "learning_rate": 8.349988747545282e-06, + "loss": 0.6833, + "step": 9750 + }, + { + "epoch": 0.5366833617700478, + "grad_norm": 0.8097321391105652, + "learning_rate": 8.349666945322636e-06, + "loss": 0.834, + "step": 9751 + }, + { + "epoch": 0.5367384005724035, + "grad_norm": 0.7692395448684692, + "learning_rate": 8.34934511792479e-06, + "loss": 0.7866, + "step": 9752 + }, + { + "epoch": 0.5367934393747592, + "grad_norm": 0.7551112174987793, + "learning_rate": 8.349023265354164e-06, + "loss": 0.8378, + "step": 9753 + }, + { + "epoch": 0.5368484781771149, + "grad_norm": 0.5796393156051636, + "learning_rate": 8.348701387613176e-06, + "loss": 0.5995, + "step": 9754 + }, + { + "epoch": 0.5369035169794705, + "grad_norm": 0.6839799284934998, + "learning_rate": 8.348379484704244e-06, + "loss": 0.8262, + "step": 9755 + }, + { + "epoch": 0.5369585557818262, + "grad_norm": 0.7710869908332825, + "learning_rate": 8.348057556629786e-06, + "loss": 0.7796, + "step": 9756 + }, + { + "epoch": 0.5370135945841819, + "grad_norm": 0.733096718788147, + "learning_rate": 8.347735603392225e-06, + "loss": 0.8233, + "step": 9757 + }, + { + "epoch": 0.5370686333865375, + "grad_norm": 0.6438466906547546, + "learning_rate": 8.347413624993982e-06, + "loss": 0.7582, + "step": 9758 + }, + { + "epoch": 0.5371236721888931, + "grad_norm": 0.6877560615539551, + "learning_rate": 8.34709162143747e-06, + "loss": 0.7428, + "step": 9759 + }, + { + "epoch": 0.5371787109912488, + "grad_norm": 1.060831069946289, + "learning_rate": 8.346769592725115e-06, + "loss": 0.8636, + "step": 9760 + }, + { + "epoch": 0.5372337497936045, + "grad_norm": 0.6828434467315674, + "learning_rate": 8.346447538859334e-06, + "loss": 0.7801, + "step": 9761 + }, + { + "epoch": 0.5372887885959602, + "grad_norm": 0.6784753203392029, + "learning_rate": 8.346125459842552e-06, + "loss": 0.7356, + "step": 9762 + }, + { + "epoch": 0.5373438273983158, + "grad_norm": 0.6493560075759888, + "learning_rate": 8.345803355677185e-06, + "loss": 0.749, + "step": 9763 + }, + { + "epoch": 0.5373988662006715, + "grad_norm": 0.7109258770942688, + "learning_rate": 8.345481226365657e-06, + "loss": 0.7599, + "step": 9764 + }, + { + "epoch": 0.5374539050030271, + "grad_norm": 0.8526985049247742, + "learning_rate": 8.345159071910387e-06, + "loss": 0.6605, + "step": 9765 + }, + { + "epoch": 0.5375089438053828, + "grad_norm": 0.9194039702415466, + "learning_rate": 8.344836892313797e-06, + "loss": 0.794, + "step": 9766 + }, + { + "epoch": 0.5375639826077384, + "grad_norm": 0.7258954048156738, + "learning_rate": 8.344514687578307e-06, + "loss": 0.871, + "step": 9767 + }, + { + "epoch": 0.5376190214100941, + "grad_norm": 0.7099377512931824, + "learning_rate": 8.34419245770634e-06, + "loss": 0.8098, + "step": 9768 + }, + { + "epoch": 0.5376740602124498, + "grad_norm": 0.7883020639419556, + "learning_rate": 8.34387020270032e-06, + "loss": 0.8383, + "step": 9769 + }, + { + "epoch": 0.5377290990148055, + "grad_norm": 0.7009730339050293, + "learning_rate": 8.343547922562664e-06, + "loss": 0.7794, + "step": 9770 + }, + { + "epoch": 0.5377841378171611, + "grad_norm": 0.6569581031799316, + "learning_rate": 8.343225617295798e-06, + "loss": 0.7574, + "step": 9771 + }, + { + "epoch": 0.5378391766195167, + "grad_norm": 0.6159278154373169, + "learning_rate": 8.342903286902142e-06, + "loss": 0.7136, + "step": 9772 + }, + { + "epoch": 0.5378942154218724, + "grad_norm": 0.6594879627227783, + "learning_rate": 8.342580931384121e-06, + "loss": 0.6906, + "step": 9773 + }, + { + "epoch": 0.5379492542242281, + "grad_norm": 0.7002933025360107, + "learning_rate": 8.342258550744156e-06, + "loss": 0.7272, + "step": 9774 + }, + { + "epoch": 0.5380042930265837, + "grad_norm": 0.8243216276168823, + "learning_rate": 8.341936144984672e-06, + "loss": 0.8105, + "step": 9775 + }, + { + "epoch": 0.5380593318289394, + "grad_norm": 0.8358921408653259, + "learning_rate": 8.34161371410809e-06, + "loss": 0.7118, + "step": 9776 + }, + { + "epoch": 0.5381143706312951, + "grad_norm": 0.6339066028594971, + "learning_rate": 8.34129125811683e-06, + "loss": 0.7035, + "step": 9777 + }, + { + "epoch": 0.5381694094336508, + "grad_norm": 0.7407625317573547, + "learning_rate": 8.340968777013324e-06, + "loss": 0.7447, + "step": 9778 + }, + { + "epoch": 0.5382244482360063, + "grad_norm": 0.6876600384712219, + "learning_rate": 8.340646270799991e-06, + "loss": 0.7298, + "step": 9779 + }, + { + "epoch": 0.538279487038362, + "grad_norm": 0.7021264433860779, + "learning_rate": 8.340323739479251e-06, + "loss": 0.7869, + "step": 9780 + }, + { + "epoch": 0.5383345258407177, + "grad_norm": 0.7341023087501526, + "learning_rate": 8.340001183053535e-06, + "loss": 0.7447, + "step": 9781 + }, + { + "epoch": 0.5383895646430734, + "grad_norm": 0.6829406023025513, + "learning_rate": 8.339678601525263e-06, + "loss": 0.7438, + "step": 9782 + }, + { + "epoch": 0.538444603445429, + "grad_norm": 0.7671583294868469, + "learning_rate": 8.33935599489686e-06, + "loss": 0.8678, + "step": 9783 + }, + { + "epoch": 0.5384996422477847, + "grad_norm": 0.701797366142273, + "learning_rate": 8.339033363170753e-06, + "loss": 0.8431, + "step": 9784 + }, + { + "epoch": 0.5385546810501404, + "grad_norm": 0.748235285282135, + "learning_rate": 8.338710706349363e-06, + "loss": 0.7905, + "step": 9785 + }, + { + "epoch": 0.5386097198524961, + "grad_norm": 0.8202430605888367, + "learning_rate": 8.338388024435119e-06, + "loss": 0.7734, + "step": 9786 + }, + { + "epoch": 0.5386647586548516, + "grad_norm": 0.8218014240264893, + "learning_rate": 8.338065317430442e-06, + "loss": 0.846, + "step": 9787 + }, + { + "epoch": 0.5387197974572073, + "grad_norm": 0.6773214936256409, + "learning_rate": 8.337742585337762e-06, + "loss": 0.7692, + "step": 9788 + }, + { + "epoch": 0.538774836259563, + "grad_norm": 0.7011464834213257, + "learning_rate": 8.337419828159501e-06, + "loss": 0.7534, + "step": 9789 + }, + { + "epoch": 0.5388298750619187, + "grad_norm": 0.8299004435539246, + "learning_rate": 8.337097045898087e-06, + "loss": 0.7997, + "step": 9790 + }, + { + "epoch": 0.5388849138642743, + "grad_norm": 0.8600753545761108, + "learning_rate": 8.336774238555942e-06, + "loss": 0.8307, + "step": 9791 + }, + { + "epoch": 0.53893995266663, + "grad_norm": 0.676490843296051, + "learning_rate": 8.336451406135498e-06, + "loss": 0.7748, + "step": 9792 + }, + { + "epoch": 0.5389949914689857, + "grad_norm": 0.7094627618789673, + "learning_rate": 8.336128548639177e-06, + "loss": 0.7524, + "step": 9793 + }, + { + "epoch": 0.5390500302713412, + "grad_norm": 0.6804066896438599, + "learning_rate": 8.335805666069407e-06, + "loss": 0.8299, + "step": 9794 + }, + { + "epoch": 0.5391050690736969, + "grad_norm": 0.6992025971412659, + "learning_rate": 8.335482758428614e-06, + "loss": 0.7548, + "step": 9795 + }, + { + "epoch": 0.5391601078760526, + "grad_norm": 0.6649640798568726, + "learning_rate": 8.335159825719227e-06, + "loss": 0.6595, + "step": 9796 + }, + { + "epoch": 0.5392151466784083, + "grad_norm": 0.7292002439498901, + "learning_rate": 8.33483686794367e-06, + "loss": 0.7944, + "step": 9797 + }, + { + "epoch": 0.5392701854807639, + "grad_norm": 0.9124587178230286, + "learning_rate": 8.334513885104375e-06, + "loss": 0.8586, + "step": 9798 + }, + { + "epoch": 0.5393252242831196, + "grad_norm": 0.7091020941734314, + "learning_rate": 8.334190877203761e-06, + "loss": 0.7019, + "step": 9799 + }, + { + "epoch": 0.5393802630854753, + "grad_norm": 0.7470952272415161, + "learning_rate": 8.333867844244265e-06, + "loss": 0.7866, + "step": 9800 + }, + { + "epoch": 0.539435301887831, + "grad_norm": 0.7368966341018677, + "learning_rate": 8.333544786228309e-06, + "loss": 0.8135, + "step": 9801 + }, + { + "epoch": 0.5394903406901865, + "grad_norm": 0.668305516242981, + "learning_rate": 8.333221703158322e-06, + "loss": 0.7549, + "step": 9802 + }, + { + "epoch": 0.5395453794925422, + "grad_norm": 0.6788874268531799, + "learning_rate": 8.332898595036735e-06, + "loss": 0.8077, + "step": 9803 + }, + { + "epoch": 0.5396004182948979, + "grad_norm": 0.654863715171814, + "learning_rate": 8.332575461865972e-06, + "loss": 0.7695, + "step": 9804 + }, + { + "epoch": 0.5396554570972536, + "grad_norm": 0.7460314631462097, + "learning_rate": 8.332252303648464e-06, + "loss": 0.7711, + "step": 9805 + }, + { + "epoch": 0.5397104958996092, + "grad_norm": 0.7923582792282104, + "learning_rate": 8.331929120386643e-06, + "loss": 0.7348, + "step": 9806 + }, + { + "epoch": 0.5397655347019649, + "grad_norm": 0.6570843458175659, + "learning_rate": 8.331605912082932e-06, + "loss": 0.7029, + "step": 9807 + }, + { + "epoch": 0.5398205735043206, + "grad_norm": 0.7728865742683411, + "learning_rate": 8.331282678739762e-06, + "loss": 0.8249, + "step": 9808 + }, + { + "epoch": 0.5398756123066762, + "grad_norm": 0.7121468186378479, + "learning_rate": 8.330959420359565e-06, + "loss": 0.8698, + "step": 9809 + }, + { + "epoch": 0.5399306511090318, + "grad_norm": 0.7779444456100464, + "learning_rate": 8.330636136944768e-06, + "loss": 0.7448, + "step": 9810 + }, + { + "epoch": 0.5399856899113875, + "grad_norm": 0.7770833373069763, + "learning_rate": 8.330312828497801e-06, + "loss": 0.8489, + "step": 9811 + }, + { + "epoch": 0.5400407287137432, + "grad_norm": 0.6705769896507263, + "learning_rate": 8.329989495021096e-06, + "loss": 0.7349, + "step": 9812 + }, + { + "epoch": 0.5400957675160989, + "grad_norm": 0.6775381565093994, + "learning_rate": 8.329666136517079e-06, + "loss": 0.8093, + "step": 9813 + }, + { + "epoch": 0.5401508063184545, + "grad_norm": 0.6621832251548767, + "learning_rate": 8.329342752988183e-06, + "loss": 0.7877, + "step": 9814 + }, + { + "epoch": 0.5402058451208102, + "grad_norm": 0.704339861869812, + "learning_rate": 8.329019344436839e-06, + "loss": 0.7708, + "step": 9815 + }, + { + "epoch": 0.5402608839231658, + "grad_norm": 0.789944052696228, + "learning_rate": 8.328695910865476e-06, + "loss": 0.7563, + "step": 9816 + }, + { + "epoch": 0.5403159227255215, + "grad_norm": 0.6997420191764832, + "learning_rate": 8.328372452276525e-06, + "loss": 0.7023, + "step": 9817 + }, + { + "epoch": 0.5403709615278771, + "grad_norm": 0.6453180313110352, + "learning_rate": 8.328048968672418e-06, + "loss": 0.7193, + "step": 9818 + }, + { + "epoch": 0.5404260003302328, + "grad_norm": 0.7059640884399414, + "learning_rate": 8.327725460055586e-06, + "loss": 0.7875, + "step": 9819 + }, + { + "epoch": 0.5404810391325885, + "grad_norm": 0.7725005745887756, + "learning_rate": 8.327401926428461e-06, + "loss": 0.7503, + "step": 9820 + }, + { + "epoch": 0.5405360779349442, + "grad_norm": 0.7710940837860107, + "learning_rate": 8.327078367793473e-06, + "loss": 0.8314, + "step": 9821 + }, + { + "epoch": 0.5405911167372998, + "grad_norm": 0.9090666770935059, + "learning_rate": 8.326754784153055e-06, + "loss": 0.8021, + "step": 9822 + }, + { + "epoch": 0.5406461555396554, + "grad_norm": 0.7135322690010071, + "learning_rate": 8.326431175509638e-06, + "loss": 0.8084, + "step": 9823 + }, + { + "epoch": 0.5407011943420111, + "grad_norm": 0.9126102328300476, + "learning_rate": 8.326107541865656e-06, + "loss": 0.75, + "step": 9824 + }, + { + "epoch": 0.5407562331443668, + "grad_norm": 0.7263361215591431, + "learning_rate": 8.325783883223539e-06, + "loss": 0.6808, + "step": 9825 + }, + { + "epoch": 0.5408112719467224, + "grad_norm": 0.7234700918197632, + "learning_rate": 8.32546019958572e-06, + "loss": 0.7582, + "step": 9826 + }, + { + "epoch": 0.5408663107490781, + "grad_norm": 0.7043294310569763, + "learning_rate": 8.325136490954633e-06, + "loss": 0.8421, + "step": 9827 + }, + { + "epoch": 0.5409213495514338, + "grad_norm": 0.7947664856910706, + "learning_rate": 8.32481275733271e-06, + "loss": 0.8672, + "step": 9828 + }, + { + "epoch": 0.5409763883537895, + "grad_norm": 0.704590916633606, + "learning_rate": 8.324488998722384e-06, + "loss": 0.7356, + "step": 9829 + }, + { + "epoch": 0.541031427156145, + "grad_norm": 0.7630662322044373, + "learning_rate": 8.32416521512609e-06, + "loss": 0.7082, + "step": 9830 + }, + { + "epoch": 0.5410864659585007, + "grad_norm": 0.728721022605896, + "learning_rate": 8.323841406546259e-06, + "loss": 0.7987, + "step": 9831 + }, + { + "epoch": 0.5411415047608564, + "grad_norm": 0.7164294719696045, + "learning_rate": 8.323517572985326e-06, + "loss": 0.721, + "step": 9832 + }, + { + "epoch": 0.5411965435632121, + "grad_norm": 0.7555723190307617, + "learning_rate": 8.323193714445722e-06, + "loss": 0.814, + "step": 9833 + }, + { + "epoch": 0.5412515823655677, + "grad_norm": 0.827485978603363, + "learning_rate": 8.322869830929887e-06, + "loss": 0.8817, + "step": 9834 + }, + { + "epoch": 0.5413066211679234, + "grad_norm": 0.718950092792511, + "learning_rate": 8.322545922440252e-06, + "loss": 0.8648, + "step": 9835 + }, + { + "epoch": 0.5413616599702791, + "grad_norm": 0.7361611723899841, + "learning_rate": 8.32222198897925e-06, + "loss": 0.7392, + "step": 9836 + }, + { + "epoch": 0.5414166987726347, + "grad_norm": 0.6712168455123901, + "learning_rate": 8.321898030549316e-06, + "loss": 0.7505, + "step": 9837 + }, + { + "epoch": 0.5414717375749903, + "grad_norm": 0.7475710511207581, + "learning_rate": 8.321574047152887e-06, + "loss": 0.7969, + "step": 9838 + }, + { + "epoch": 0.541526776377346, + "grad_norm": 0.9751361608505249, + "learning_rate": 8.321250038792397e-06, + "loss": 0.8534, + "step": 9839 + }, + { + "epoch": 0.5415818151797017, + "grad_norm": 0.6858723163604736, + "learning_rate": 8.32092600547028e-06, + "loss": 0.8277, + "step": 9840 + }, + { + "epoch": 0.5416368539820573, + "grad_norm": 0.8899725675582886, + "learning_rate": 8.320601947188971e-06, + "loss": 0.8599, + "step": 9841 + }, + { + "epoch": 0.541691892784413, + "grad_norm": 0.7140665650367737, + "learning_rate": 8.320277863950907e-06, + "loss": 0.7429, + "step": 9842 + }, + { + "epoch": 0.5417469315867687, + "grad_norm": 0.7467615604400635, + "learning_rate": 8.319953755758525e-06, + "loss": 0.7826, + "step": 9843 + }, + { + "epoch": 0.5418019703891244, + "grad_norm": 0.6578202843666077, + "learning_rate": 8.319629622614258e-06, + "loss": 0.6833, + "step": 9844 + }, + { + "epoch": 0.5418570091914799, + "grad_norm": 0.9430698156356812, + "learning_rate": 8.319305464520543e-06, + "loss": 0.8243, + "step": 9845 + }, + { + "epoch": 0.5419120479938356, + "grad_norm": 0.8632097840309143, + "learning_rate": 8.318981281479817e-06, + "loss": 0.7975, + "step": 9846 + }, + { + "epoch": 0.5419670867961913, + "grad_norm": 0.7241839170455933, + "learning_rate": 8.318657073494517e-06, + "loss": 0.7226, + "step": 9847 + }, + { + "epoch": 0.542022125598547, + "grad_norm": 0.6927164196968079, + "learning_rate": 8.318332840567078e-06, + "loss": 0.7125, + "step": 9848 + }, + { + "epoch": 0.5420771644009026, + "grad_norm": 0.6414939761161804, + "learning_rate": 8.318008582699937e-06, + "loss": 0.7366, + "step": 9849 + }, + { + "epoch": 0.5421322032032583, + "grad_norm": 0.7584436535835266, + "learning_rate": 8.317684299895533e-06, + "loss": 0.8601, + "step": 9850 + }, + { + "epoch": 0.542187242005614, + "grad_norm": 0.6045856475830078, + "learning_rate": 8.317359992156302e-06, + "loss": 0.6697, + "step": 9851 + }, + { + "epoch": 0.5422422808079697, + "grad_norm": 0.715048611164093, + "learning_rate": 8.31703565948468e-06, + "loss": 0.7535, + "step": 9852 + }, + { + "epoch": 0.5422973196103252, + "grad_norm": 0.6925113201141357, + "learning_rate": 8.316711301883106e-06, + "loss": 0.8122, + "step": 9853 + }, + { + "epoch": 0.5423523584126809, + "grad_norm": 0.6787780523300171, + "learning_rate": 8.316386919354018e-06, + "loss": 0.7428, + "step": 9854 + }, + { + "epoch": 0.5424073972150366, + "grad_norm": 0.6831366419792175, + "learning_rate": 8.316062511899855e-06, + "loss": 0.767, + "step": 9855 + }, + { + "epoch": 0.5424624360173923, + "grad_norm": 0.6865691542625427, + "learning_rate": 8.315738079523053e-06, + "loss": 0.6549, + "step": 9856 + }, + { + "epoch": 0.5425174748197479, + "grad_norm": 0.7149406671524048, + "learning_rate": 8.31541362222605e-06, + "loss": 0.8127, + "step": 9857 + }, + { + "epoch": 0.5425725136221036, + "grad_norm": 0.6826779842376709, + "learning_rate": 8.315089140011286e-06, + "loss": 0.706, + "step": 9858 + }, + { + "epoch": 0.5426275524244593, + "grad_norm": 0.688204288482666, + "learning_rate": 8.3147646328812e-06, + "loss": 0.8675, + "step": 9859 + }, + { + "epoch": 0.542682591226815, + "grad_norm": 0.6659492254257202, + "learning_rate": 8.31444010083823e-06, + "loss": 0.7851, + "step": 9860 + }, + { + "epoch": 0.5427376300291705, + "grad_norm": 0.8049291372299194, + "learning_rate": 8.314115543884816e-06, + "loss": 0.7442, + "step": 9861 + }, + { + "epoch": 0.5427926688315262, + "grad_norm": 0.7505989670753479, + "learning_rate": 8.313790962023397e-06, + "loss": 0.8391, + "step": 9862 + }, + { + "epoch": 0.5428477076338819, + "grad_norm": 0.6810199618339539, + "learning_rate": 8.31346635525641e-06, + "loss": 0.8131, + "step": 9863 + }, + { + "epoch": 0.5429027464362376, + "grad_norm": 0.6724215745925903, + "learning_rate": 8.313141723586298e-06, + "loss": 0.75, + "step": 9864 + }, + { + "epoch": 0.5429577852385932, + "grad_norm": 0.7804376482963562, + "learning_rate": 8.3128170670155e-06, + "loss": 0.704, + "step": 9865 + }, + { + "epoch": 0.5430128240409489, + "grad_norm": 0.9494230151176453, + "learning_rate": 8.312492385546455e-06, + "loss": 0.8578, + "step": 9866 + }, + { + "epoch": 0.5430678628433045, + "grad_norm": 0.6780333518981934, + "learning_rate": 8.312167679181606e-06, + "loss": 0.701, + "step": 9867 + }, + { + "epoch": 0.5431229016456602, + "grad_norm": 0.7407701015472412, + "learning_rate": 8.31184294792339e-06, + "loss": 0.8505, + "step": 9868 + }, + { + "epoch": 0.5431779404480158, + "grad_norm": 0.680903434753418, + "learning_rate": 8.311518191774249e-06, + "loss": 0.7645, + "step": 9869 + }, + { + "epoch": 0.5432329792503715, + "grad_norm": 0.6695752143859863, + "learning_rate": 8.311193410736622e-06, + "loss": 0.816, + "step": 9870 + }, + { + "epoch": 0.5432880180527272, + "grad_norm": 0.6725142598152161, + "learning_rate": 8.310868604812954e-06, + "loss": 0.7044, + "step": 9871 + }, + { + "epoch": 0.5433430568550829, + "grad_norm": 0.922627866268158, + "learning_rate": 8.310543774005684e-06, + "loss": 0.7589, + "step": 9872 + }, + { + "epoch": 0.5433980956574385, + "grad_norm": 1.0136839151382446, + "learning_rate": 8.310218918317251e-06, + "loss": 0.7573, + "step": 9873 + }, + { + "epoch": 0.5434531344597942, + "grad_norm": 0.9053532481193542, + "learning_rate": 8.309894037750099e-06, + "loss": 0.8269, + "step": 9874 + }, + { + "epoch": 0.5435081732621498, + "grad_norm": 0.6800149083137512, + "learning_rate": 8.309569132306671e-06, + "loss": 0.716, + "step": 9875 + }, + { + "epoch": 0.5435632120645055, + "grad_norm": 0.7157679796218872, + "learning_rate": 8.309244201989408e-06, + "loss": 0.7433, + "step": 9876 + }, + { + "epoch": 0.5436182508668611, + "grad_norm": 0.9316089749336243, + "learning_rate": 8.308919246800748e-06, + "loss": 0.7499, + "step": 9877 + }, + { + "epoch": 0.5436732896692168, + "grad_norm": 0.6682490110397339, + "learning_rate": 8.308594266743139e-06, + "loss": 0.7286, + "step": 9878 + }, + { + "epoch": 0.5437283284715725, + "grad_norm": 0.7241143584251404, + "learning_rate": 8.308269261819022e-06, + "loss": 0.7934, + "step": 9879 + }, + { + "epoch": 0.5437833672739281, + "grad_norm": 0.7402396202087402, + "learning_rate": 8.307944232030838e-06, + "loss": 0.7361, + "step": 9880 + }, + { + "epoch": 0.5438384060762838, + "grad_norm": 0.6839993596076965, + "learning_rate": 8.307619177381029e-06, + "loss": 0.749, + "step": 9881 + }, + { + "epoch": 0.5438934448786394, + "grad_norm": 0.6536363363265991, + "learning_rate": 8.307294097872041e-06, + "loss": 0.706, + "step": 9882 + }, + { + "epoch": 0.5439484836809951, + "grad_norm": 0.602644681930542, + "learning_rate": 8.306968993506317e-06, + "loss": 0.6857, + "step": 9883 + }, + { + "epoch": 0.5440035224833507, + "grad_norm": 0.6567881107330322, + "learning_rate": 8.306643864286297e-06, + "loss": 0.6989, + "step": 9884 + }, + { + "epoch": 0.5440585612857064, + "grad_norm": 1.0013506412506104, + "learning_rate": 8.306318710214427e-06, + "loss": 0.7251, + "step": 9885 + }, + { + "epoch": 0.5441136000880621, + "grad_norm": 0.7016813158988953, + "learning_rate": 8.305993531293153e-06, + "loss": 0.7535, + "step": 9886 + }, + { + "epoch": 0.5441686388904178, + "grad_norm": 0.7345741391181946, + "learning_rate": 8.305668327524915e-06, + "loss": 0.887, + "step": 9887 + }, + { + "epoch": 0.5442236776927734, + "grad_norm": 1.0925754308700562, + "learning_rate": 8.305343098912158e-06, + "loss": 0.7779, + "step": 9888 + }, + { + "epoch": 0.544278716495129, + "grad_norm": 0.79815274477005, + "learning_rate": 8.305017845457328e-06, + "loss": 0.7736, + "step": 9889 + }, + { + "epoch": 0.5443337552974847, + "grad_norm": 0.6324154138565063, + "learning_rate": 8.304692567162868e-06, + "loss": 0.6823, + "step": 9890 + }, + { + "epoch": 0.5443887940998404, + "grad_norm": 0.6990262866020203, + "learning_rate": 8.304367264031223e-06, + "loss": 0.7804, + "step": 9891 + }, + { + "epoch": 0.544443832902196, + "grad_norm": 1.4203195571899414, + "learning_rate": 8.304041936064839e-06, + "loss": 0.8702, + "step": 9892 + }, + { + "epoch": 0.5444988717045517, + "grad_norm": 0.6986544132232666, + "learning_rate": 8.303716583266161e-06, + "loss": 0.7666, + "step": 9893 + }, + { + "epoch": 0.5445539105069074, + "grad_norm": 0.7037138938903809, + "learning_rate": 8.303391205637632e-06, + "loss": 0.7995, + "step": 9894 + }, + { + "epoch": 0.5446089493092631, + "grad_norm": 0.7101728320121765, + "learning_rate": 8.3030658031817e-06, + "loss": 0.8185, + "step": 9895 + }, + { + "epoch": 0.5446639881116186, + "grad_norm": 0.6571425795555115, + "learning_rate": 8.302740375900808e-06, + "loss": 0.6152, + "step": 9896 + }, + { + "epoch": 0.5447190269139743, + "grad_norm": 0.7560263276100159, + "learning_rate": 8.302414923797406e-06, + "loss": 0.9037, + "step": 9897 + }, + { + "epoch": 0.54477406571633, + "grad_norm": 0.8692007064819336, + "learning_rate": 8.302089446873935e-06, + "loss": 0.7689, + "step": 9898 + }, + { + "epoch": 0.5448291045186857, + "grad_norm": 0.7533506751060486, + "learning_rate": 8.301763945132845e-06, + "loss": 0.7671, + "step": 9899 + }, + { + "epoch": 0.5448841433210413, + "grad_norm": 0.6992233991622925, + "learning_rate": 8.301438418576581e-06, + "loss": 0.723, + "step": 9900 + }, + { + "epoch": 0.544939182123397, + "grad_norm": 0.7966120839118958, + "learning_rate": 8.301112867207589e-06, + "loss": 0.7968, + "step": 9901 + }, + { + "epoch": 0.5449942209257527, + "grad_norm": 0.800558865070343, + "learning_rate": 8.300787291028316e-06, + "loss": 0.8583, + "step": 9902 + }, + { + "epoch": 0.5450492597281084, + "grad_norm": 0.7019909024238586, + "learning_rate": 8.30046169004121e-06, + "loss": 0.7045, + "step": 9903 + }, + { + "epoch": 0.5451042985304639, + "grad_norm": 0.7778449654579163, + "learning_rate": 8.300136064248717e-06, + "loss": 0.7964, + "step": 9904 + }, + { + "epoch": 0.5451593373328196, + "grad_norm": 0.6894309520721436, + "learning_rate": 8.299810413653284e-06, + "loss": 0.7382, + "step": 9905 + }, + { + "epoch": 0.5452143761351753, + "grad_norm": 0.6942182183265686, + "learning_rate": 8.299484738257361e-06, + "loss": 0.73, + "step": 9906 + }, + { + "epoch": 0.545269414937531, + "grad_norm": 0.6607787609100342, + "learning_rate": 8.299159038063394e-06, + "loss": 0.6987, + "step": 9907 + }, + { + "epoch": 0.5453244537398866, + "grad_norm": 0.7447709441184998, + "learning_rate": 8.29883331307383e-06, + "loss": 0.7787, + "step": 9908 + }, + { + "epoch": 0.5453794925422423, + "grad_norm": 0.6315301656723022, + "learning_rate": 8.298507563291116e-06, + "loss": 0.7047, + "step": 9909 + }, + { + "epoch": 0.545434531344598, + "grad_norm": 0.8095656633377075, + "learning_rate": 8.298181788717705e-06, + "loss": 0.691, + "step": 9910 + }, + { + "epoch": 0.5454895701469537, + "grad_norm": 0.6419453024864197, + "learning_rate": 8.29785598935604e-06, + "loss": 0.7333, + "step": 9911 + }, + { + "epoch": 0.5455446089493092, + "grad_norm": 0.7209222316741943, + "learning_rate": 8.297530165208574e-06, + "loss": 0.8174, + "step": 9912 + }, + { + "epoch": 0.5455996477516649, + "grad_norm": 0.6778598427772522, + "learning_rate": 8.297204316277754e-06, + "loss": 0.7696, + "step": 9913 + }, + { + "epoch": 0.5456546865540206, + "grad_norm": 0.6573307514190674, + "learning_rate": 8.296878442566028e-06, + "loss": 0.7843, + "step": 9914 + }, + { + "epoch": 0.5457097253563763, + "grad_norm": 0.6987473964691162, + "learning_rate": 8.296552544075847e-06, + "loss": 0.809, + "step": 9915 + }, + { + "epoch": 0.5457647641587319, + "grad_norm": 0.7149204015731812, + "learning_rate": 8.29622662080966e-06, + "loss": 0.848, + "step": 9916 + }, + { + "epoch": 0.5458198029610876, + "grad_norm": 0.6252632141113281, + "learning_rate": 8.295900672769913e-06, + "loss": 0.7029, + "step": 9917 + }, + { + "epoch": 0.5458748417634433, + "grad_norm": 0.713376522064209, + "learning_rate": 8.295574699959062e-06, + "loss": 0.726, + "step": 9918 + }, + { + "epoch": 0.5459298805657989, + "grad_norm": 0.6864717602729797, + "learning_rate": 8.295248702379552e-06, + "loss": 0.7428, + "step": 9919 + }, + { + "epoch": 0.5459849193681545, + "grad_norm": 0.8085678219795227, + "learning_rate": 8.294922680033837e-06, + "loss": 0.8697, + "step": 9920 + }, + { + "epoch": 0.5460399581705102, + "grad_norm": 0.7366700768470764, + "learning_rate": 8.294596632924363e-06, + "loss": 0.7714, + "step": 9921 + }, + { + "epoch": 0.5460949969728659, + "grad_norm": 0.670632541179657, + "learning_rate": 8.294270561053583e-06, + "loss": 0.7032, + "step": 9922 + }, + { + "epoch": 0.5461500357752215, + "grad_norm": 0.7867220640182495, + "learning_rate": 8.293944464423946e-06, + "loss": 0.8903, + "step": 9923 + }, + { + "epoch": 0.5462050745775772, + "grad_norm": 0.8441565632820129, + "learning_rate": 8.293618343037907e-06, + "loss": 0.8694, + "step": 9924 + }, + { + "epoch": 0.5462601133799329, + "grad_norm": 0.7048027515411377, + "learning_rate": 8.293292196897913e-06, + "loss": 0.8226, + "step": 9925 + }, + { + "epoch": 0.5463151521822885, + "grad_norm": 0.6344078779220581, + "learning_rate": 8.292966026006416e-06, + "loss": 0.7615, + "step": 9926 + }, + { + "epoch": 0.5463701909846441, + "grad_norm": 0.6744484901428223, + "learning_rate": 8.292639830365867e-06, + "loss": 0.6944, + "step": 9927 + }, + { + "epoch": 0.5464252297869998, + "grad_norm": 0.8113303780555725, + "learning_rate": 8.292313609978721e-06, + "loss": 0.7558, + "step": 9928 + }, + { + "epoch": 0.5464802685893555, + "grad_norm": 0.640190839767456, + "learning_rate": 8.291987364847425e-06, + "loss": 0.7167, + "step": 9929 + }, + { + "epoch": 0.5465353073917112, + "grad_norm": 0.7714816331863403, + "learning_rate": 8.291661094974434e-06, + "loss": 0.8662, + "step": 9930 + }, + { + "epoch": 0.5465903461940668, + "grad_norm": 0.6785402894020081, + "learning_rate": 8.291334800362199e-06, + "loss": 0.6835, + "step": 9931 + }, + { + "epoch": 0.5466453849964225, + "grad_norm": 0.704868495464325, + "learning_rate": 8.291008481013173e-06, + "loss": 0.7343, + "step": 9932 + }, + { + "epoch": 0.5467004237987781, + "grad_norm": 0.7587466239929199, + "learning_rate": 8.290682136929809e-06, + "loss": 0.7856, + "step": 9933 + }, + { + "epoch": 0.5467554626011338, + "grad_norm": 0.7460505962371826, + "learning_rate": 8.290355768114557e-06, + "loss": 0.7463, + "step": 9934 + }, + { + "epoch": 0.5468105014034894, + "grad_norm": 0.7185021042823792, + "learning_rate": 8.290029374569873e-06, + "loss": 0.8106, + "step": 9935 + }, + { + "epoch": 0.5468655402058451, + "grad_norm": 0.7023874521255493, + "learning_rate": 8.289702956298209e-06, + "loss": 0.6863, + "step": 9936 + }, + { + "epoch": 0.5469205790082008, + "grad_norm": 0.8688495755195618, + "learning_rate": 8.289376513302017e-06, + "loss": 0.8898, + "step": 9937 + }, + { + "epoch": 0.5469756178105565, + "grad_norm": 0.6405122876167297, + "learning_rate": 8.289050045583752e-06, + "loss": 0.6804, + "step": 9938 + }, + { + "epoch": 0.5470306566129121, + "grad_norm": 0.8364881277084351, + "learning_rate": 8.288723553145868e-06, + "loss": 0.8356, + "step": 9939 + }, + { + "epoch": 0.5470856954152677, + "grad_norm": 0.6621617078781128, + "learning_rate": 8.288397035990818e-06, + "loss": 0.7508, + "step": 9940 + }, + { + "epoch": 0.5471407342176234, + "grad_norm": 0.6822347640991211, + "learning_rate": 8.288070494121056e-06, + "loss": 0.7722, + "step": 9941 + }, + { + "epoch": 0.5471957730199791, + "grad_norm": 0.6727223992347717, + "learning_rate": 8.287743927539036e-06, + "loss": 0.743, + "step": 9942 + }, + { + "epoch": 0.5472508118223347, + "grad_norm": 0.7852441668510437, + "learning_rate": 8.287417336247214e-06, + "loss": 0.8321, + "step": 9943 + }, + { + "epoch": 0.5473058506246904, + "grad_norm": 0.6982126235961914, + "learning_rate": 8.287090720248041e-06, + "loss": 0.6669, + "step": 9944 + }, + { + "epoch": 0.5473608894270461, + "grad_norm": 0.7820166945457458, + "learning_rate": 8.286764079543976e-06, + "loss": 0.7592, + "step": 9945 + }, + { + "epoch": 0.5474159282294018, + "grad_norm": 0.6868422627449036, + "learning_rate": 8.28643741413747e-06, + "loss": 0.8308, + "step": 9946 + }, + { + "epoch": 0.5474709670317573, + "grad_norm": 0.8227942585945129, + "learning_rate": 8.286110724030982e-06, + "loss": 0.7982, + "step": 9947 + }, + { + "epoch": 0.547526005834113, + "grad_norm": 0.6838171482086182, + "learning_rate": 8.285784009226964e-06, + "loss": 0.7907, + "step": 9948 + }, + { + "epoch": 0.5475810446364687, + "grad_norm": 0.7200812697410583, + "learning_rate": 8.285457269727875e-06, + "loss": 0.88, + "step": 9949 + }, + { + "epoch": 0.5476360834388244, + "grad_norm": 0.7469412684440613, + "learning_rate": 8.285130505536168e-06, + "loss": 0.8167, + "step": 9950 + }, + { + "epoch": 0.54769112224118, + "grad_norm": 0.6660227179527283, + "learning_rate": 8.284803716654298e-06, + "loss": 0.7685, + "step": 9951 + }, + { + "epoch": 0.5477461610435357, + "grad_norm": 0.7116572260856628, + "learning_rate": 8.284476903084723e-06, + "loss": 0.7415, + "step": 9952 + }, + { + "epoch": 0.5478011998458914, + "grad_norm": 0.6540791988372803, + "learning_rate": 8.284150064829899e-06, + "loss": 0.6571, + "step": 9953 + }, + { + "epoch": 0.5478562386482471, + "grad_norm": 0.7527759075164795, + "learning_rate": 8.283823201892283e-06, + "loss": 0.8678, + "step": 9954 + }, + { + "epoch": 0.5479112774506026, + "grad_norm": 0.7795953750610352, + "learning_rate": 8.283496314274331e-06, + "loss": 0.8086, + "step": 9955 + }, + { + "epoch": 0.5479663162529583, + "grad_norm": 0.862503170967102, + "learning_rate": 8.283169401978498e-06, + "loss": 0.7442, + "step": 9956 + }, + { + "epoch": 0.548021355055314, + "grad_norm": 0.6552054286003113, + "learning_rate": 8.282842465007244e-06, + "loss": 0.6664, + "step": 9957 + }, + { + "epoch": 0.5480763938576697, + "grad_norm": 0.7242427468299866, + "learning_rate": 8.282515503363024e-06, + "loss": 0.8199, + "step": 9958 + }, + { + "epoch": 0.5481314326600253, + "grad_norm": 0.7529763579368591, + "learning_rate": 8.282188517048295e-06, + "loss": 0.761, + "step": 9959 + }, + { + "epoch": 0.548186471462381, + "grad_norm": 0.7909425497055054, + "learning_rate": 8.281861506065519e-06, + "loss": 0.7389, + "step": 9960 + }, + { + "epoch": 0.5482415102647367, + "grad_norm": 0.6594850420951843, + "learning_rate": 8.281534470417147e-06, + "loss": 0.7473, + "step": 9961 + }, + { + "epoch": 0.5482965490670924, + "grad_norm": 0.6900844573974609, + "learning_rate": 8.281207410105642e-06, + "loss": 0.7551, + "step": 9962 + }, + { + "epoch": 0.5483515878694479, + "grad_norm": 0.6922640204429626, + "learning_rate": 8.28088032513346e-06, + "loss": 0.7654, + "step": 9963 + }, + { + "epoch": 0.5484066266718036, + "grad_norm": 0.7758432626724243, + "learning_rate": 8.28055321550306e-06, + "loss": 0.8033, + "step": 9964 + }, + { + "epoch": 0.5484616654741593, + "grad_norm": 0.7074280977249146, + "learning_rate": 8.2802260812169e-06, + "loss": 0.7302, + "step": 9965 + }, + { + "epoch": 0.5485167042765149, + "grad_norm": 0.7724928259849548, + "learning_rate": 8.27989892227744e-06, + "loss": 0.7621, + "step": 9966 + }, + { + "epoch": 0.5485717430788706, + "grad_norm": 0.7364168167114258, + "learning_rate": 8.279571738687137e-06, + "loss": 0.7587, + "step": 9967 + }, + { + "epoch": 0.5486267818812263, + "grad_norm": 0.7298350930213928, + "learning_rate": 8.27924453044845e-06, + "loss": 0.7371, + "step": 9968 + }, + { + "epoch": 0.548681820683582, + "grad_norm": 0.8056737780570984, + "learning_rate": 8.27891729756384e-06, + "loss": 0.9871, + "step": 9969 + }, + { + "epoch": 0.5487368594859375, + "grad_norm": 0.7499688267707825, + "learning_rate": 8.278590040035763e-06, + "loss": 0.8574, + "step": 9970 + }, + { + "epoch": 0.5487918982882932, + "grad_norm": 0.7398175001144409, + "learning_rate": 8.278262757866683e-06, + "loss": 0.744, + "step": 9971 + }, + { + "epoch": 0.5488469370906489, + "grad_norm": 0.7099171876907349, + "learning_rate": 8.277935451059058e-06, + "loss": 0.7108, + "step": 9972 + }, + { + "epoch": 0.5489019758930046, + "grad_norm": 0.6720188856124878, + "learning_rate": 8.277608119615345e-06, + "loss": 0.8565, + "step": 9973 + }, + { + "epoch": 0.5489570146953602, + "grad_norm": 0.7870737910270691, + "learning_rate": 8.27728076353801e-06, + "loss": 0.7429, + "step": 9974 + }, + { + "epoch": 0.5490120534977159, + "grad_norm": 0.7358133792877197, + "learning_rate": 8.276953382829507e-06, + "loss": 0.7549, + "step": 9975 + }, + { + "epoch": 0.5490670923000716, + "grad_norm": 0.8968467116355896, + "learning_rate": 8.276625977492303e-06, + "loss": 0.6983, + "step": 9976 + }, + { + "epoch": 0.5491221311024272, + "grad_norm": 0.7346875071525574, + "learning_rate": 8.276298547528852e-06, + "loss": 0.8541, + "step": 9977 + }, + { + "epoch": 0.5491771699047828, + "grad_norm": 0.7297229170799255, + "learning_rate": 8.27597109294162e-06, + "loss": 0.8378, + "step": 9978 + }, + { + "epoch": 0.5492322087071385, + "grad_norm": 0.6907635927200317, + "learning_rate": 8.275643613733064e-06, + "loss": 0.7058, + "step": 9979 + }, + { + "epoch": 0.5492872475094942, + "grad_norm": 0.7612239718437195, + "learning_rate": 8.27531610990565e-06, + "loss": 0.6827, + "step": 9980 + }, + { + "epoch": 0.5493422863118499, + "grad_norm": 1.3160386085510254, + "learning_rate": 8.274988581461837e-06, + "loss": 0.7357, + "step": 9981 + }, + { + "epoch": 0.5493973251142055, + "grad_norm": 0.6370541453361511, + "learning_rate": 8.274661028404083e-06, + "loss": 0.7323, + "step": 9982 + }, + { + "epoch": 0.5494523639165612, + "grad_norm": 0.7051724195480347, + "learning_rate": 8.274333450734856e-06, + "loss": 0.7714, + "step": 9983 + }, + { + "epoch": 0.5495074027189168, + "grad_norm": 0.7452969551086426, + "learning_rate": 8.274005848456614e-06, + "loss": 0.7516, + "step": 9984 + }, + { + "epoch": 0.5495624415212725, + "grad_norm": 0.7132626175880432, + "learning_rate": 8.273678221571823e-06, + "loss": 0.6417, + "step": 9985 + }, + { + "epoch": 0.5496174803236281, + "grad_norm": 0.7873446345329285, + "learning_rate": 8.273350570082941e-06, + "loss": 0.8457, + "step": 9986 + }, + { + "epoch": 0.5496725191259838, + "grad_norm": 0.691470205783844, + "learning_rate": 8.273022893992432e-06, + "loss": 0.7871, + "step": 9987 + }, + { + "epoch": 0.5497275579283395, + "grad_norm": 0.6671431064605713, + "learning_rate": 8.27269519330276e-06, + "loss": 0.6919, + "step": 9988 + }, + { + "epoch": 0.5497825967306952, + "grad_norm": 0.8026914596557617, + "learning_rate": 8.272367468016387e-06, + "loss": 0.6885, + "step": 9989 + }, + { + "epoch": 0.5498376355330508, + "grad_norm": 0.9003152251243591, + "learning_rate": 8.272039718135774e-06, + "loss": 0.7671, + "step": 9990 + }, + { + "epoch": 0.5498926743354065, + "grad_norm": 0.6515254378318787, + "learning_rate": 8.271711943663388e-06, + "loss": 0.7589, + "step": 9991 + }, + { + "epoch": 0.5499477131377621, + "grad_norm": 0.6495782136917114, + "learning_rate": 8.27138414460169e-06, + "loss": 0.7277, + "step": 9992 + }, + { + "epoch": 0.5500027519401178, + "grad_norm": 0.7564565539360046, + "learning_rate": 8.271056320953146e-06, + "loss": 0.6977, + "step": 9993 + }, + { + "epoch": 0.5500577907424734, + "grad_norm": 0.8551548719406128, + "learning_rate": 8.270728472720218e-06, + "loss": 0.684, + "step": 9994 + }, + { + "epoch": 0.5501128295448291, + "grad_norm": 0.6614843010902405, + "learning_rate": 8.270400599905369e-06, + "loss": 0.6559, + "step": 9995 + }, + { + "epoch": 0.5501678683471848, + "grad_norm": 0.6920068264007568, + "learning_rate": 8.270072702511065e-06, + "loss": 0.7497, + "step": 9996 + }, + { + "epoch": 0.5502229071495405, + "grad_norm": 0.7426198124885559, + "learning_rate": 8.26974478053977e-06, + "loss": 0.7434, + "step": 9997 + }, + { + "epoch": 0.550277945951896, + "grad_norm": 1.2630934715270996, + "learning_rate": 8.269416833993949e-06, + "loss": 0.7306, + "step": 9998 + }, + { + "epoch": 0.5503329847542517, + "grad_norm": 0.7069457769393921, + "learning_rate": 8.269088862876066e-06, + "loss": 0.6735, + "step": 9999 } ], "logging_steps": 1, @@ -57293,7 +70019,7 @@ "attributes": {} } }, - "total_flos": 2.414263242709795e+19, + "total_flos": 2.9507661855341937e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null