{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.26293817132021, "eval_steps": 500, "global_step": 18580, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 58.19491958618164, "learning_rate": 1.6666666666666667e-06, "loss": 4.5462, "step": 20 }, { "epoch": 0.0, "grad_norm": 51.19196319580078, "learning_rate": 3.3333333333333333e-06, "loss": 4.6693, "step": 40 }, { "epoch": 0.0, "grad_norm": 45.4248161315918, "learning_rate": 5e-06, "loss": 4.6065, "step": 60 }, { "epoch": 0.0, "grad_norm": 57.08290100097656, "learning_rate": 6.666666666666667e-06, "loss": 4.4395, "step": 80 }, { "epoch": 0.0, "grad_norm": 40.65673828125, "learning_rate": 8.333333333333334e-06, "loss": 4.4641, "step": 100 }, { "epoch": 0.0, "grad_norm": 40.7547492980957, "learning_rate": 1e-05, "loss": 4.4638, "step": 120 }, { "epoch": 0.0, "grad_norm": 40.71052169799805, "learning_rate": 1.1666666666666668e-05, "loss": 4.3721, "step": 140 }, { "epoch": 0.0, "grad_norm": 32.69596862792969, "learning_rate": 1.3333333333333333e-05, "loss": 4.3784, "step": 160 }, { "epoch": 0.0, "grad_norm": 27.53285026550293, "learning_rate": 1.5e-05, "loss": 4.3627, "step": 180 }, { "epoch": 0.0, "grad_norm": 39.0136833190918, "learning_rate": 1.6666666666666667e-05, "loss": 4.2018, "step": 200 }, { "epoch": 0.0, "grad_norm": 39.9036750793457, "learning_rate": 1.8333333333333333e-05, "loss": 4.1214, "step": 220 }, { "epoch": 0.0, "grad_norm": 26.16208267211914, "learning_rate": 2e-05, "loss": 4.0551, "step": 240 }, { "epoch": 0.0, "grad_norm": 35.66220474243164, "learning_rate": 2.1666666666666667e-05, "loss": 4.0599, "step": 260 }, { "epoch": 0.0, "grad_norm": 22.310619354248047, "learning_rate": 2.3333333333333336e-05, "loss": 4.181, "step": 280 }, { "epoch": 0.0, "grad_norm": 31.29083824157715, "learning_rate": 2.5e-05, "loss": 4.0389, "step": 300 }, { "epoch": 0.0, "grad_norm": 18.66942596435547, "learning_rate": 2.6666666666666667e-05, "loss": 4.0888, "step": 320 }, { "epoch": 0.0, "grad_norm": 47.483428955078125, "learning_rate": 2.8333333333333335e-05, "loss": 4.0918, "step": 340 }, { "epoch": 0.01, "grad_norm": 51.05717468261719, "learning_rate": 3e-05, "loss": 3.9807, "step": 360 }, { "epoch": 0.01, "grad_norm": 67.01704406738281, "learning_rate": 3.1666666666666666e-05, "loss": 4.0331, "step": 380 }, { "epoch": 0.01, "grad_norm": 40.98155975341797, "learning_rate": 3.3333333333333335e-05, "loss": 4.039, "step": 400 }, { "epoch": 0.01, "grad_norm": 29.619321823120117, "learning_rate": 3.5e-05, "loss": 4.077, "step": 420 }, { "epoch": 0.01, "grad_norm": 41.605018615722656, "learning_rate": 3.6666666666666666e-05, "loss": 4.044, "step": 440 }, { "epoch": 0.01, "grad_norm": 34.36818313598633, "learning_rate": 3.8333333333333334e-05, "loss": 3.974, "step": 460 }, { "epoch": 0.01, "grad_norm": 26.917036056518555, "learning_rate": 4e-05, "loss": 4.0088, "step": 480 }, { "epoch": 0.01, "grad_norm": 25.219558715820312, "learning_rate": 4.166666666666667e-05, "loss": 3.8768, "step": 500 }, { "epoch": 0.01, "grad_norm": 24.45106315612793, "learning_rate": 4.3333333333333334e-05, "loss": 3.8979, "step": 520 }, { "epoch": 0.01, "grad_norm": 39.479461669921875, "learning_rate": 4.5e-05, "loss": 3.9241, "step": 540 }, { "epoch": 0.01, "grad_norm": 46.96614456176758, "learning_rate": 4.666666666666667e-05, "loss": 3.8796, "step": 560 }, { "epoch": 0.01, "grad_norm": 31.622241973876953, "learning_rate": 4.8333333333333334e-05, "loss": 3.9045, "step": 580 }, { "epoch": 0.01, "grad_norm": 146.8946990966797, "learning_rate": 5e-05, "loss": 3.941, "step": 600 }, { "epoch": 0.01, "grad_norm": 29.78015899658203, "learning_rate": 4.9999868880914903e-05, "loss": 3.9279, "step": 620 }, { "epoch": 0.01, "grad_norm": 44.591156005859375, "learning_rate": 4.999947552503497e-05, "loss": 3.8695, "step": 640 }, { "epoch": 0.01, "grad_norm": 35.80597686767578, "learning_rate": 4.9998819936486327e-05, "loss": 3.9277, "step": 660 }, { "epoch": 0.01, "grad_norm": 35.00313186645508, "learning_rate": 4.99979021221458e-05, "loss": 3.881, "step": 680 }, { "epoch": 0.01, "grad_norm": 28.8647403717041, "learning_rate": 4.999672209164081e-05, "loss": 3.8286, "step": 700 }, { "epoch": 0.01, "grad_norm": 33.56174087524414, "learning_rate": 4.999527985734932e-05, "loss": 3.8631, "step": 720 }, { "epoch": 0.01, "grad_norm": 63.59539794921875, "learning_rate": 4.999357543439969e-05, "loss": 3.8931, "step": 740 }, { "epoch": 0.01, "grad_norm": 54.89167785644531, "learning_rate": 4.999160884067051e-05, "loss": 3.8953, "step": 760 }, { "epoch": 0.01, "grad_norm": 33.9933967590332, "learning_rate": 4.998938009679042e-05, "loss": 3.9113, "step": 780 }, { "epoch": 0.01, "grad_norm": 56.342620849609375, "learning_rate": 4.998688922613788e-05, "loss": 3.8079, "step": 800 }, { "epoch": 0.01, "grad_norm": 35.17020797729492, "learning_rate": 4.998413625484095e-05, "loss": 3.8289, "step": 820 }, { "epoch": 0.01, "grad_norm": 36.69993209838867, "learning_rate": 4.998112121177699e-05, "loss": 3.9726, "step": 840 }, { "epoch": 0.01, "grad_norm": 41.2137565612793, "learning_rate": 4.997784412857239e-05, "loss": 3.8602, "step": 860 }, { "epoch": 0.01, "grad_norm": 49.4541130065918, "learning_rate": 4.99743050396022e-05, "loss": 3.8549, "step": 880 }, { "epoch": 0.01, "grad_norm": 40.87107849121094, "learning_rate": 4.997050398198977e-05, "loss": 3.7832, "step": 900 }, { "epoch": 0.01, "grad_norm": 31.820924758911133, "learning_rate": 4.9966440995606415e-05, "loss": 3.8991, "step": 920 }, { "epoch": 0.01, "grad_norm": 37.09877395629883, "learning_rate": 4.9962116123070924e-05, "loss": 3.9486, "step": 940 }, { "epoch": 0.01, "grad_norm": 40.25444412231445, "learning_rate": 4.995752940974918e-05, "loss": 3.848, "step": 960 }, { "epoch": 0.01, "grad_norm": 38.95152282714844, "learning_rate": 4.9952680903753627e-05, "loss": 3.723, "step": 980 }, { "epoch": 0.01, "grad_norm": 52.44506072998047, "learning_rate": 4.9947570655942796e-05, "loss": 3.864, "step": 1000 }, { "epoch": 0.01, "grad_norm": 59.793373107910156, "learning_rate": 4.994219871992077e-05, "loss": 3.794, "step": 1020 }, { "epoch": 0.01, "grad_norm": 40.9141960144043, "learning_rate": 4.993656515203662e-05, "loss": 3.8384, "step": 1040 }, { "epoch": 0.02, "grad_norm": 33.75545883178711, "learning_rate": 4.99306700113838e-05, "loss": 3.8811, "step": 1060 }, { "epoch": 0.02, "grad_norm": 30.463613510131836, "learning_rate": 4.9924513359799554e-05, "loss": 3.7411, "step": 1080 }, { "epoch": 0.02, "grad_norm": 36.24667739868164, "learning_rate": 4.991809526186424e-05, "loss": 3.8915, "step": 1100 }, { "epoch": 0.02, "grad_norm": 35.77268600463867, "learning_rate": 4.991141578490066e-05, "loss": 3.7547, "step": 1120 }, { "epoch": 0.02, "grad_norm": 43.09757995605469, "learning_rate": 4.990447499897339e-05, "loss": 3.8161, "step": 1140 }, { "epoch": 0.02, "grad_norm": 67.45648956298828, "learning_rate": 4.989727297688797e-05, "loss": 3.9635, "step": 1160 }, { "epoch": 0.02, "grad_norm": 31.597640991210938, "learning_rate": 4.98898097941902e-05, "loss": 3.8912, "step": 1180 }, { "epoch": 0.02, "grad_norm": 41.68192672729492, "learning_rate": 4.988208552916535e-05, "loss": 3.8112, "step": 1200 }, { "epoch": 0.02, "grad_norm": 36.489810943603516, "learning_rate": 4.9874100262837296e-05, "loss": 3.7838, "step": 1220 }, { "epoch": 0.02, "grad_norm": 31.755823135375977, "learning_rate": 4.986585407896772e-05, "loss": 3.8385, "step": 1240 }, { "epoch": 0.02, "grad_norm": 84.64984130859375, "learning_rate": 4.985734706405516e-05, "loss": 3.8727, "step": 1260 }, { "epoch": 0.02, "grad_norm": 32.23849868774414, "learning_rate": 4.98485793073342e-05, "loss": 3.8013, "step": 1280 }, { "epoch": 0.02, "grad_norm": 25.90882110595703, "learning_rate": 4.983955090077444e-05, "loss": 3.7387, "step": 1300 }, { "epoch": 0.02, "grad_norm": 43.255313873291016, "learning_rate": 4.9830261939079614e-05, "loss": 3.8756, "step": 1320 }, { "epoch": 0.02, "grad_norm": 35.833404541015625, "learning_rate": 4.982071251968652e-05, "loss": 3.7124, "step": 1340 }, { "epoch": 0.02, "grad_norm": 29.098703384399414, "learning_rate": 4.981090274276406e-05, "loss": 3.8525, "step": 1360 }, { "epoch": 0.02, "grad_norm": 35.16478729248047, "learning_rate": 4.980083271121214e-05, "loss": 3.8262, "step": 1380 }, { "epoch": 0.02, "grad_norm": 32.62320327758789, "learning_rate": 4.9790502530660635e-05, "loss": 3.8903, "step": 1400 }, { "epoch": 0.02, "grad_norm": 48.55181884765625, "learning_rate": 4.977991230946824e-05, "loss": 3.7363, "step": 1420 }, { "epoch": 0.02, "grad_norm": 46.640403747558594, "learning_rate": 4.976906215872138e-05, "loss": 3.9682, "step": 1440 }, { "epoch": 0.02, "grad_norm": 32.13254928588867, "learning_rate": 4.9757952192232985e-05, "loss": 3.6851, "step": 1460 }, { "epoch": 0.02, "grad_norm": 34.074649810791016, "learning_rate": 4.9746582526541355e-05, "loss": 3.7781, "step": 1480 }, { "epoch": 0.02, "grad_norm": 37.383548736572266, "learning_rate": 4.9734953280908904e-05, "loss": 3.7182, "step": 1500 }, { "epoch": 0.02, "grad_norm": 45.83818435668945, "learning_rate": 4.972306457732091e-05, "loss": 3.7685, "step": 1520 }, { "epoch": 0.02, "grad_norm": 35.88654327392578, "learning_rate": 4.9710916540484265e-05, "loss": 3.7627, "step": 1540 }, { "epoch": 0.02, "grad_norm": 29.5416202545166, "learning_rate": 4.96985092978261e-05, "loss": 3.8022, "step": 1560 }, { "epoch": 0.02, "grad_norm": 31.974184036254883, "learning_rate": 4.968584297949255e-05, "loss": 3.792, "step": 1580 }, { "epoch": 0.02, "grad_norm": 32.32705307006836, "learning_rate": 4.967291771834727e-05, "loss": 3.7238, "step": 1600 }, { "epoch": 0.02, "grad_norm": 29.011735916137695, "learning_rate": 4.9659733649970155e-05, "loss": 3.7215, "step": 1620 }, { "epoch": 0.02, "grad_norm": 33.73636245727539, "learning_rate": 4.9646290912655834e-05, "loss": 3.8132, "step": 1640 }, { "epoch": 0.02, "grad_norm": 38.57840347290039, "learning_rate": 4.9632589647412265e-05, "loss": 3.8606, "step": 1660 }, { "epoch": 0.02, "grad_norm": 33.149078369140625, "learning_rate": 4.9618629997959235e-05, "loss": 3.7518, "step": 1680 }, { "epoch": 0.02, "grad_norm": 58.5382194519043, "learning_rate": 4.960441211072686e-05, "loss": 3.7482, "step": 1700 }, { "epoch": 0.02, "grad_norm": 31.86609649658203, "learning_rate": 4.958993613485405e-05, "loss": 3.7683, "step": 1720 }, { "epoch": 0.02, "grad_norm": 28.98000144958496, "learning_rate": 4.9575202222186945e-05, "loss": 3.8361, "step": 1740 }, { "epoch": 0.02, "grad_norm": 37.06975555419922, "learning_rate": 4.956021052727731e-05, "loss": 3.7297, "step": 1760 }, { "epoch": 0.03, "grad_norm": 44.01863479614258, "learning_rate": 4.954496120738094e-05, "loss": 3.8244, "step": 1780 }, { "epoch": 0.03, "grad_norm": 31.08086585998535, "learning_rate": 4.9529454422455976e-05, "loss": 3.8144, "step": 1800 }, { "epoch": 0.03, "grad_norm": 36.80121994018555, "learning_rate": 4.951369033516127e-05, "loss": 3.7668, "step": 1820 }, { "epoch": 0.03, "grad_norm": 24.225065231323242, "learning_rate": 4.949766911085461e-05, "loss": 3.7929, "step": 1840 }, { "epoch": 0.03, "grad_norm": 33.50989532470703, "learning_rate": 4.948139091759108e-05, "loss": 3.7897, "step": 1860 }, { "epoch": 0.03, "grad_norm": 26.35730743408203, "learning_rate": 4.9464855926121225e-05, "loss": 3.8618, "step": 1880 }, { "epoch": 0.03, "grad_norm": 36.487464904785156, "learning_rate": 4.944806430988927e-05, "loss": 3.7205, "step": 1900 }, { "epoch": 0.03, "grad_norm": 35.87200164794922, "learning_rate": 4.943101624503132e-05, "loss": 3.8324, "step": 1920 }, { "epoch": 0.03, "grad_norm": 26.013994216918945, "learning_rate": 4.941371191037354e-05, "loss": 3.6997, "step": 1940 }, { "epoch": 0.03, "grad_norm": 42.59685134887695, "learning_rate": 4.939615148743017e-05, "loss": 3.7085, "step": 1960 }, { "epoch": 0.03, "grad_norm": 65.71659851074219, "learning_rate": 4.9378335160401766e-05, "loss": 3.8939, "step": 1980 }, { "epoch": 0.03, "grad_norm": 25.612024307250977, "learning_rate": 4.936026311617316e-05, "loss": 3.7231, "step": 2000 }, { "epoch": 0.03, "grad_norm": 28.377412796020508, "learning_rate": 4.9341935544311536e-05, "loss": 3.7476, "step": 2020 }, { "epoch": 0.03, "grad_norm": 29.760807037353516, "learning_rate": 4.9323352637064455e-05, "loss": 3.8374, "step": 2040 }, { "epoch": 0.03, "grad_norm": 35.875770568847656, "learning_rate": 4.9304514589357834e-05, "loss": 3.7073, "step": 2060 }, { "epoch": 0.03, "grad_norm": 26.299306869506836, "learning_rate": 4.928542159879386e-05, "loss": 3.736, "step": 2080 }, { "epoch": 0.03, "grad_norm": 40.1691780090332, "learning_rate": 4.926607386564898e-05, "loss": 3.7416, "step": 2100 }, { "epoch": 0.03, "grad_norm": 35.2581901550293, "learning_rate": 4.924647159287176e-05, "loss": 3.7917, "step": 2120 }, { "epoch": 0.03, "grad_norm": 24.038591384887695, "learning_rate": 4.9226614986080763e-05, "loss": 3.7164, "step": 2140 }, { "epoch": 0.03, "grad_norm": 41.96257019042969, "learning_rate": 4.92065042535624e-05, "loss": 3.8562, "step": 2160 }, { "epoch": 0.03, "grad_norm": 37.07769775390625, "learning_rate": 4.918613960626873e-05, "loss": 3.845, "step": 2180 }, { "epoch": 0.03, "grad_norm": 35.35500717163086, "learning_rate": 4.916552125781528e-05, "loss": 3.679, "step": 2200 }, { "epoch": 0.03, "grad_norm": 28.356767654418945, "learning_rate": 4.914464942447876e-05, "loss": 3.6217, "step": 2220 }, { "epoch": 0.03, "grad_norm": 32.50172805786133, "learning_rate": 4.912352432519484e-05, "loss": 3.8185, "step": 2240 }, { "epoch": 0.03, "grad_norm": 36.33710861206055, "learning_rate": 4.910214618155579e-05, "loss": 3.7401, "step": 2260 }, { "epoch": 0.03, "grad_norm": 42.05067443847656, "learning_rate": 4.908051521780824e-05, "loss": 3.6782, "step": 2280 }, { "epoch": 0.03, "grad_norm": 37.84385299682617, "learning_rate": 4.9058631660850765e-05, "loss": 3.7863, "step": 2300 }, { "epoch": 0.03, "grad_norm": 28.022615432739258, "learning_rate": 4.90364957402315e-05, "loss": 3.7804, "step": 2320 }, { "epoch": 0.03, "grad_norm": 38.274173736572266, "learning_rate": 4.9014107688145804e-05, "loss": 3.6898, "step": 2340 }, { "epoch": 0.03, "grad_norm": 29.532123565673828, "learning_rate": 4.899146773943374e-05, "loss": 3.7521, "step": 2360 }, { "epoch": 0.03, "grad_norm": 48.601417541503906, "learning_rate": 4.896857613157765e-05, "loss": 3.646, "step": 2380 }, { "epoch": 0.03, "grad_norm": 31.142457962036133, "learning_rate": 4.894543310469968e-05, "loss": 3.7694, "step": 2400 }, { "epoch": 0.03, "grad_norm": 39.75430679321289, "learning_rate": 4.8922038901559224e-05, "loss": 3.7673, "step": 2420 }, { "epoch": 0.03, "grad_norm": 46.01137924194336, "learning_rate": 4.8898393767550405e-05, "loss": 3.7022, "step": 2440 }, { "epoch": 0.03, "grad_norm": 26.171249389648438, "learning_rate": 4.887449795069948e-05, "loss": 3.7917, "step": 2460 }, { "epoch": 0.04, "grad_norm": 46.24589538574219, "learning_rate": 4.885035170166228e-05, "loss": 3.7352, "step": 2480 }, { "epoch": 0.04, "grad_norm": 31.69544219970703, "learning_rate": 4.882595527372152e-05, "loss": 3.694, "step": 2500 }, { "epoch": 0.04, "grad_norm": 35.99808883666992, "learning_rate": 4.880130892278419e-05, "loss": 3.7636, "step": 2520 }, { "epoch": 0.04, "grad_norm": 31.871978759765625, "learning_rate": 4.877641290737884e-05, "loss": 3.7472, "step": 2540 }, { "epoch": 0.04, "grad_norm": 35.04158401489258, "learning_rate": 4.87512674886529e-05, "loss": 3.7445, "step": 2560 }, { "epoch": 0.04, "grad_norm": 46.71685791015625, "learning_rate": 4.872587293036991e-05, "loss": 3.7141, "step": 2580 }, { "epoch": 0.04, "grad_norm": 26.907012939453125, "learning_rate": 4.870022949890676e-05, "loss": 3.748, "step": 2600 }, { "epoch": 0.04, "grad_norm": 26.9509334564209, "learning_rate": 4.867433746325093e-05, "loss": 3.7635, "step": 2620 }, { "epoch": 0.04, "grad_norm": 26.85176658630371, "learning_rate": 4.8648197094997616e-05, "loss": 3.824, "step": 2640 }, { "epoch": 0.04, "grad_norm": 22.88348960876465, "learning_rate": 4.8621808668346906e-05, "loss": 3.7504, "step": 2660 }, { "epoch": 0.04, "grad_norm": 27.76841163635254, "learning_rate": 4.859517246010091e-05, "loss": 3.8228, "step": 2680 }, { "epoch": 0.04, "grad_norm": 41.46321487426758, "learning_rate": 4.856828874966086e-05, "loss": 3.6509, "step": 2700 }, { "epoch": 0.04, "grad_norm": 28.96099090576172, "learning_rate": 4.854115781902414e-05, "loss": 3.7377, "step": 2720 }, { "epoch": 0.04, "grad_norm": 38.632015228271484, "learning_rate": 4.851377995278138e-05, "loss": 3.8471, "step": 2740 }, { "epoch": 0.04, "grad_norm": 32.76665496826172, "learning_rate": 4.8486155438113454e-05, "loss": 3.731, "step": 2760 }, { "epoch": 0.04, "grad_norm": 30.798906326293945, "learning_rate": 4.845828456478842e-05, "loss": 3.6953, "step": 2780 }, { "epoch": 0.04, "grad_norm": 35.173606872558594, "learning_rate": 4.8430167625158595e-05, "loss": 3.6521, "step": 2800 }, { "epoch": 0.04, "grad_norm": 50.02262496948242, "learning_rate": 4.840180491415733e-05, "loss": 3.6999, "step": 2820 }, { "epoch": 0.04, "grad_norm": 33.76813507080078, "learning_rate": 4.837319672929607e-05, "loss": 3.7118, "step": 2840 }, { "epoch": 0.04, "grad_norm": 24.56015396118164, "learning_rate": 4.834434337066112e-05, "loss": 3.7094, "step": 2860 }, { "epoch": 0.04, "grad_norm": 39.17055892944336, "learning_rate": 4.8315245140910556e-05, "loss": 3.799, "step": 2880 }, { "epoch": 0.04, "grad_norm": 29.631614685058594, "learning_rate": 4.828590234527106e-05, "loss": 3.7785, "step": 2900 }, { "epoch": 0.04, "grad_norm": 46.83203125, "learning_rate": 4.825631529153466e-05, "loss": 3.6311, "step": 2920 }, { "epoch": 0.04, "grad_norm": 34.5321044921875, "learning_rate": 4.822648429005554e-05, "loss": 3.7288, "step": 2940 }, { "epoch": 0.04, "grad_norm": 19.74892234802246, "learning_rate": 4.819640965374681e-05, "loss": 3.6749, "step": 2960 }, { "epoch": 0.04, "grad_norm": 51.736480712890625, "learning_rate": 4.8166091698077164e-05, "loss": 3.8733, "step": 2980 }, { "epoch": 0.04, "grad_norm": 24.50010871887207, "learning_rate": 4.813553074106761e-05, "loss": 3.7634, "step": 3000 }, { "epoch": 0.04, "grad_norm": 29.08304214477539, "learning_rate": 4.810472710328812e-05, "loss": 3.7277, "step": 3020 }, { "epoch": 0.04, "grad_norm": 55.230377197265625, "learning_rate": 4.80736811078543e-05, "loss": 3.7238, "step": 3040 }, { "epoch": 0.04, "grad_norm": 19.770660400390625, "learning_rate": 4.804239308042392e-05, "loss": 3.7202, "step": 3060 }, { "epoch": 0.04, "grad_norm": 28.955581665039062, "learning_rate": 4.8010863349193605e-05, "loss": 3.7079, "step": 3080 }, { "epoch": 0.04, "grad_norm": 32.6827278137207, "learning_rate": 4.7979092244895305e-05, "loss": 3.7488, "step": 3100 }, { "epoch": 0.04, "grad_norm": 28.665210723876953, "learning_rate": 4.794708010079289e-05, "loss": 3.6798, "step": 3120 }, { "epoch": 0.04, "grad_norm": 31.36636734008789, "learning_rate": 4.791482725267857e-05, "loss": 3.7233, "step": 3140 }, { "epoch": 0.04, "grad_norm": 28.98109245300293, "learning_rate": 4.7882334038869495e-05, "loss": 3.8137, "step": 3160 }, { "epoch": 0.05, "grad_norm": 25.13091278076172, "learning_rate": 4.784960080020408e-05, "loss": 3.756, "step": 3180 }, { "epoch": 0.05, "grad_norm": 43.819313049316406, "learning_rate": 4.781662788003851e-05, "loss": 3.7371, "step": 3200 }, { "epoch": 0.05, "grad_norm": 25.864599227905273, "learning_rate": 4.7783415624243124e-05, "loss": 3.604, "step": 3220 }, { "epoch": 0.05, "grad_norm": 38.96342468261719, "learning_rate": 4.7749964381198765e-05, "loss": 3.7482, "step": 3240 }, { "epoch": 0.05, "grad_norm": 28.412094116210938, "learning_rate": 4.7716274501793144e-05, "loss": 3.6766, "step": 3260 }, { "epoch": 0.05, "grad_norm": 35.93290328979492, "learning_rate": 4.768234633941716e-05, "loss": 3.6659, "step": 3280 }, { "epoch": 0.05, "grad_norm": 34.64625930786133, "learning_rate": 4.764818024996117e-05, "loss": 3.6739, "step": 3300 }, { "epoch": 0.05, "grad_norm": 32.466495513916016, "learning_rate": 4.76137765918113e-05, "loss": 3.7524, "step": 3320 }, { "epoch": 0.05, "grad_norm": 33.156776428222656, "learning_rate": 4.7579135725845635e-05, "loss": 3.7571, "step": 3340 }, { "epoch": 0.05, "grad_norm": 48.48731994628906, "learning_rate": 4.7544258015430463e-05, "loss": 3.6783, "step": 3360 }, { "epoch": 0.05, "grad_norm": 30.641870498657227, "learning_rate": 4.750914382641648e-05, "loss": 3.7549, "step": 3380 }, { "epoch": 0.05, "grad_norm": 31.7097110748291, "learning_rate": 4.747379352713489e-05, "loss": 3.6388, "step": 3400 }, { "epoch": 0.05, "grad_norm": 45.476951599121094, "learning_rate": 4.7438207488393616e-05, "loss": 3.7421, "step": 3420 }, { "epoch": 0.05, "grad_norm": 42.40350341796875, "learning_rate": 4.740238608347336e-05, "loss": 3.771, "step": 3440 }, { "epoch": 0.05, "grad_norm": 26.54286003112793, "learning_rate": 4.736632968812373e-05, "loss": 3.6409, "step": 3460 }, { "epoch": 0.05, "grad_norm": 33.44880676269531, "learning_rate": 4.733003868055923e-05, "loss": 3.6977, "step": 3480 }, { "epoch": 0.05, "grad_norm": 30.746978759765625, "learning_rate": 4.7293513441455364e-05, "loss": 3.6403, "step": 3500 }, { "epoch": 0.05, "grad_norm": 30.616453170776367, "learning_rate": 4.72567543539446e-05, "loss": 3.7039, "step": 3520 }, { "epoch": 0.05, "grad_norm": 28.486270904541016, "learning_rate": 4.721976180361238e-05, "loss": 3.6331, "step": 3540 }, { "epoch": 0.05, "grad_norm": 31.4039363861084, "learning_rate": 4.718253617849306e-05, "loss": 3.6498, "step": 3560 }, { "epoch": 0.05, "grad_norm": 22.35509490966797, "learning_rate": 4.714507786906581e-05, "loss": 3.709, "step": 3580 }, { "epoch": 0.05, "grad_norm": 25.957500457763672, "learning_rate": 4.710738726825059e-05, "loss": 3.7159, "step": 3600 }, { "epoch": 0.05, "grad_norm": 27.019580841064453, "learning_rate": 4.706946477140396e-05, "loss": 3.6971, "step": 3620 }, { "epoch": 0.05, "grad_norm": 32.743896484375, "learning_rate": 4.703131077631497e-05, "loss": 3.5543, "step": 3640 }, { "epoch": 0.05, "grad_norm": 30.018753051757812, "learning_rate": 4.699292568320097e-05, "loss": 3.6811, "step": 3660 }, { "epoch": 0.05, "grad_norm": 27.54176139831543, "learning_rate": 4.695430989470343e-05, "loss": 3.6593, "step": 3680 }, { "epoch": 0.05, "grad_norm": 30.283519744873047, "learning_rate": 4.69154638158837e-05, "loss": 3.551, "step": 3700 }, { "epoch": 0.05, "grad_norm": 26.505075454711914, "learning_rate": 4.687638785421875e-05, "loss": 3.7794, "step": 3720 }, { "epoch": 0.05, "grad_norm": 26.94403839111328, "learning_rate": 4.683708241959694e-05, "loss": 3.6415, "step": 3740 }, { "epoch": 0.05, "grad_norm": 31.006845474243164, "learning_rate": 4.679754792431368e-05, "loss": 3.6741, "step": 3760 }, { "epoch": 0.05, "grad_norm": 60.343318939208984, "learning_rate": 4.675778478306712e-05, "loss": 3.6502, "step": 3780 }, { "epoch": 0.05, "grad_norm": 52.47261047363281, "learning_rate": 4.671779341295378e-05, "loss": 3.6878, "step": 3800 }, { "epoch": 0.05, "grad_norm": 34.15403747558594, "learning_rate": 4.6677574233464226e-05, "loss": 3.7464, "step": 3820 }, { "epoch": 0.05, "grad_norm": 21.71308135986328, "learning_rate": 4.663712766647862e-05, "loss": 3.6239, "step": 3840 }, { "epoch": 0.05, "grad_norm": 25.242189407348633, "learning_rate": 4.65964541362623e-05, "loss": 3.8114, "step": 3860 }, { "epoch": 0.05, "grad_norm": 35.03647232055664, "learning_rate": 4.655555406946135e-05, "loss": 3.654, "step": 3880 }, { "epoch": 0.06, "grad_norm": 54.89191818237305, "learning_rate": 4.6514427895098134e-05, "loss": 3.6936, "step": 3900 }, { "epoch": 0.06, "grad_norm": 24.903459548950195, "learning_rate": 4.647307604456674e-05, "loss": 3.8267, "step": 3920 }, { "epoch": 0.06, "grad_norm": 33.852054595947266, "learning_rate": 4.643149895162854e-05, "loss": 3.661, "step": 3940 }, { "epoch": 0.06, "grad_norm": 35.687713623046875, "learning_rate": 4.6389697052407534e-05, "loss": 3.67, "step": 3960 }, { "epoch": 0.06, "grad_norm": 29.4704647064209, "learning_rate": 4.6347670785385884e-05, "loss": 3.7182, "step": 3980 }, { "epoch": 0.06, "grad_norm": 24.089828491210938, "learning_rate": 4.630542059139924e-05, "loss": 3.5781, "step": 4000 }, { "epoch": 0.06, "grad_norm": 34.60494613647461, "learning_rate": 4.626294691363213e-05, "loss": 3.7001, "step": 4020 }, { "epoch": 0.06, "grad_norm": 53.43947219848633, "learning_rate": 4.622025019761336e-05, "loss": 3.6048, "step": 4040 }, { "epoch": 0.06, "grad_norm": 35.322486877441406, "learning_rate": 4.617733089121127e-05, "loss": 3.6201, "step": 4060 }, { "epoch": 0.06, "grad_norm": 47.170005798339844, "learning_rate": 4.613418944462907e-05, "loss": 3.7443, "step": 4080 }, { "epoch": 0.06, "grad_norm": 30.616161346435547, "learning_rate": 4.6090826310400116e-05, "loss": 3.7685, "step": 4100 }, { "epoch": 0.06, "grad_norm": 24.628185272216797, "learning_rate": 4.6047241943383176e-05, "loss": 3.6677, "step": 4120 }, { "epoch": 0.06, "grad_norm": 38.79618453979492, "learning_rate": 4.600343680075764e-05, "loss": 3.744, "step": 4140 }, { "epoch": 0.06, "grad_norm": 37.38518524169922, "learning_rate": 4.595941134201871e-05, "loss": 3.7101, "step": 4160 }, { "epoch": 0.06, "grad_norm": 29.248828887939453, "learning_rate": 4.5915166028972624e-05, "loss": 3.7209, "step": 4180 }, { "epoch": 0.06, "grad_norm": 45.65785217285156, "learning_rate": 4.587070132573178e-05, "loss": 3.7605, "step": 4200 }, { "epoch": 0.06, "grad_norm": 24.220314025878906, "learning_rate": 4.582601769870988e-05, "loss": 3.6609, "step": 4220 }, { "epoch": 0.06, "grad_norm": 27.00070571899414, "learning_rate": 4.578111561661702e-05, "loss": 3.6754, "step": 4240 }, { "epoch": 0.06, "grad_norm": 75.85283660888672, "learning_rate": 4.573599555045479e-05, "loss": 3.6605, "step": 4260 }, { "epoch": 0.06, "grad_norm": 29.803096771240234, "learning_rate": 4.569065797351135e-05, "loss": 3.6287, "step": 4280 }, { "epoch": 0.06, "grad_norm": 26.4781494140625, "learning_rate": 4.5645103361356415e-05, "loss": 3.6301, "step": 4300 }, { "epoch": 0.06, "grad_norm": 47.245357513427734, "learning_rate": 4.5599332191836316e-05, "loss": 3.6776, "step": 4320 }, { "epoch": 0.06, "grad_norm": 26.005104064941406, "learning_rate": 4.555334494506896e-05, "loss": 3.6756, "step": 4340 }, { "epoch": 0.06, "grad_norm": 35.15077590942383, "learning_rate": 4.5507142103438794e-05, "loss": 3.7022, "step": 4360 }, { "epoch": 0.06, "grad_norm": 29.038782119750977, "learning_rate": 4.546072415159179e-05, "loss": 3.6325, "step": 4380 }, { "epoch": 0.06, "grad_norm": 30.944393157958984, "learning_rate": 4.541409157643027e-05, "loss": 3.6343, "step": 4400 }, { "epoch": 0.06, "grad_norm": 31.153432846069336, "learning_rate": 4.536724486710791e-05, "loss": 3.7739, "step": 4420 }, { "epoch": 0.06, "grad_norm": 40.95075225830078, "learning_rate": 4.53201845150245e-05, "loss": 3.6558, "step": 4440 }, { "epoch": 0.06, "grad_norm": 30.37499237060547, "learning_rate": 4.5272911013820876e-05, "loss": 3.6093, "step": 4460 }, { "epoch": 0.06, "grad_norm": 23.894237518310547, "learning_rate": 4.522542485937369e-05, "loss": 3.6415, "step": 4480 }, { "epoch": 0.06, "grad_norm": 69.29508209228516, "learning_rate": 4.517772654979023e-05, "loss": 3.696, "step": 4500 }, { "epoch": 0.06, "grad_norm": 31.464527130126953, "learning_rate": 4.5129816585403206e-05, "loss": 3.7147, "step": 4520 }, { "epoch": 0.06, "grad_norm": 30.76380729675293, "learning_rate": 4.508169546876547e-05, "loss": 3.6428, "step": 4540 }, { "epoch": 0.06, "grad_norm": 27.94367027282715, "learning_rate": 4.503336370464476e-05, "loss": 3.7018, "step": 4560 }, { "epoch": 0.06, "grad_norm": 22.166793823242188, "learning_rate": 4.49848218000184e-05, "loss": 3.7018, "step": 4580 }, { "epoch": 0.07, "grad_norm": 32.058921813964844, "learning_rate": 4.493607026406802e-05, "loss": 3.7035, "step": 4600 }, { "epoch": 0.07, "grad_norm": 28.55988311767578, "learning_rate": 4.488710960817416e-05, "loss": 3.7725, "step": 4620 }, { "epoch": 0.07, "grad_norm": 23.51280403137207, "learning_rate": 4.4837940345910925e-05, "loss": 3.7238, "step": 4640 }, { "epoch": 0.07, "grad_norm": 37.3757209777832, "learning_rate": 4.4788562993040614e-05, "loss": 3.701, "step": 4660 }, { "epoch": 0.07, "grad_norm": 38.56554412841797, "learning_rate": 4.473897806750829e-05, "loss": 3.7174, "step": 4680 }, { "epoch": 0.07, "grad_norm": 29.553325653076172, "learning_rate": 4.4689186089436366e-05, "loss": 3.627, "step": 4700 }, { "epoch": 0.07, "grad_norm": 33.66290283203125, "learning_rate": 4.463918758111912e-05, "loss": 3.6307, "step": 4720 }, { "epoch": 0.07, "grad_norm": 29.957775115966797, "learning_rate": 4.4588983067017257e-05, "loss": 3.6157, "step": 4740 }, { "epoch": 0.07, "grad_norm": 35.32748794555664, "learning_rate": 4.4538573073752365e-05, "loss": 3.5961, "step": 4760 }, { "epoch": 0.07, "grad_norm": 24.597824096679688, "learning_rate": 4.448795813010142e-05, "loss": 3.5881, "step": 4780 }, { "epoch": 0.07, "grad_norm": 26.248044967651367, "learning_rate": 4.443713876699124e-05, "loss": 3.6057, "step": 4800 }, { "epoch": 0.07, "grad_norm": 25.942325592041016, "learning_rate": 4.4386115517492874e-05, "loss": 3.6286, "step": 4820 }, { "epoch": 0.07, "grad_norm": 42.028316497802734, "learning_rate": 4.43348889168161e-05, "loss": 3.6306, "step": 4840 }, { "epoch": 0.07, "grad_norm": 24.317644119262695, "learning_rate": 4.4283459502303695e-05, "loss": 3.5992, "step": 4860 }, { "epoch": 0.07, "grad_norm": 43.174903869628906, "learning_rate": 4.4231827813425885e-05, "loss": 3.6493, "step": 4880 }, { "epoch": 0.07, "grad_norm": 33.58101272583008, "learning_rate": 4.417999439177466e-05, "loss": 3.6843, "step": 4900 }, { "epoch": 0.07, "grad_norm": 34.096824645996094, "learning_rate": 4.412795978105807e-05, "loss": 3.6134, "step": 4920 }, { "epoch": 0.07, "grad_norm": 35.04353713989258, "learning_rate": 4.4075724527094584e-05, "loss": 3.5916, "step": 4940 }, { "epoch": 0.07, "grad_norm": 28.97658920288086, "learning_rate": 4.402328917780728e-05, "loss": 3.6362, "step": 4960 }, { "epoch": 0.07, "grad_norm": 35.05881118774414, "learning_rate": 4.397065428321817e-05, "loss": 3.7566, "step": 4980 }, { "epoch": 0.07, "grad_norm": 27.057044982910156, "learning_rate": 4.391782039544238e-05, "loss": 3.4967, "step": 5000 }, { "epoch": 0.07, "grad_norm": 22.590089797973633, "learning_rate": 4.386478806868241e-05, "loss": 3.6759, "step": 5020 }, { "epoch": 0.07, "grad_norm": 34.77460479736328, "learning_rate": 4.3811557859222254e-05, "loss": 3.6893, "step": 5040 }, { "epoch": 0.07, "grad_norm": 24.440248489379883, "learning_rate": 4.375813032542164e-05, "loss": 3.7167, "step": 5060 }, { "epoch": 0.07, "grad_norm": 42.91717529296875, "learning_rate": 4.3704506027710105e-05, "loss": 3.5893, "step": 5080 }, { "epoch": 0.07, "grad_norm": 34.991634368896484, "learning_rate": 4.365068552858115e-05, "loss": 3.5482, "step": 5100 }, { "epoch": 0.07, "grad_norm": 37.62036895751953, "learning_rate": 4.3596669392586365e-05, "loss": 3.5972, "step": 5120 }, { "epoch": 0.07, "grad_norm": 29.56283950805664, "learning_rate": 4.354245818632944e-05, "loss": 3.6804, "step": 5140 }, { "epoch": 0.07, "grad_norm": 35.37843322753906, "learning_rate": 4.348805247846027e-05, "loss": 3.6491, "step": 5160 }, { "epoch": 0.07, "grad_norm": 39.210906982421875, "learning_rate": 4.343345283966901e-05, "loss": 3.6268, "step": 5180 }, { "epoch": 0.07, "grad_norm": 26.60144805908203, "learning_rate": 4.337865984268001e-05, "loss": 3.6277, "step": 5200 }, { "epoch": 0.07, "grad_norm": 32.668052673339844, "learning_rate": 4.33236740622459e-05, "loss": 3.6159, "step": 5220 }, { "epoch": 0.07, "grad_norm": 43.837833404541016, "learning_rate": 4.326849607514148e-05, "loss": 3.5939, "step": 5240 }, { "epoch": 0.07, "grad_norm": 20.860111236572266, "learning_rate": 4.321312646015775e-05, "loss": 3.624, "step": 5260 }, { "epoch": 0.07, "grad_norm": 24.005277633666992, "learning_rate": 4.3157565798095753e-05, "loss": 3.6098, "step": 5280 }, { "epoch": 0.08, "grad_norm": 23.65524673461914, "learning_rate": 4.3101814671760546e-05, "loss": 3.6969, "step": 5300 }, { "epoch": 0.08, "grad_norm": 40.98033905029297, "learning_rate": 4.304587366595506e-05, "loss": 3.8225, "step": 5320 }, { "epoch": 0.08, "grad_norm": 28.647207260131836, "learning_rate": 4.298974336747397e-05, "loss": 3.6742, "step": 5340 }, { "epoch": 0.08, "grad_norm": 20.806941986083984, "learning_rate": 4.2933424365097564e-05, "loss": 3.5679, "step": 5360 }, { "epoch": 0.08, "grad_norm": 22.459196090698242, "learning_rate": 4.287691724958551e-05, "loss": 3.6389, "step": 5380 }, { "epoch": 0.08, "grad_norm": 23.558490753173828, "learning_rate": 4.2820222613670736e-05, "loss": 3.6654, "step": 5400 }, { "epoch": 0.08, "grad_norm": 20.315793991088867, "learning_rate": 4.276334105205312e-05, "loss": 3.5976, "step": 5420 }, { "epoch": 0.08, "grad_norm": 21.125396728515625, "learning_rate": 4.2706273161393327e-05, "loss": 3.5712, "step": 5440 }, { "epoch": 0.08, "grad_norm": 25.103483200073242, "learning_rate": 4.2649019540306545e-05, "loss": 3.616, "step": 5460 }, { "epoch": 0.08, "grad_norm": 23.65394401550293, "learning_rate": 4.2591580789356156e-05, "loss": 3.6587, "step": 5480 }, { "epoch": 0.08, "grad_norm": 31.216896057128906, "learning_rate": 4.253395751104748e-05, "loss": 3.7161, "step": 5500 }, { "epoch": 0.08, "grad_norm": 28.144855499267578, "learning_rate": 4.247615030982144e-05, "loss": 3.6847, "step": 5520 }, { "epoch": 0.08, "grad_norm": 23.597564697265625, "learning_rate": 4.241815979204822e-05, "loss": 3.6556, "step": 5540 }, { "epoch": 0.08, "grad_norm": 41.00291061401367, "learning_rate": 4.2359986566020906e-05, "loss": 3.7665, "step": 5560 }, { "epoch": 0.08, "grad_norm": 37.05702209472656, "learning_rate": 4.230163124194913e-05, "loss": 3.5916, "step": 5580 }, { "epoch": 0.08, "grad_norm": 28.161930084228516, "learning_rate": 4.224309443195261e-05, "loss": 3.6887, "step": 5600 }, { "epoch": 0.08, "grad_norm": 31.685361862182617, "learning_rate": 4.2184376750054786e-05, "loss": 3.5724, "step": 5620 }, { "epoch": 0.08, "grad_norm": 38.13533020019531, "learning_rate": 4.2125478812176364e-05, "loss": 3.664, "step": 5640 }, { "epoch": 0.08, "grad_norm": 20.385272979736328, "learning_rate": 4.206640123612884e-05, "loss": 3.73, "step": 5660 }, { "epoch": 0.08, "grad_norm": 30.926259994506836, "learning_rate": 4.200714464160804e-05, "loss": 3.6472, "step": 5680 }, { "epoch": 0.08, "grad_norm": 19.820131301879883, "learning_rate": 4.194770965018758e-05, "loss": 3.6226, "step": 5700 }, { "epoch": 0.08, "grad_norm": 21.318801879882812, "learning_rate": 4.188809688531241e-05, "loss": 3.635, "step": 5720 }, { "epoch": 0.08, "grad_norm": 18.304567337036133, "learning_rate": 4.182830697229223e-05, "loss": 3.625, "step": 5740 }, { "epoch": 0.08, "grad_norm": 24.25802230834961, "learning_rate": 4.176834053829492e-05, "loss": 3.5844, "step": 5760 }, { "epoch": 0.08, "grad_norm": 53.09843444824219, "learning_rate": 4.170819821234001e-05, "loss": 3.7058, "step": 5780 }, { "epoch": 0.08, "grad_norm": 39.87876510620117, "learning_rate": 4.164788062529203e-05, "loss": 3.725, "step": 5800 }, { "epoch": 0.08, "grad_norm": 32.36482620239258, "learning_rate": 4.1587388409853935e-05, "loss": 3.5355, "step": 5820 }, { "epoch": 0.08, "grad_norm": 28.59760284423828, "learning_rate": 4.1526722200560445e-05, "loss": 3.6528, "step": 5840 }, { "epoch": 0.08, "grad_norm": 21.3729305267334, "learning_rate": 4.146588263377137e-05, "loss": 3.6428, "step": 5860 }, { "epoch": 0.08, "grad_norm": 20.160661697387695, "learning_rate": 4.140487034766499e-05, "loss": 3.6116, "step": 5880 }, { "epoch": 0.08, "grad_norm": 31.58021354675293, "learning_rate": 4.134368598223132e-05, "loss": 3.6302, "step": 5900 }, { "epoch": 0.08, "grad_norm": 30.793672561645508, "learning_rate": 4.128233017926538e-05, "loss": 3.5663, "step": 5920 }, { "epoch": 0.08, "grad_norm": 22.589147567749023, "learning_rate": 4.122080358236055e-05, "loss": 3.6292, "step": 5940 }, { "epoch": 0.08, "grad_norm": 32.27565383911133, "learning_rate": 4.1159106836901674e-05, "loss": 3.5806, "step": 5960 }, { "epoch": 0.08, "grad_norm": 37.15829849243164, "learning_rate": 4.109724059005844e-05, "loss": 3.5662, "step": 5980 }, { "epoch": 0.08, "grad_norm": 38.23238754272461, "learning_rate": 4.10352054907785e-05, "loss": 3.6842, "step": 6000 }, { "epoch": 0.09, "grad_norm": 24.37531089782715, "learning_rate": 4.0973002189780694e-05, "loss": 3.6153, "step": 6020 }, { "epoch": 0.09, "grad_norm": 24.309982299804688, "learning_rate": 4.0910631339548206e-05, "loss": 3.6502, "step": 6040 }, { "epoch": 0.09, "grad_norm": 24.007654190063477, "learning_rate": 4.084809359432175e-05, "loss": 3.7203, "step": 6060 }, { "epoch": 0.09, "grad_norm": 24.977094650268555, "learning_rate": 4.0785389610092686e-05, "loss": 3.5413, "step": 6080 }, { "epoch": 0.09, "grad_norm": 27.397930145263672, "learning_rate": 4.072252004459611e-05, "loss": 3.5612, "step": 6100 }, { "epoch": 0.09, "grad_norm": 26.012800216674805, "learning_rate": 4.065948555730405e-05, "loss": 3.6385, "step": 6120 }, { "epoch": 0.09, "grad_norm": 29.745574951171875, "learning_rate": 4.0596286809418435e-05, "loss": 3.6646, "step": 6140 }, { "epoch": 0.09, "grad_norm": 30.76190185546875, "learning_rate": 4.053292446386422e-05, "loss": 3.6622, "step": 6160 }, { "epoch": 0.09, "grad_norm": 27.577564239501953, "learning_rate": 4.046939918528243e-05, "loss": 3.701, "step": 6180 }, { "epoch": 0.09, "grad_norm": 31.610410690307617, "learning_rate": 4.0405711640023186e-05, "loss": 3.5977, "step": 6200 }, { "epoch": 0.09, "grad_norm": 28.61423110961914, "learning_rate": 4.034186249613869e-05, "loss": 3.7307, "step": 6220 }, { "epoch": 0.09, "grad_norm": 44.62327575683594, "learning_rate": 4.027785242337626e-05, "loss": 3.7055, "step": 6240 }, { "epoch": 0.09, "grad_norm": 32.20371627807617, "learning_rate": 4.0213682093171254e-05, "loss": 3.6186, "step": 6260 }, { "epoch": 0.09, "grad_norm": 32.36015701293945, "learning_rate": 4.014935217864009e-05, "loss": 3.5798, "step": 6280 }, { "epoch": 0.09, "grad_norm": 36.1356201171875, "learning_rate": 4.008486335457312e-05, "loss": 3.6395, "step": 6300 }, { "epoch": 0.09, "grad_norm": 20.485820770263672, "learning_rate": 4.0020216297427594e-05, "loss": 3.6075, "step": 6320 }, { "epoch": 0.09, "grad_norm": 21.503564834594727, "learning_rate": 3.995541168532055e-05, "loss": 3.6099, "step": 6340 }, { "epoch": 0.09, "grad_norm": 29.125812530517578, "learning_rate": 3.9890450198021704e-05, "loss": 3.6665, "step": 6360 }, { "epoch": 0.09, "grad_norm": 24.479976654052734, "learning_rate": 3.982533251694632e-05, "loss": 3.7168, "step": 6380 }, { "epoch": 0.09, "grad_norm": 36.184410095214844, "learning_rate": 3.976005932514807e-05, "loss": 3.5771, "step": 6400 }, { "epoch": 0.09, "grad_norm": 28.156030654907227, "learning_rate": 3.969463130731183e-05, "loss": 3.6353, "step": 6420 }, { "epoch": 0.09, "grad_norm": 25.22379493713379, "learning_rate": 3.962904914974656e-05, "loss": 3.5015, "step": 6440 }, { "epoch": 0.09, "grad_norm": 31.427339553833008, "learning_rate": 3.9563313540378055e-05, "loss": 3.5712, "step": 6460 }, { "epoch": 0.09, "grad_norm": 19.2696590423584, "learning_rate": 3.949742516874175e-05, "loss": 3.5929, "step": 6480 }, { "epoch": 0.09, "grad_norm": 23.234111785888672, "learning_rate": 3.943138472597549e-05, "loss": 3.6166, "step": 6500 }, { "epoch": 0.09, "grad_norm": 26.726085662841797, "learning_rate": 3.936519290481226e-05, "loss": 3.6748, "step": 6520 }, { "epoch": 0.09, "grad_norm": 34.712257385253906, "learning_rate": 3.929885039957296e-05, "loss": 3.64, "step": 6540 }, { "epoch": 0.09, "grad_norm": 25.96158218383789, "learning_rate": 3.923235790615907e-05, "loss": 3.6119, "step": 6560 }, { "epoch": 0.09, "grad_norm": 34.04408264160156, "learning_rate": 3.916571612204537e-05, "loss": 3.6881, "step": 6580 }, { "epoch": 0.09, "grad_norm": 20.656030654907227, "learning_rate": 3.909892574627266e-05, "loss": 3.6462, "step": 6600 }, { "epoch": 0.09, "grad_norm": 23.8648738861084, "learning_rate": 3.9031987479440367e-05, "loss": 3.597, "step": 6620 }, { "epoch": 0.09, "grad_norm": 34.48773193359375, "learning_rate": 3.896490202369924e-05, "loss": 3.5781, "step": 6640 }, { "epoch": 0.09, "grad_norm": 26.09375762939453, "learning_rate": 3.8897670082743955e-05, "loss": 3.5463, "step": 6660 }, { "epoch": 0.09, "grad_norm": 25.49962615966797, "learning_rate": 3.883029236180577e-05, "loss": 3.637, "step": 6680 }, { "epoch": 0.09, "grad_norm": 37.70731735229492, "learning_rate": 3.876276956764509e-05, "loss": 3.6515, "step": 6700 }, { "epoch": 0.1, "grad_norm": 53.345558166503906, "learning_rate": 3.8695102408544076e-05, "loss": 3.521, "step": 6720 }, { "epoch": 0.1, "grad_norm": 34.147884368896484, "learning_rate": 3.862729159429921e-05, "loss": 3.6443, "step": 6740 }, { "epoch": 0.1, "grad_norm": 29.45001220703125, "learning_rate": 3.855933783621384e-05, "loss": 3.5976, "step": 6760 }, { "epoch": 0.1, "grad_norm": 29.933969497680664, "learning_rate": 3.849124184709073e-05, "loss": 3.6396, "step": 6780 }, { "epoch": 0.1, "grad_norm": 38.85334014892578, "learning_rate": 3.84230043412246e-05, "loss": 3.6518, "step": 6800 }, { "epoch": 0.1, "grad_norm": 34.85492706298828, "learning_rate": 3.835462603439458e-05, "loss": 3.6577, "step": 6820 }, { "epoch": 0.1, "grad_norm": 29.77360725402832, "learning_rate": 3.828610764385676e-05, "loss": 3.6026, "step": 6840 }, { "epoch": 0.1, "grad_norm": 38.89609909057617, "learning_rate": 3.821744988833663e-05, "loss": 3.6144, "step": 6860 }, { "epoch": 0.1, "grad_norm": 25.664960861206055, "learning_rate": 3.814865348802157e-05, "loss": 3.5826, "step": 6880 }, { "epoch": 0.1, "grad_norm": 31.955894470214844, "learning_rate": 3.807971916455325e-05, "loss": 3.6973, "step": 6900 }, { "epoch": 0.1, "grad_norm": 23.378131866455078, "learning_rate": 3.8010647641020115e-05, "loss": 3.6875, "step": 6920 }, { "epoch": 0.1, "grad_norm": 45.89334487915039, "learning_rate": 3.794143964194976e-05, "loss": 3.5457, "step": 6940 }, { "epoch": 0.1, "grad_norm": 32.45075988769531, "learning_rate": 3.787209589330134e-05, "loss": 3.5719, "step": 6960 }, { "epoch": 0.1, "grad_norm": 32.06966018676758, "learning_rate": 3.7802617122457975e-05, "loss": 3.6324, "step": 6980 }, { "epoch": 0.1, "grad_norm": 27.93364715576172, "learning_rate": 3.773300405821908e-05, "loss": 3.6093, "step": 7000 }, { "epoch": 0.1, "grad_norm": 23.111515045166016, "learning_rate": 3.766325743079277e-05, "loss": 3.5292, "step": 7020 }, { "epoch": 0.1, "grad_norm": 24.405742645263672, "learning_rate": 3.759337797178816e-05, "loss": 3.5969, "step": 7040 }, { "epoch": 0.1, "grad_norm": 37.218467712402344, "learning_rate": 3.752336641420772e-05, "loss": 3.653, "step": 7060 }, { "epoch": 0.1, "grad_norm": 32.396522521972656, "learning_rate": 3.745322349243954e-05, "loss": 3.6483, "step": 7080 }, { "epoch": 0.1, "grad_norm": 35.53373336791992, "learning_rate": 3.7382949942249694e-05, "loss": 3.6356, "step": 7100 }, { "epoch": 0.1, "grad_norm": 33.19758987426758, "learning_rate": 3.731254650077446e-05, "loss": 3.6017, "step": 7120 }, { "epoch": 0.1, "grad_norm": 36.60466003417969, "learning_rate": 3.7242013906512626e-05, "loss": 3.6246, "step": 7140 }, { "epoch": 0.1, "grad_norm": 21.257328033447266, "learning_rate": 3.717135289931774e-05, "loss": 3.6046, "step": 7160 }, { "epoch": 0.1, "grad_norm": 25.697444915771484, "learning_rate": 3.7100564220390326e-05, "loss": 3.6154, "step": 7180 }, { "epoch": 0.1, "grad_norm": 28.491622924804688, "learning_rate": 3.702964861227013e-05, "loss": 3.6983, "step": 7200 }, { "epoch": 0.1, "grad_norm": 26.819791793823242, "learning_rate": 3.695860681882832e-05, "loss": 3.5722, "step": 7220 }, { "epoch": 0.1, "grad_norm": 25.864788055419922, "learning_rate": 3.6887439585259694e-05, "loss": 3.6825, "step": 7240 }, { "epoch": 0.1, "grad_norm": 22.492717742919922, "learning_rate": 3.681614765807486e-05, "loss": 3.6377, "step": 7260 }, { "epoch": 0.1, "grad_norm": 31.227336883544922, "learning_rate": 3.6744731785092395e-05, "loss": 3.5476, "step": 7280 }, { "epoch": 0.1, "grad_norm": 29.010467529296875, "learning_rate": 3.6673192715431015e-05, "loss": 3.6285, "step": 7300 }, { "epoch": 0.1, "grad_norm": 38.40274429321289, "learning_rate": 3.6601531199501714e-05, "loss": 3.6941, "step": 7320 }, { "epoch": 0.1, "grad_norm": 26.361167907714844, "learning_rate": 3.652974798899988e-05, "loss": 3.5772, "step": 7340 }, { "epoch": 0.1, "grad_norm": 30.241390228271484, "learning_rate": 3.645784383689742e-05, "loss": 3.5177, "step": 7360 }, { "epoch": 0.1, "grad_norm": 43.579349517822266, "learning_rate": 3.6385819497434876e-05, "loss": 3.7467, "step": 7380 }, { "epoch": 0.1, "grad_norm": 47.42546081542969, "learning_rate": 3.631367572611348e-05, "loss": 3.6651, "step": 7400 }, { "epoch": 0.11, "grad_norm": 29.63494110107422, "learning_rate": 3.6241413279687254e-05, "loss": 3.6368, "step": 7420 }, { "epoch": 0.11, "grad_norm": 23.63068389892578, "learning_rate": 3.616903291615506e-05, "loss": 3.4684, "step": 7440 }, { "epoch": 0.11, "grad_norm": 24.25609588623047, "learning_rate": 3.6096535394752676e-05, "loss": 3.6177, "step": 7460 }, { "epoch": 0.11, "grad_norm": 23.829919815063477, "learning_rate": 3.6023921475944794e-05, "loss": 3.6008, "step": 7480 }, { "epoch": 0.11, "grad_norm": 23.879764556884766, "learning_rate": 3.595119192141706e-05, "loss": 3.6926, "step": 7500 }, { "epoch": 0.11, "grad_norm": 22.195941925048828, "learning_rate": 3.5878347494068084e-05, "loss": 3.6049, "step": 7520 }, { "epoch": 0.11, "grad_norm": 48.3228645324707, "learning_rate": 3.580538895800144e-05, "loss": 3.64, "step": 7540 }, { "epoch": 0.11, "grad_norm": 33.77362823486328, "learning_rate": 3.5732317078517654e-05, "loss": 3.573, "step": 7560 }, { "epoch": 0.11, "grad_norm": 29.266658782958984, "learning_rate": 3.565913262210615e-05, "loss": 3.6385, "step": 7580 }, { "epoch": 0.11, "grad_norm": 42.985694885253906, "learning_rate": 3.5585836356437264e-05, "loss": 3.5987, "step": 7600 }, { "epoch": 0.11, "grad_norm": 28.579496383666992, "learning_rate": 3.551242905035412e-05, "loss": 3.6161, "step": 7620 }, { "epoch": 0.11, "grad_norm": 27.196502685546875, "learning_rate": 3.5438911473864634e-05, "loss": 3.5763, "step": 7640 }, { "epoch": 0.11, "grad_norm": 28.27582359313965, "learning_rate": 3.5365284398133405e-05, "loss": 3.6452, "step": 7660 }, { "epoch": 0.11, "grad_norm": 27.310009002685547, "learning_rate": 3.52915485954736e-05, "loss": 3.6718, "step": 7680 }, { "epoch": 0.11, "grad_norm": 18.603565216064453, "learning_rate": 3.521770483933891e-05, "loss": 3.7397, "step": 7700 }, { "epoch": 0.11, "grad_norm": 26.25426483154297, "learning_rate": 3.514375390431539e-05, "loss": 3.6665, "step": 7720 }, { "epoch": 0.11, "grad_norm": 29.20294952392578, "learning_rate": 3.506969656611335e-05, "loss": 3.551, "step": 7740 }, { "epoch": 0.11, "grad_norm": 37.7564697265625, "learning_rate": 3.4995533601559226e-05, "loss": 3.58, "step": 7760 }, { "epoch": 0.11, "grad_norm": 25.87001609802246, "learning_rate": 3.4921265788587435e-05, "loss": 3.5855, "step": 7780 }, { "epoch": 0.11, "grad_norm": 26.17401123046875, "learning_rate": 3.484689390623218e-05, "loss": 3.5951, "step": 7800 }, { "epoch": 0.11, "grad_norm": 29.20701026916504, "learning_rate": 3.4772418734619324e-05, "loss": 3.6267, "step": 7820 }, { "epoch": 0.11, "grad_norm": 60.92488098144531, "learning_rate": 3.4697841054958165e-05, "loss": 3.5733, "step": 7840 }, { "epoch": 0.11, "grad_norm": 23.196178436279297, "learning_rate": 3.462316164953328e-05, "loss": 3.6283, "step": 7860 }, { "epoch": 0.11, "grad_norm": 23.13970184326172, "learning_rate": 3.45483813016963e-05, "loss": 3.6558, "step": 7880 }, { "epoch": 0.11, "grad_norm": 36.5677375793457, "learning_rate": 3.447350079585767e-05, "loss": 3.8141, "step": 7900 }, { "epoch": 0.11, "grad_norm": 24.820940017700195, "learning_rate": 3.4398520917478476e-05, "loss": 3.6439, "step": 7920 }, { "epoch": 0.11, "grad_norm": 19.9990291595459, "learning_rate": 3.4323442453062174e-05, "loss": 3.601, "step": 7940 }, { "epoch": 0.11, "grad_norm": 20.419004440307617, "learning_rate": 3.42482661901463e-05, "loss": 3.4856, "step": 7960 }, { "epoch": 0.11, "grad_norm": 24.06426429748535, "learning_rate": 3.417299291729431e-05, "loss": 3.679, "step": 7980 }, { "epoch": 0.11, "grad_norm": 23.68332862854004, "learning_rate": 3.409762342408719e-05, "loss": 3.6538, "step": 8000 }, { "epoch": 0.11, "grad_norm": 22.80304527282715, "learning_rate": 3.402215850111528e-05, "loss": 3.6685, "step": 8020 }, { "epoch": 0.11, "grad_norm": 30.03902244567871, "learning_rate": 3.3946598939969896e-05, "loss": 3.633, "step": 8040 }, { "epoch": 0.11, "grad_norm": 31.799922943115234, "learning_rate": 3.38709455332351e-05, "loss": 3.5756, "step": 8060 }, { "epoch": 0.11, "grad_norm": 29.18169403076172, "learning_rate": 3.379519907447931e-05, "loss": 3.5886, "step": 8080 }, { "epoch": 0.11, "grad_norm": 34.412113189697266, "learning_rate": 3.3719360358247054e-05, "loss": 3.5254, "step": 8100 }, { "epoch": 0.11, "grad_norm": 38.046695709228516, "learning_rate": 3.3643430180050574e-05, "loss": 3.6677, "step": 8120 }, { "epoch": 0.12, "grad_norm": 23.16988182067871, "learning_rate": 3.35674093363615e-05, "loss": 3.5864, "step": 8140 }, { "epoch": 0.12, "grad_norm": 59.21152114868164, "learning_rate": 3.349129862460251e-05, "loss": 3.4903, "step": 8160 }, { "epoch": 0.12, "grad_norm": 21.080909729003906, "learning_rate": 3.341509884313897e-05, "loss": 3.5803, "step": 8180 }, { "epoch": 0.12, "grad_norm": 26.221805572509766, "learning_rate": 3.333881079127052e-05, "loss": 3.5238, "step": 8200 }, { "epoch": 0.12, "grad_norm": 21.8948917388916, "learning_rate": 3.326243526922272e-05, "loss": 3.5498, "step": 8220 }, { "epoch": 0.12, "grad_norm": 29.98341178894043, "learning_rate": 3.3185973078138664e-05, "loss": 3.6218, "step": 8240 }, { "epoch": 0.12, "grad_norm": 21.86969757080078, "learning_rate": 3.310942502007056e-05, "loss": 3.5104, "step": 8260 }, { "epoch": 0.12, "grad_norm": 29.3415584564209, "learning_rate": 3.303279189797131e-05, "loss": 3.5253, "step": 8280 }, { "epoch": 0.12, "grad_norm": 30.171510696411133, "learning_rate": 3.29560745156861e-05, "loss": 3.6886, "step": 8300 }, { "epoch": 0.12, "grad_norm": 24.074813842773438, "learning_rate": 3.287927367794397e-05, "loss": 3.6401, "step": 8320 }, { "epoch": 0.12, "grad_norm": 25.059324264526367, "learning_rate": 3.2802390190349366e-05, "loss": 3.5847, "step": 8340 }, { "epoch": 0.12, "grad_norm": 19.766672134399414, "learning_rate": 3.272542485937369e-05, "loss": 3.5812, "step": 8360 }, { "epoch": 0.12, "grad_norm": 25.08376693725586, "learning_rate": 3.264837849234685e-05, "loss": 3.55, "step": 8380 }, { "epoch": 0.12, "grad_norm": 27.044347763061523, "learning_rate": 3.2571251897448765e-05, "loss": 3.5347, "step": 8400 }, { "epoch": 0.12, "grad_norm": 23.3479061126709, "learning_rate": 3.249404588370094e-05, "loss": 3.5016, "step": 8420 }, { "epoch": 0.12, "grad_norm": 25.586896896362305, "learning_rate": 3.241676126095792e-05, "loss": 3.537, "step": 8440 }, { "epoch": 0.12, "grad_norm": 31.54664421081543, "learning_rate": 3.233939883989882e-05, "loss": 3.6093, "step": 8460 }, { "epoch": 0.12, "grad_norm": 44.6853141784668, "learning_rate": 3.226195943201883e-05, "loss": 3.6135, "step": 8480 }, { "epoch": 0.12, "grad_norm": 43.322757720947266, "learning_rate": 3.218444384962071e-05, "loss": 3.6048, "step": 8500 }, { "epoch": 0.12, "grad_norm": 19.633960723876953, "learning_rate": 3.210685290580622e-05, "loss": 3.5767, "step": 8520 }, { "epoch": 0.12, "grad_norm": 23.640382766723633, "learning_rate": 3.202918741446764e-05, "loss": 3.5961, "step": 8540 }, { "epoch": 0.12, "grad_norm": 46.06730270385742, "learning_rate": 3.1951448190279255e-05, "loss": 3.5757, "step": 8560 }, { "epoch": 0.12, "grad_norm": 24.966190338134766, "learning_rate": 3.187363604868872e-05, "loss": 3.5488, "step": 8580 }, { "epoch": 0.12, "grad_norm": 61.91409683227539, "learning_rate": 3.1795751805908573e-05, "loss": 3.6554, "step": 8600 }, { "epoch": 0.12, "grad_norm": 80.80062103271484, "learning_rate": 3.171779627890769e-05, "loss": 3.6226, "step": 8620 }, { "epoch": 0.12, "grad_norm": 20.951128005981445, "learning_rate": 3.163977028540263e-05, "loss": 3.6122, "step": 8640 }, { "epoch": 0.12, "grad_norm": 19.875343322753906, "learning_rate": 3.156167464384917e-05, "loss": 3.5637, "step": 8660 }, { "epoch": 0.12, "grad_norm": 21.2697811126709, "learning_rate": 3.1483510173433626e-05, "loss": 3.537, "step": 8680 }, { "epoch": 0.12, "grad_norm": 22.24051856994629, "learning_rate": 3.1405277694064305e-05, "loss": 3.5661, "step": 8700 }, { "epoch": 0.12, "grad_norm": 21.55095863342285, "learning_rate": 3.1326978026362904e-05, "loss": 3.5573, "step": 8720 }, { "epoch": 0.12, "grad_norm": 32.11522674560547, "learning_rate": 3.124861199165588e-05, "loss": 3.5995, "step": 8740 }, { "epoch": 0.12, "grad_norm": 22.775867462158203, "learning_rate": 3.117018041196585e-05, "loss": 3.6436, "step": 8760 }, { "epoch": 0.12, "grad_norm": 23.462509155273438, "learning_rate": 3.109168411000299e-05, "loss": 3.601, "step": 8780 }, { "epoch": 0.12, "grad_norm": 23.43865203857422, "learning_rate": 3.101312390915634e-05, "loss": 3.6081, "step": 8800 }, { "epoch": 0.12, "grad_norm": 27.17888832092285, "learning_rate": 3.0934500633485255e-05, "loss": 3.6257, "step": 8820 }, { "epoch": 0.13, "grad_norm": 31.697662353515625, "learning_rate": 3.0855815107710666e-05, "loss": 3.5902, "step": 8840 }, { "epoch": 0.13, "grad_norm": 37.07548904418945, "learning_rate": 3.0777068157206536e-05, "loss": 3.6514, "step": 8860 }, { "epoch": 0.13, "grad_norm": 20.554109573364258, "learning_rate": 3.069826060799109e-05, "loss": 3.5068, "step": 8880 }, { "epoch": 0.13, "grad_norm": 22.490015029907227, "learning_rate": 3.061939328671824e-05, "loss": 3.6488, "step": 8900 }, { "epoch": 0.13, "grad_norm": 25.30253791809082, "learning_rate": 3.0540467020668864e-05, "loss": 3.5931, "step": 8920 }, { "epoch": 0.13, "grad_norm": 22.97053337097168, "learning_rate": 3.0461482637742135e-05, "loss": 3.5475, "step": 8940 }, { "epoch": 0.13, "grad_norm": 22.68851661682129, "learning_rate": 3.0382440966446875e-05, "loss": 3.619, "step": 8960 }, { "epoch": 0.13, "grad_norm": 28.575305938720703, "learning_rate": 3.03033428358928e-05, "loss": 3.5188, "step": 8980 }, { "epoch": 0.13, "grad_norm": 21.909072875976562, "learning_rate": 3.0224189075781884e-05, "loss": 3.5988, "step": 9000 }, { "epoch": 0.13, "grad_norm": 36.04661560058594, "learning_rate": 3.014498051639959e-05, "loss": 3.569, "step": 9020 }, { "epoch": 0.13, "grad_norm": 157.5665740966797, "learning_rate": 3.0065717988606257e-05, "loss": 3.6474, "step": 9040 }, { "epoch": 0.13, "grad_norm": 37.57924270629883, "learning_rate": 2.9986402323828272e-05, "loss": 3.5874, "step": 9060 }, { "epoch": 0.13, "grad_norm": 26.661418914794922, "learning_rate": 2.990703435404944e-05, "loss": 3.5982, "step": 9080 }, { "epoch": 0.13, "grad_norm": 38.95368957519531, "learning_rate": 2.9827614911802203e-05, "loss": 3.4998, "step": 9100 }, { "epoch": 0.13, "grad_norm": 34.97966003417969, "learning_rate": 2.9748144830158924e-05, "loss": 3.5486, "step": 9120 }, { "epoch": 0.13, "grad_norm": 27.682832717895508, "learning_rate": 2.9668624942723162e-05, "loss": 3.6144, "step": 9140 }, { "epoch": 0.13, "grad_norm": 29.238054275512695, "learning_rate": 2.9589056083620902e-05, "loss": 3.6442, "step": 9160 }, { "epoch": 0.13, "grad_norm": 31.821439743041992, "learning_rate": 2.9509439087491835e-05, "loss": 3.6221, "step": 9180 }, { "epoch": 0.13, "grad_norm": 21.238704681396484, "learning_rate": 2.9429774789480575e-05, "loss": 3.6278, "step": 9200 }, { "epoch": 0.13, "grad_norm": 28.50370216369629, "learning_rate": 2.9350064025227897e-05, "loss": 3.6592, "step": 9220 }, { "epoch": 0.13, "grad_norm": 22.938095092773438, "learning_rate": 2.927030763086201e-05, "loss": 3.519, "step": 9240 }, { "epoch": 0.13, "grad_norm": 27.523639678955078, "learning_rate": 2.9190506442989752e-05, "loss": 3.5285, "step": 9260 }, { "epoch": 0.13, "grad_norm": 27.63553237915039, "learning_rate": 2.9110661298687824e-05, "loss": 3.5641, "step": 9280 }, { "epoch": 0.13, "grad_norm": 43.626129150390625, "learning_rate": 2.9030773035493997e-05, "loss": 3.5764, "step": 9300 }, { "epoch": 0.13, "grad_norm": 20.081253051757812, "learning_rate": 2.8950842491398357e-05, "loss": 3.6518, "step": 9320 }, { "epoch": 0.13, "grad_norm": 20.68466567993164, "learning_rate": 2.8870870504834496e-05, "loss": 3.6206, "step": 9340 }, { "epoch": 0.13, "grad_norm": 72.61478424072266, "learning_rate": 2.8790857914670698e-05, "loss": 3.5108, "step": 9360 }, { "epoch": 0.13, "grad_norm": 23.662805557250977, "learning_rate": 2.871080556020118e-05, "loss": 3.6223, "step": 9380 }, { "epoch": 0.13, "grad_norm": 25.221176147460938, "learning_rate": 2.863071428113726e-05, "loss": 3.644, "step": 9400 }, { "epoch": 0.13, "grad_norm": 42.87479782104492, "learning_rate": 2.8550584917598554e-05, "loss": 3.7027, "step": 9420 }, { "epoch": 0.13, "grad_norm": 30.936325073242188, "learning_rate": 2.8470418310104173e-05, "loss": 3.5493, "step": 9440 }, { "epoch": 0.13, "grad_norm": 24.074983596801758, "learning_rate": 2.8390215299563884e-05, "loss": 3.4781, "step": 9460 }, { "epoch": 0.13, "grad_norm": 22.818313598632812, "learning_rate": 2.8309976727269332e-05, "loss": 3.5558, "step": 9480 }, { "epoch": 0.13, "grad_norm": 33.634605407714844, "learning_rate": 2.8229703434885163e-05, "loss": 3.5958, "step": 9500 }, { "epoch": 0.13, "grad_norm": 29.69095802307129, "learning_rate": 2.814939626444023e-05, "loss": 3.5682, "step": 9520 }, { "epoch": 0.14, "grad_norm": 29.696638107299805, "learning_rate": 2.8069056058318755e-05, "loss": 3.5676, "step": 9540 }, { "epoch": 0.14, "grad_norm": 34.828269958496094, "learning_rate": 2.7988683659251474e-05, "loss": 3.482, "step": 9560 }, { "epoch": 0.14, "grad_norm": 21.408599853515625, "learning_rate": 2.7908279910306835e-05, "loss": 3.5189, "step": 9580 }, { "epoch": 0.14, "grad_norm": 21.67983627319336, "learning_rate": 2.782784565488211e-05, "loss": 3.5703, "step": 9600 }, { "epoch": 0.14, "grad_norm": 24.80797576904297, "learning_rate": 2.7747381736694572e-05, "loss": 3.512, "step": 9620 }, { "epoch": 0.14, "grad_norm": 35.9987678527832, "learning_rate": 2.766688899977266e-05, "loss": 3.5937, "step": 9640 }, { "epoch": 0.14, "grad_norm": 24.59494400024414, "learning_rate": 2.7586368288447095e-05, "loss": 3.5829, "step": 9660 }, { "epoch": 0.14, "grad_norm": 26.161178588867188, "learning_rate": 2.7505820447342028e-05, "loss": 3.5978, "step": 9680 }, { "epoch": 0.14, "grad_norm": 17.551490783691406, "learning_rate": 2.7425246321366203e-05, "loss": 3.527, "step": 9700 }, { "epoch": 0.14, "grad_norm": 31.15974998474121, "learning_rate": 2.7344646755704078e-05, "loss": 3.6422, "step": 9720 }, { "epoch": 0.14, "grad_norm": 24.26822280883789, "learning_rate": 2.7264022595806948e-05, "loss": 3.5971, "step": 9740 }, { "epoch": 0.14, "grad_norm": 24.76336669921875, "learning_rate": 2.71833746873841e-05, "loss": 3.5972, "step": 9760 }, { "epoch": 0.14, "grad_norm": 18.041610717773438, "learning_rate": 2.7102703876393944e-05, "loss": 3.5832, "step": 9780 }, { "epoch": 0.14, "grad_norm": 45.17101287841797, "learning_rate": 2.7022011009035107e-05, "loss": 3.5754, "step": 9800 }, { "epoch": 0.14, "grad_norm": 30.87299156188965, "learning_rate": 2.6941296931737585e-05, "loss": 3.6022, "step": 9820 }, { "epoch": 0.14, "grad_norm": 32.659786224365234, "learning_rate": 2.686056249115385e-05, "loss": 3.5947, "step": 9840 }, { "epoch": 0.14, "grad_norm": 25.66715431213379, "learning_rate": 2.6779808534149987e-05, "loss": 3.5997, "step": 9860 }, { "epoch": 0.14, "grad_norm": 24.735023498535156, "learning_rate": 2.6699035907796792e-05, "loss": 3.5619, "step": 9880 }, { "epoch": 0.14, "grad_norm": 24.357643127441406, "learning_rate": 2.6618245459360897e-05, "loss": 3.6028, "step": 9900 }, { "epoch": 0.14, "grad_norm": 29.255617141723633, "learning_rate": 2.6537438036295875e-05, "loss": 3.5231, "step": 9920 }, { "epoch": 0.14, "grad_norm": 19.508926391601562, "learning_rate": 2.6456614486233343e-05, "loss": 3.5555, "step": 9940 }, { "epoch": 0.14, "grad_norm": 29.48908042907715, "learning_rate": 2.6375775656974123e-05, "loss": 3.5376, "step": 9960 }, { "epoch": 0.14, "grad_norm": 23.857908248901367, "learning_rate": 2.629492239647926e-05, "loss": 3.5641, "step": 9980 }, { "epoch": 0.14, "grad_norm": 39.08363342285156, "learning_rate": 2.621405555286121e-05, "loss": 3.5957, "step": 10000 }, { "epoch": 0.14, "grad_norm": 30.3124942779541, "learning_rate": 2.6133175974374892e-05, "loss": 3.5933, "step": 10020 }, { "epoch": 0.14, "grad_norm": 23.010902404785156, "learning_rate": 2.6052284509408804e-05, "loss": 3.573, "step": 10040 }, { "epoch": 0.14, "grad_norm": 34.93478775024414, "learning_rate": 2.5971382006476154e-05, "loss": 3.5641, "step": 10060 }, { "epoch": 0.14, "grad_norm": 28.346033096313477, "learning_rate": 2.5890469314205897e-05, "loss": 3.5833, "step": 10080 }, { "epoch": 0.14, "grad_norm": 41.69817352294922, "learning_rate": 2.5809547281333902e-05, "loss": 3.5718, "step": 10100 }, { "epoch": 0.14, "grad_norm": 36.409114837646484, "learning_rate": 2.5728616756693997e-05, "loss": 3.5675, "step": 10120 }, { "epoch": 0.14, "grad_norm": 23.812952041625977, "learning_rate": 2.564767858920909e-05, "loss": 3.6445, "step": 10140 }, { "epoch": 0.14, "grad_norm": 31.33305549621582, "learning_rate": 2.556673362788225e-05, "loss": 3.5669, "step": 10160 }, { "epoch": 0.14, "grad_norm": 34.65876770019531, "learning_rate": 2.5485782721787837e-05, "loss": 3.53, "step": 10180 }, { "epoch": 0.14, "grad_norm": 21.920583724975586, "learning_rate": 2.540482672006254e-05, "loss": 3.5825, "step": 10200 }, { "epoch": 0.14, "grad_norm": 48.019004821777344, "learning_rate": 2.5323866471896512e-05, "loss": 3.5733, "step": 10220 }, { "epoch": 0.14, "grad_norm": 52.00071334838867, "learning_rate": 2.5242902826524434e-05, "loss": 3.5487, "step": 10240 }, { "epoch": 0.15, "grad_norm": 47.33055877685547, "learning_rate": 2.5161936633216653e-05, "loss": 3.5076, "step": 10260 }, { "epoch": 0.15, "grad_norm": 20.109745025634766, "learning_rate": 2.5080968741270223e-05, "loss": 3.5991, "step": 10280 }, { "epoch": 0.15, "grad_norm": 23.458494186401367, "learning_rate": 2.5e-05, "loss": 3.6357, "step": 10300 }, { "epoch": 0.15, "grad_norm": 23.68842315673828, "learning_rate": 2.4919031258729786e-05, "loss": 3.5449, "step": 10320 }, { "epoch": 0.15, "grad_norm": 21.289154052734375, "learning_rate": 2.4838063366783353e-05, "loss": 3.6704, "step": 10340 }, { "epoch": 0.15, "grad_norm": 23.132051467895508, "learning_rate": 2.4757097173475572e-05, "loss": 3.6327, "step": 10360 }, { "epoch": 0.15, "grad_norm": 29.875104904174805, "learning_rate": 2.4676133528103497e-05, "loss": 3.5294, "step": 10380 }, { "epoch": 0.15, "grad_norm": 23.41105079650879, "learning_rate": 2.4595173279937464e-05, "loss": 3.5995, "step": 10400 }, { "epoch": 0.15, "grad_norm": 22.15860939025879, "learning_rate": 2.451421727821217e-05, "loss": 3.6109, "step": 10420 }, { "epoch": 0.15, "grad_norm": 28.534278869628906, "learning_rate": 2.443326637211775e-05, "loss": 3.6389, "step": 10440 }, { "epoch": 0.15, "grad_norm": 26.33950424194336, "learning_rate": 2.435232141079092e-05, "loss": 3.6083, "step": 10460 }, { "epoch": 0.15, "grad_norm": 19.027633666992188, "learning_rate": 2.4271383243306016e-05, "loss": 3.5256, "step": 10480 }, { "epoch": 0.15, "grad_norm": 28.898550033569336, "learning_rate": 2.419045271866611e-05, "loss": 3.61, "step": 10500 }, { "epoch": 0.15, "grad_norm": 35.347347259521484, "learning_rate": 2.410953068579411e-05, "loss": 3.616, "step": 10520 }, { "epoch": 0.15, "grad_norm": 23.184894561767578, "learning_rate": 2.402861799352386e-05, "loss": 3.6263, "step": 10540 }, { "epoch": 0.15, "grad_norm": 32.66107177734375, "learning_rate": 2.3947715490591206e-05, "loss": 3.5446, "step": 10560 }, { "epoch": 0.15, "grad_norm": 20.614028930664062, "learning_rate": 2.3866824025625124e-05, "loss": 3.5989, "step": 10580 }, { "epoch": 0.15, "grad_norm": 25.750699996948242, "learning_rate": 2.3785944447138802e-05, "loss": 3.5197, "step": 10600 }, { "epoch": 0.15, "grad_norm": 23.97648048400879, "learning_rate": 2.370507760352074e-05, "loss": 3.6399, "step": 10620 }, { "epoch": 0.15, "grad_norm": 19.600095748901367, "learning_rate": 2.362422434302588e-05, "loss": 3.5295, "step": 10640 }, { "epoch": 0.15, "grad_norm": 27.21882438659668, "learning_rate": 2.3543385513766656e-05, "loss": 3.512, "step": 10660 }, { "epoch": 0.15, "grad_norm": 27.75621795654297, "learning_rate": 2.3462561963704134e-05, "loss": 3.5351, "step": 10680 }, { "epoch": 0.15, "grad_norm": 27.200828552246094, "learning_rate": 2.338175454063911e-05, "loss": 3.5038, "step": 10700 }, { "epoch": 0.15, "grad_norm": 27.96784782409668, "learning_rate": 2.3300964092203207e-05, "loss": 3.6097, "step": 10720 }, { "epoch": 0.15, "grad_norm": 28.206979751586914, "learning_rate": 2.3220191465850015e-05, "loss": 3.5254, "step": 10740 }, { "epoch": 0.15, "grad_norm": 22.781152725219727, "learning_rate": 2.3139437508846155e-05, "loss": 3.5857, "step": 10760 }, { "epoch": 0.15, "grad_norm": 23.07236099243164, "learning_rate": 2.305870306826242e-05, "loss": 3.4872, "step": 10780 }, { "epoch": 0.15, "grad_norm": 22.408714294433594, "learning_rate": 2.29779889909649e-05, "loss": 3.5115, "step": 10800 }, { "epoch": 0.15, "grad_norm": 23.98442268371582, "learning_rate": 2.289729612360606e-05, "loss": 3.6297, "step": 10820 }, { "epoch": 0.15, "grad_norm": 29.503135681152344, "learning_rate": 2.2816625312615903e-05, "loss": 3.6209, "step": 10840 }, { "epoch": 0.15, "grad_norm": 30.787010192871094, "learning_rate": 2.2735977404193058e-05, "loss": 3.4921, "step": 10860 }, { "epoch": 0.15, "grad_norm": 24.088376998901367, "learning_rate": 2.2655353244295928e-05, "loss": 3.5582, "step": 10880 }, { "epoch": 0.15, "grad_norm": 25.253761291503906, "learning_rate": 2.25747536786338e-05, "loss": 3.5297, "step": 10900 }, { "epoch": 0.15, "grad_norm": 24.333845138549805, "learning_rate": 2.2494179552657978e-05, "loss": 3.6105, "step": 10920 }, { "epoch": 0.15, "grad_norm": 32.39179229736328, "learning_rate": 2.241363171155291e-05, "loss": 3.6122, "step": 10940 }, { "epoch": 0.16, "grad_norm": 31.885894775390625, "learning_rate": 2.2333111000227342e-05, "loss": 3.6358, "step": 10960 }, { "epoch": 0.16, "grad_norm": 22.056386947631836, "learning_rate": 2.225261826330543e-05, "loss": 3.5181, "step": 10980 }, { "epoch": 0.16, "grad_norm": 23.47673225402832, "learning_rate": 2.2172154345117894e-05, "loss": 3.4853, "step": 11000 }, { "epoch": 0.16, "grad_norm": 23.548656463623047, "learning_rate": 2.2091720089693168e-05, "loss": 3.5468, "step": 11020 }, { "epoch": 0.16, "grad_norm": 16.66544532775879, "learning_rate": 2.201131634074853e-05, "loss": 3.626, "step": 11040 }, { "epoch": 0.16, "grad_norm": 30.556697845458984, "learning_rate": 2.1930943941681254e-05, "loss": 3.5565, "step": 11060 }, { "epoch": 0.16, "grad_norm": 62.914642333984375, "learning_rate": 2.1850603735559778e-05, "loss": 3.554, "step": 11080 }, { "epoch": 0.16, "grad_norm": 25.617481231689453, "learning_rate": 2.177029656511485e-05, "loss": 3.5449, "step": 11100 }, { "epoch": 0.16, "grad_norm": 33.26127243041992, "learning_rate": 2.169002327273068e-05, "loss": 3.6071, "step": 11120 }, { "epoch": 0.16, "grad_norm": 21.895418167114258, "learning_rate": 2.160978470043612e-05, "loss": 3.4622, "step": 11140 }, { "epoch": 0.16, "grad_norm": 25.30924415588379, "learning_rate": 2.152958168989584e-05, "loss": 3.5169, "step": 11160 }, { "epoch": 0.16, "grad_norm": 28.7779541015625, "learning_rate": 2.1449415082401455e-05, "loss": 3.5817, "step": 11180 }, { "epoch": 0.16, "grad_norm": 24.38544273376465, "learning_rate": 2.136928571886275e-05, "loss": 3.5433, "step": 11200 }, { "epoch": 0.16, "grad_norm": 36.38949966430664, "learning_rate": 2.1289194439798818e-05, "loss": 3.5653, "step": 11220 }, { "epoch": 0.16, "grad_norm": 36.11268615722656, "learning_rate": 2.12091420853293e-05, "loss": 3.4839, "step": 11240 }, { "epoch": 0.16, "grad_norm": 18.36191749572754, "learning_rate": 2.1129129495165507e-05, "loss": 3.5532, "step": 11260 }, { "epoch": 0.16, "grad_norm": 27.239763259887695, "learning_rate": 2.1049157508601642e-05, "loss": 3.5536, "step": 11280 }, { "epoch": 0.16, "grad_norm": 25.459758758544922, "learning_rate": 2.0969226964506006e-05, "loss": 3.4878, "step": 11300 }, { "epoch": 0.16, "grad_norm": 28.359439849853516, "learning_rate": 2.0889338701312185e-05, "loss": 3.563, "step": 11320 }, { "epoch": 0.16, "grad_norm": 25.177392959594727, "learning_rate": 2.0809493557010247e-05, "loss": 3.6313, "step": 11340 }, { "epoch": 0.16, "grad_norm": 26.633609771728516, "learning_rate": 2.072969236913799e-05, "loss": 3.6034, "step": 11360 }, { "epoch": 0.16, "grad_norm": 19.589900970458984, "learning_rate": 2.0649935974772105e-05, "loss": 3.6429, "step": 11380 }, { "epoch": 0.16, "grad_norm": 35.16368865966797, "learning_rate": 2.0570225210519434e-05, "loss": 3.5154, "step": 11400 }, { "epoch": 0.16, "grad_norm": 37.59727478027344, "learning_rate": 2.0490560912508168e-05, "loss": 3.5652, "step": 11420 }, { "epoch": 0.16, "grad_norm": 37.76837158203125, "learning_rate": 2.04109439163791e-05, "loss": 3.6911, "step": 11440 }, { "epoch": 0.16, "grad_norm": 27.86673355102539, "learning_rate": 2.0331375057276844e-05, "loss": 3.4824, "step": 11460 }, { "epoch": 0.16, "grad_norm": 51.12165832519531, "learning_rate": 2.025185516984108e-05, "loss": 3.558, "step": 11480 }, { "epoch": 0.16, "grad_norm": 22.489160537719727, "learning_rate": 2.0172385088197803e-05, "loss": 3.5595, "step": 11500 }, { "epoch": 0.16, "grad_norm": 19.6495304107666, "learning_rate": 2.0092965645950564e-05, "loss": 3.5679, "step": 11520 }, { "epoch": 0.16, "grad_norm": 19.997142791748047, "learning_rate": 2.001359767617173e-05, "loss": 3.5332, "step": 11540 }, { "epoch": 0.16, "grad_norm": 34.29532241821289, "learning_rate": 1.9934282011393753e-05, "loss": 3.4848, "step": 11560 }, { "epoch": 0.16, "grad_norm": 20.737041473388672, "learning_rate": 1.985501948360041e-05, "loss": 3.4874, "step": 11580 }, { "epoch": 0.16, "grad_norm": 30.11549186706543, "learning_rate": 1.9775810924218125e-05, "loss": 3.5166, "step": 11600 }, { "epoch": 0.16, "grad_norm": 23.56212615966797, "learning_rate": 1.9696657164107202e-05, "loss": 3.652, "step": 11620 }, { "epoch": 0.16, "grad_norm": 20.44150733947754, "learning_rate": 1.9617559033553128e-05, "loss": 3.5137, "step": 11640 }, { "epoch": 0.17, "grad_norm": 33.37120819091797, "learning_rate": 1.9538517362257868e-05, "loss": 3.5163, "step": 11660 }, { "epoch": 0.17, "grad_norm": 29.839820861816406, "learning_rate": 1.945953297933115e-05, "loss": 3.5979, "step": 11680 }, { "epoch": 0.17, "grad_norm": 25.600812911987305, "learning_rate": 1.9380606713281775e-05, "loss": 3.6111, "step": 11700 }, { "epoch": 0.17, "grad_norm": 40.76740264892578, "learning_rate": 1.9301739392008923e-05, "loss": 3.6727, "step": 11720 }, { "epoch": 0.17, "grad_norm": 25.436763763427734, "learning_rate": 1.9222931842793473e-05, "loss": 3.6145, "step": 11740 }, { "epoch": 0.17, "grad_norm": 19.53345489501953, "learning_rate": 1.9144184892289337e-05, "loss": 3.5486, "step": 11760 }, { "epoch": 0.17, "grad_norm": 21.103118896484375, "learning_rate": 1.9065499366514757e-05, "loss": 3.5796, "step": 11780 }, { "epoch": 0.17, "grad_norm": 26.135894775390625, "learning_rate": 1.8986876090843667e-05, "loss": 3.5905, "step": 11800 }, { "epoch": 0.17, "grad_norm": 32.71371841430664, "learning_rate": 1.8908315889997007e-05, "loss": 3.531, "step": 11820 }, { "epoch": 0.17, "grad_norm": 23.510149002075195, "learning_rate": 1.882981958803414e-05, "loss": 3.5597, "step": 11840 }, { "epoch": 0.17, "grad_norm": 23.804306030273438, "learning_rate": 1.8751388008344117e-05, "loss": 3.5755, "step": 11860 }, { "epoch": 0.17, "grad_norm": 33.7330436706543, "learning_rate": 1.8673021973637095e-05, "loss": 3.5092, "step": 11880 }, { "epoch": 0.17, "grad_norm": 20.88456153869629, "learning_rate": 1.859472230593569e-05, "loss": 3.6001, "step": 11900 }, { "epoch": 0.17, "grad_norm": 28.640546798706055, "learning_rate": 1.8516489826566376e-05, "loss": 3.5419, "step": 11920 }, { "epoch": 0.17, "grad_norm": 25.2142391204834, "learning_rate": 1.8438325356150826e-05, "loss": 3.465, "step": 11940 }, { "epoch": 0.17, "grad_norm": 27.663267135620117, "learning_rate": 1.836022971459737e-05, "loss": 3.5017, "step": 11960 }, { "epoch": 0.17, "grad_norm": 31.913984298706055, "learning_rate": 1.828220372109232e-05, "loss": 3.5187, "step": 11980 }, { "epoch": 0.17, "grad_norm": 29.825590133666992, "learning_rate": 1.820424819409143e-05, "loss": 3.5469, "step": 12000 }, { "epoch": 0.17, "grad_norm": 18.72800636291504, "learning_rate": 1.8126363951311287e-05, "loss": 3.5486, "step": 12020 }, { "epoch": 0.17, "grad_norm": 30.366409301757812, "learning_rate": 1.804855180972075e-05, "loss": 3.5487, "step": 12040 }, { "epoch": 0.17, "grad_norm": 25.00339698791504, "learning_rate": 1.797081258553236e-05, "loss": 3.4778, "step": 12060 }, { "epoch": 0.17, "grad_norm": 29.204017639160156, "learning_rate": 1.7893147094193786e-05, "loss": 3.446, "step": 12080 }, { "epoch": 0.17, "grad_norm": 28.64485740661621, "learning_rate": 1.7815556150379298e-05, "loss": 3.5421, "step": 12100 }, { "epoch": 0.17, "grad_norm": 31.4785213470459, "learning_rate": 1.7738040567981166e-05, "loss": 3.5075, "step": 12120 }, { "epoch": 0.17, "grad_norm": 28.798315048217773, "learning_rate": 1.766060116010118e-05, "loss": 3.5049, "step": 12140 }, { "epoch": 0.17, "grad_norm": 27.112850189208984, "learning_rate": 1.7583238739042086e-05, "loss": 3.5939, "step": 12160 }, { "epoch": 0.17, "grad_norm": 24.396697998046875, "learning_rate": 1.7505954116299063e-05, "loss": 3.4596, "step": 12180 }, { "epoch": 0.17, "grad_norm": 18.46675682067871, "learning_rate": 1.7428748102551237e-05, "loss": 3.4861, "step": 12200 }, { "epoch": 0.17, "grad_norm": 25.615234375, "learning_rate": 1.7351621507653157e-05, "loss": 3.5211, "step": 12220 }, { "epoch": 0.17, "grad_norm": 23.890607833862305, "learning_rate": 1.7274575140626318e-05, "loss": 3.5255, "step": 12240 }, { "epoch": 0.17, "grad_norm": 19.983030319213867, "learning_rate": 1.7197609809650643e-05, "loss": 3.5567, "step": 12260 }, { "epoch": 0.17, "grad_norm": 24.144041061401367, "learning_rate": 1.712072632205604e-05, "loss": 3.5586, "step": 12280 }, { "epoch": 0.17, "grad_norm": 35.812835693359375, "learning_rate": 1.704392548431391e-05, "loss": 3.5274, "step": 12300 }, { "epoch": 0.17, "grad_norm": 18.759809494018555, "learning_rate": 1.6967208102028697e-05, "loss": 3.5823, "step": 12320 }, { "epoch": 0.17, "grad_norm": 19.85857391357422, "learning_rate": 1.6890574979929448e-05, "loss": 3.5583, "step": 12340 }, { "epoch": 0.17, "grad_norm": 21.088096618652344, "learning_rate": 1.6814026921861335e-05, "loss": 3.5084, "step": 12360 }, { "epoch": 0.18, "grad_norm": 21.760805130004883, "learning_rate": 1.6737564730777284e-05, "loss": 3.4753, "step": 12380 }, { "epoch": 0.18, "grad_norm": 25.510221481323242, "learning_rate": 1.666118920872949e-05, "loss": 3.6024, "step": 12400 }, { "epoch": 0.18, "grad_norm": 29.43688201904297, "learning_rate": 1.658490115686104e-05, "loss": 3.647, "step": 12420 }, { "epoch": 0.18, "grad_norm": 19.17232322692871, "learning_rate": 1.6508701375397487e-05, "loss": 3.5505, "step": 12440 }, { "epoch": 0.18, "grad_norm": 27.04405975341797, "learning_rate": 1.64325906636385e-05, "loss": 3.5158, "step": 12460 }, { "epoch": 0.18, "grad_norm": 34.61522674560547, "learning_rate": 1.635656981994943e-05, "loss": 3.5723, "step": 12480 }, { "epoch": 0.18, "grad_norm": 22.05956268310547, "learning_rate": 1.6280639641752942e-05, "loss": 3.5133, "step": 12500 }, { "epoch": 0.18, "grad_norm": 26.21821403503418, "learning_rate": 1.6204800925520685e-05, "loss": 3.4956, "step": 12520 }, { "epoch": 0.18, "grad_norm": 16.636159896850586, "learning_rate": 1.6129054466764904e-05, "loss": 3.5843, "step": 12540 }, { "epoch": 0.18, "grad_norm": 27.356168746948242, "learning_rate": 1.60534010600301e-05, "loss": 3.5189, "step": 12560 }, { "epoch": 0.18, "grad_norm": 19.394620895385742, "learning_rate": 1.5977841498884723e-05, "loss": 3.5838, "step": 12580 }, { "epoch": 0.18, "grad_norm": 31.54738426208496, "learning_rate": 1.5902376575912815e-05, "loss": 3.6633, "step": 12600 }, { "epoch": 0.18, "grad_norm": 23.533172607421875, "learning_rate": 1.5827007082705698e-05, "loss": 3.5234, "step": 12620 }, { "epoch": 0.18, "grad_norm": 25.580156326293945, "learning_rate": 1.5751733809853704e-05, "loss": 3.5478, "step": 12640 }, { "epoch": 0.18, "grad_norm": 28.20244789123535, "learning_rate": 1.5676557546937838e-05, "loss": 3.49, "step": 12660 }, { "epoch": 0.18, "grad_norm": 26.89322280883789, "learning_rate": 1.5601479082521526e-05, "loss": 3.5238, "step": 12680 }, { "epoch": 0.18, "grad_norm": 25.817209243774414, "learning_rate": 1.552649920414233e-05, "loss": 3.5417, "step": 12700 }, { "epoch": 0.18, "grad_norm": 30.55599594116211, "learning_rate": 1.545161869830371e-05, "loss": 3.5908, "step": 12720 }, { "epoch": 0.18, "grad_norm": 49.497894287109375, "learning_rate": 1.5376838350466725e-05, "loss": 3.6647, "step": 12740 }, { "epoch": 0.18, "grad_norm": 17.040536880493164, "learning_rate": 1.5302158945041838e-05, "loss": 3.5271, "step": 12760 }, { "epoch": 0.18, "grad_norm": 31.21510124206543, "learning_rate": 1.5227581265380685e-05, "loss": 3.4708, "step": 12780 }, { "epoch": 0.18, "grad_norm": 26.194982528686523, "learning_rate": 1.5153106093767827e-05, "loss": 3.5831, "step": 12800 }, { "epoch": 0.18, "grad_norm": 21.868547439575195, "learning_rate": 1.5078734211412573e-05, "loss": 3.532, "step": 12820 }, { "epoch": 0.18, "grad_norm": 27.728635787963867, "learning_rate": 1.5004466398440775e-05, "loss": 3.5432, "step": 12840 }, { "epoch": 0.18, "grad_norm": 22.69495964050293, "learning_rate": 1.493030343388666e-05, "loss": 3.5464, "step": 12860 }, { "epoch": 0.18, "grad_norm": 22.85190773010254, "learning_rate": 1.4856246095684622e-05, "loss": 3.5686, "step": 12880 }, { "epoch": 0.18, "grad_norm": 18.40359115600586, "learning_rate": 1.4782295160661103e-05, "loss": 3.4922, "step": 12900 }, { "epoch": 0.18, "grad_norm": 27.058635711669922, "learning_rate": 1.4708451404526407e-05, "loss": 3.5231, "step": 12920 }, { "epoch": 0.18, "grad_norm": 28.491573333740234, "learning_rate": 1.4634715601866606e-05, "loss": 3.502, "step": 12940 }, { "epoch": 0.18, "grad_norm": 26.94320297241211, "learning_rate": 1.4561088526135375e-05, "loss": 3.5746, "step": 12960 }, { "epoch": 0.18, "grad_norm": 26.503097534179688, "learning_rate": 1.4487570949645888e-05, "loss": 3.5195, "step": 12980 }, { "epoch": 0.18, "grad_norm": 32.38998794555664, "learning_rate": 1.4414163643562755e-05, "loss": 3.5195, "step": 13000 }, { "epoch": 0.18, "grad_norm": 29.240285873413086, "learning_rate": 1.434086737789386e-05, "loss": 3.6301, "step": 13020 }, { "epoch": 0.18, "grad_norm": 19.996788024902344, "learning_rate": 1.4267682921482356e-05, "loss": 3.5252, "step": 13040 }, { "epoch": 0.18, "grad_norm": 26.969528198242188, "learning_rate": 1.419461104199856e-05, "loss": 3.546, "step": 13060 }, { "epoch": 0.19, "grad_norm": 39.526832580566406, "learning_rate": 1.412165250593192e-05, "loss": 3.5464, "step": 13080 }, { "epoch": 0.19, "grad_norm": 22.60038948059082, "learning_rate": 1.4048808078582942e-05, "loss": 3.475, "step": 13100 }, { "epoch": 0.19, "grad_norm": 20.97502899169922, "learning_rate": 1.3976078524055203e-05, "loss": 3.5398, "step": 13120 }, { "epoch": 0.19, "grad_norm": 16.191055297851562, "learning_rate": 1.3903464605247325e-05, "loss": 3.4869, "step": 13140 }, { "epoch": 0.19, "grad_norm": 22.308320999145508, "learning_rate": 1.3830967083844942e-05, "loss": 3.4316, "step": 13160 }, { "epoch": 0.19, "grad_norm": 23.294443130493164, "learning_rate": 1.375858672031276e-05, "loss": 3.6033, "step": 13180 }, { "epoch": 0.19, "grad_norm": 24.432270050048828, "learning_rate": 1.368632427388653e-05, "loss": 3.4829, "step": 13200 }, { "epoch": 0.19, "grad_norm": 24.131166458129883, "learning_rate": 1.3614180502565135e-05, "loss": 3.5721, "step": 13220 }, { "epoch": 0.19, "grad_norm": 36.706668853759766, "learning_rate": 1.3542156163102582e-05, "loss": 3.4877, "step": 13240 }, { "epoch": 0.19, "grad_norm": 22.797359466552734, "learning_rate": 1.3470252011000123e-05, "loss": 3.539, "step": 13260 }, { "epoch": 0.19, "grad_norm": 26.030719757080078, "learning_rate": 1.3398468800498293e-05, "loss": 3.5415, "step": 13280 }, { "epoch": 0.19, "grad_norm": 24.123130798339844, "learning_rate": 1.3326807284568984e-05, "loss": 3.5354, "step": 13300 }, { "epoch": 0.19, "grad_norm": 24.965229034423828, "learning_rate": 1.3255268214907613e-05, "loss": 3.387, "step": 13320 }, { "epoch": 0.19, "grad_norm": 27.477920532226562, "learning_rate": 1.3183852341925145e-05, "loss": 3.5484, "step": 13340 }, { "epoch": 0.19, "grad_norm": 27.259984970092773, "learning_rate": 1.3112560414740315e-05, "loss": 3.5104, "step": 13360 }, { "epoch": 0.19, "grad_norm": 34.4488525390625, "learning_rate": 1.3041393181171688e-05, "loss": 3.5881, "step": 13380 }, { "epoch": 0.19, "grad_norm": 30.281177520751953, "learning_rate": 1.2970351387729873e-05, "loss": 3.5851, "step": 13400 }, { "epoch": 0.19, "grad_norm": 21.918861389160156, "learning_rate": 1.2899435779609682e-05, "loss": 3.5427, "step": 13420 }, { "epoch": 0.19, "grad_norm": 30.275283813476562, "learning_rate": 1.2828647100682261e-05, "loss": 3.6322, "step": 13440 }, { "epoch": 0.19, "grad_norm": 57.30770492553711, "learning_rate": 1.275798609348738e-05, "loss": 3.5871, "step": 13460 }, { "epoch": 0.19, "grad_norm": 21.865861892700195, "learning_rate": 1.2687453499225545e-05, "loss": 3.5117, "step": 13480 }, { "epoch": 0.19, "grad_norm": 19.6708927154541, "learning_rate": 1.2617050057750322e-05, "loss": 3.5015, "step": 13500 }, { "epoch": 0.19, "grad_norm": 27.376646041870117, "learning_rate": 1.2546776507560468e-05, "loss": 3.5206, "step": 13520 }, { "epoch": 0.19, "grad_norm": 44.89965057373047, "learning_rate": 1.2476633585792286e-05, "loss": 3.5766, "step": 13540 }, { "epoch": 0.19, "grad_norm": 37.562286376953125, "learning_rate": 1.2406622028211844e-05, "loss": 3.5488, "step": 13560 }, { "epoch": 0.19, "grad_norm": 41.833892822265625, "learning_rate": 1.2336742569207235e-05, "loss": 3.6429, "step": 13580 }, { "epoch": 0.19, "grad_norm": 44.58812713623047, "learning_rate": 1.2266995941780934e-05, "loss": 3.5362, "step": 13600 }, { "epoch": 0.19, "grad_norm": 26.543933868408203, "learning_rate": 1.2197382877542041e-05, "loss": 3.5761, "step": 13620 }, { "epoch": 0.19, "grad_norm": 25.108768463134766, "learning_rate": 1.2127904106698666e-05, "loss": 3.4656, "step": 13640 }, { "epoch": 0.19, "grad_norm": 22.213363647460938, "learning_rate": 1.2058560358050241e-05, "loss": 3.5438, "step": 13660 }, { "epoch": 0.19, "grad_norm": 25.67693519592285, "learning_rate": 1.1989352358979888e-05, "loss": 3.5508, "step": 13680 }, { "epoch": 0.19, "grad_norm": 23.043434143066406, "learning_rate": 1.1920280835446748e-05, "loss": 3.5901, "step": 13700 }, { "epoch": 0.19, "grad_norm": 25.011388778686523, "learning_rate": 1.1851346511978425e-05, "loss": 3.5773, "step": 13720 }, { "epoch": 0.19, "grad_norm": 29.659713745117188, "learning_rate": 1.1782550111663369e-05, "loss": 3.5795, "step": 13740 }, { "epoch": 0.19, "grad_norm": 28.714496612548828, "learning_rate": 1.1713892356143239e-05, "loss": 3.5942, "step": 13760 }, { "epoch": 0.2, "grad_norm": 29.76102066040039, "learning_rate": 1.1645373965605425e-05, "loss": 3.5008, "step": 13780 }, { "epoch": 0.2, "grad_norm": 36.479854583740234, "learning_rate": 1.1576995658775405e-05, "loss": 3.4347, "step": 13800 }, { "epoch": 0.2, "grad_norm": 22.23845100402832, "learning_rate": 1.1508758152909273e-05, "loss": 3.559, "step": 13820 }, { "epoch": 0.2, "grad_norm": 52.79865646362305, "learning_rate": 1.1440662163786167e-05, "loss": 3.5128, "step": 13840 }, { "epoch": 0.2, "grad_norm": 23.489912033081055, "learning_rate": 1.1372708405700793e-05, "loss": 3.6525, "step": 13860 }, { "epoch": 0.2, "grad_norm": 22.640663146972656, "learning_rate": 1.1304897591455928e-05, "loss": 3.5387, "step": 13880 }, { "epoch": 0.2, "grad_norm": 23.30765724182129, "learning_rate": 1.1237230432354912e-05, "loss": 3.5714, "step": 13900 }, { "epoch": 0.2, "grad_norm": 27.42571258544922, "learning_rate": 1.1169707638194238e-05, "loss": 3.6333, "step": 13920 }, { "epoch": 0.2, "grad_norm": 17.366491317749023, "learning_rate": 1.1102329917256046e-05, "loss": 3.5651, "step": 13940 }, { "epoch": 0.2, "grad_norm": 17.73781967163086, "learning_rate": 1.103509797630077e-05, "loss": 3.6944, "step": 13960 }, { "epoch": 0.2, "grad_norm": 23.699871063232422, "learning_rate": 1.0968012520559634e-05, "loss": 3.5914, "step": 13980 }, { "epoch": 0.2, "grad_norm": 21.75518798828125, "learning_rate": 1.0901074253727336e-05, "loss": 3.592, "step": 14000 }, { "epoch": 0.2, "grad_norm": 26.731660842895508, "learning_rate": 1.083428387795463e-05, "loss": 3.5461, "step": 14020 }, { "epoch": 0.2, "grad_norm": 20.746625900268555, "learning_rate": 1.0767642093840932e-05, "loss": 3.5951, "step": 14040 }, { "epoch": 0.2, "grad_norm": 29.204326629638672, "learning_rate": 1.0701149600427044e-05, "loss": 3.591, "step": 14060 }, { "epoch": 0.2, "grad_norm": 23.31056022644043, "learning_rate": 1.0634807095187737e-05, "loss": 3.4382, "step": 14080 }, { "epoch": 0.2, "grad_norm": 20.306123733520508, "learning_rate": 1.0568615274024522e-05, "loss": 3.5539, "step": 14100 }, { "epoch": 0.2, "grad_norm": 19.775318145751953, "learning_rate": 1.0502574831258259e-05, "loss": 3.5532, "step": 14120 }, { "epoch": 0.2, "grad_norm": 23.407949447631836, "learning_rate": 1.043668645962195e-05, "loss": 3.4811, "step": 14140 }, { "epoch": 0.2, "grad_norm": 27.759410858154297, "learning_rate": 1.0370950850253449e-05, "loss": 3.6196, "step": 14160 }, { "epoch": 0.2, "grad_norm": 35.55162048339844, "learning_rate": 1.0305368692688174e-05, "loss": 3.4774, "step": 14180 }, { "epoch": 0.2, "grad_norm": 23.29683494567871, "learning_rate": 1.0239940674851941e-05, "loss": 3.5437, "step": 14200 }, { "epoch": 0.2, "grad_norm": 23.486143112182617, "learning_rate": 1.0174667483053682e-05, "loss": 3.671, "step": 14220 }, { "epoch": 0.2, "grad_norm": 19.907581329345703, "learning_rate": 1.0109549801978305e-05, "loss": 3.4272, "step": 14240 }, { "epoch": 0.2, "grad_norm": 35.4050407409668, "learning_rate": 1.0044588314679451e-05, "loss": 3.5397, "step": 14260 }, { "epoch": 0.2, "grad_norm": 28.344934463500977, "learning_rate": 9.979783702572412e-06, "loss": 3.5157, "step": 14280 }, { "epoch": 0.2, "grad_norm": 27.65180015563965, "learning_rate": 9.915136645426884e-06, "loss": 3.5073, "step": 14300 }, { "epoch": 0.2, "grad_norm": 21.25212860107422, "learning_rate": 9.850647821359918e-06, "loss": 3.5119, "step": 14320 }, { "epoch": 0.2, "grad_norm": 25.691951751708984, "learning_rate": 9.786317906828747e-06, "loss": 3.6237, "step": 14340 }, { "epoch": 0.2, "grad_norm": 32.24767303466797, "learning_rate": 9.722147576623743e-06, "loss": 3.5211, "step": 14360 }, { "epoch": 0.2, "grad_norm": 20.08036231994629, "learning_rate": 9.658137503861314e-06, "loss": 3.4558, "step": 14380 }, { "epoch": 0.2, "grad_norm": 22.380619049072266, "learning_rate": 9.594288359976817e-06, "loss": 3.4814, "step": 14400 }, { "epoch": 0.2, "grad_norm": 28.68309211730957, "learning_rate": 9.530600814717575e-06, "loss": 3.5701, "step": 14420 }, { "epoch": 0.2, "grad_norm": 22.603858947753906, "learning_rate": 9.467075536135787e-06, "loss": 3.5527, "step": 14440 }, { "epoch": 0.2, "grad_norm": 20.50914192199707, "learning_rate": 9.403713190581576e-06, "loss": 3.4903, "step": 14460 }, { "epoch": 0.2, "grad_norm": 30.767824172973633, "learning_rate": 9.340514442695952e-06, "loss": 3.5184, "step": 14480 }, { "epoch": 0.21, "grad_norm": 23.36814308166504, "learning_rate": 9.277479955403887e-06, "loss": 3.4903, "step": 14500 }, { "epoch": 0.21, "grad_norm": 34.33296203613281, "learning_rate": 9.214610389907327e-06, "loss": 3.5716, "step": 14520 }, { "epoch": 0.21, "grad_norm": 21.11824607849121, "learning_rate": 9.15190640567825e-06, "loss": 3.6187, "step": 14540 }, { "epoch": 0.21, "grad_norm": 38.17007064819336, "learning_rate": 9.0893686604518e-06, "loss": 3.5029, "step": 14560 }, { "epoch": 0.21, "grad_norm": 22.7441349029541, "learning_rate": 9.026997810219312e-06, "loss": 3.552, "step": 14580 }, { "epoch": 0.21, "grad_norm": 27.61566734313965, "learning_rate": 8.964794509221508e-06, "loss": 3.5794, "step": 14600 }, { "epoch": 0.21, "grad_norm": 28.213449478149414, "learning_rate": 8.902759409941566e-06, "loss": 3.6239, "step": 14620 }, { "epoch": 0.21, "grad_norm": 99.79093933105469, "learning_rate": 8.840893163098331e-06, "loss": 3.5571, "step": 14640 }, { "epoch": 0.21, "grad_norm": 22.567502975463867, "learning_rate": 8.779196417639466e-06, "loss": 3.6038, "step": 14660 }, { "epoch": 0.21, "grad_norm": 16.769285202026367, "learning_rate": 8.71766982073462e-06, "loss": 3.5192, "step": 14680 }, { "epoch": 0.21, "grad_norm": 21.841182708740234, "learning_rate": 8.656314017768693e-06, "loss": 3.4728, "step": 14700 }, { "epoch": 0.21, "grad_norm": 29.540746688842773, "learning_rate": 8.595129652335019e-06, "loss": 3.4656, "step": 14720 }, { "epoch": 0.21, "grad_norm": 31.932985305786133, "learning_rate": 8.534117366228644e-06, "loss": 3.5597, "step": 14740 }, { "epoch": 0.21, "grad_norm": 22.49396324157715, "learning_rate": 8.47327779943957e-06, "loss": 3.5653, "step": 14760 }, { "epoch": 0.21, "grad_norm": 26.135406494140625, "learning_rate": 8.412611590146069e-06, "loss": 3.5669, "step": 14780 }, { "epoch": 0.21, "grad_norm": 26.18607521057129, "learning_rate": 8.352119374707978e-06, "loss": 3.4971, "step": 14800 }, { "epoch": 0.21, "grad_norm": 32.66505813598633, "learning_rate": 8.29180178766e-06, "loss": 3.6041, "step": 14820 }, { "epoch": 0.21, "grad_norm": 22.13516616821289, "learning_rate": 8.23165946170509e-06, "loss": 3.4845, "step": 14840 }, { "epoch": 0.21, "grad_norm": 19.47613525390625, "learning_rate": 8.171693027707772e-06, "loss": 3.582, "step": 14860 }, { "epoch": 0.21, "grad_norm": 28.822246551513672, "learning_rate": 8.111903114687591e-06, "loss": 3.5498, "step": 14880 }, { "epoch": 0.21, "grad_norm": 27.130552291870117, "learning_rate": 8.052290349812419e-06, "loss": 3.5724, "step": 14900 }, { "epoch": 0.21, "grad_norm": 20.185958862304688, "learning_rate": 7.992855358391967e-06, "loss": 3.5115, "step": 14920 }, { "epoch": 0.21, "grad_norm": 19.644073486328125, "learning_rate": 7.933598763871155e-06, "loss": 3.5116, "step": 14940 }, { "epoch": 0.21, "grad_norm": 28.123699188232422, "learning_rate": 7.87452118782363e-06, "loss": 3.5057, "step": 14960 }, { "epoch": 0.21, "grad_norm": 18.76272964477539, "learning_rate": 7.815623249945214e-06, "loss": 3.579, "step": 14980 }, { "epoch": 0.21, "grad_norm": 24.90170669555664, "learning_rate": 7.756905568047393e-06, "loss": 3.4875, "step": 15000 }, { "epoch": 0.21, "grad_norm": 28.07253074645996, "learning_rate": 7.698368758050877e-06, "loss": 3.4413, "step": 15020 }, { "epoch": 0.21, "grad_norm": 25.81130027770996, "learning_rate": 7.640013433979093e-06, "loss": 3.5166, "step": 15040 }, { "epoch": 0.21, "grad_norm": 19.91429901123047, "learning_rate": 7.58184020795179e-06, "loss": 3.6086, "step": 15060 }, { "epoch": 0.21, "grad_norm": 22.920394897460938, "learning_rate": 7.523849690178567e-06, "loss": 3.4341, "step": 15080 }, { "epoch": 0.21, "grad_norm": 32.82981872558594, "learning_rate": 7.466042488952521e-06, "loss": 3.5264, "step": 15100 }, { "epoch": 0.21, "grad_norm": 23.146589279174805, "learning_rate": 7.408419210643847e-06, "loss": 3.4571, "step": 15120 }, { "epoch": 0.21, "grad_norm": 16.78236198425293, "learning_rate": 7.350980459693455e-06, "loss": 3.5377, "step": 15140 }, { "epoch": 0.21, "grad_norm": 19.41143035888672, "learning_rate": 7.293726838606674e-06, "loss": 3.5262, "step": 15160 }, { "epoch": 0.21, "grad_norm": 35.83265686035156, "learning_rate": 7.236658947946886e-06, "loss": 3.5389, "step": 15180 }, { "epoch": 0.22, "grad_norm": 18.50286102294922, "learning_rate": 7.179777386329276e-06, "loss": 3.4822, "step": 15200 }, { "epoch": 0.22, "grad_norm": 22.09296226501465, "learning_rate": 7.123082750414486e-06, "loss": 3.6018, "step": 15220 }, { "epoch": 0.22, "grad_norm": 20.790420532226562, "learning_rate": 7.066575634902436e-06, "loss": 3.5642, "step": 15240 }, { "epoch": 0.22, "grad_norm": 29.84691047668457, "learning_rate": 7.010256632526035e-06, "loss": 3.6224, "step": 15260 }, { "epoch": 0.22, "grad_norm": 45.16112518310547, "learning_rate": 6.9541263340449496e-06, "loss": 3.5078, "step": 15280 }, { "epoch": 0.22, "grad_norm": 24.578792572021484, "learning_rate": 6.898185328239468e-06, "loss": 3.571, "step": 15300 }, { "epoch": 0.22, "grad_norm": 28.100582122802734, "learning_rate": 6.842434201904255e-06, "loss": 3.4775, "step": 15320 }, { "epoch": 0.22, "grad_norm": 31.31978988647461, "learning_rate": 6.786873539842259e-06, "loss": 3.586, "step": 15340 }, { "epoch": 0.22, "grad_norm": 25.426467895507812, "learning_rate": 6.731503924858518e-06, "loss": 3.6732, "step": 15360 }, { "epoch": 0.22, "grad_norm": 29.726364135742188, "learning_rate": 6.676325937754102e-06, "loss": 3.4458, "step": 15380 }, { "epoch": 0.22, "grad_norm": 35.88907241821289, "learning_rate": 6.621340157319997e-06, "loss": 3.5081, "step": 15400 }, { "epoch": 0.22, "grad_norm": 18.98563003540039, "learning_rate": 6.566547160330999e-06, "loss": 3.4117, "step": 15420 }, { "epoch": 0.22, "grad_norm": 23.43938446044922, "learning_rate": 6.511947521539738e-06, "loss": 3.5529, "step": 15440 }, { "epoch": 0.22, "grad_norm": 27.628211975097656, "learning_rate": 6.457541813670564e-06, "loss": 3.6043, "step": 15460 }, { "epoch": 0.22, "grad_norm": 32.65241241455078, "learning_rate": 6.403330607413643e-06, "loss": 3.5273, "step": 15480 }, { "epoch": 0.22, "grad_norm": 23.956153869628906, "learning_rate": 6.349314471418849e-06, "loss": 3.62, "step": 15500 }, { "epoch": 0.22, "grad_norm": 26.069808959960938, "learning_rate": 6.295493972289904e-06, "loss": 3.5688, "step": 15520 }, { "epoch": 0.22, "grad_norm": 39.316566467285156, "learning_rate": 6.241869674578363e-06, "loss": 3.5178, "step": 15540 }, { "epoch": 0.22, "grad_norm": 27.55044174194336, "learning_rate": 6.188442140777742e-06, "loss": 3.4732, "step": 15560 }, { "epoch": 0.22, "grad_norm": 20.663110733032227, "learning_rate": 6.1352119313175945e-06, "loss": 3.471, "step": 15580 }, { "epoch": 0.22, "grad_norm": 24.35353660583496, "learning_rate": 6.082179604557617e-06, "loss": 3.503, "step": 15600 }, { "epoch": 0.22, "grad_norm": 21.658618927001953, "learning_rate": 6.029345716781837e-06, "loss": 3.5414, "step": 15620 }, { "epoch": 0.22, "grad_norm": 29.32859230041504, "learning_rate": 5.9767108221927216e-06, "loss": 3.4492, "step": 15640 }, { "epoch": 0.22, "grad_norm": 29.497888565063477, "learning_rate": 5.924275472905424e-06, "loss": 3.6211, "step": 15660 }, { "epoch": 0.22, "grad_norm": 22.027616500854492, "learning_rate": 5.872040218941929e-06, "loss": 3.6381, "step": 15680 }, { "epoch": 0.22, "grad_norm": 25.56600570678711, "learning_rate": 5.820005608225346e-06, "loss": 3.6468, "step": 15700 }, { "epoch": 0.22, "grad_norm": 20.85995864868164, "learning_rate": 5.768172186574122e-06, "loss": 3.5111, "step": 15720 }, { "epoch": 0.22, "grad_norm": 27.828828811645508, "learning_rate": 5.716540497696307e-06, "loss": 3.4975, "step": 15740 }, { "epoch": 0.22, "grad_norm": 28.20557403564453, "learning_rate": 5.665111083183905e-06, "loss": 3.5542, "step": 15760 }, { "epoch": 0.22, "grad_norm": 27.977331161499023, "learning_rate": 5.613884482507123e-06, "loss": 3.5096, "step": 15780 }, { "epoch": 0.22, "grad_norm": 15.945253372192383, "learning_rate": 5.562861233008774e-06, "loss": 3.4329, "step": 15800 }, { "epoch": 0.22, "grad_norm": 19.057100296020508, "learning_rate": 5.512041869898585e-06, "loss": 3.5043, "step": 15820 }, { "epoch": 0.22, "grad_norm": 19.960477828979492, "learning_rate": 5.46142692624764e-06, "loss": 3.4124, "step": 15840 }, { "epoch": 0.22, "grad_norm": 20.818775177001953, "learning_rate": 5.411016932982752e-06, "loss": 3.4409, "step": 15860 }, { "epoch": 0.22, "grad_norm": 19.86461639404297, "learning_rate": 5.360812418880884e-06, "loss": 3.6115, "step": 15880 }, { "epoch": 0.23, "grad_norm": 26.129798889160156, "learning_rate": 5.310813910563644e-06, "loss": 3.5875, "step": 15900 }, { "epoch": 0.23, "grad_norm": 17.876537322998047, "learning_rate": 5.261021932491714e-06, "loss": 3.5214, "step": 15920 }, { "epoch": 0.23, "grad_norm": 37.67085266113281, "learning_rate": 5.2114370069593965e-06, "loss": 3.6228, "step": 15940 }, { "epoch": 0.23, "grad_norm": 21.973346710205078, "learning_rate": 5.162059654089083e-06, "loss": 3.457, "step": 15960 }, { "epoch": 0.23, "grad_norm": 30.133270263671875, "learning_rate": 5.112890391825845e-06, "loss": 3.4729, "step": 15980 }, { "epoch": 0.23, "grad_norm": 24.22100830078125, "learning_rate": 5.063929735931985e-06, "loss": 3.5727, "step": 16000 }, { "epoch": 0.23, "grad_norm": 25.7775821685791, "learning_rate": 5.015178199981602e-06, "loss": 3.5195, "step": 16020 }, { "epoch": 0.23, "grad_norm": 21.86203384399414, "learning_rate": 4.966636295355253e-06, "loss": 3.5248, "step": 16040 }, { "epoch": 0.23, "grad_norm": 31.954734802246094, "learning_rate": 4.918304531234533e-06, "loss": 3.5392, "step": 16060 }, { "epoch": 0.23, "grad_norm": 19.251066207885742, "learning_rate": 4.870183414596794e-06, "loss": 3.5204, "step": 16080 }, { "epoch": 0.23, "grad_norm": 30.7813777923584, "learning_rate": 4.8222734502097665e-06, "loss": 3.5081, "step": 16100 }, { "epoch": 0.23, "grad_norm": 24.208667755126953, "learning_rate": 4.7745751406263165e-06, "loss": 3.5487, "step": 16120 }, { "epoch": 0.23, "grad_norm": 24.639707565307617, "learning_rate": 4.727088986179129e-06, "loss": 3.5998, "step": 16140 }, { "epoch": 0.23, "grad_norm": 23.774940490722656, "learning_rate": 4.679815484975505e-06, "loss": 3.4195, "step": 16160 }, { "epoch": 0.23, "grad_norm": 20.040874481201172, "learning_rate": 4.6327551328920945e-06, "loss": 3.5555, "step": 16180 }, { "epoch": 0.23, "grad_norm": 32.899959564208984, "learning_rate": 4.585908423569724e-06, "loss": 3.5204, "step": 16200 }, { "epoch": 0.23, "grad_norm": 29.705387115478516, "learning_rate": 4.539275848408217e-06, "loss": 3.5667, "step": 16220 }, { "epoch": 0.23, "grad_norm": 19.238882064819336, "learning_rate": 4.492857896561204e-06, "loss": 3.4192, "step": 16240 }, { "epoch": 0.23, "grad_norm": 18.87932777404785, "learning_rate": 4.446655054931051e-06, "loss": 3.4987, "step": 16260 }, { "epoch": 0.23, "grad_norm": 25.21925163269043, "learning_rate": 4.4006678081636884e-06, "loss": 3.6039, "step": 16280 }, { "epoch": 0.23, "grad_norm": 42.43282699584961, "learning_rate": 4.35489663864359e-06, "loss": 3.5736, "step": 16300 }, { "epoch": 0.23, "grad_norm": 34.624874114990234, "learning_rate": 4.309342026488653e-06, "loss": 3.4077, "step": 16320 }, { "epoch": 0.23, "grad_norm": 28.791912078857422, "learning_rate": 4.264004449545206e-06, "loss": 3.511, "step": 16340 }, { "epoch": 0.23, "grad_norm": 21.95167350769043, "learning_rate": 4.218884383382987e-06, "loss": 3.4688, "step": 16360 }, { "epoch": 0.23, "grad_norm": 29.78521156311035, "learning_rate": 4.173982301290122e-06, "loss": 3.4808, "step": 16380 }, { "epoch": 0.23, "grad_norm": 24.426288604736328, "learning_rate": 4.129298674268225e-06, "loss": 3.5356, "step": 16400 }, { "epoch": 0.23, "grad_norm": 23.966203689575195, "learning_rate": 4.084833971027379e-06, "loss": 3.5471, "step": 16420 }, { "epoch": 0.23, "grad_norm": 26.148141860961914, "learning_rate": 4.040588657981301e-06, "loss": 3.4811, "step": 16440 }, { "epoch": 0.23, "grad_norm": 29.300769805908203, "learning_rate": 3.99656319924237e-06, "loss": 3.5584, "step": 16460 }, { "epoch": 0.23, "grad_norm": 27.95845603942871, "learning_rate": 3.952758056616826e-06, "loss": 3.5451, "step": 16480 }, { "epoch": 0.23, "grad_norm": 21.04642677307129, "learning_rate": 3.90917368959989e-06, "loss": 3.5002, "step": 16500 }, { "epoch": 0.23, "grad_norm": 38.936370849609375, "learning_rate": 3.865810555370936e-06, "loss": 3.4524, "step": 16520 }, { "epoch": 0.23, "grad_norm": 24.49747085571289, "learning_rate": 3.822669108788738e-06, "loss": 3.4887, "step": 16540 }, { "epoch": 0.23, "grad_norm": 36.478424072265625, "learning_rate": 3.7797498023866396e-06, "loss": 3.5807, "step": 16560 }, { "epoch": 0.23, "grad_norm": 19.034469604492188, "learning_rate": 3.737053086367873e-06, "loss": 3.5806, "step": 16580 }, { "epoch": 0.23, "grad_norm": 26.125469207763672, "learning_rate": 3.694579408600771e-06, "loss": 3.4561, "step": 16600 }, { "epoch": 0.24, "grad_norm": 30.372791290283203, "learning_rate": 3.6523292146141227e-06, "loss": 3.5875, "step": 16620 }, { "epoch": 0.24, "grad_norm": 25.58843421936035, "learning_rate": 3.6103029475924726e-06, "loss": 3.498, "step": 16640 }, { "epoch": 0.24, "grad_norm": 27.708513259887695, "learning_rate": 3.56850104837147e-06, "loss": 3.5339, "step": 16660 }, { "epoch": 0.24, "grad_norm": 28.574857711791992, "learning_rate": 3.5269239554332563e-06, "loss": 3.5488, "step": 16680 }, { "epoch": 0.24, "grad_norm": 37.658775329589844, "learning_rate": 3.4855721049018688e-06, "loss": 3.5008, "step": 16700 }, { "epoch": 0.24, "grad_norm": 31.66521644592285, "learning_rate": 3.4444459305386507e-06, "loss": 3.4864, "step": 16720 }, { "epoch": 0.24, "grad_norm": 18.813400268554688, "learning_rate": 3.403545863737706e-06, "loss": 3.5685, "step": 16740 }, { "epoch": 0.24, "grad_norm": 24.066808700561523, "learning_rate": 3.3628723335213885e-06, "loss": 3.5549, "step": 16760 }, { "epoch": 0.24, "grad_norm": 23.029767990112305, "learning_rate": 3.322425766535778e-06, "loss": 3.4389, "step": 16780 }, { "epoch": 0.24, "grad_norm": 25.6854190826416, "learning_rate": 3.2822065870462217e-06, "loss": 3.4405, "step": 16800 }, { "epoch": 0.24, "grad_norm": 26.14007568359375, "learning_rate": 3.2422152169328922e-06, "loss": 3.5291, "step": 16820 }, { "epoch": 0.24, "grad_norm": 22.264413833618164, "learning_rate": 3.2024520756863243e-06, "loss": 3.613, "step": 16840 }, { "epoch": 0.24, "grad_norm": 32.82612991333008, "learning_rate": 3.1629175804030658e-06, "loss": 3.4603, "step": 16860 }, { "epoch": 0.24, "grad_norm": 32.21546173095703, "learning_rate": 3.1236121457812544e-06, "loss": 3.5886, "step": 16880 }, { "epoch": 0.24, "grad_norm": 18.181438446044922, "learning_rate": 3.08453618411631e-06, "loss": 3.4568, "step": 16900 }, { "epoch": 0.24, "grad_norm": 23.75358772277832, "learning_rate": 3.0456901052965724e-06, "loss": 3.5491, "step": 16920 }, { "epoch": 0.24, "grad_norm": 17.43839454650879, "learning_rate": 3.0070743167990273e-06, "loss": 3.5776, "step": 16940 }, { "epoch": 0.24, "grad_norm": 20.617218017578125, "learning_rate": 2.9686892236850337e-06, "loss": 3.539, "step": 16960 }, { "epoch": 0.24, "grad_norm": 19.83597755432129, "learning_rate": 2.93053522859604e-06, "loss": 3.3575, "step": 16980 }, { "epoch": 0.24, "grad_norm": 18.4063720703125, "learning_rate": 2.892612731749414e-06, "loss": 3.3658, "step": 17000 }, { "epoch": 0.24, "grad_norm": 23.77143096923828, "learning_rate": 2.85492213093419e-06, "loss": 3.4393, "step": 17020 }, { "epoch": 0.24, "grad_norm": 22.04786491394043, "learning_rate": 2.8174638215069493e-06, "loss": 3.5262, "step": 17040 }, { "epoch": 0.24, "grad_norm": 24.277013778686523, "learning_rate": 2.780238196387619e-06, "loss": 3.4419, "step": 17060 }, { "epoch": 0.24, "grad_norm": 30.128318786621094, "learning_rate": 2.743245646055398e-06, "loss": 3.5387, "step": 17080 }, { "epoch": 0.24, "grad_norm": 26.737049102783203, "learning_rate": 2.7064865585446434e-06, "loss": 3.4134, "step": 17100 }, { "epoch": 0.24, "grad_norm": 26.093942642211914, "learning_rate": 2.6699613194407725e-06, "loss": 3.5691, "step": 17120 }, { "epoch": 0.24, "grad_norm": 30.150657653808594, "learning_rate": 2.6336703118762766e-06, "loss": 3.4658, "step": 17140 }, { "epoch": 0.24, "grad_norm": 21.17641258239746, "learning_rate": 2.597613916526637e-06, "loss": 3.4942, "step": 17160 }, { "epoch": 0.24, "grad_norm": 28.02484130859375, "learning_rate": 2.5617925116063924e-06, "loss": 3.4448, "step": 17180 }, { "epoch": 0.24, "grad_norm": 24.14384651184082, "learning_rate": 2.52620647286512e-06, "loss": 3.5448, "step": 17200 }, { "epoch": 0.24, "grad_norm": 25.69285774230957, "learning_rate": 2.4908561735835306e-06, "loss": 3.5668, "step": 17220 }, { "epoch": 0.24, "grad_norm": 19.125316619873047, "learning_rate": 2.4557419845695427e-06, "loss": 3.5204, "step": 17240 }, { "epoch": 0.24, "grad_norm": 23.64023208618164, "learning_rate": 2.420864274154372e-06, "loss": 3.4345, "step": 17260 }, { "epoch": 0.24, "grad_norm": 24.39943504333496, "learning_rate": 2.3862234081887036e-06, "loss": 3.5515, "step": 17280 }, { "epoch": 0.24, "grad_norm": 24.969629287719727, "learning_rate": 2.351819750038828e-06, "loss": 3.4973, "step": 17300 }, { "epoch": 0.25, "grad_norm": 18.182634353637695, "learning_rate": 2.317653660582844e-06, "loss": 3.6065, "step": 17320 }, { "epoch": 0.25, "grad_norm": 27.141944885253906, "learning_rate": 2.2837254982068567e-06, "loss": 3.5106, "step": 17340 }, { "epoch": 0.25, "grad_norm": 20.061452865600586, "learning_rate": 2.250035618801241e-06, "loss": 3.4274, "step": 17360 }, { "epoch": 0.25, "grad_norm": 39.53497314453125, "learning_rate": 2.2165843757568805e-06, "loss": 3.4597, "step": 17380 }, { "epoch": 0.25, "grad_norm": 14.67487907409668, "learning_rate": 2.183372119961499e-06, "loss": 3.5732, "step": 17400 }, { "epoch": 0.25, "grad_norm": 14.984709739685059, "learning_rate": 2.15039919979593e-06, "loss": 3.4735, "step": 17420 }, { "epoch": 0.25, "grad_norm": 30.988039016723633, "learning_rate": 2.117665961130513e-06, "loss": 3.4269, "step": 17440 }, { "epoch": 0.25, "grad_norm": 23.8664493560791, "learning_rate": 2.0851727473214315e-06, "loss": 3.4997, "step": 17460 }, { "epoch": 0.25, "grad_norm": 24.271230697631836, "learning_rate": 2.05291989920712e-06, "loss": 3.5919, "step": 17480 }, { "epoch": 0.25, "grad_norm": 32.17240524291992, "learning_rate": 2.020907755104698e-06, "loss": 3.4734, "step": 17500 }, { "epoch": 0.25, "grad_norm": 24.72242546081543, "learning_rate": 1.9891366508064003e-06, "loss": 3.5043, "step": 17520 }, { "epoch": 0.25, "grad_norm": 29.70708656311035, "learning_rate": 1.957606919576088e-06, "loss": 3.4543, "step": 17540 }, { "epoch": 0.25, "grad_norm": 29.549745559692383, "learning_rate": 1.926318892145712e-06, "loss": 3.4355, "step": 17560 }, { "epoch": 0.25, "grad_norm": 25.912363052368164, "learning_rate": 1.8952728967118804e-06, "loss": 3.4614, "step": 17580 }, { "epoch": 0.25, "grad_norm": 22.835115432739258, "learning_rate": 1.864469258932397e-06, "loss": 3.5498, "step": 17600 }, { "epoch": 0.25, "grad_norm": 20.103981018066406, "learning_rate": 1.8339083019228404e-06, "loss": 3.5791, "step": 17620 }, { "epoch": 0.25, "grad_norm": 24.153532028198242, "learning_rate": 1.803590346253195e-06, "loss": 3.495, "step": 17640 }, { "epoch": 0.25, "grad_norm": 19.70048713684082, "learning_rate": 1.7735157099444593e-06, "loss": 3.5439, "step": 17660 }, { "epoch": 0.25, "grad_norm": 23.056358337402344, "learning_rate": 1.7436847084653456e-06, "loss": 3.4222, "step": 17680 }, { "epoch": 0.25, "grad_norm": 25.633689880371094, "learning_rate": 1.7140976547289438e-06, "loss": 3.5387, "step": 17700 }, { "epoch": 0.25, "grad_norm": 30.34889030456543, "learning_rate": 1.6847548590894435e-06, "loss": 3.5579, "step": 17720 }, { "epoch": 0.25, "grad_norm": 19.06514549255371, "learning_rate": 1.6556566293388892e-06, "loss": 3.4082, "step": 17740 }, { "epoch": 0.25, "grad_norm": 16.91566276550293, "learning_rate": 1.626803270703936e-06, "loss": 3.5513, "step": 17760 }, { "epoch": 0.25, "grad_norm": 26.17884635925293, "learning_rate": 1.5981950858426714e-06, "loss": 3.5068, "step": 17780 }, { "epoch": 0.25, "grad_norm": 35.77287292480469, "learning_rate": 1.5698323748414124e-06, "loss": 3.4825, "step": 17800 }, { "epoch": 0.25, "grad_norm": 25.684925079345703, "learning_rate": 1.5417154352115742e-06, "loss": 3.5529, "step": 17820 }, { "epoch": 0.25, "grad_norm": 23.964488983154297, "learning_rate": 1.5138445618865544e-06, "loss": 3.549, "step": 17840 }, { "epoch": 0.25, "grad_norm": 20.69983673095703, "learning_rate": 1.4862200472186199e-06, "loss": 3.5607, "step": 17860 }, { "epoch": 0.25, "grad_norm": 24.382530212402344, "learning_rate": 1.458842180975864e-06, "loss": 3.4468, "step": 17880 }, { "epoch": 0.25, "grad_norm": 20.305166244506836, "learning_rate": 1.4317112503391432e-06, "loss": 3.5468, "step": 17900 }, { "epoch": 0.25, "grad_norm": 20.76270294189453, "learning_rate": 1.4048275398990896e-06, "loss": 3.5828, "step": 17920 }, { "epoch": 0.25, "grad_norm": 31.468564987182617, "learning_rate": 1.3781913316530948e-06, "loss": 3.6117, "step": 17940 }, { "epoch": 0.25, "grad_norm": 22.185617446899414, "learning_rate": 1.351802905002386e-06, "loss": 3.4663, "step": 17960 }, { "epoch": 0.25, "grad_norm": 27.454687118530273, "learning_rate": 1.32566253674907e-06, "loss": 3.4419, "step": 17980 }, { "epoch": 0.25, "grad_norm": 20.76512336730957, "learning_rate": 1.2997705010932393e-06, "loss": 3.5315, "step": 18000 }, { "epoch": 0.26, "grad_norm": 27.795419692993164, "learning_rate": 1.274127069630096e-06, "loss": 3.5435, "step": 18020 }, { "epoch": 0.26, "grad_norm": 45.871864318847656, "learning_rate": 1.2487325113471032e-06, "loss": 3.3871, "step": 18040 }, { "epoch": 0.26, "grad_norm": 15.510208129882812, "learning_rate": 1.2235870926211619e-06, "loss": 3.5862, "step": 18060 }, { "epoch": 0.26, "grad_norm": 26.943269729614258, "learning_rate": 1.1986910772158104e-06, "loss": 3.5032, "step": 18080 }, { "epoch": 0.26, "grad_norm": 28.423053741455078, "learning_rate": 1.1740447262784781e-06, "loss": 3.4936, "step": 18100 }, { "epoch": 0.26, "grad_norm": 25.210853576660156, "learning_rate": 1.1496482983377189e-06, "loss": 3.4515, "step": 18120 }, { "epoch": 0.26, "grad_norm": 32.88740921020508, "learning_rate": 1.125502049300517e-06, "loss": 3.5196, "step": 18140 }, { "epoch": 0.26, "grad_norm": 20.562488555908203, "learning_rate": 1.1016062324496008e-06, "loss": 3.4467, "step": 18160 }, { "epoch": 0.26, "grad_norm": 21.112634658813477, "learning_rate": 1.0779610984407773e-06, "loss": 3.5286, "step": 18180 }, { "epoch": 0.26, "grad_norm": 29.323238372802734, "learning_rate": 1.0545668953003241e-06, "loss": 3.4971, "step": 18200 }, { "epoch": 0.26, "grad_norm": 24.024930953979492, "learning_rate": 1.0314238684223515e-06, "loss": 3.5919, "step": 18220 }, { "epoch": 0.26, "grad_norm": 29.396581649780273, "learning_rate": 1.0085322605662666e-06, "loss": 3.4255, "step": 18240 }, { "epoch": 0.26, "grad_norm": 19.502662658691406, "learning_rate": 9.858923118542002e-07, "loss": 3.464, "step": 18260 }, { "epoch": 0.26, "grad_norm": 20.03078269958496, "learning_rate": 9.635042597685023e-07, "loss": 3.4305, "step": 18280 }, { "epoch": 0.26, "grad_norm": 18.905967712402344, "learning_rate": 9.413683391492456e-07, "loss": 3.6401, "step": 18300 }, { "epoch": 0.26, "grad_norm": 23.61101531982422, "learning_rate": 9.194847821917623e-07, "loss": 3.5543, "step": 18320 }, { "epoch": 0.26, "grad_norm": 18.563806533813477, "learning_rate": 8.978538184442137e-07, "loss": 3.4395, "step": 18340 }, { "epoch": 0.26, "grad_norm": 21.695003509521484, "learning_rate": 8.764756748051662e-07, "loss": 3.4193, "step": 18360 }, { "epoch": 0.26, "grad_norm": 21.57720947265625, "learning_rate": 8.553505755212382e-07, "loss": 3.5357, "step": 18380 }, { "epoch": 0.26, "grad_norm": 30.37428855895996, "learning_rate": 8.344787421847217e-07, "loss": 3.5414, "step": 18400 }, { "epoch": 0.26, "grad_norm": 42.314064025878906, "learning_rate": 8.138603937312722e-07, "loss": 3.5528, "step": 18420 }, { "epoch": 0.26, "grad_norm": 22.21116065979004, "learning_rate": 7.934957464376058e-07, "loss": 3.6419, "step": 18440 }, { "epoch": 0.26, "grad_norm": 26.877450942993164, "learning_rate": 7.733850139192395e-07, "loss": 3.5869, "step": 18460 }, { "epoch": 0.26, "grad_norm": 21.281030654907227, "learning_rate": 7.535284071282455e-07, "loss": 3.6047, "step": 18480 }, { "epoch": 0.26, "grad_norm": 20.147789001464844, "learning_rate": 7.339261343510206e-07, "loss": 3.4247, "step": 18500 }, { "epoch": 0.26, "grad_norm": 19.394601821899414, "learning_rate": 7.145784012061424e-07, "loss": 3.5844, "step": 18520 }, { "epoch": 0.26, "grad_norm": 22.156579971313477, "learning_rate": 6.954854106421715e-07, "loss": 3.5348, "step": 18540 }, { "epoch": 0.26, "grad_norm": 28.641721725463867, "learning_rate": 6.766473629355452e-07, "loss": 3.5451, "step": 18560 }, { "epoch": 0.26, "grad_norm": 19.47591209411621, "learning_rate": 6.580644556884702e-07, "loss": 3.5458, "step": 18580 } ], "logging_steps": 20, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 4.037882943504384e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }