diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,6523 +1,27 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.26293817132021, + "epoch": 0.0002830335536277826, "eval_steps": 500, - "global_step": 18580, + "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "grad_norm": 58.19491958618164, - "learning_rate": 1.6666666666666667e-06, - "loss": 4.5462, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 3.5917, "step": 20 - }, - { - "epoch": 0.0, - "grad_norm": 51.19196319580078, - "learning_rate": 3.3333333333333333e-06, - "loss": 4.6693, - "step": 40 - }, - { - "epoch": 0.0, - "grad_norm": 45.4248161315918, - "learning_rate": 5e-06, - "loss": 4.6065, - "step": 60 - }, - { - "epoch": 0.0, - "grad_norm": 57.08290100097656, - "learning_rate": 6.666666666666667e-06, - "loss": 4.4395, - "step": 80 - }, - { - "epoch": 0.0, - "grad_norm": 40.65673828125, - "learning_rate": 8.333333333333334e-06, - "loss": 4.4641, - "step": 100 - }, - { - "epoch": 0.0, - "grad_norm": 40.7547492980957, - "learning_rate": 1e-05, - "loss": 4.4638, - "step": 120 - }, - { - "epoch": 0.0, - "grad_norm": 40.71052169799805, - "learning_rate": 1.1666666666666668e-05, - "loss": 4.3721, - "step": 140 - }, - { - "epoch": 0.0, - "grad_norm": 32.69596862792969, - "learning_rate": 1.3333333333333333e-05, - "loss": 4.3784, - "step": 160 - }, - { - "epoch": 0.0, - "grad_norm": 27.53285026550293, - "learning_rate": 1.5e-05, - "loss": 4.3627, - "step": 180 - }, - { - "epoch": 0.0, - "grad_norm": 39.0136833190918, - "learning_rate": 1.6666666666666667e-05, - "loss": 4.2018, - "step": 200 - }, - { - "epoch": 0.0, - "grad_norm": 39.9036750793457, - "learning_rate": 1.8333333333333333e-05, - "loss": 4.1214, - "step": 220 - }, - { - "epoch": 0.0, - "grad_norm": 26.16208267211914, - "learning_rate": 2e-05, - "loss": 4.0551, - "step": 240 - }, - { - "epoch": 0.0, - "grad_norm": 35.66220474243164, - "learning_rate": 2.1666666666666667e-05, - "loss": 4.0599, - "step": 260 - }, - { - "epoch": 0.0, - "grad_norm": 22.310619354248047, - "learning_rate": 2.3333333333333336e-05, - "loss": 4.181, - "step": 280 - }, - { - "epoch": 0.0, - "grad_norm": 31.29083824157715, - "learning_rate": 2.5e-05, - "loss": 4.0389, - "step": 300 - }, - { - "epoch": 0.0, - "grad_norm": 18.66942596435547, - "learning_rate": 2.6666666666666667e-05, - "loss": 4.0888, - "step": 320 - }, - { - "epoch": 0.0, - "grad_norm": 47.483428955078125, - "learning_rate": 2.8333333333333335e-05, - "loss": 4.0918, - "step": 340 - }, - { - "epoch": 0.01, - "grad_norm": 51.05717468261719, - "learning_rate": 3e-05, - "loss": 3.9807, - "step": 360 - }, - { - "epoch": 0.01, - "grad_norm": 67.01704406738281, - "learning_rate": 3.1666666666666666e-05, - "loss": 4.0331, - "step": 380 - }, - { - "epoch": 0.01, - "grad_norm": 40.98155975341797, - "learning_rate": 3.3333333333333335e-05, - "loss": 4.039, - "step": 400 - }, - { - "epoch": 0.01, - "grad_norm": 29.619321823120117, - "learning_rate": 3.5e-05, - "loss": 4.077, - "step": 420 - }, - { - "epoch": 0.01, - "grad_norm": 41.605018615722656, - "learning_rate": 3.6666666666666666e-05, - "loss": 4.044, - "step": 440 - }, - { - "epoch": 0.01, - "grad_norm": 34.36818313598633, - "learning_rate": 3.8333333333333334e-05, - "loss": 3.974, - "step": 460 - }, - { - "epoch": 0.01, - "grad_norm": 26.917036056518555, - "learning_rate": 4e-05, - "loss": 4.0088, - "step": 480 - }, - { - "epoch": 0.01, - "grad_norm": 25.219558715820312, - "learning_rate": 4.166666666666667e-05, - "loss": 3.8768, - "step": 500 - }, - { - "epoch": 0.01, - "grad_norm": 24.45106315612793, - "learning_rate": 4.3333333333333334e-05, - "loss": 3.8979, - "step": 520 - }, - { - "epoch": 0.01, - "grad_norm": 39.479461669921875, - "learning_rate": 4.5e-05, - "loss": 3.9241, - "step": 540 - }, - { - "epoch": 0.01, - "grad_norm": 46.96614456176758, - "learning_rate": 4.666666666666667e-05, - "loss": 3.8796, - "step": 560 - }, - { - "epoch": 0.01, - "grad_norm": 31.622241973876953, - "learning_rate": 4.8333333333333334e-05, - "loss": 3.9045, - "step": 580 - }, - { - "epoch": 0.01, - "grad_norm": 146.8946990966797, - "learning_rate": 5e-05, - "loss": 3.941, - "step": 600 - }, - { - "epoch": 0.01, - "grad_norm": 29.78015899658203, - "learning_rate": 4.9999868880914903e-05, - "loss": 3.9279, - "step": 620 - }, - { - "epoch": 0.01, - "grad_norm": 44.591156005859375, - "learning_rate": 4.999947552503497e-05, - "loss": 3.8695, - "step": 640 - }, - { - "epoch": 0.01, - "grad_norm": 35.80597686767578, - "learning_rate": 4.9998819936486327e-05, - "loss": 3.9277, - "step": 660 - }, - { - "epoch": 0.01, - "grad_norm": 35.00313186645508, - "learning_rate": 4.99979021221458e-05, - "loss": 3.881, - "step": 680 - }, - { - "epoch": 0.01, - "grad_norm": 28.8647403717041, - "learning_rate": 4.999672209164081e-05, - "loss": 3.8286, - "step": 700 - }, - { - "epoch": 0.01, - "grad_norm": 33.56174087524414, - "learning_rate": 4.999527985734932e-05, - "loss": 3.8631, - "step": 720 - }, - { - "epoch": 0.01, - "grad_norm": 63.59539794921875, - "learning_rate": 4.999357543439969e-05, - "loss": 3.8931, - "step": 740 - }, - { - "epoch": 0.01, - "grad_norm": 54.89167785644531, - "learning_rate": 4.999160884067051e-05, - "loss": 3.8953, - "step": 760 - }, - { - "epoch": 0.01, - "grad_norm": 33.9933967590332, - "learning_rate": 4.998938009679042e-05, - "loss": 3.9113, - "step": 780 - }, - { - "epoch": 0.01, - "grad_norm": 56.342620849609375, - "learning_rate": 4.998688922613788e-05, - "loss": 3.8079, - "step": 800 - }, - { - "epoch": 0.01, - "grad_norm": 35.17020797729492, - "learning_rate": 4.998413625484095e-05, - "loss": 3.8289, - "step": 820 - }, - { - "epoch": 0.01, - "grad_norm": 36.69993209838867, - "learning_rate": 4.998112121177699e-05, - "loss": 3.9726, - "step": 840 - }, - { - "epoch": 0.01, - "grad_norm": 41.2137565612793, - "learning_rate": 4.997784412857239e-05, - "loss": 3.8602, - "step": 860 - }, - { - "epoch": 0.01, - "grad_norm": 49.4541130065918, - "learning_rate": 4.99743050396022e-05, - "loss": 3.8549, - "step": 880 - }, - { - "epoch": 0.01, - "grad_norm": 40.87107849121094, - "learning_rate": 4.997050398198977e-05, - "loss": 3.7832, - "step": 900 - }, - { - "epoch": 0.01, - "grad_norm": 31.820924758911133, - "learning_rate": 4.9966440995606415e-05, - "loss": 3.8991, - "step": 920 - }, - { - "epoch": 0.01, - "grad_norm": 37.09877395629883, - "learning_rate": 4.9962116123070924e-05, - "loss": 3.9486, - "step": 940 - }, - { - "epoch": 0.01, - "grad_norm": 40.25444412231445, - "learning_rate": 4.995752940974918e-05, - "loss": 3.848, - "step": 960 - }, - { - "epoch": 0.01, - "grad_norm": 38.95152282714844, - "learning_rate": 4.9952680903753627e-05, - "loss": 3.723, - "step": 980 - }, - { - "epoch": 0.01, - "grad_norm": 52.44506072998047, - "learning_rate": 4.9947570655942796e-05, - "loss": 3.864, - "step": 1000 - }, - { - "epoch": 0.01, - "grad_norm": 59.793373107910156, - "learning_rate": 4.994219871992077e-05, - "loss": 3.794, - "step": 1020 - }, - { - "epoch": 0.01, - "grad_norm": 40.9141960144043, - "learning_rate": 4.993656515203662e-05, - "loss": 3.8384, - "step": 1040 - }, - { - "epoch": 0.02, - "grad_norm": 33.75545883178711, - "learning_rate": 4.99306700113838e-05, - "loss": 3.8811, - "step": 1060 - }, - { - "epoch": 0.02, - "grad_norm": 30.463613510131836, - "learning_rate": 4.9924513359799554e-05, - "loss": 3.7411, - "step": 1080 - }, - { - "epoch": 0.02, - "grad_norm": 36.24667739868164, - "learning_rate": 4.991809526186424e-05, - "loss": 3.8915, - "step": 1100 - }, - { - "epoch": 0.02, - "grad_norm": 35.77268600463867, - "learning_rate": 4.991141578490066e-05, - "loss": 3.7547, - "step": 1120 - }, - { - "epoch": 0.02, - "grad_norm": 43.09757995605469, - "learning_rate": 4.990447499897339e-05, - "loss": 3.8161, - "step": 1140 - }, - { - "epoch": 0.02, - "grad_norm": 67.45648956298828, - "learning_rate": 4.989727297688797e-05, - "loss": 3.9635, - "step": 1160 - }, - { - "epoch": 0.02, - "grad_norm": 31.597640991210938, - "learning_rate": 4.98898097941902e-05, - "loss": 3.8912, - "step": 1180 - }, - { - "epoch": 0.02, - "grad_norm": 41.68192672729492, - "learning_rate": 4.988208552916535e-05, - "loss": 3.8112, - "step": 1200 - }, - { - "epoch": 0.02, - "grad_norm": 36.489810943603516, - "learning_rate": 4.9874100262837296e-05, - "loss": 3.7838, - "step": 1220 - }, - { - "epoch": 0.02, - "grad_norm": 31.755823135375977, - "learning_rate": 4.986585407896772e-05, - "loss": 3.8385, - "step": 1240 - }, - { - "epoch": 0.02, - "grad_norm": 84.64984130859375, - "learning_rate": 4.985734706405516e-05, - "loss": 3.8727, - "step": 1260 - }, - { - "epoch": 0.02, - "grad_norm": 32.23849868774414, - "learning_rate": 4.98485793073342e-05, - "loss": 3.8013, - "step": 1280 - }, - { - "epoch": 0.02, - "grad_norm": 25.90882110595703, - "learning_rate": 4.983955090077444e-05, - "loss": 3.7387, - "step": 1300 - }, - { - "epoch": 0.02, - "grad_norm": 43.255313873291016, - "learning_rate": 4.9830261939079614e-05, - "loss": 3.8756, - "step": 1320 - }, - { - "epoch": 0.02, - "grad_norm": 35.833404541015625, - "learning_rate": 4.982071251968652e-05, - "loss": 3.7124, - "step": 1340 - }, - { - "epoch": 0.02, - "grad_norm": 29.098703384399414, - "learning_rate": 4.981090274276406e-05, - "loss": 3.8525, - "step": 1360 - }, - { - "epoch": 0.02, - "grad_norm": 35.16478729248047, - "learning_rate": 4.980083271121214e-05, - "loss": 3.8262, - "step": 1380 - }, - { - "epoch": 0.02, - "grad_norm": 32.62320327758789, - "learning_rate": 4.9790502530660635e-05, - "loss": 3.8903, - "step": 1400 - }, - { - "epoch": 0.02, - "grad_norm": 48.55181884765625, - "learning_rate": 4.977991230946824e-05, - "loss": 3.7363, - "step": 1420 - }, - { - "epoch": 0.02, - "grad_norm": 46.640403747558594, - "learning_rate": 4.976906215872138e-05, - "loss": 3.9682, - "step": 1440 - }, - { - "epoch": 0.02, - "grad_norm": 32.13254928588867, - "learning_rate": 4.9757952192232985e-05, - "loss": 3.6851, - "step": 1460 - }, - { - "epoch": 0.02, - "grad_norm": 34.074649810791016, - "learning_rate": 4.9746582526541355e-05, - "loss": 3.7781, - "step": 1480 - }, - { - "epoch": 0.02, - "grad_norm": 37.383548736572266, - "learning_rate": 4.9734953280908904e-05, - "loss": 3.7182, - "step": 1500 - }, - { - "epoch": 0.02, - "grad_norm": 45.83818435668945, - "learning_rate": 4.972306457732091e-05, - "loss": 3.7685, - "step": 1520 - }, - { - "epoch": 0.02, - "grad_norm": 35.88654327392578, - "learning_rate": 4.9710916540484265e-05, - "loss": 3.7627, - "step": 1540 - }, - { - "epoch": 0.02, - "grad_norm": 29.5416202545166, - "learning_rate": 4.96985092978261e-05, - "loss": 3.8022, - "step": 1560 - }, - { - "epoch": 0.02, - "grad_norm": 31.974184036254883, - "learning_rate": 4.968584297949255e-05, - "loss": 3.792, - "step": 1580 - }, - { - "epoch": 0.02, - "grad_norm": 32.32705307006836, - "learning_rate": 4.967291771834727e-05, - "loss": 3.7238, - "step": 1600 - }, - { - "epoch": 0.02, - "grad_norm": 29.011735916137695, - "learning_rate": 4.9659733649970155e-05, - "loss": 3.7215, - "step": 1620 - }, - { - "epoch": 0.02, - "grad_norm": 33.73636245727539, - "learning_rate": 4.9646290912655834e-05, - "loss": 3.8132, - "step": 1640 - }, - { - "epoch": 0.02, - "grad_norm": 38.57840347290039, - "learning_rate": 4.9632589647412265e-05, - "loss": 3.8606, - "step": 1660 - }, - { - "epoch": 0.02, - "grad_norm": 33.149078369140625, - "learning_rate": 4.9618629997959235e-05, - "loss": 3.7518, - "step": 1680 - }, - { - "epoch": 0.02, - "grad_norm": 58.5382194519043, - "learning_rate": 4.960441211072686e-05, - "loss": 3.7482, - "step": 1700 - }, - { - "epoch": 0.02, - "grad_norm": 31.86609649658203, - "learning_rate": 4.958993613485405e-05, - "loss": 3.7683, - "step": 1720 - }, - { - "epoch": 0.02, - "grad_norm": 28.98000144958496, - "learning_rate": 4.9575202222186945e-05, - "loss": 3.8361, - "step": 1740 - }, - { - "epoch": 0.02, - "grad_norm": 37.06975555419922, - "learning_rate": 4.956021052727731e-05, - "loss": 3.7297, - "step": 1760 - }, - { - "epoch": 0.03, - "grad_norm": 44.01863479614258, - "learning_rate": 4.954496120738094e-05, - "loss": 3.8244, - "step": 1780 - }, - { - "epoch": 0.03, - "grad_norm": 31.08086585998535, - "learning_rate": 4.9529454422455976e-05, - "loss": 3.8144, - "step": 1800 - }, - { - "epoch": 0.03, - "grad_norm": 36.80121994018555, - "learning_rate": 4.951369033516127e-05, - "loss": 3.7668, - "step": 1820 - }, - { - "epoch": 0.03, - "grad_norm": 24.225065231323242, - "learning_rate": 4.949766911085461e-05, - "loss": 3.7929, - "step": 1840 - }, - { - "epoch": 0.03, - "grad_norm": 33.50989532470703, - "learning_rate": 4.948139091759108e-05, - "loss": 3.7897, - "step": 1860 - }, - { - "epoch": 0.03, - "grad_norm": 26.35730743408203, - "learning_rate": 4.9464855926121225e-05, - "loss": 3.8618, - "step": 1880 - }, - { - "epoch": 0.03, - "grad_norm": 36.487464904785156, - "learning_rate": 4.944806430988927e-05, - "loss": 3.7205, - "step": 1900 - }, - { - "epoch": 0.03, - "grad_norm": 35.87200164794922, - "learning_rate": 4.943101624503132e-05, - "loss": 3.8324, - "step": 1920 - }, - { - "epoch": 0.03, - "grad_norm": 26.013994216918945, - "learning_rate": 4.941371191037354e-05, - "loss": 3.6997, - "step": 1940 - }, - { - "epoch": 0.03, - "grad_norm": 42.59685134887695, - "learning_rate": 4.939615148743017e-05, - "loss": 3.7085, - "step": 1960 - }, - { - "epoch": 0.03, - "grad_norm": 65.71659851074219, - "learning_rate": 4.9378335160401766e-05, - "loss": 3.8939, - "step": 1980 - }, - { - "epoch": 0.03, - "grad_norm": 25.612024307250977, - "learning_rate": 4.936026311617316e-05, - "loss": 3.7231, - "step": 2000 - }, - { - "epoch": 0.03, - "grad_norm": 28.377412796020508, - "learning_rate": 4.9341935544311536e-05, - "loss": 3.7476, - "step": 2020 - }, - { - "epoch": 0.03, - "grad_norm": 29.760807037353516, - "learning_rate": 4.9323352637064455e-05, - "loss": 3.8374, - "step": 2040 - }, - { - "epoch": 0.03, - "grad_norm": 35.875770568847656, - "learning_rate": 4.9304514589357834e-05, - "loss": 3.7073, - "step": 2060 - }, - { - "epoch": 0.03, - "grad_norm": 26.299306869506836, - "learning_rate": 4.928542159879386e-05, - "loss": 3.736, - "step": 2080 - }, - { - "epoch": 0.03, - "grad_norm": 40.1691780090332, - "learning_rate": 4.926607386564898e-05, - "loss": 3.7416, - "step": 2100 - }, - { - "epoch": 0.03, - "grad_norm": 35.2581901550293, - "learning_rate": 4.924647159287176e-05, - "loss": 3.7917, - "step": 2120 - }, - { - "epoch": 0.03, - "grad_norm": 24.038591384887695, - "learning_rate": 4.9226614986080763e-05, - "loss": 3.7164, - "step": 2140 - }, - { - "epoch": 0.03, - "grad_norm": 41.96257019042969, - "learning_rate": 4.92065042535624e-05, - "loss": 3.8562, - "step": 2160 - }, - { - "epoch": 0.03, - "grad_norm": 37.07769775390625, - "learning_rate": 4.918613960626873e-05, - "loss": 3.845, - "step": 2180 - }, - { - "epoch": 0.03, - "grad_norm": 35.35500717163086, - "learning_rate": 4.916552125781528e-05, - "loss": 3.679, - "step": 2200 - }, - { - "epoch": 0.03, - "grad_norm": 28.356767654418945, - "learning_rate": 4.914464942447876e-05, - "loss": 3.6217, - "step": 2220 - }, - { - "epoch": 0.03, - "grad_norm": 32.50172805786133, - "learning_rate": 4.912352432519484e-05, - "loss": 3.8185, - "step": 2240 - }, - { - "epoch": 0.03, - "grad_norm": 36.33710861206055, - "learning_rate": 4.910214618155579e-05, - "loss": 3.7401, - "step": 2260 - }, - { - "epoch": 0.03, - "grad_norm": 42.05067443847656, - "learning_rate": 4.908051521780824e-05, - "loss": 3.6782, - "step": 2280 - }, - { - "epoch": 0.03, - "grad_norm": 37.84385299682617, - "learning_rate": 4.9058631660850765e-05, - "loss": 3.7863, - "step": 2300 - }, - { - "epoch": 0.03, - "grad_norm": 28.022615432739258, - "learning_rate": 4.90364957402315e-05, - "loss": 3.7804, - "step": 2320 - }, - { - "epoch": 0.03, - "grad_norm": 38.274173736572266, - "learning_rate": 4.9014107688145804e-05, - "loss": 3.6898, - "step": 2340 - }, - { - "epoch": 0.03, - "grad_norm": 29.532123565673828, - "learning_rate": 4.899146773943374e-05, - "loss": 3.7521, - "step": 2360 - }, - { - "epoch": 0.03, - "grad_norm": 48.601417541503906, - "learning_rate": 4.896857613157765e-05, - "loss": 3.646, - "step": 2380 - }, - { - "epoch": 0.03, - "grad_norm": 31.142457962036133, - "learning_rate": 4.894543310469968e-05, - "loss": 3.7694, - "step": 2400 - }, - { - "epoch": 0.03, - "grad_norm": 39.75430679321289, - "learning_rate": 4.8922038901559224e-05, - "loss": 3.7673, - "step": 2420 - }, - { - "epoch": 0.03, - "grad_norm": 46.01137924194336, - "learning_rate": 4.8898393767550405e-05, - "loss": 3.7022, - "step": 2440 - }, - { - "epoch": 0.03, - "grad_norm": 26.171249389648438, - "learning_rate": 4.887449795069948e-05, - "loss": 3.7917, - "step": 2460 - }, - { - "epoch": 0.04, - "grad_norm": 46.24589538574219, - "learning_rate": 4.885035170166228e-05, - "loss": 3.7352, - "step": 2480 - }, - { - "epoch": 0.04, - "grad_norm": 31.69544219970703, - "learning_rate": 4.882595527372152e-05, - "loss": 3.694, - "step": 2500 - }, - { - "epoch": 0.04, - "grad_norm": 35.99808883666992, - "learning_rate": 4.880130892278419e-05, - "loss": 3.7636, - "step": 2520 - }, - { - "epoch": 0.04, - "grad_norm": 31.871978759765625, - "learning_rate": 4.877641290737884e-05, - "loss": 3.7472, - "step": 2540 - }, - { - "epoch": 0.04, - "grad_norm": 35.04158401489258, - "learning_rate": 4.87512674886529e-05, - "loss": 3.7445, - "step": 2560 - }, - { - "epoch": 0.04, - "grad_norm": 46.71685791015625, - "learning_rate": 4.872587293036991e-05, - "loss": 3.7141, - "step": 2580 - }, - { - "epoch": 0.04, - "grad_norm": 26.907012939453125, - "learning_rate": 4.870022949890676e-05, - "loss": 3.748, - "step": 2600 - }, - { - "epoch": 0.04, - "grad_norm": 26.9509334564209, - "learning_rate": 4.867433746325093e-05, - "loss": 3.7635, - "step": 2620 - }, - { - "epoch": 0.04, - "grad_norm": 26.85176658630371, - "learning_rate": 4.8648197094997616e-05, - "loss": 3.824, - "step": 2640 - }, - { - "epoch": 0.04, - "grad_norm": 22.88348960876465, - "learning_rate": 4.8621808668346906e-05, - "loss": 3.7504, - "step": 2660 - }, - { - "epoch": 0.04, - "grad_norm": 27.76841163635254, - "learning_rate": 4.859517246010091e-05, - "loss": 3.8228, - "step": 2680 - }, - { - "epoch": 0.04, - "grad_norm": 41.46321487426758, - "learning_rate": 4.856828874966086e-05, - "loss": 3.6509, - "step": 2700 - }, - { - "epoch": 0.04, - "grad_norm": 28.96099090576172, - "learning_rate": 4.854115781902414e-05, - "loss": 3.7377, - "step": 2720 - }, - { - "epoch": 0.04, - "grad_norm": 38.632015228271484, - "learning_rate": 4.851377995278138e-05, - "loss": 3.8471, - "step": 2740 - }, - { - "epoch": 0.04, - "grad_norm": 32.76665496826172, - "learning_rate": 4.8486155438113454e-05, - "loss": 3.731, - "step": 2760 - }, - { - "epoch": 0.04, - "grad_norm": 30.798906326293945, - "learning_rate": 4.845828456478842e-05, - "loss": 3.6953, - "step": 2780 - }, - { - "epoch": 0.04, - "grad_norm": 35.173606872558594, - "learning_rate": 4.8430167625158595e-05, - "loss": 3.6521, - "step": 2800 - }, - { - "epoch": 0.04, - "grad_norm": 50.02262496948242, - "learning_rate": 4.840180491415733e-05, - "loss": 3.6999, - "step": 2820 - }, - { - "epoch": 0.04, - "grad_norm": 33.76813507080078, - "learning_rate": 4.837319672929607e-05, - "loss": 3.7118, - "step": 2840 - }, - { - "epoch": 0.04, - "grad_norm": 24.56015396118164, - "learning_rate": 4.834434337066112e-05, - "loss": 3.7094, - "step": 2860 - }, - { - "epoch": 0.04, - "grad_norm": 39.17055892944336, - "learning_rate": 4.8315245140910556e-05, - "loss": 3.799, - "step": 2880 - }, - { - "epoch": 0.04, - "grad_norm": 29.631614685058594, - "learning_rate": 4.828590234527106e-05, - "loss": 3.7785, - "step": 2900 - }, - { - "epoch": 0.04, - "grad_norm": 46.83203125, - "learning_rate": 4.825631529153466e-05, - "loss": 3.6311, - "step": 2920 - }, - { - "epoch": 0.04, - "grad_norm": 34.5321044921875, - "learning_rate": 4.822648429005554e-05, - "loss": 3.7288, - "step": 2940 - }, - { - "epoch": 0.04, - "grad_norm": 19.74892234802246, - "learning_rate": 4.819640965374681e-05, - "loss": 3.6749, - "step": 2960 - }, - { - "epoch": 0.04, - "grad_norm": 51.736480712890625, - "learning_rate": 4.8166091698077164e-05, - "loss": 3.8733, - "step": 2980 - }, - { - "epoch": 0.04, - "grad_norm": 24.50010871887207, - "learning_rate": 4.813553074106761e-05, - "loss": 3.7634, - "step": 3000 - }, - { - "epoch": 0.04, - "grad_norm": 29.08304214477539, - "learning_rate": 4.810472710328812e-05, - "loss": 3.7277, - "step": 3020 - }, - { - "epoch": 0.04, - "grad_norm": 55.230377197265625, - "learning_rate": 4.80736811078543e-05, - "loss": 3.7238, - "step": 3040 - }, - { - "epoch": 0.04, - "grad_norm": 19.770660400390625, - "learning_rate": 4.804239308042392e-05, - "loss": 3.7202, - "step": 3060 - }, - { - "epoch": 0.04, - "grad_norm": 28.955581665039062, - "learning_rate": 4.8010863349193605e-05, - "loss": 3.7079, - "step": 3080 - }, - { - "epoch": 0.04, - "grad_norm": 32.6827278137207, - "learning_rate": 4.7979092244895305e-05, - "loss": 3.7488, - "step": 3100 - }, - { - "epoch": 0.04, - "grad_norm": 28.665210723876953, - "learning_rate": 4.794708010079289e-05, - "loss": 3.6798, - "step": 3120 - }, - { - "epoch": 0.04, - "grad_norm": 31.36636734008789, - "learning_rate": 4.791482725267857e-05, - "loss": 3.7233, - "step": 3140 - }, - { - "epoch": 0.04, - "grad_norm": 28.98109245300293, - "learning_rate": 4.7882334038869495e-05, - "loss": 3.8137, - "step": 3160 - }, - { - "epoch": 0.05, - "grad_norm": 25.13091278076172, - "learning_rate": 4.784960080020408e-05, - "loss": 3.756, - "step": 3180 - }, - { - "epoch": 0.05, - "grad_norm": 43.819313049316406, - "learning_rate": 4.781662788003851e-05, - "loss": 3.7371, - "step": 3200 - }, - { - "epoch": 0.05, - "grad_norm": 25.864599227905273, - "learning_rate": 4.7783415624243124e-05, - "loss": 3.604, - "step": 3220 - }, - { - "epoch": 0.05, - "grad_norm": 38.96342468261719, - "learning_rate": 4.7749964381198765e-05, - "loss": 3.7482, - "step": 3240 - }, - { - "epoch": 0.05, - "grad_norm": 28.412094116210938, - "learning_rate": 4.7716274501793144e-05, - "loss": 3.6766, - "step": 3260 - }, - { - "epoch": 0.05, - "grad_norm": 35.93290328979492, - "learning_rate": 4.768234633941716e-05, - "loss": 3.6659, - "step": 3280 - }, - { - "epoch": 0.05, - "grad_norm": 34.64625930786133, - "learning_rate": 4.764818024996117e-05, - "loss": 3.6739, - "step": 3300 - }, - { - "epoch": 0.05, - "grad_norm": 32.466495513916016, - "learning_rate": 4.76137765918113e-05, - "loss": 3.7524, - "step": 3320 - }, - { - "epoch": 0.05, - "grad_norm": 33.156776428222656, - "learning_rate": 4.7579135725845635e-05, - "loss": 3.7571, - "step": 3340 - }, - { - "epoch": 0.05, - "grad_norm": 48.48731994628906, - "learning_rate": 4.7544258015430463e-05, - "loss": 3.6783, - "step": 3360 - }, - { - "epoch": 0.05, - "grad_norm": 30.641870498657227, - "learning_rate": 4.750914382641648e-05, - "loss": 3.7549, - "step": 3380 - }, - { - "epoch": 0.05, - "grad_norm": 31.7097110748291, - "learning_rate": 4.747379352713489e-05, - "loss": 3.6388, - "step": 3400 - }, - { - "epoch": 0.05, - "grad_norm": 45.476951599121094, - "learning_rate": 4.7438207488393616e-05, - "loss": 3.7421, - "step": 3420 - }, - { - "epoch": 0.05, - "grad_norm": 42.40350341796875, - "learning_rate": 4.740238608347336e-05, - "loss": 3.771, - "step": 3440 - }, - { - "epoch": 0.05, - "grad_norm": 26.54286003112793, - "learning_rate": 4.736632968812373e-05, - "loss": 3.6409, - "step": 3460 - }, - { - "epoch": 0.05, - "grad_norm": 33.44880676269531, - "learning_rate": 4.733003868055923e-05, - "loss": 3.6977, - "step": 3480 - }, - { - "epoch": 0.05, - "grad_norm": 30.746978759765625, - "learning_rate": 4.7293513441455364e-05, - "loss": 3.6403, - "step": 3500 - }, - { - "epoch": 0.05, - "grad_norm": 30.616453170776367, - "learning_rate": 4.72567543539446e-05, - "loss": 3.7039, - "step": 3520 - }, - { - "epoch": 0.05, - "grad_norm": 28.486270904541016, - "learning_rate": 4.721976180361238e-05, - "loss": 3.6331, - "step": 3540 - }, - { - "epoch": 0.05, - "grad_norm": 31.4039363861084, - "learning_rate": 4.718253617849306e-05, - "loss": 3.6498, - "step": 3560 - }, - { - "epoch": 0.05, - "grad_norm": 22.35509490966797, - "learning_rate": 4.714507786906581e-05, - "loss": 3.709, - "step": 3580 - }, - { - "epoch": 0.05, - "grad_norm": 25.957500457763672, - "learning_rate": 4.710738726825059e-05, - "loss": 3.7159, - "step": 3600 - }, - { - "epoch": 0.05, - "grad_norm": 27.019580841064453, - "learning_rate": 4.706946477140396e-05, - "loss": 3.6971, - "step": 3620 - }, - { - "epoch": 0.05, - "grad_norm": 32.743896484375, - "learning_rate": 4.703131077631497e-05, - "loss": 3.5543, - "step": 3640 - }, - { - "epoch": 0.05, - "grad_norm": 30.018753051757812, - "learning_rate": 4.699292568320097e-05, - "loss": 3.6811, - "step": 3660 - }, - { - "epoch": 0.05, - "grad_norm": 27.54176139831543, - "learning_rate": 4.695430989470343e-05, - "loss": 3.6593, - "step": 3680 - }, - { - "epoch": 0.05, - "grad_norm": 30.283519744873047, - "learning_rate": 4.69154638158837e-05, - "loss": 3.551, - "step": 3700 - }, - { - "epoch": 0.05, - "grad_norm": 26.505075454711914, - "learning_rate": 4.687638785421875e-05, - "loss": 3.7794, - "step": 3720 - }, - { - "epoch": 0.05, - "grad_norm": 26.94403839111328, - "learning_rate": 4.683708241959694e-05, - "loss": 3.6415, - "step": 3740 - }, - { - "epoch": 0.05, - "grad_norm": 31.006845474243164, - "learning_rate": 4.679754792431368e-05, - "loss": 3.6741, - "step": 3760 - }, - { - "epoch": 0.05, - "grad_norm": 60.343318939208984, - "learning_rate": 4.675778478306712e-05, - "loss": 3.6502, - "step": 3780 - }, - { - "epoch": 0.05, - "grad_norm": 52.47261047363281, - "learning_rate": 4.671779341295378e-05, - "loss": 3.6878, - "step": 3800 - }, - { - "epoch": 0.05, - "grad_norm": 34.15403747558594, - "learning_rate": 4.6677574233464226e-05, - "loss": 3.7464, - "step": 3820 - }, - { - "epoch": 0.05, - "grad_norm": 21.71308135986328, - "learning_rate": 4.663712766647862e-05, - "loss": 3.6239, - "step": 3840 - }, - { - "epoch": 0.05, - "grad_norm": 25.242189407348633, - "learning_rate": 4.65964541362623e-05, - "loss": 3.8114, - "step": 3860 - }, - { - "epoch": 0.05, - "grad_norm": 35.03647232055664, - "learning_rate": 4.655555406946135e-05, - "loss": 3.654, - "step": 3880 - }, - { - "epoch": 0.06, - "grad_norm": 54.89191818237305, - "learning_rate": 4.6514427895098134e-05, - "loss": 3.6936, - "step": 3900 - }, - { - "epoch": 0.06, - "grad_norm": 24.903459548950195, - "learning_rate": 4.647307604456674e-05, - "loss": 3.8267, - "step": 3920 - }, - { - "epoch": 0.06, - "grad_norm": 33.852054595947266, - "learning_rate": 4.643149895162854e-05, - "loss": 3.661, - "step": 3940 - }, - { - "epoch": 0.06, - "grad_norm": 35.687713623046875, - "learning_rate": 4.6389697052407534e-05, - "loss": 3.67, - "step": 3960 - }, - { - "epoch": 0.06, - "grad_norm": 29.4704647064209, - "learning_rate": 4.6347670785385884e-05, - "loss": 3.7182, - "step": 3980 - }, - { - "epoch": 0.06, - "grad_norm": 24.089828491210938, - "learning_rate": 4.630542059139924e-05, - "loss": 3.5781, - "step": 4000 - }, - { - "epoch": 0.06, - "grad_norm": 34.60494613647461, - "learning_rate": 4.626294691363213e-05, - "loss": 3.7001, - "step": 4020 - }, - { - "epoch": 0.06, - "grad_norm": 53.43947219848633, - "learning_rate": 4.622025019761336e-05, - "loss": 3.6048, - "step": 4040 - }, - { - "epoch": 0.06, - "grad_norm": 35.322486877441406, - "learning_rate": 4.617733089121127e-05, - "loss": 3.6201, - "step": 4060 - }, - { - "epoch": 0.06, - "grad_norm": 47.170005798339844, - "learning_rate": 4.613418944462907e-05, - "loss": 3.7443, - "step": 4080 - }, - { - "epoch": 0.06, - "grad_norm": 30.616161346435547, - "learning_rate": 4.6090826310400116e-05, - "loss": 3.7685, - "step": 4100 - }, - { - "epoch": 0.06, - "grad_norm": 24.628185272216797, - "learning_rate": 4.6047241943383176e-05, - "loss": 3.6677, - "step": 4120 - }, - { - "epoch": 0.06, - "grad_norm": 38.79618453979492, - "learning_rate": 4.600343680075764e-05, - "loss": 3.744, - "step": 4140 - }, - { - "epoch": 0.06, - "grad_norm": 37.38518524169922, - "learning_rate": 4.595941134201871e-05, - "loss": 3.7101, - "step": 4160 - }, - { - "epoch": 0.06, - "grad_norm": 29.248828887939453, - "learning_rate": 4.5915166028972624e-05, - "loss": 3.7209, - "step": 4180 - }, - { - "epoch": 0.06, - "grad_norm": 45.65785217285156, - "learning_rate": 4.587070132573178e-05, - "loss": 3.7605, - "step": 4200 - }, - { - "epoch": 0.06, - "grad_norm": 24.220314025878906, - "learning_rate": 4.582601769870988e-05, - "loss": 3.6609, - "step": 4220 - }, - { - "epoch": 0.06, - "grad_norm": 27.00070571899414, - "learning_rate": 4.578111561661702e-05, - "loss": 3.6754, - "step": 4240 - }, - { - "epoch": 0.06, - "grad_norm": 75.85283660888672, - "learning_rate": 4.573599555045479e-05, - "loss": 3.6605, - "step": 4260 - }, - { - "epoch": 0.06, - "grad_norm": 29.803096771240234, - "learning_rate": 4.569065797351135e-05, - "loss": 3.6287, - "step": 4280 - }, - { - "epoch": 0.06, - "grad_norm": 26.4781494140625, - "learning_rate": 4.5645103361356415e-05, - "loss": 3.6301, - "step": 4300 - }, - { - "epoch": 0.06, - "grad_norm": 47.245357513427734, - "learning_rate": 4.5599332191836316e-05, - "loss": 3.6776, - "step": 4320 - }, - { - "epoch": 0.06, - "grad_norm": 26.005104064941406, - "learning_rate": 4.555334494506896e-05, - "loss": 3.6756, - "step": 4340 - }, - { - "epoch": 0.06, - "grad_norm": 35.15077590942383, - "learning_rate": 4.5507142103438794e-05, - "loss": 3.7022, - "step": 4360 - }, - { - "epoch": 0.06, - "grad_norm": 29.038782119750977, - "learning_rate": 4.546072415159179e-05, - "loss": 3.6325, - "step": 4380 - }, - { - "epoch": 0.06, - "grad_norm": 30.944393157958984, - "learning_rate": 4.541409157643027e-05, - "loss": 3.6343, - "step": 4400 - }, - { - "epoch": 0.06, - "grad_norm": 31.153432846069336, - "learning_rate": 4.536724486710791e-05, - "loss": 3.7739, - "step": 4420 - }, - { - "epoch": 0.06, - "grad_norm": 40.95075225830078, - "learning_rate": 4.53201845150245e-05, - "loss": 3.6558, - "step": 4440 - }, - { - "epoch": 0.06, - "grad_norm": 30.37499237060547, - "learning_rate": 4.5272911013820876e-05, - "loss": 3.6093, - "step": 4460 - }, - { - "epoch": 0.06, - "grad_norm": 23.894237518310547, - "learning_rate": 4.522542485937369e-05, - "loss": 3.6415, - "step": 4480 - }, - { - "epoch": 0.06, - "grad_norm": 69.29508209228516, - "learning_rate": 4.517772654979023e-05, - "loss": 3.696, - "step": 4500 - }, - { - "epoch": 0.06, - "grad_norm": 31.464527130126953, - "learning_rate": 4.5129816585403206e-05, - "loss": 3.7147, - "step": 4520 - }, - { - "epoch": 0.06, - "grad_norm": 30.76380729675293, - "learning_rate": 4.508169546876547e-05, - "loss": 3.6428, - "step": 4540 - }, - { - "epoch": 0.06, - "grad_norm": 27.94367027282715, - "learning_rate": 4.503336370464476e-05, - "loss": 3.7018, - "step": 4560 - }, - { - "epoch": 0.06, - "grad_norm": 22.166793823242188, - "learning_rate": 4.49848218000184e-05, - "loss": 3.7018, - "step": 4580 - }, - { - "epoch": 0.07, - "grad_norm": 32.058921813964844, - "learning_rate": 4.493607026406802e-05, - "loss": 3.7035, - "step": 4600 - }, - { - "epoch": 0.07, - "grad_norm": 28.55988311767578, - "learning_rate": 4.488710960817416e-05, - "loss": 3.7725, - "step": 4620 - }, - { - "epoch": 0.07, - "grad_norm": 23.51280403137207, - "learning_rate": 4.4837940345910925e-05, - "loss": 3.7238, - "step": 4640 - }, - { - "epoch": 0.07, - "grad_norm": 37.3757209777832, - "learning_rate": 4.4788562993040614e-05, - "loss": 3.701, - "step": 4660 - }, - { - "epoch": 0.07, - "grad_norm": 38.56554412841797, - "learning_rate": 4.473897806750829e-05, - "loss": 3.7174, - "step": 4680 - }, - { - "epoch": 0.07, - "grad_norm": 29.553325653076172, - "learning_rate": 4.4689186089436366e-05, - "loss": 3.627, - "step": 4700 - }, - { - "epoch": 0.07, - "grad_norm": 33.66290283203125, - "learning_rate": 4.463918758111912e-05, - "loss": 3.6307, - "step": 4720 - }, - { - "epoch": 0.07, - "grad_norm": 29.957775115966797, - "learning_rate": 4.4588983067017257e-05, - "loss": 3.6157, - "step": 4740 - }, - { - "epoch": 0.07, - "grad_norm": 35.32748794555664, - "learning_rate": 4.4538573073752365e-05, - "loss": 3.5961, - "step": 4760 - }, - { - "epoch": 0.07, - "grad_norm": 24.597824096679688, - "learning_rate": 4.448795813010142e-05, - "loss": 3.5881, - "step": 4780 - }, - { - "epoch": 0.07, - "grad_norm": 26.248044967651367, - "learning_rate": 4.443713876699124e-05, - "loss": 3.6057, - "step": 4800 - }, - { - "epoch": 0.07, - "grad_norm": 25.942325592041016, - "learning_rate": 4.4386115517492874e-05, - "loss": 3.6286, - "step": 4820 - }, - { - "epoch": 0.07, - "grad_norm": 42.028316497802734, - "learning_rate": 4.43348889168161e-05, - "loss": 3.6306, - "step": 4840 - }, - { - "epoch": 0.07, - "grad_norm": 24.317644119262695, - "learning_rate": 4.4283459502303695e-05, - "loss": 3.5992, - "step": 4860 - }, - { - "epoch": 0.07, - "grad_norm": 43.174903869628906, - "learning_rate": 4.4231827813425885e-05, - "loss": 3.6493, - "step": 4880 - }, - { - "epoch": 0.07, - "grad_norm": 33.58101272583008, - "learning_rate": 4.417999439177466e-05, - "loss": 3.6843, - "step": 4900 - }, - { - "epoch": 0.07, - "grad_norm": 34.096824645996094, - "learning_rate": 4.412795978105807e-05, - "loss": 3.6134, - "step": 4920 - }, - { - "epoch": 0.07, - "grad_norm": 35.04353713989258, - "learning_rate": 4.4075724527094584e-05, - "loss": 3.5916, - "step": 4940 - }, - { - "epoch": 0.07, - "grad_norm": 28.97658920288086, - "learning_rate": 4.402328917780728e-05, - "loss": 3.6362, - "step": 4960 - }, - { - "epoch": 0.07, - "grad_norm": 35.05881118774414, - "learning_rate": 4.397065428321817e-05, - "loss": 3.7566, - "step": 4980 - }, - { - "epoch": 0.07, - "grad_norm": 27.057044982910156, - "learning_rate": 4.391782039544238e-05, - "loss": 3.4967, - "step": 5000 - }, - { - "epoch": 0.07, - "grad_norm": 22.590089797973633, - "learning_rate": 4.386478806868241e-05, - "loss": 3.6759, - "step": 5020 - }, - { - "epoch": 0.07, - "grad_norm": 34.77460479736328, - "learning_rate": 4.3811557859222254e-05, - "loss": 3.6893, - "step": 5040 - }, - { - "epoch": 0.07, - "grad_norm": 24.440248489379883, - "learning_rate": 4.375813032542164e-05, - "loss": 3.7167, - "step": 5060 - }, - { - "epoch": 0.07, - "grad_norm": 42.91717529296875, - "learning_rate": 4.3704506027710105e-05, - "loss": 3.5893, - "step": 5080 - }, - { - "epoch": 0.07, - "grad_norm": 34.991634368896484, - "learning_rate": 4.365068552858115e-05, - "loss": 3.5482, - "step": 5100 - }, - { - "epoch": 0.07, - "grad_norm": 37.62036895751953, - "learning_rate": 4.3596669392586365e-05, - "loss": 3.5972, - "step": 5120 - }, - { - "epoch": 0.07, - "grad_norm": 29.56283950805664, - "learning_rate": 4.354245818632944e-05, - "loss": 3.6804, - "step": 5140 - }, - { - "epoch": 0.07, - "grad_norm": 35.37843322753906, - "learning_rate": 4.348805247846027e-05, - "loss": 3.6491, - "step": 5160 - }, - { - "epoch": 0.07, - "grad_norm": 39.210906982421875, - "learning_rate": 4.343345283966901e-05, - "loss": 3.6268, - "step": 5180 - }, - { - "epoch": 0.07, - "grad_norm": 26.60144805908203, - "learning_rate": 4.337865984268001e-05, - "loss": 3.6277, - "step": 5200 - }, - { - "epoch": 0.07, - "grad_norm": 32.668052673339844, - "learning_rate": 4.33236740622459e-05, - "loss": 3.6159, - "step": 5220 - }, - { - "epoch": 0.07, - "grad_norm": 43.837833404541016, - "learning_rate": 4.326849607514148e-05, - "loss": 3.5939, - "step": 5240 - }, - { - "epoch": 0.07, - "grad_norm": 20.860111236572266, - "learning_rate": 4.321312646015775e-05, - "loss": 3.624, - "step": 5260 - }, - { - "epoch": 0.07, - "grad_norm": 24.005277633666992, - "learning_rate": 4.3157565798095753e-05, - "loss": 3.6098, - "step": 5280 - }, - { - "epoch": 0.08, - "grad_norm": 23.65524673461914, - "learning_rate": 4.3101814671760546e-05, - "loss": 3.6969, - "step": 5300 - }, - { - "epoch": 0.08, - "grad_norm": 40.98033905029297, - "learning_rate": 4.304587366595506e-05, - "loss": 3.8225, - "step": 5320 - }, - { - "epoch": 0.08, - "grad_norm": 28.647207260131836, - "learning_rate": 4.298974336747397e-05, - "loss": 3.6742, - "step": 5340 - }, - { - "epoch": 0.08, - "grad_norm": 20.806941986083984, - "learning_rate": 4.2933424365097564e-05, - "loss": 3.5679, - "step": 5360 - }, - { - "epoch": 0.08, - "grad_norm": 22.459196090698242, - "learning_rate": 4.287691724958551e-05, - "loss": 3.6389, - "step": 5380 - }, - { - "epoch": 0.08, - "grad_norm": 23.558490753173828, - "learning_rate": 4.2820222613670736e-05, - "loss": 3.6654, - "step": 5400 - }, - { - "epoch": 0.08, - "grad_norm": 20.315793991088867, - "learning_rate": 4.276334105205312e-05, - "loss": 3.5976, - "step": 5420 - }, - { - "epoch": 0.08, - "grad_norm": 21.125396728515625, - "learning_rate": 4.2706273161393327e-05, - "loss": 3.5712, - "step": 5440 - }, - { - "epoch": 0.08, - "grad_norm": 25.103483200073242, - "learning_rate": 4.2649019540306545e-05, - "loss": 3.616, - "step": 5460 - }, - { - "epoch": 0.08, - "grad_norm": 23.65394401550293, - "learning_rate": 4.2591580789356156e-05, - "loss": 3.6587, - "step": 5480 - }, - { - "epoch": 0.08, - "grad_norm": 31.216896057128906, - "learning_rate": 4.253395751104748e-05, - "loss": 3.7161, - "step": 5500 - }, - { - "epoch": 0.08, - "grad_norm": 28.144855499267578, - "learning_rate": 4.247615030982144e-05, - "loss": 3.6847, - "step": 5520 - }, - { - "epoch": 0.08, - "grad_norm": 23.597564697265625, - "learning_rate": 4.241815979204822e-05, - "loss": 3.6556, - "step": 5540 - }, - { - "epoch": 0.08, - "grad_norm": 41.00291061401367, - "learning_rate": 4.2359986566020906e-05, - "loss": 3.7665, - "step": 5560 - }, - { - "epoch": 0.08, - "grad_norm": 37.05702209472656, - "learning_rate": 4.230163124194913e-05, - "loss": 3.5916, - "step": 5580 - }, - { - "epoch": 0.08, - "grad_norm": 28.161930084228516, - "learning_rate": 4.224309443195261e-05, - "loss": 3.6887, - "step": 5600 - }, - { - "epoch": 0.08, - "grad_norm": 31.685361862182617, - "learning_rate": 4.2184376750054786e-05, - "loss": 3.5724, - "step": 5620 - }, - { - "epoch": 0.08, - "grad_norm": 38.13533020019531, - "learning_rate": 4.2125478812176364e-05, - "loss": 3.664, - "step": 5640 - }, - { - "epoch": 0.08, - "grad_norm": 20.385272979736328, - "learning_rate": 4.206640123612884e-05, - "loss": 3.73, - "step": 5660 - }, - { - "epoch": 0.08, - "grad_norm": 30.926259994506836, - "learning_rate": 4.200714464160804e-05, - "loss": 3.6472, - "step": 5680 - }, - { - "epoch": 0.08, - "grad_norm": 19.820131301879883, - "learning_rate": 4.194770965018758e-05, - "loss": 3.6226, - "step": 5700 - }, - { - "epoch": 0.08, - "grad_norm": 21.318801879882812, - "learning_rate": 4.188809688531241e-05, - "loss": 3.635, - "step": 5720 - }, - { - "epoch": 0.08, - "grad_norm": 18.304567337036133, - "learning_rate": 4.182830697229223e-05, - "loss": 3.625, - "step": 5740 - }, - { - "epoch": 0.08, - "grad_norm": 24.25802230834961, - "learning_rate": 4.176834053829492e-05, - "loss": 3.5844, - "step": 5760 - }, - { - "epoch": 0.08, - "grad_norm": 53.09843444824219, - "learning_rate": 4.170819821234001e-05, - "loss": 3.7058, - "step": 5780 - }, - { - "epoch": 0.08, - "grad_norm": 39.87876510620117, - "learning_rate": 4.164788062529203e-05, - "loss": 3.725, - "step": 5800 - }, - { - "epoch": 0.08, - "grad_norm": 32.36482620239258, - "learning_rate": 4.1587388409853935e-05, - "loss": 3.5355, - "step": 5820 - }, - { - "epoch": 0.08, - "grad_norm": 28.59760284423828, - "learning_rate": 4.1526722200560445e-05, - "loss": 3.6528, - "step": 5840 - }, - { - "epoch": 0.08, - "grad_norm": 21.3729305267334, - "learning_rate": 4.146588263377137e-05, - "loss": 3.6428, - "step": 5860 - }, - { - "epoch": 0.08, - "grad_norm": 20.160661697387695, - "learning_rate": 4.140487034766499e-05, - "loss": 3.6116, - "step": 5880 - }, - { - "epoch": 0.08, - "grad_norm": 31.58021354675293, - "learning_rate": 4.134368598223132e-05, - "loss": 3.6302, - "step": 5900 - }, - { - "epoch": 0.08, - "grad_norm": 30.793672561645508, - "learning_rate": 4.128233017926538e-05, - "loss": 3.5663, - "step": 5920 - }, - { - "epoch": 0.08, - "grad_norm": 22.589147567749023, - "learning_rate": 4.122080358236055e-05, - "loss": 3.6292, - "step": 5940 - }, - { - "epoch": 0.08, - "grad_norm": 32.27565383911133, - "learning_rate": 4.1159106836901674e-05, - "loss": 3.5806, - "step": 5960 - }, - { - "epoch": 0.08, - "grad_norm": 37.15829849243164, - "learning_rate": 4.109724059005844e-05, - "loss": 3.5662, - "step": 5980 - }, - { - "epoch": 0.08, - "grad_norm": 38.23238754272461, - "learning_rate": 4.10352054907785e-05, - "loss": 3.6842, - "step": 6000 - }, - { - "epoch": 0.09, - "grad_norm": 24.37531089782715, - "learning_rate": 4.0973002189780694e-05, - "loss": 3.6153, - "step": 6020 - }, - { - "epoch": 0.09, - "grad_norm": 24.309982299804688, - "learning_rate": 4.0910631339548206e-05, - "loss": 3.6502, - "step": 6040 - }, - { - "epoch": 0.09, - "grad_norm": 24.007654190063477, - "learning_rate": 4.084809359432175e-05, - "loss": 3.7203, - "step": 6060 - }, - { - "epoch": 0.09, - "grad_norm": 24.977094650268555, - "learning_rate": 4.0785389610092686e-05, - "loss": 3.5413, - "step": 6080 - }, - { - "epoch": 0.09, - "grad_norm": 27.397930145263672, - "learning_rate": 4.072252004459611e-05, - "loss": 3.5612, - "step": 6100 - }, - { - "epoch": 0.09, - "grad_norm": 26.012800216674805, - "learning_rate": 4.065948555730405e-05, - "loss": 3.6385, - "step": 6120 - }, - { - "epoch": 0.09, - "grad_norm": 29.745574951171875, - "learning_rate": 4.0596286809418435e-05, - "loss": 3.6646, - "step": 6140 - }, - { - "epoch": 0.09, - "grad_norm": 30.76190185546875, - "learning_rate": 4.053292446386422e-05, - "loss": 3.6622, - "step": 6160 - }, - { - "epoch": 0.09, - "grad_norm": 27.577564239501953, - "learning_rate": 4.046939918528243e-05, - "loss": 3.701, - "step": 6180 - }, - { - "epoch": 0.09, - "grad_norm": 31.610410690307617, - "learning_rate": 4.0405711640023186e-05, - "loss": 3.5977, - "step": 6200 - }, - { - "epoch": 0.09, - "grad_norm": 28.61423110961914, - "learning_rate": 4.034186249613869e-05, - "loss": 3.7307, - "step": 6220 - }, - { - "epoch": 0.09, - "grad_norm": 44.62327575683594, - "learning_rate": 4.027785242337626e-05, - "loss": 3.7055, - "step": 6240 - }, - { - "epoch": 0.09, - "grad_norm": 32.20371627807617, - "learning_rate": 4.0213682093171254e-05, - "loss": 3.6186, - "step": 6260 - }, - { - "epoch": 0.09, - "grad_norm": 32.36015701293945, - "learning_rate": 4.014935217864009e-05, - "loss": 3.5798, - "step": 6280 - }, - { - "epoch": 0.09, - "grad_norm": 36.1356201171875, - "learning_rate": 4.008486335457312e-05, - "loss": 3.6395, - "step": 6300 - }, - { - "epoch": 0.09, - "grad_norm": 20.485820770263672, - "learning_rate": 4.0020216297427594e-05, - "loss": 3.6075, - "step": 6320 - }, - { - "epoch": 0.09, - "grad_norm": 21.503564834594727, - "learning_rate": 3.995541168532055e-05, - "loss": 3.6099, - "step": 6340 - }, - { - "epoch": 0.09, - "grad_norm": 29.125812530517578, - "learning_rate": 3.9890450198021704e-05, - "loss": 3.6665, - "step": 6360 - }, - { - "epoch": 0.09, - "grad_norm": 24.479976654052734, - "learning_rate": 3.982533251694632e-05, - "loss": 3.7168, - "step": 6380 - }, - { - "epoch": 0.09, - "grad_norm": 36.184410095214844, - "learning_rate": 3.976005932514807e-05, - "loss": 3.5771, - "step": 6400 - }, - { - "epoch": 0.09, - "grad_norm": 28.156030654907227, - "learning_rate": 3.969463130731183e-05, - "loss": 3.6353, - "step": 6420 - }, - { - "epoch": 0.09, - "grad_norm": 25.22379493713379, - "learning_rate": 3.962904914974656e-05, - "loss": 3.5015, - "step": 6440 - }, - { - "epoch": 0.09, - "grad_norm": 31.427339553833008, - "learning_rate": 3.9563313540378055e-05, - "loss": 3.5712, - "step": 6460 - }, - { - "epoch": 0.09, - "grad_norm": 19.2696590423584, - "learning_rate": 3.949742516874175e-05, - "loss": 3.5929, - "step": 6480 - }, - { - "epoch": 0.09, - "grad_norm": 23.234111785888672, - "learning_rate": 3.943138472597549e-05, - "loss": 3.6166, - "step": 6500 - }, - { - "epoch": 0.09, - "grad_norm": 26.726085662841797, - "learning_rate": 3.936519290481226e-05, - "loss": 3.6748, - "step": 6520 - }, - { - "epoch": 0.09, - "grad_norm": 34.712257385253906, - "learning_rate": 3.929885039957296e-05, - "loss": 3.64, - "step": 6540 - }, - { - "epoch": 0.09, - "grad_norm": 25.96158218383789, - "learning_rate": 3.923235790615907e-05, - "loss": 3.6119, - "step": 6560 - }, - { - "epoch": 0.09, - "grad_norm": 34.04408264160156, - "learning_rate": 3.916571612204537e-05, - "loss": 3.6881, - "step": 6580 - }, - { - "epoch": 0.09, - "grad_norm": 20.656030654907227, - "learning_rate": 3.909892574627266e-05, - "loss": 3.6462, - "step": 6600 - }, - { - "epoch": 0.09, - "grad_norm": 23.8648738861084, - "learning_rate": 3.9031987479440367e-05, - "loss": 3.597, - "step": 6620 - }, - { - "epoch": 0.09, - "grad_norm": 34.48773193359375, - "learning_rate": 3.896490202369924e-05, - "loss": 3.5781, - "step": 6640 - }, - { - "epoch": 0.09, - "grad_norm": 26.09375762939453, - "learning_rate": 3.8897670082743955e-05, - "loss": 3.5463, - "step": 6660 - }, - { - "epoch": 0.09, - "grad_norm": 25.49962615966797, - "learning_rate": 3.883029236180577e-05, - "loss": 3.637, - "step": 6680 - }, - { - "epoch": 0.09, - "grad_norm": 37.70731735229492, - "learning_rate": 3.876276956764509e-05, - "loss": 3.6515, - "step": 6700 - }, - { - "epoch": 0.1, - "grad_norm": 53.345558166503906, - "learning_rate": 3.8695102408544076e-05, - "loss": 3.521, - "step": 6720 - }, - { - "epoch": 0.1, - "grad_norm": 34.147884368896484, - "learning_rate": 3.862729159429921e-05, - "loss": 3.6443, - "step": 6740 - }, - { - "epoch": 0.1, - "grad_norm": 29.45001220703125, - "learning_rate": 3.855933783621384e-05, - "loss": 3.5976, - "step": 6760 - }, - { - "epoch": 0.1, - "grad_norm": 29.933969497680664, - "learning_rate": 3.849124184709073e-05, - "loss": 3.6396, - "step": 6780 - }, - { - "epoch": 0.1, - "grad_norm": 38.85334014892578, - "learning_rate": 3.84230043412246e-05, - "loss": 3.6518, - "step": 6800 - }, - { - "epoch": 0.1, - "grad_norm": 34.85492706298828, - "learning_rate": 3.835462603439458e-05, - "loss": 3.6577, - "step": 6820 - }, - { - "epoch": 0.1, - "grad_norm": 29.77360725402832, - "learning_rate": 3.828610764385676e-05, - "loss": 3.6026, - "step": 6840 - }, - { - "epoch": 0.1, - "grad_norm": 38.89609909057617, - "learning_rate": 3.821744988833663e-05, - "loss": 3.6144, - "step": 6860 - }, - { - "epoch": 0.1, - "grad_norm": 25.664960861206055, - "learning_rate": 3.814865348802157e-05, - "loss": 3.5826, - "step": 6880 - }, - { - "epoch": 0.1, - "grad_norm": 31.955894470214844, - "learning_rate": 3.807971916455325e-05, - "loss": 3.6973, - "step": 6900 - }, - { - "epoch": 0.1, - "grad_norm": 23.378131866455078, - "learning_rate": 3.8010647641020115e-05, - "loss": 3.6875, - "step": 6920 - }, - { - "epoch": 0.1, - "grad_norm": 45.89334487915039, - "learning_rate": 3.794143964194976e-05, - "loss": 3.5457, - "step": 6940 - }, - { - "epoch": 0.1, - "grad_norm": 32.45075988769531, - "learning_rate": 3.787209589330134e-05, - "loss": 3.5719, - "step": 6960 - }, - { - "epoch": 0.1, - "grad_norm": 32.06966018676758, - "learning_rate": 3.7802617122457975e-05, - "loss": 3.6324, - "step": 6980 - }, - { - "epoch": 0.1, - "grad_norm": 27.93364715576172, - "learning_rate": 3.773300405821908e-05, - "loss": 3.6093, - "step": 7000 - }, - { - "epoch": 0.1, - "grad_norm": 23.111515045166016, - "learning_rate": 3.766325743079277e-05, - "loss": 3.5292, - "step": 7020 - }, - { - "epoch": 0.1, - "grad_norm": 24.405742645263672, - "learning_rate": 3.759337797178816e-05, - "loss": 3.5969, - "step": 7040 - }, - { - "epoch": 0.1, - "grad_norm": 37.218467712402344, - "learning_rate": 3.752336641420772e-05, - "loss": 3.653, - "step": 7060 - }, - { - "epoch": 0.1, - "grad_norm": 32.396522521972656, - "learning_rate": 3.745322349243954e-05, - "loss": 3.6483, - "step": 7080 - }, - { - "epoch": 0.1, - "grad_norm": 35.53373336791992, - "learning_rate": 3.7382949942249694e-05, - "loss": 3.6356, - "step": 7100 - }, - { - "epoch": 0.1, - "grad_norm": 33.19758987426758, - "learning_rate": 3.731254650077446e-05, - "loss": 3.6017, - "step": 7120 - }, - { - "epoch": 0.1, - "grad_norm": 36.60466003417969, - "learning_rate": 3.7242013906512626e-05, - "loss": 3.6246, - "step": 7140 - }, - { - "epoch": 0.1, - "grad_norm": 21.257328033447266, - "learning_rate": 3.717135289931774e-05, - "loss": 3.6046, - "step": 7160 - }, - { - "epoch": 0.1, - "grad_norm": 25.697444915771484, - "learning_rate": 3.7100564220390326e-05, - "loss": 3.6154, - "step": 7180 - }, - { - "epoch": 0.1, - "grad_norm": 28.491622924804688, - "learning_rate": 3.702964861227013e-05, - "loss": 3.6983, - "step": 7200 - }, - { - "epoch": 0.1, - "grad_norm": 26.819791793823242, - "learning_rate": 3.695860681882832e-05, - "loss": 3.5722, - "step": 7220 - }, - { - "epoch": 0.1, - "grad_norm": 25.864788055419922, - "learning_rate": 3.6887439585259694e-05, - "loss": 3.6825, - "step": 7240 - }, - { - "epoch": 0.1, - "grad_norm": 22.492717742919922, - "learning_rate": 3.681614765807486e-05, - "loss": 3.6377, - "step": 7260 - }, - { - "epoch": 0.1, - "grad_norm": 31.227336883544922, - "learning_rate": 3.6744731785092395e-05, - "loss": 3.5476, - "step": 7280 - }, - { - "epoch": 0.1, - "grad_norm": 29.010467529296875, - "learning_rate": 3.6673192715431015e-05, - "loss": 3.6285, - "step": 7300 - }, - { - "epoch": 0.1, - "grad_norm": 38.40274429321289, - "learning_rate": 3.6601531199501714e-05, - "loss": 3.6941, - "step": 7320 - }, - { - "epoch": 0.1, - "grad_norm": 26.361167907714844, - "learning_rate": 3.652974798899988e-05, - "loss": 3.5772, - "step": 7340 - }, - { - "epoch": 0.1, - "grad_norm": 30.241390228271484, - "learning_rate": 3.645784383689742e-05, - "loss": 3.5177, - "step": 7360 - }, - { - "epoch": 0.1, - "grad_norm": 43.579349517822266, - "learning_rate": 3.6385819497434876e-05, - "loss": 3.7467, - "step": 7380 - }, - { - "epoch": 0.1, - "grad_norm": 47.42546081542969, - "learning_rate": 3.631367572611348e-05, - "loss": 3.6651, - "step": 7400 - }, - { - "epoch": 0.11, - "grad_norm": 29.63494110107422, - "learning_rate": 3.6241413279687254e-05, - "loss": 3.6368, - "step": 7420 - }, - { - "epoch": 0.11, - "grad_norm": 23.63068389892578, - "learning_rate": 3.616903291615506e-05, - "loss": 3.4684, - "step": 7440 - }, - { - "epoch": 0.11, - "grad_norm": 24.25609588623047, - "learning_rate": 3.6096535394752676e-05, - "loss": 3.6177, - "step": 7460 - }, - { - "epoch": 0.11, - "grad_norm": 23.829919815063477, - "learning_rate": 3.6023921475944794e-05, - "loss": 3.6008, - "step": 7480 - }, - { - "epoch": 0.11, - "grad_norm": 23.879764556884766, - "learning_rate": 3.595119192141706e-05, - "loss": 3.6926, - "step": 7500 - }, - { - "epoch": 0.11, - "grad_norm": 22.195941925048828, - "learning_rate": 3.5878347494068084e-05, - "loss": 3.6049, - "step": 7520 - }, - { - "epoch": 0.11, - "grad_norm": 48.3228645324707, - "learning_rate": 3.580538895800144e-05, - "loss": 3.64, - "step": 7540 - }, - { - "epoch": 0.11, - "grad_norm": 33.77362823486328, - "learning_rate": 3.5732317078517654e-05, - "loss": 3.573, - "step": 7560 - }, - { - "epoch": 0.11, - "grad_norm": 29.266658782958984, - "learning_rate": 3.565913262210615e-05, - "loss": 3.6385, - "step": 7580 - }, - { - "epoch": 0.11, - "grad_norm": 42.985694885253906, - "learning_rate": 3.5585836356437264e-05, - "loss": 3.5987, - "step": 7600 - }, - { - "epoch": 0.11, - "grad_norm": 28.579496383666992, - "learning_rate": 3.551242905035412e-05, - "loss": 3.6161, - "step": 7620 - }, - { - "epoch": 0.11, - "grad_norm": 27.196502685546875, - "learning_rate": 3.5438911473864634e-05, - "loss": 3.5763, - "step": 7640 - }, - { - "epoch": 0.11, - "grad_norm": 28.27582359313965, - "learning_rate": 3.5365284398133405e-05, - "loss": 3.6452, - "step": 7660 - }, - { - "epoch": 0.11, - "grad_norm": 27.310009002685547, - "learning_rate": 3.52915485954736e-05, - "loss": 3.6718, - "step": 7680 - }, - { - "epoch": 0.11, - "grad_norm": 18.603565216064453, - "learning_rate": 3.521770483933891e-05, - "loss": 3.7397, - "step": 7700 - }, - { - "epoch": 0.11, - "grad_norm": 26.25426483154297, - "learning_rate": 3.514375390431539e-05, - "loss": 3.6665, - "step": 7720 - }, - { - "epoch": 0.11, - "grad_norm": 29.20294952392578, - "learning_rate": 3.506969656611335e-05, - "loss": 3.551, - "step": 7740 - }, - { - "epoch": 0.11, - "grad_norm": 37.7564697265625, - "learning_rate": 3.4995533601559226e-05, - "loss": 3.58, - "step": 7760 - }, - { - "epoch": 0.11, - "grad_norm": 25.87001609802246, - "learning_rate": 3.4921265788587435e-05, - "loss": 3.5855, - "step": 7780 - }, - { - "epoch": 0.11, - "grad_norm": 26.17401123046875, - "learning_rate": 3.484689390623218e-05, - "loss": 3.5951, - "step": 7800 - }, - { - "epoch": 0.11, - "grad_norm": 29.20701026916504, - "learning_rate": 3.4772418734619324e-05, - "loss": 3.6267, - "step": 7820 - }, - { - "epoch": 0.11, - "grad_norm": 60.92488098144531, - "learning_rate": 3.4697841054958165e-05, - "loss": 3.5733, - "step": 7840 - }, - { - "epoch": 0.11, - "grad_norm": 23.196178436279297, - "learning_rate": 3.462316164953328e-05, - "loss": 3.6283, - "step": 7860 - }, - { - "epoch": 0.11, - "grad_norm": 23.13970184326172, - "learning_rate": 3.45483813016963e-05, - "loss": 3.6558, - "step": 7880 - }, - { - "epoch": 0.11, - "grad_norm": 36.5677375793457, - "learning_rate": 3.447350079585767e-05, - "loss": 3.8141, - "step": 7900 - }, - { - "epoch": 0.11, - "grad_norm": 24.820940017700195, - "learning_rate": 3.4398520917478476e-05, - "loss": 3.6439, - "step": 7920 - }, - { - "epoch": 0.11, - "grad_norm": 19.9990291595459, - "learning_rate": 3.4323442453062174e-05, - "loss": 3.601, - "step": 7940 - }, - { - "epoch": 0.11, - "grad_norm": 20.419004440307617, - "learning_rate": 3.42482661901463e-05, - "loss": 3.4856, - "step": 7960 - }, - { - "epoch": 0.11, - "grad_norm": 24.06426429748535, - "learning_rate": 3.417299291729431e-05, - "loss": 3.679, - "step": 7980 - }, - { - "epoch": 0.11, - "grad_norm": 23.68332862854004, - "learning_rate": 3.409762342408719e-05, - "loss": 3.6538, - "step": 8000 - }, - { - "epoch": 0.11, - "grad_norm": 22.80304527282715, - "learning_rate": 3.402215850111528e-05, - "loss": 3.6685, - "step": 8020 - }, - { - "epoch": 0.11, - "grad_norm": 30.03902244567871, - "learning_rate": 3.3946598939969896e-05, - "loss": 3.633, - "step": 8040 - }, - { - "epoch": 0.11, - "grad_norm": 31.799922943115234, - "learning_rate": 3.38709455332351e-05, - "loss": 3.5756, - "step": 8060 - }, - { - "epoch": 0.11, - "grad_norm": 29.18169403076172, - "learning_rate": 3.379519907447931e-05, - "loss": 3.5886, - "step": 8080 - }, - { - "epoch": 0.11, - "grad_norm": 34.412113189697266, - "learning_rate": 3.3719360358247054e-05, - "loss": 3.5254, - "step": 8100 - }, - { - "epoch": 0.11, - "grad_norm": 38.046695709228516, - "learning_rate": 3.3643430180050574e-05, - "loss": 3.6677, - "step": 8120 - }, - { - "epoch": 0.12, - "grad_norm": 23.16988182067871, - "learning_rate": 3.35674093363615e-05, - "loss": 3.5864, - "step": 8140 - }, - { - "epoch": 0.12, - "grad_norm": 59.21152114868164, - "learning_rate": 3.349129862460251e-05, - "loss": 3.4903, - "step": 8160 - }, - { - "epoch": 0.12, - "grad_norm": 21.080909729003906, - "learning_rate": 3.341509884313897e-05, - "loss": 3.5803, - "step": 8180 - }, - { - "epoch": 0.12, - "grad_norm": 26.221805572509766, - "learning_rate": 3.333881079127052e-05, - "loss": 3.5238, - "step": 8200 - }, - { - "epoch": 0.12, - "grad_norm": 21.8948917388916, - "learning_rate": 3.326243526922272e-05, - "loss": 3.5498, - "step": 8220 - }, - { - "epoch": 0.12, - "grad_norm": 29.98341178894043, - "learning_rate": 3.3185973078138664e-05, - "loss": 3.6218, - "step": 8240 - }, - { - "epoch": 0.12, - "grad_norm": 21.86969757080078, - "learning_rate": 3.310942502007056e-05, - "loss": 3.5104, - "step": 8260 - }, - { - "epoch": 0.12, - "grad_norm": 29.3415584564209, - "learning_rate": 3.303279189797131e-05, - "loss": 3.5253, - "step": 8280 - }, - { - "epoch": 0.12, - "grad_norm": 30.171510696411133, - "learning_rate": 3.29560745156861e-05, - "loss": 3.6886, - "step": 8300 - }, - { - "epoch": 0.12, - "grad_norm": 24.074813842773438, - "learning_rate": 3.287927367794397e-05, - "loss": 3.6401, - "step": 8320 - }, - { - "epoch": 0.12, - "grad_norm": 25.059324264526367, - "learning_rate": 3.2802390190349366e-05, - "loss": 3.5847, - "step": 8340 - }, - { - "epoch": 0.12, - "grad_norm": 19.766672134399414, - "learning_rate": 3.272542485937369e-05, - "loss": 3.5812, - "step": 8360 - }, - { - "epoch": 0.12, - "grad_norm": 25.08376693725586, - "learning_rate": 3.264837849234685e-05, - "loss": 3.55, - "step": 8380 - }, - { - "epoch": 0.12, - "grad_norm": 27.044347763061523, - "learning_rate": 3.2571251897448765e-05, - "loss": 3.5347, - "step": 8400 - }, - { - "epoch": 0.12, - "grad_norm": 23.3479061126709, - "learning_rate": 3.249404588370094e-05, - "loss": 3.5016, - "step": 8420 - }, - { - "epoch": 0.12, - "grad_norm": 25.586896896362305, - "learning_rate": 3.241676126095792e-05, - "loss": 3.537, - "step": 8440 - }, - { - "epoch": 0.12, - "grad_norm": 31.54664421081543, - "learning_rate": 3.233939883989882e-05, - "loss": 3.6093, - "step": 8460 - }, - { - "epoch": 0.12, - "grad_norm": 44.6853141784668, - "learning_rate": 3.226195943201883e-05, - "loss": 3.6135, - "step": 8480 - }, - { - "epoch": 0.12, - "grad_norm": 43.322757720947266, - "learning_rate": 3.218444384962071e-05, - "loss": 3.6048, - "step": 8500 - }, - { - "epoch": 0.12, - "grad_norm": 19.633960723876953, - "learning_rate": 3.210685290580622e-05, - "loss": 3.5767, - "step": 8520 - }, - { - "epoch": 0.12, - "grad_norm": 23.640382766723633, - "learning_rate": 3.202918741446764e-05, - "loss": 3.5961, - "step": 8540 - }, - { - "epoch": 0.12, - "grad_norm": 46.06730270385742, - "learning_rate": 3.1951448190279255e-05, - "loss": 3.5757, - "step": 8560 - }, - { - "epoch": 0.12, - "grad_norm": 24.966190338134766, - "learning_rate": 3.187363604868872e-05, - "loss": 3.5488, - "step": 8580 - }, - { - "epoch": 0.12, - "grad_norm": 61.91409683227539, - "learning_rate": 3.1795751805908573e-05, - "loss": 3.6554, - "step": 8600 - }, - { - "epoch": 0.12, - "grad_norm": 80.80062103271484, - "learning_rate": 3.171779627890769e-05, - "loss": 3.6226, - "step": 8620 - }, - { - "epoch": 0.12, - "grad_norm": 20.951128005981445, - "learning_rate": 3.163977028540263e-05, - "loss": 3.6122, - "step": 8640 - }, - { - "epoch": 0.12, - "grad_norm": 19.875343322753906, - "learning_rate": 3.156167464384917e-05, - "loss": 3.5637, - "step": 8660 - }, - { - "epoch": 0.12, - "grad_norm": 21.2697811126709, - "learning_rate": 3.1483510173433626e-05, - "loss": 3.537, - "step": 8680 - }, - { - "epoch": 0.12, - "grad_norm": 22.24051856994629, - "learning_rate": 3.1405277694064305e-05, - "loss": 3.5661, - "step": 8700 - }, - { - "epoch": 0.12, - "grad_norm": 21.55095863342285, - "learning_rate": 3.1326978026362904e-05, - "loss": 3.5573, - "step": 8720 - }, - { - "epoch": 0.12, - "grad_norm": 32.11522674560547, - "learning_rate": 3.124861199165588e-05, - "loss": 3.5995, - "step": 8740 - }, - { - "epoch": 0.12, - "grad_norm": 22.775867462158203, - "learning_rate": 3.117018041196585e-05, - "loss": 3.6436, - "step": 8760 - }, - { - "epoch": 0.12, - "grad_norm": 23.462509155273438, - "learning_rate": 3.109168411000299e-05, - "loss": 3.601, - "step": 8780 - }, - { - "epoch": 0.12, - "grad_norm": 23.43865203857422, - "learning_rate": 3.101312390915634e-05, - "loss": 3.6081, - "step": 8800 - }, - { - "epoch": 0.12, - "grad_norm": 27.17888832092285, - "learning_rate": 3.0934500633485255e-05, - "loss": 3.6257, - "step": 8820 - }, - { - "epoch": 0.13, - "grad_norm": 31.697662353515625, - "learning_rate": 3.0855815107710666e-05, - "loss": 3.5902, - "step": 8840 - }, - { - "epoch": 0.13, - "grad_norm": 37.07548904418945, - "learning_rate": 3.0777068157206536e-05, - "loss": 3.6514, - "step": 8860 - }, - { - "epoch": 0.13, - "grad_norm": 20.554109573364258, - "learning_rate": 3.069826060799109e-05, - "loss": 3.5068, - "step": 8880 - }, - { - "epoch": 0.13, - "grad_norm": 22.490015029907227, - "learning_rate": 3.061939328671824e-05, - "loss": 3.6488, - "step": 8900 - }, - { - "epoch": 0.13, - "grad_norm": 25.30253791809082, - "learning_rate": 3.0540467020668864e-05, - "loss": 3.5931, - "step": 8920 - }, - { - "epoch": 0.13, - "grad_norm": 22.97053337097168, - "learning_rate": 3.0461482637742135e-05, - "loss": 3.5475, - "step": 8940 - }, - { - "epoch": 0.13, - "grad_norm": 22.68851661682129, - "learning_rate": 3.0382440966446875e-05, - "loss": 3.619, - "step": 8960 - }, - { - "epoch": 0.13, - "grad_norm": 28.575305938720703, - "learning_rate": 3.03033428358928e-05, - "loss": 3.5188, - "step": 8980 - }, - { - "epoch": 0.13, - "grad_norm": 21.909072875976562, - "learning_rate": 3.0224189075781884e-05, - "loss": 3.5988, - "step": 9000 - }, - { - "epoch": 0.13, - "grad_norm": 36.04661560058594, - "learning_rate": 3.014498051639959e-05, - "loss": 3.569, - "step": 9020 - }, - { - "epoch": 0.13, - "grad_norm": 157.5665740966797, - "learning_rate": 3.0065717988606257e-05, - "loss": 3.6474, - "step": 9040 - }, - { - "epoch": 0.13, - "grad_norm": 37.57924270629883, - "learning_rate": 2.9986402323828272e-05, - "loss": 3.5874, - "step": 9060 - }, - { - "epoch": 0.13, - "grad_norm": 26.661418914794922, - "learning_rate": 2.990703435404944e-05, - "loss": 3.5982, - "step": 9080 - }, - { - "epoch": 0.13, - "grad_norm": 38.95368957519531, - "learning_rate": 2.9827614911802203e-05, - "loss": 3.4998, - "step": 9100 - }, - { - "epoch": 0.13, - "grad_norm": 34.97966003417969, - "learning_rate": 2.9748144830158924e-05, - "loss": 3.5486, - "step": 9120 - }, - { - "epoch": 0.13, - "grad_norm": 27.682832717895508, - "learning_rate": 2.9668624942723162e-05, - "loss": 3.6144, - "step": 9140 - }, - { - "epoch": 0.13, - "grad_norm": 29.238054275512695, - "learning_rate": 2.9589056083620902e-05, - "loss": 3.6442, - "step": 9160 - }, - { - "epoch": 0.13, - "grad_norm": 31.821439743041992, - "learning_rate": 2.9509439087491835e-05, - "loss": 3.6221, - "step": 9180 - }, - { - "epoch": 0.13, - "grad_norm": 21.238704681396484, - "learning_rate": 2.9429774789480575e-05, - "loss": 3.6278, - "step": 9200 - }, - { - "epoch": 0.13, - "grad_norm": 28.50370216369629, - "learning_rate": 2.9350064025227897e-05, - "loss": 3.6592, - "step": 9220 - }, - { - "epoch": 0.13, - "grad_norm": 22.938095092773438, - "learning_rate": 2.927030763086201e-05, - "loss": 3.519, - "step": 9240 - }, - { - "epoch": 0.13, - "grad_norm": 27.523639678955078, - "learning_rate": 2.9190506442989752e-05, - "loss": 3.5285, - "step": 9260 - }, - { - "epoch": 0.13, - "grad_norm": 27.63553237915039, - "learning_rate": 2.9110661298687824e-05, - "loss": 3.5641, - "step": 9280 - }, - { - "epoch": 0.13, - "grad_norm": 43.626129150390625, - "learning_rate": 2.9030773035493997e-05, - "loss": 3.5764, - "step": 9300 - }, - { - "epoch": 0.13, - "grad_norm": 20.081253051757812, - "learning_rate": 2.8950842491398357e-05, - "loss": 3.6518, - "step": 9320 - }, - { - "epoch": 0.13, - "grad_norm": 20.68466567993164, - "learning_rate": 2.8870870504834496e-05, - "loss": 3.6206, - "step": 9340 - }, - { - "epoch": 0.13, - "grad_norm": 72.61478424072266, - "learning_rate": 2.8790857914670698e-05, - "loss": 3.5108, - "step": 9360 - }, - { - "epoch": 0.13, - "grad_norm": 23.662805557250977, - "learning_rate": 2.871080556020118e-05, - "loss": 3.6223, - "step": 9380 - }, - { - "epoch": 0.13, - "grad_norm": 25.221176147460938, - "learning_rate": 2.863071428113726e-05, - "loss": 3.644, - "step": 9400 - }, - { - "epoch": 0.13, - "grad_norm": 42.87479782104492, - "learning_rate": 2.8550584917598554e-05, - "loss": 3.7027, - "step": 9420 - }, - { - "epoch": 0.13, - "grad_norm": 30.936325073242188, - "learning_rate": 2.8470418310104173e-05, - "loss": 3.5493, - "step": 9440 - }, - { - "epoch": 0.13, - "grad_norm": 24.074983596801758, - "learning_rate": 2.8390215299563884e-05, - "loss": 3.4781, - "step": 9460 - }, - { - "epoch": 0.13, - "grad_norm": 22.818313598632812, - "learning_rate": 2.8309976727269332e-05, - "loss": 3.5558, - "step": 9480 - }, - { - "epoch": 0.13, - "grad_norm": 33.634605407714844, - "learning_rate": 2.8229703434885163e-05, - "loss": 3.5958, - "step": 9500 - }, - { - "epoch": 0.13, - "grad_norm": 29.69095802307129, - "learning_rate": 2.814939626444023e-05, - "loss": 3.5682, - "step": 9520 - }, - { - "epoch": 0.14, - "grad_norm": 29.696638107299805, - "learning_rate": 2.8069056058318755e-05, - "loss": 3.5676, - "step": 9540 - }, - { - "epoch": 0.14, - "grad_norm": 34.828269958496094, - "learning_rate": 2.7988683659251474e-05, - "loss": 3.482, - "step": 9560 - }, - { - "epoch": 0.14, - "grad_norm": 21.408599853515625, - "learning_rate": 2.7908279910306835e-05, - "loss": 3.5189, - "step": 9580 - }, - { - "epoch": 0.14, - "grad_norm": 21.67983627319336, - "learning_rate": 2.782784565488211e-05, - "loss": 3.5703, - "step": 9600 - }, - { - "epoch": 0.14, - "grad_norm": 24.80797576904297, - "learning_rate": 2.7747381736694572e-05, - "loss": 3.512, - "step": 9620 - }, - { - "epoch": 0.14, - "grad_norm": 35.9987678527832, - "learning_rate": 2.766688899977266e-05, - "loss": 3.5937, - "step": 9640 - }, - { - "epoch": 0.14, - "grad_norm": 24.59494400024414, - "learning_rate": 2.7586368288447095e-05, - "loss": 3.5829, - "step": 9660 - }, - { - "epoch": 0.14, - "grad_norm": 26.161178588867188, - "learning_rate": 2.7505820447342028e-05, - "loss": 3.5978, - "step": 9680 - }, - { - "epoch": 0.14, - "grad_norm": 17.551490783691406, - "learning_rate": 2.7425246321366203e-05, - "loss": 3.527, - "step": 9700 - }, - { - "epoch": 0.14, - "grad_norm": 31.15974998474121, - "learning_rate": 2.7344646755704078e-05, - "loss": 3.6422, - "step": 9720 - }, - { - "epoch": 0.14, - "grad_norm": 24.26822280883789, - "learning_rate": 2.7264022595806948e-05, - "loss": 3.5971, - "step": 9740 - }, - { - "epoch": 0.14, - "grad_norm": 24.76336669921875, - "learning_rate": 2.71833746873841e-05, - "loss": 3.5972, - "step": 9760 - }, - { - "epoch": 0.14, - "grad_norm": 18.041610717773438, - "learning_rate": 2.7102703876393944e-05, - "loss": 3.5832, - "step": 9780 - }, - { - "epoch": 0.14, - "grad_norm": 45.17101287841797, - "learning_rate": 2.7022011009035107e-05, - "loss": 3.5754, - "step": 9800 - }, - { - "epoch": 0.14, - "grad_norm": 30.87299156188965, - "learning_rate": 2.6941296931737585e-05, - "loss": 3.6022, - "step": 9820 - }, - { - "epoch": 0.14, - "grad_norm": 32.659786224365234, - "learning_rate": 2.686056249115385e-05, - "loss": 3.5947, - "step": 9840 - }, - { - "epoch": 0.14, - "grad_norm": 25.66715431213379, - "learning_rate": 2.6779808534149987e-05, - "loss": 3.5997, - "step": 9860 - }, - { - "epoch": 0.14, - "grad_norm": 24.735023498535156, - "learning_rate": 2.6699035907796792e-05, - "loss": 3.5619, - "step": 9880 - }, - { - "epoch": 0.14, - "grad_norm": 24.357643127441406, - "learning_rate": 2.6618245459360897e-05, - "loss": 3.6028, - "step": 9900 - }, - { - "epoch": 0.14, - "grad_norm": 29.255617141723633, - "learning_rate": 2.6537438036295875e-05, - "loss": 3.5231, - "step": 9920 - }, - { - "epoch": 0.14, - "grad_norm": 19.508926391601562, - "learning_rate": 2.6456614486233343e-05, - "loss": 3.5555, - "step": 9940 - }, - { - "epoch": 0.14, - "grad_norm": 29.48908042907715, - "learning_rate": 2.6375775656974123e-05, - "loss": 3.5376, - "step": 9960 - }, - { - "epoch": 0.14, - "grad_norm": 23.857908248901367, - "learning_rate": 2.629492239647926e-05, - "loss": 3.5641, - "step": 9980 - }, - { - "epoch": 0.14, - "grad_norm": 39.08363342285156, - "learning_rate": 2.621405555286121e-05, - "loss": 3.5957, - "step": 10000 - }, - { - "epoch": 0.14, - "grad_norm": 30.3124942779541, - "learning_rate": 2.6133175974374892e-05, - "loss": 3.5933, - "step": 10020 - }, - { - "epoch": 0.14, - "grad_norm": 23.010902404785156, - "learning_rate": 2.6052284509408804e-05, - "loss": 3.573, - "step": 10040 - }, - { - "epoch": 0.14, - "grad_norm": 34.93478775024414, - "learning_rate": 2.5971382006476154e-05, - "loss": 3.5641, - "step": 10060 - }, - { - "epoch": 0.14, - "grad_norm": 28.346033096313477, - "learning_rate": 2.5890469314205897e-05, - "loss": 3.5833, - "step": 10080 - }, - { - "epoch": 0.14, - "grad_norm": 41.69817352294922, - "learning_rate": 2.5809547281333902e-05, - "loss": 3.5718, - "step": 10100 - }, - { - "epoch": 0.14, - "grad_norm": 36.409114837646484, - "learning_rate": 2.5728616756693997e-05, - "loss": 3.5675, - "step": 10120 - }, - { - "epoch": 0.14, - "grad_norm": 23.812952041625977, - "learning_rate": 2.564767858920909e-05, - "loss": 3.6445, - "step": 10140 - }, - { - "epoch": 0.14, - "grad_norm": 31.33305549621582, - "learning_rate": 2.556673362788225e-05, - "loss": 3.5669, - "step": 10160 - }, - { - "epoch": 0.14, - "grad_norm": 34.65876770019531, - "learning_rate": 2.5485782721787837e-05, - "loss": 3.53, - "step": 10180 - }, - { - "epoch": 0.14, - "grad_norm": 21.920583724975586, - "learning_rate": 2.540482672006254e-05, - "loss": 3.5825, - "step": 10200 - }, - { - "epoch": 0.14, - "grad_norm": 48.019004821777344, - "learning_rate": 2.5323866471896512e-05, - "loss": 3.5733, - "step": 10220 - }, - { - "epoch": 0.14, - "grad_norm": 52.00071334838867, - "learning_rate": 2.5242902826524434e-05, - "loss": 3.5487, - "step": 10240 - }, - { - "epoch": 0.15, - "grad_norm": 47.33055877685547, - "learning_rate": 2.5161936633216653e-05, - "loss": 3.5076, - "step": 10260 - }, - { - "epoch": 0.15, - "grad_norm": 20.109745025634766, - "learning_rate": 2.5080968741270223e-05, - "loss": 3.5991, - "step": 10280 - }, - { - "epoch": 0.15, - "grad_norm": 23.458494186401367, - "learning_rate": 2.5e-05, - "loss": 3.6357, - "step": 10300 - }, - { - "epoch": 0.15, - "grad_norm": 23.68842315673828, - "learning_rate": 2.4919031258729786e-05, - "loss": 3.5449, - "step": 10320 - }, - { - "epoch": 0.15, - "grad_norm": 21.289154052734375, - "learning_rate": 2.4838063366783353e-05, - "loss": 3.6704, - "step": 10340 - }, - { - "epoch": 0.15, - "grad_norm": 23.132051467895508, - "learning_rate": 2.4757097173475572e-05, - "loss": 3.6327, - "step": 10360 - }, - { - "epoch": 0.15, - "grad_norm": 29.875104904174805, - "learning_rate": 2.4676133528103497e-05, - "loss": 3.5294, - "step": 10380 - }, - { - "epoch": 0.15, - "grad_norm": 23.41105079650879, - "learning_rate": 2.4595173279937464e-05, - "loss": 3.5995, - "step": 10400 - }, - { - "epoch": 0.15, - "grad_norm": 22.15860939025879, - "learning_rate": 2.451421727821217e-05, - "loss": 3.6109, - "step": 10420 - }, - { - "epoch": 0.15, - "grad_norm": 28.534278869628906, - "learning_rate": 2.443326637211775e-05, - "loss": 3.6389, - "step": 10440 - }, - { - "epoch": 0.15, - "grad_norm": 26.33950424194336, - "learning_rate": 2.435232141079092e-05, - "loss": 3.6083, - "step": 10460 - }, - { - "epoch": 0.15, - "grad_norm": 19.027633666992188, - "learning_rate": 2.4271383243306016e-05, - "loss": 3.5256, - "step": 10480 - }, - { - "epoch": 0.15, - "grad_norm": 28.898550033569336, - "learning_rate": 2.419045271866611e-05, - "loss": 3.61, - "step": 10500 - }, - { - "epoch": 0.15, - "grad_norm": 35.347347259521484, - "learning_rate": 2.410953068579411e-05, - "loss": 3.616, - "step": 10520 - }, - { - "epoch": 0.15, - "grad_norm": 23.184894561767578, - "learning_rate": 2.402861799352386e-05, - "loss": 3.6263, - "step": 10540 - }, - { - "epoch": 0.15, - "grad_norm": 32.66107177734375, - "learning_rate": 2.3947715490591206e-05, - "loss": 3.5446, - "step": 10560 - }, - { - "epoch": 0.15, - "grad_norm": 20.614028930664062, - "learning_rate": 2.3866824025625124e-05, - "loss": 3.5989, - "step": 10580 - }, - { - "epoch": 0.15, - "grad_norm": 25.750699996948242, - "learning_rate": 2.3785944447138802e-05, - "loss": 3.5197, - "step": 10600 - }, - { - "epoch": 0.15, - "grad_norm": 23.97648048400879, - "learning_rate": 2.370507760352074e-05, - "loss": 3.6399, - "step": 10620 - }, - { - "epoch": 0.15, - "grad_norm": 19.600095748901367, - "learning_rate": 2.362422434302588e-05, - "loss": 3.5295, - "step": 10640 - }, - { - "epoch": 0.15, - "grad_norm": 27.21882438659668, - "learning_rate": 2.3543385513766656e-05, - "loss": 3.512, - "step": 10660 - }, - { - "epoch": 0.15, - "grad_norm": 27.75621795654297, - "learning_rate": 2.3462561963704134e-05, - "loss": 3.5351, - "step": 10680 - }, - { - "epoch": 0.15, - "grad_norm": 27.200828552246094, - "learning_rate": 2.338175454063911e-05, - "loss": 3.5038, - "step": 10700 - }, - { - "epoch": 0.15, - "grad_norm": 27.96784782409668, - "learning_rate": 2.3300964092203207e-05, - "loss": 3.6097, - "step": 10720 - }, - { - "epoch": 0.15, - "grad_norm": 28.206979751586914, - "learning_rate": 2.3220191465850015e-05, - "loss": 3.5254, - "step": 10740 - }, - { - "epoch": 0.15, - "grad_norm": 22.781152725219727, - "learning_rate": 2.3139437508846155e-05, - "loss": 3.5857, - "step": 10760 - }, - { - "epoch": 0.15, - "grad_norm": 23.07236099243164, - "learning_rate": 2.305870306826242e-05, - "loss": 3.4872, - "step": 10780 - }, - { - "epoch": 0.15, - "grad_norm": 22.408714294433594, - "learning_rate": 2.29779889909649e-05, - "loss": 3.5115, - "step": 10800 - }, - { - "epoch": 0.15, - "grad_norm": 23.98442268371582, - "learning_rate": 2.289729612360606e-05, - "loss": 3.6297, - "step": 10820 - }, - { - "epoch": 0.15, - "grad_norm": 29.503135681152344, - "learning_rate": 2.2816625312615903e-05, - "loss": 3.6209, - "step": 10840 - }, - { - "epoch": 0.15, - "grad_norm": 30.787010192871094, - "learning_rate": 2.2735977404193058e-05, - "loss": 3.4921, - "step": 10860 - }, - { - "epoch": 0.15, - "grad_norm": 24.088376998901367, - "learning_rate": 2.2655353244295928e-05, - "loss": 3.5582, - "step": 10880 - }, - { - "epoch": 0.15, - "grad_norm": 25.253761291503906, - "learning_rate": 2.25747536786338e-05, - "loss": 3.5297, - "step": 10900 - }, - { - "epoch": 0.15, - "grad_norm": 24.333845138549805, - "learning_rate": 2.2494179552657978e-05, - "loss": 3.6105, - "step": 10920 - }, - { - "epoch": 0.15, - "grad_norm": 32.39179229736328, - "learning_rate": 2.241363171155291e-05, - "loss": 3.6122, - "step": 10940 - }, - { - "epoch": 0.16, - "grad_norm": 31.885894775390625, - "learning_rate": 2.2333111000227342e-05, - "loss": 3.6358, - "step": 10960 - }, - { - "epoch": 0.16, - "grad_norm": 22.056386947631836, - "learning_rate": 2.225261826330543e-05, - "loss": 3.5181, - "step": 10980 - }, - { - "epoch": 0.16, - "grad_norm": 23.47673225402832, - "learning_rate": 2.2172154345117894e-05, - "loss": 3.4853, - "step": 11000 - }, - { - "epoch": 0.16, - "grad_norm": 23.548656463623047, - "learning_rate": 2.2091720089693168e-05, - "loss": 3.5468, - "step": 11020 - }, - { - "epoch": 0.16, - "grad_norm": 16.66544532775879, - "learning_rate": 2.201131634074853e-05, - "loss": 3.626, - "step": 11040 - }, - { - "epoch": 0.16, - "grad_norm": 30.556697845458984, - "learning_rate": 2.1930943941681254e-05, - "loss": 3.5565, - "step": 11060 - }, - { - "epoch": 0.16, - "grad_norm": 62.914642333984375, - "learning_rate": 2.1850603735559778e-05, - "loss": 3.554, - "step": 11080 - }, - { - "epoch": 0.16, - "grad_norm": 25.617481231689453, - "learning_rate": 2.177029656511485e-05, - "loss": 3.5449, - "step": 11100 - }, - { - "epoch": 0.16, - "grad_norm": 33.26127243041992, - "learning_rate": 2.169002327273068e-05, - "loss": 3.6071, - "step": 11120 - }, - { - "epoch": 0.16, - "grad_norm": 21.895418167114258, - "learning_rate": 2.160978470043612e-05, - "loss": 3.4622, - "step": 11140 - }, - { - "epoch": 0.16, - "grad_norm": 25.30924415588379, - "learning_rate": 2.152958168989584e-05, - "loss": 3.5169, - "step": 11160 - }, - { - "epoch": 0.16, - "grad_norm": 28.7779541015625, - "learning_rate": 2.1449415082401455e-05, - "loss": 3.5817, - "step": 11180 - }, - { - "epoch": 0.16, - "grad_norm": 24.38544273376465, - "learning_rate": 2.136928571886275e-05, - "loss": 3.5433, - "step": 11200 - }, - { - "epoch": 0.16, - "grad_norm": 36.38949966430664, - "learning_rate": 2.1289194439798818e-05, - "loss": 3.5653, - "step": 11220 - }, - { - "epoch": 0.16, - "grad_norm": 36.11268615722656, - "learning_rate": 2.12091420853293e-05, - "loss": 3.4839, - "step": 11240 - }, - { - "epoch": 0.16, - "grad_norm": 18.36191749572754, - "learning_rate": 2.1129129495165507e-05, - "loss": 3.5532, - "step": 11260 - }, - { - "epoch": 0.16, - "grad_norm": 27.239763259887695, - "learning_rate": 2.1049157508601642e-05, - "loss": 3.5536, - "step": 11280 - }, - { - "epoch": 0.16, - "grad_norm": 25.459758758544922, - "learning_rate": 2.0969226964506006e-05, - "loss": 3.4878, - "step": 11300 - }, - { - "epoch": 0.16, - "grad_norm": 28.359439849853516, - "learning_rate": 2.0889338701312185e-05, - "loss": 3.563, - "step": 11320 - }, - { - "epoch": 0.16, - "grad_norm": 25.177392959594727, - "learning_rate": 2.0809493557010247e-05, - "loss": 3.6313, - "step": 11340 - }, - { - "epoch": 0.16, - "grad_norm": 26.633609771728516, - "learning_rate": 2.072969236913799e-05, - "loss": 3.6034, - "step": 11360 - }, - { - "epoch": 0.16, - "grad_norm": 19.589900970458984, - "learning_rate": 2.0649935974772105e-05, - "loss": 3.6429, - "step": 11380 - }, - { - "epoch": 0.16, - "grad_norm": 35.16368865966797, - "learning_rate": 2.0570225210519434e-05, - "loss": 3.5154, - "step": 11400 - }, - { - "epoch": 0.16, - "grad_norm": 37.59727478027344, - "learning_rate": 2.0490560912508168e-05, - "loss": 3.5652, - "step": 11420 - }, - { - "epoch": 0.16, - "grad_norm": 37.76837158203125, - "learning_rate": 2.04109439163791e-05, - "loss": 3.6911, - "step": 11440 - }, - { - "epoch": 0.16, - "grad_norm": 27.86673355102539, - "learning_rate": 2.0331375057276844e-05, - "loss": 3.4824, - "step": 11460 - }, - { - "epoch": 0.16, - "grad_norm": 51.12165832519531, - "learning_rate": 2.025185516984108e-05, - "loss": 3.558, - "step": 11480 - }, - { - "epoch": 0.16, - "grad_norm": 22.489160537719727, - "learning_rate": 2.0172385088197803e-05, - "loss": 3.5595, - "step": 11500 - }, - { - "epoch": 0.16, - "grad_norm": 19.6495304107666, - "learning_rate": 2.0092965645950564e-05, - "loss": 3.5679, - "step": 11520 - }, - { - "epoch": 0.16, - "grad_norm": 19.997142791748047, - "learning_rate": 2.001359767617173e-05, - "loss": 3.5332, - "step": 11540 - }, - { - "epoch": 0.16, - "grad_norm": 34.29532241821289, - "learning_rate": 1.9934282011393753e-05, - "loss": 3.4848, - "step": 11560 - }, - { - "epoch": 0.16, - "grad_norm": 20.737041473388672, - "learning_rate": 1.985501948360041e-05, - "loss": 3.4874, - "step": 11580 - }, - { - "epoch": 0.16, - "grad_norm": 30.11549186706543, - "learning_rate": 1.9775810924218125e-05, - "loss": 3.5166, - "step": 11600 - }, - { - "epoch": 0.16, - "grad_norm": 23.56212615966797, - "learning_rate": 1.9696657164107202e-05, - "loss": 3.652, - "step": 11620 - }, - { - "epoch": 0.16, - "grad_norm": 20.44150733947754, - "learning_rate": 1.9617559033553128e-05, - "loss": 3.5137, - "step": 11640 - }, - { - "epoch": 0.17, - "grad_norm": 33.37120819091797, - "learning_rate": 1.9538517362257868e-05, - "loss": 3.5163, - "step": 11660 - }, - { - "epoch": 0.17, - "grad_norm": 29.839820861816406, - "learning_rate": 1.945953297933115e-05, - "loss": 3.5979, - "step": 11680 - }, - { - "epoch": 0.17, - "grad_norm": 25.600812911987305, - "learning_rate": 1.9380606713281775e-05, - "loss": 3.6111, - "step": 11700 - }, - { - "epoch": 0.17, - "grad_norm": 40.76740264892578, - "learning_rate": 1.9301739392008923e-05, - "loss": 3.6727, - "step": 11720 - }, - { - "epoch": 0.17, - "grad_norm": 25.436763763427734, - "learning_rate": 1.9222931842793473e-05, - "loss": 3.6145, - "step": 11740 - }, - { - "epoch": 0.17, - "grad_norm": 19.53345489501953, - "learning_rate": 1.9144184892289337e-05, - "loss": 3.5486, - "step": 11760 - }, - { - "epoch": 0.17, - "grad_norm": 21.103118896484375, - "learning_rate": 1.9065499366514757e-05, - "loss": 3.5796, - "step": 11780 - }, - { - "epoch": 0.17, - "grad_norm": 26.135894775390625, - "learning_rate": 1.8986876090843667e-05, - "loss": 3.5905, - "step": 11800 - }, - { - "epoch": 0.17, - "grad_norm": 32.71371841430664, - "learning_rate": 1.8908315889997007e-05, - "loss": 3.531, - "step": 11820 - }, - { - "epoch": 0.17, - "grad_norm": 23.510149002075195, - "learning_rate": 1.882981958803414e-05, - "loss": 3.5597, - "step": 11840 - }, - { - "epoch": 0.17, - "grad_norm": 23.804306030273438, - "learning_rate": 1.8751388008344117e-05, - "loss": 3.5755, - "step": 11860 - }, - { - "epoch": 0.17, - "grad_norm": 33.7330436706543, - "learning_rate": 1.8673021973637095e-05, - "loss": 3.5092, - "step": 11880 - }, - { - "epoch": 0.17, - "grad_norm": 20.88456153869629, - "learning_rate": 1.859472230593569e-05, - "loss": 3.6001, - "step": 11900 - }, - { - "epoch": 0.17, - "grad_norm": 28.640546798706055, - "learning_rate": 1.8516489826566376e-05, - "loss": 3.5419, - "step": 11920 - }, - { - "epoch": 0.17, - "grad_norm": 25.2142391204834, - "learning_rate": 1.8438325356150826e-05, - "loss": 3.465, - "step": 11940 - }, - { - "epoch": 0.17, - "grad_norm": 27.663267135620117, - "learning_rate": 1.836022971459737e-05, - "loss": 3.5017, - "step": 11960 - }, - { - "epoch": 0.17, - "grad_norm": 31.913984298706055, - "learning_rate": 1.828220372109232e-05, - "loss": 3.5187, - "step": 11980 - }, - { - "epoch": 0.17, - "grad_norm": 29.825590133666992, - "learning_rate": 1.820424819409143e-05, - "loss": 3.5469, - "step": 12000 - }, - { - "epoch": 0.17, - "grad_norm": 18.72800636291504, - "learning_rate": 1.8126363951311287e-05, - "loss": 3.5486, - "step": 12020 - }, - { - "epoch": 0.17, - "grad_norm": 30.366409301757812, - "learning_rate": 1.804855180972075e-05, - "loss": 3.5487, - "step": 12040 - }, - { - "epoch": 0.17, - "grad_norm": 25.00339698791504, - "learning_rate": 1.797081258553236e-05, - "loss": 3.4778, - "step": 12060 - }, - { - "epoch": 0.17, - "grad_norm": 29.204017639160156, - "learning_rate": 1.7893147094193786e-05, - "loss": 3.446, - "step": 12080 - }, - { - "epoch": 0.17, - "grad_norm": 28.64485740661621, - "learning_rate": 1.7815556150379298e-05, - "loss": 3.5421, - "step": 12100 - }, - { - "epoch": 0.17, - "grad_norm": 31.4785213470459, - "learning_rate": 1.7738040567981166e-05, - "loss": 3.5075, - "step": 12120 - }, - { - "epoch": 0.17, - "grad_norm": 28.798315048217773, - "learning_rate": 1.766060116010118e-05, - "loss": 3.5049, - "step": 12140 - }, - { - "epoch": 0.17, - "grad_norm": 27.112850189208984, - "learning_rate": 1.7583238739042086e-05, - "loss": 3.5939, - "step": 12160 - }, - { - "epoch": 0.17, - "grad_norm": 24.396697998046875, - "learning_rate": 1.7505954116299063e-05, - "loss": 3.4596, - "step": 12180 - }, - { - "epoch": 0.17, - "grad_norm": 18.46675682067871, - "learning_rate": 1.7428748102551237e-05, - "loss": 3.4861, - "step": 12200 - }, - { - "epoch": 0.17, - "grad_norm": 25.615234375, - "learning_rate": 1.7351621507653157e-05, - "loss": 3.5211, - "step": 12220 - }, - { - "epoch": 0.17, - "grad_norm": 23.890607833862305, - "learning_rate": 1.7274575140626318e-05, - "loss": 3.5255, - "step": 12240 - }, - { - "epoch": 0.17, - "grad_norm": 19.983030319213867, - "learning_rate": 1.7197609809650643e-05, - "loss": 3.5567, - "step": 12260 - }, - { - "epoch": 0.17, - "grad_norm": 24.144041061401367, - "learning_rate": 1.712072632205604e-05, - "loss": 3.5586, - "step": 12280 - }, - { - "epoch": 0.17, - "grad_norm": 35.812835693359375, - "learning_rate": 1.704392548431391e-05, - "loss": 3.5274, - "step": 12300 - }, - { - "epoch": 0.17, - "grad_norm": 18.759809494018555, - "learning_rate": 1.6967208102028697e-05, - "loss": 3.5823, - "step": 12320 - }, - { - "epoch": 0.17, - "grad_norm": 19.85857391357422, - "learning_rate": 1.6890574979929448e-05, - "loss": 3.5583, - "step": 12340 - }, - { - "epoch": 0.17, - "grad_norm": 21.088096618652344, - "learning_rate": 1.6814026921861335e-05, - "loss": 3.5084, - "step": 12360 - }, - { - "epoch": 0.18, - "grad_norm": 21.760805130004883, - "learning_rate": 1.6737564730777284e-05, - "loss": 3.4753, - "step": 12380 - }, - { - "epoch": 0.18, - "grad_norm": 25.510221481323242, - "learning_rate": 1.666118920872949e-05, - "loss": 3.6024, - "step": 12400 - }, - { - "epoch": 0.18, - "grad_norm": 29.43688201904297, - "learning_rate": 1.658490115686104e-05, - "loss": 3.647, - "step": 12420 - }, - { - "epoch": 0.18, - "grad_norm": 19.17232322692871, - "learning_rate": 1.6508701375397487e-05, - "loss": 3.5505, - "step": 12440 - }, - { - "epoch": 0.18, - "grad_norm": 27.04405975341797, - "learning_rate": 1.64325906636385e-05, - "loss": 3.5158, - "step": 12460 - }, - { - "epoch": 0.18, - "grad_norm": 34.61522674560547, - "learning_rate": 1.635656981994943e-05, - "loss": 3.5723, - "step": 12480 - }, - { - "epoch": 0.18, - "grad_norm": 22.05956268310547, - "learning_rate": 1.6280639641752942e-05, - "loss": 3.5133, - "step": 12500 - }, - { - "epoch": 0.18, - "grad_norm": 26.21821403503418, - "learning_rate": 1.6204800925520685e-05, - "loss": 3.4956, - "step": 12520 - }, - { - "epoch": 0.18, - "grad_norm": 16.636159896850586, - "learning_rate": 1.6129054466764904e-05, - "loss": 3.5843, - "step": 12540 - }, - { - "epoch": 0.18, - "grad_norm": 27.356168746948242, - "learning_rate": 1.60534010600301e-05, - "loss": 3.5189, - "step": 12560 - }, - { - "epoch": 0.18, - "grad_norm": 19.394620895385742, - "learning_rate": 1.5977841498884723e-05, - "loss": 3.5838, - "step": 12580 - }, - { - "epoch": 0.18, - "grad_norm": 31.54738426208496, - "learning_rate": 1.5902376575912815e-05, - "loss": 3.6633, - "step": 12600 - }, - { - "epoch": 0.18, - "grad_norm": 23.533172607421875, - "learning_rate": 1.5827007082705698e-05, - "loss": 3.5234, - "step": 12620 - }, - { - "epoch": 0.18, - "grad_norm": 25.580156326293945, - "learning_rate": 1.5751733809853704e-05, - "loss": 3.5478, - "step": 12640 - }, - { - "epoch": 0.18, - "grad_norm": 28.20244789123535, - "learning_rate": 1.5676557546937838e-05, - "loss": 3.49, - "step": 12660 - }, - { - "epoch": 0.18, - "grad_norm": 26.89322280883789, - "learning_rate": 1.5601479082521526e-05, - "loss": 3.5238, - "step": 12680 - }, - { - "epoch": 0.18, - "grad_norm": 25.817209243774414, - "learning_rate": 1.552649920414233e-05, - "loss": 3.5417, - "step": 12700 - }, - { - "epoch": 0.18, - "grad_norm": 30.55599594116211, - "learning_rate": 1.545161869830371e-05, - "loss": 3.5908, - "step": 12720 - }, - { - "epoch": 0.18, - "grad_norm": 49.497894287109375, - "learning_rate": 1.5376838350466725e-05, - "loss": 3.6647, - "step": 12740 - }, - { - "epoch": 0.18, - "grad_norm": 17.040536880493164, - "learning_rate": 1.5302158945041838e-05, - "loss": 3.5271, - "step": 12760 - }, - { - "epoch": 0.18, - "grad_norm": 31.21510124206543, - "learning_rate": 1.5227581265380685e-05, - "loss": 3.4708, - "step": 12780 - }, - { - "epoch": 0.18, - "grad_norm": 26.194982528686523, - "learning_rate": 1.5153106093767827e-05, - "loss": 3.5831, - "step": 12800 - }, - { - "epoch": 0.18, - "grad_norm": 21.868547439575195, - "learning_rate": 1.5078734211412573e-05, - "loss": 3.532, - "step": 12820 - }, - { - "epoch": 0.18, - "grad_norm": 27.728635787963867, - "learning_rate": 1.5004466398440775e-05, - "loss": 3.5432, - "step": 12840 - }, - { - "epoch": 0.18, - "grad_norm": 22.69495964050293, - "learning_rate": 1.493030343388666e-05, - "loss": 3.5464, - "step": 12860 - }, - { - "epoch": 0.18, - "grad_norm": 22.85190773010254, - "learning_rate": 1.4856246095684622e-05, - "loss": 3.5686, - "step": 12880 - }, - { - "epoch": 0.18, - "grad_norm": 18.40359115600586, - "learning_rate": 1.4782295160661103e-05, - "loss": 3.4922, - "step": 12900 - }, - { - "epoch": 0.18, - "grad_norm": 27.058635711669922, - "learning_rate": 1.4708451404526407e-05, - "loss": 3.5231, - "step": 12920 - }, - { - "epoch": 0.18, - "grad_norm": 28.491573333740234, - "learning_rate": 1.4634715601866606e-05, - "loss": 3.502, - "step": 12940 - }, - { - "epoch": 0.18, - "grad_norm": 26.94320297241211, - "learning_rate": 1.4561088526135375e-05, - "loss": 3.5746, - "step": 12960 - }, - { - "epoch": 0.18, - "grad_norm": 26.503097534179688, - "learning_rate": 1.4487570949645888e-05, - "loss": 3.5195, - "step": 12980 - }, - { - "epoch": 0.18, - "grad_norm": 32.38998794555664, - "learning_rate": 1.4414163643562755e-05, - "loss": 3.5195, - "step": 13000 - }, - { - "epoch": 0.18, - "grad_norm": 29.240285873413086, - "learning_rate": 1.434086737789386e-05, - "loss": 3.6301, - "step": 13020 - }, - { - "epoch": 0.18, - "grad_norm": 19.996788024902344, - "learning_rate": 1.4267682921482356e-05, - "loss": 3.5252, - "step": 13040 - }, - { - "epoch": 0.18, - "grad_norm": 26.969528198242188, - "learning_rate": 1.419461104199856e-05, - "loss": 3.546, - "step": 13060 - }, - { - "epoch": 0.19, - "grad_norm": 39.526832580566406, - "learning_rate": 1.412165250593192e-05, - "loss": 3.5464, - "step": 13080 - }, - { - "epoch": 0.19, - "grad_norm": 22.60038948059082, - "learning_rate": 1.4048808078582942e-05, - "loss": 3.475, - "step": 13100 - }, - { - "epoch": 0.19, - "grad_norm": 20.97502899169922, - "learning_rate": 1.3976078524055203e-05, - "loss": 3.5398, - "step": 13120 - }, - { - "epoch": 0.19, - "grad_norm": 16.191055297851562, - "learning_rate": 1.3903464605247325e-05, - "loss": 3.4869, - "step": 13140 - }, - { - "epoch": 0.19, - "grad_norm": 22.308320999145508, - "learning_rate": 1.3830967083844942e-05, - "loss": 3.4316, - "step": 13160 - }, - { - "epoch": 0.19, - "grad_norm": 23.294443130493164, - "learning_rate": 1.375858672031276e-05, - "loss": 3.6033, - "step": 13180 - }, - { - "epoch": 0.19, - "grad_norm": 24.432270050048828, - "learning_rate": 1.368632427388653e-05, - "loss": 3.4829, - "step": 13200 - }, - { - "epoch": 0.19, - "grad_norm": 24.131166458129883, - "learning_rate": 1.3614180502565135e-05, - "loss": 3.5721, - "step": 13220 - }, - { - "epoch": 0.19, - "grad_norm": 36.706668853759766, - "learning_rate": 1.3542156163102582e-05, - "loss": 3.4877, - "step": 13240 - }, - { - "epoch": 0.19, - "grad_norm": 22.797359466552734, - "learning_rate": 1.3470252011000123e-05, - "loss": 3.539, - "step": 13260 - }, - { - "epoch": 0.19, - "grad_norm": 26.030719757080078, - "learning_rate": 1.3398468800498293e-05, - "loss": 3.5415, - "step": 13280 - }, - { - "epoch": 0.19, - "grad_norm": 24.123130798339844, - "learning_rate": 1.3326807284568984e-05, - "loss": 3.5354, - "step": 13300 - }, - { - "epoch": 0.19, - "grad_norm": 24.965229034423828, - "learning_rate": 1.3255268214907613e-05, - "loss": 3.387, - "step": 13320 - }, - { - "epoch": 0.19, - "grad_norm": 27.477920532226562, - "learning_rate": 1.3183852341925145e-05, - "loss": 3.5484, - "step": 13340 - }, - { - "epoch": 0.19, - "grad_norm": 27.259984970092773, - "learning_rate": 1.3112560414740315e-05, - "loss": 3.5104, - "step": 13360 - }, - { - "epoch": 0.19, - "grad_norm": 34.4488525390625, - "learning_rate": 1.3041393181171688e-05, - "loss": 3.5881, - "step": 13380 - }, - { - "epoch": 0.19, - "grad_norm": 30.281177520751953, - "learning_rate": 1.2970351387729873e-05, - "loss": 3.5851, - "step": 13400 - }, - { - "epoch": 0.19, - "grad_norm": 21.918861389160156, - "learning_rate": 1.2899435779609682e-05, - "loss": 3.5427, - "step": 13420 - }, - { - "epoch": 0.19, - "grad_norm": 30.275283813476562, - "learning_rate": 1.2828647100682261e-05, - "loss": 3.6322, - "step": 13440 - }, - { - "epoch": 0.19, - "grad_norm": 57.30770492553711, - "learning_rate": 1.275798609348738e-05, - "loss": 3.5871, - "step": 13460 - }, - { - "epoch": 0.19, - "grad_norm": 21.865861892700195, - "learning_rate": 1.2687453499225545e-05, - "loss": 3.5117, - "step": 13480 - }, - { - "epoch": 0.19, - "grad_norm": 19.6708927154541, - "learning_rate": 1.2617050057750322e-05, - "loss": 3.5015, - "step": 13500 - }, - { - "epoch": 0.19, - "grad_norm": 27.376646041870117, - "learning_rate": 1.2546776507560468e-05, - "loss": 3.5206, - "step": 13520 - }, - { - "epoch": 0.19, - "grad_norm": 44.89965057373047, - "learning_rate": 1.2476633585792286e-05, - "loss": 3.5766, - "step": 13540 - }, - { - "epoch": 0.19, - "grad_norm": 37.562286376953125, - "learning_rate": 1.2406622028211844e-05, - "loss": 3.5488, - "step": 13560 - }, - { - "epoch": 0.19, - "grad_norm": 41.833892822265625, - "learning_rate": 1.2336742569207235e-05, - "loss": 3.6429, - "step": 13580 - }, - { - "epoch": 0.19, - "grad_norm": 44.58812713623047, - "learning_rate": 1.2266995941780934e-05, - "loss": 3.5362, - "step": 13600 - }, - { - "epoch": 0.19, - "grad_norm": 26.543933868408203, - "learning_rate": 1.2197382877542041e-05, - "loss": 3.5761, - "step": 13620 - }, - { - "epoch": 0.19, - "grad_norm": 25.108768463134766, - "learning_rate": 1.2127904106698666e-05, - "loss": 3.4656, - "step": 13640 - }, - { - "epoch": 0.19, - "grad_norm": 22.213363647460938, - "learning_rate": 1.2058560358050241e-05, - "loss": 3.5438, - "step": 13660 - }, - { - "epoch": 0.19, - "grad_norm": 25.67693519592285, - "learning_rate": 1.1989352358979888e-05, - "loss": 3.5508, - "step": 13680 - }, - { - "epoch": 0.19, - "grad_norm": 23.043434143066406, - "learning_rate": 1.1920280835446748e-05, - "loss": 3.5901, - "step": 13700 - }, - { - "epoch": 0.19, - "grad_norm": 25.011388778686523, - "learning_rate": 1.1851346511978425e-05, - "loss": 3.5773, - "step": 13720 - }, - { - "epoch": 0.19, - "grad_norm": 29.659713745117188, - "learning_rate": 1.1782550111663369e-05, - "loss": 3.5795, - "step": 13740 - }, - { - "epoch": 0.19, - "grad_norm": 28.714496612548828, - "learning_rate": 1.1713892356143239e-05, - "loss": 3.5942, - "step": 13760 - }, - { - "epoch": 0.2, - "grad_norm": 29.76102066040039, - "learning_rate": 1.1645373965605425e-05, - "loss": 3.5008, - "step": 13780 - }, - { - "epoch": 0.2, - "grad_norm": 36.479854583740234, - "learning_rate": 1.1576995658775405e-05, - "loss": 3.4347, - "step": 13800 - }, - { - "epoch": 0.2, - "grad_norm": 22.23845100402832, - "learning_rate": 1.1508758152909273e-05, - "loss": 3.559, - "step": 13820 - }, - { - "epoch": 0.2, - "grad_norm": 52.79865646362305, - "learning_rate": 1.1440662163786167e-05, - "loss": 3.5128, - "step": 13840 - }, - { - "epoch": 0.2, - "grad_norm": 23.489912033081055, - "learning_rate": 1.1372708405700793e-05, - "loss": 3.6525, - "step": 13860 - }, - { - "epoch": 0.2, - "grad_norm": 22.640663146972656, - "learning_rate": 1.1304897591455928e-05, - "loss": 3.5387, - "step": 13880 - }, - { - "epoch": 0.2, - "grad_norm": 23.30765724182129, - "learning_rate": 1.1237230432354912e-05, - "loss": 3.5714, - "step": 13900 - }, - { - "epoch": 0.2, - "grad_norm": 27.42571258544922, - "learning_rate": 1.1169707638194238e-05, - "loss": 3.6333, - "step": 13920 - }, - { - "epoch": 0.2, - "grad_norm": 17.366491317749023, - "learning_rate": 1.1102329917256046e-05, - "loss": 3.5651, - "step": 13940 - }, - { - "epoch": 0.2, - "grad_norm": 17.73781967163086, - "learning_rate": 1.103509797630077e-05, - "loss": 3.6944, - "step": 13960 - }, - { - "epoch": 0.2, - "grad_norm": 23.699871063232422, - "learning_rate": 1.0968012520559634e-05, - "loss": 3.5914, - "step": 13980 - }, - { - "epoch": 0.2, - "grad_norm": 21.75518798828125, - "learning_rate": 1.0901074253727336e-05, - "loss": 3.592, - "step": 14000 - }, - { - "epoch": 0.2, - "grad_norm": 26.731660842895508, - "learning_rate": 1.083428387795463e-05, - "loss": 3.5461, - "step": 14020 - }, - { - "epoch": 0.2, - "grad_norm": 20.746625900268555, - "learning_rate": 1.0767642093840932e-05, - "loss": 3.5951, - "step": 14040 - }, - { - "epoch": 0.2, - "grad_norm": 29.204326629638672, - "learning_rate": 1.0701149600427044e-05, - "loss": 3.591, - "step": 14060 - }, - { - "epoch": 0.2, - "grad_norm": 23.31056022644043, - "learning_rate": 1.0634807095187737e-05, - "loss": 3.4382, - "step": 14080 - }, - { - "epoch": 0.2, - "grad_norm": 20.306123733520508, - "learning_rate": 1.0568615274024522e-05, - "loss": 3.5539, - "step": 14100 - }, - { - "epoch": 0.2, - "grad_norm": 19.775318145751953, - "learning_rate": 1.0502574831258259e-05, - "loss": 3.5532, - "step": 14120 - }, - { - "epoch": 0.2, - "grad_norm": 23.407949447631836, - "learning_rate": 1.043668645962195e-05, - "loss": 3.4811, - "step": 14140 - }, - { - "epoch": 0.2, - "grad_norm": 27.759410858154297, - "learning_rate": 1.0370950850253449e-05, - "loss": 3.6196, - "step": 14160 - }, - { - "epoch": 0.2, - "grad_norm": 35.55162048339844, - "learning_rate": 1.0305368692688174e-05, - "loss": 3.4774, - "step": 14180 - }, - { - "epoch": 0.2, - "grad_norm": 23.29683494567871, - "learning_rate": 1.0239940674851941e-05, - "loss": 3.5437, - "step": 14200 - }, - { - "epoch": 0.2, - "grad_norm": 23.486143112182617, - "learning_rate": 1.0174667483053682e-05, - "loss": 3.671, - "step": 14220 - }, - { - "epoch": 0.2, - "grad_norm": 19.907581329345703, - "learning_rate": 1.0109549801978305e-05, - "loss": 3.4272, - "step": 14240 - }, - { - "epoch": 0.2, - "grad_norm": 35.4050407409668, - "learning_rate": 1.0044588314679451e-05, - "loss": 3.5397, - "step": 14260 - }, - { - "epoch": 0.2, - "grad_norm": 28.344934463500977, - "learning_rate": 9.979783702572412e-06, - "loss": 3.5157, - "step": 14280 - }, - { - "epoch": 0.2, - "grad_norm": 27.65180015563965, - "learning_rate": 9.915136645426884e-06, - "loss": 3.5073, - "step": 14300 - }, - { - "epoch": 0.2, - "grad_norm": 21.25212860107422, - "learning_rate": 9.850647821359918e-06, - "loss": 3.5119, - "step": 14320 - }, - { - "epoch": 0.2, - "grad_norm": 25.691951751708984, - "learning_rate": 9.786317906828747e-06, - "loss": 3.6237, - "step": 14340 - }, - { - "epoch": 0.2, - "grad_norm": 32.24767303466797, - "learning_rate": 9.722147576623743e-06, - "loss": 3.5211, - "step": 14360 - }, - { - "epoch": 0.2, - "grad_norm": 20.08036231994629, - "learning_rate": 9.658137503861314e-06, - "loss": 3.4558, - "step": 14380 - }, - { - "epoch": 0.2, - "grad_norm": 22.380619049072266, - "learning_rate": 9.594288359976817e-06, - "loss": 3.4814, - "step": 14400 - }, - { - "epoch": 0.2, - "grad_norm": 28.68309211730957, - "learning_rate": 9.530600814717575e-06, - "loss": 3.5701, - "step": 14420 - }, - { - "epoch": 0.2, - "grad_norm": 22.603858947753906, - "learning_rate": 9.467075536135787e-06, - "loss": 3.5527, - "step": 14440 - }, - { - "epoch": 0.2, - "grad_norm": 20.50914192199707, - "learning_rate": 9.403713190581576e-06, - "loss": 3.4903, - "step": 14460 - }, - { - "epoch": 0.2, - "grad_norm": 30.767824172973633, - "learning_rate": 9.340514442695952e-06, - "loss": 3.5184, - "step": 14480 - }, - { - "epoch": 0.21, - "grad_norm": 23.36814308166504, - "learning_rate": 9.277479955403887e-06, - "loss": 3.4903, - "step": 14500 - }, - { - "epoch": 0.21, - "grad_norm": 34.33296203613281, - "learning_rate": 9.214610389907327e-06, - "loss": 3.5716, - "step": 14520 - }, - { - "epoch": 0.21, - "grad_norm": 21.11824607849121, - "learning_rate": 9.15190640567825e-06, - "loss": 3.6187, - "step": 14540 - }, - { - "epoch": 0.21, - "grad_norm": 38.17007064819336, - "learning_rate": 9.0893686604518e-06, - "loss": 3.5029, - "step": 14560 - }, - { - "epoch": 0.21, - "grad_norm": 22.7441349029541, - "learning_rate": 9.026997810219312e-06, - "loss": 3.552, - "step": 14580 - }, - { - "epoch": 0.21, - "grad_norm": 27.61566734313965, - "learning_rate": 8.964794509221508e-06, - "loss": 3.5794, - "step": 14600 - }, - { - "epoch": 0.21, - "grad_norm": 28.213449478149414, - "learning_rate": 8.902759409941566e-06, - "loss": 3.6239, - "step": 14620 - }, - { - "epoch": 0.21, - "grad_norm": 99.79093933105469, - "learning_rate": 8.840893163098331e-06, - "loss": 3.5571, - "step": 14640 - }, - { - "epoch": 0.21, - "grad_norm": 22.567502975463867, - "learning_rate": 8.779196417639466e-06, - "loss": 3.6038, - "step": 14660 - }, - { - "epoch": 0.21, - "grad_norm": 16.769285202026367, - "learning_rate": 8.71766982073462e-06, - "loss": 3.5192, - "step": 14680 - }, - { - "epoch": 0.21, - "grad_norm": 21.841182708740234, - "learning_rate": 8.656314017768693e-06, - "loss": 3.4728, - "step": 14700 - }, - { - "epoch": 0.21, - "grad_norm": 29.540746688842773, - "learning_rate": 8.595129652335019e-06, - "loss": 3.4656, - "step": 14720 - }, - { - "epoch": 0.21, - "grad_norm": 31.932985305786133, - "learning_rate": 8.534117366228644e-06, - "loss": 3.5597, - "step": 14740 - }, - { - "epoch": 0.21, - "grad_norm": 22.49396324157715, - "learning_rate": 8.47327779943957e-06, - "loss": 3.5653, - "step": 14760 - }, - { - "epoch": 0.21, - "grad_norm": 26.135406494140625, - "learning_rate": 8.412611590146069e-06, - "loss": 3.5669, - "step": 14780 - }, - { - "epoch": 0.21, - "grad_norm": 26.18607521057129, - "learning_rate": 8.352119374707978e-06, - "loss": 3.4971, - "step": 14800 - }, - { - "epoch": 0.21, - "grad_norm": 32.66505813598633, - "learning_rate": 8.29180178766e-06, - "loss": 3.6041, - "step": 14820 - }, - { - "epoch": 0.21, - "grad_norm": 22.13516616821289, - "learning_rate": 8.23165946170509e-06, - "loss": 3.4845, - "step": 14840 - }, - { - "epoch": 0.21, - "grad_norm": 19.47613525390625, - "learning_rate": 8.171693027707772e-06, - "loss": 3.582, - "step": 14860 - }, - { - "epoch": 0.21, - "grad_norm": 28.822246551513672, - "learning_rate": 8.111903114687591e-06, - "loss": 3.5498, - "step": 14880 - }, - { - "epoch": 0.21, - "grad_norm": 27.130552291870117, - "learning_rate": 8.052290349812419e-06, - "loss": 3.5724, - "step": 14900 - }, - { - "epoch": 0.21, - "grad_norm": 20.185958862304688, - "learning_rate": 7.992855358391967e-06, - "loss": 3.5115, - "step": 14920 - }, - { - "epoch": 0.21, - "grad_norm": 19.644073486328125, - "learning_rate": 7.933598763871155e-06, - "loss": 3.5116, - "step": 14940 - }, - { - "epoch": 0.21, - "grad_norm": 28.123699188232422, - "learning_rate": 7.87452118782363e-06, - "loss": 3.5057, - "step": 14960 - }, - { - "epoch": 0.21, - "grad_norm": 18.76272964477539, - "learning_rate": 7.815623249945214e-06, - "loss": 3.579, - "step": 14980 - }, - { - "epoch": 0.21, - "grad_norm": 24.90170669555664, - "learning_rate": 7.756905568047393e-06, - "loss": 3.4875, - "step": 15000 - }, - { - "epoch": 0.21, - "grad_norm": 28.07253074645996, - "learning_rate": 7.698368758050877e-06, - "loss": 3.4413, - "step": 15020 - }, - { - "epoch": 0.21, - "grad_norm": 25.81130027770996, - "learning_rate": 7.640013433979093e-06, - "loss": 3.5166, - "step": 15040 - }, - { - "epoch": 0.21, - "grad_norm": 19.91429901123047, - "learning_rate": 7.58184020795179e-06, - "loss": 3.6086, - "step": 15060 - }, - { - "epoch": 0.21, - "grad_norm": 22.920394897460938, - "learning_rate": 7.523849690178567e-06, - "loss": 3.4341, - "step": 15080 - }, - { - "epoch": 0.21, - "grad_norm": 32.82981872558594, - "learning_rate": 7.466042488952521e-06, - "loss": 3.5264, - "step": 15100 - }, - { - "epoch": 0.21, - "grad_norm": 23.146589279174805, - "learning_rate": 7.408419210643847e-06, - "loss": 3.4571, - "step": 15120 - }, - { - "epoch": 0.21, - "grad_norm": 16.78236198425293, - "learning_rate": 7.350980459693455e-06, - "loss": 3.5377, - "step": 15140 - }, - { - "epoch": 0.21, - "grad_norm": 19.41143035888672, - "learning_rate": 7.293726838606674e-06, - "loss": 3.5262, - "step": 15160 - }, - { - "epoch": 0.21, - "grad_norm": 35.83265686035156, - "learning_rate": 7.236658947946886e-06, - "loss": 3.5389, - "step": 15180 - }, - { - "epoch": 0.22, - "grad_norm": 18.50286102294922, - "learning_rate": 7.179777386329276e-06, - "loss": 3.4822, - "step": 15200 - }, - { - "epoch": 0.22, - "grad_norm": 22.09296226501465, - "learning_rate": 7.123082750414486e-06, - "loss": 3.6018, - "step": 15220 - }, - { - "epoch": 0.22, - "grad_norm": 20.790420532226562, - "learning_rate": 7.066575634902436e-06, - "loss": 3.5642, - "step": 15240 - }, - { - "epoch": 0.22, - "grad_norm": 29.84691047668457, - "learning_rate": 7.010256632526035e-06, - "loss": 3.6224, - "step": 15260 - }, - { - "epoch": 0.22, - "grad_norm": 45.16112518310547, - "learning_rate": 6.9541263340449496e-06, - "loss": 3.5078, - "step": 15280 - }, - { - "epoch": 0.22, - "grad_norm": 24.578792572021484, - "learning_rate": 6.898185328239468e-06, - "loss": 3.571, - "step": 15300 - }, - { - "epoch": 0.22, - "grad_norm": 28.100582122802734, - "learning_rate": 6.842434201904255e-06, - "loss": 3.4775, - "step": 15320 - }, - { - "epoch": 0.22, - "grad_norm": 31.31978988647461, - "learning_rate": 6.786873539842259e-06, - "loss": 3.586, - "step": 15340 - }, - { - "epoch": 0.22, - "grad_norm": 25.426467895507812, - "learning_rate": 6.731503924858518e-06, - "loss": 3.6732, - "step": 15360 - }, - { - "epoch": 0.22, - "grad_norm": 29.726364135742188, - "learning_rate": 6.676325937754102e-06, - "loss": 3.4458, - "step": 15380 - }, - { - "epoch": 0.22, - "grad_norm": 35.88907241821289, - "learning_rate": 6.621340157319997e-06, - "loss": 3.5081, - "step": 15400 - }, - { - "epoch": 0.22, - "grad_norm": 18.98563003540039, - "learning_rate": 6.566547160330999e-06, - "loss": 3.4117, - "step": 15420 - }, - { - "epoch": 0.22, - "grad_norm": 23.43938446044922, - "learning_rate": 6.511947521539738e-06, - "loss": 3.5529, - "step": 15440 - }, - { - "epoch": 0.22, - "grad_norm": 27.628211975097656, - "learning_rate": 6.457541813670564e-06, - "loss": 3.6043, - "step": 15460 - }, - { - "epoch": 0.22, - "grad_norm": 32.65241241455078, - "learning_rate": 6.403330607413643e-06, - "loss": 3.5273, - "step": 15480 - }, - { - "epoch": 0.22, - "grad_norm": 23.956153869628906, - "learning_rate": 6.349314471418849e-06, - "loss": 3.62, - "step": 15500 - }, - { - "epoch": 0.22, - "grad_norm": 26.069808959960938, - "learning_rate": 6.295493972289904e-06, - "loss": 3.5688, - "step": 15520 - }, - { - "epoch": 0.22, - "grad_norm": 39.316566467285156, - "learning_rate": 6.241869674578363e-06, - "loss": 3.5178, - "step": 15540 - }, - { - "epoch": 0.22, - "grad_norm": 27.55044174194336, - "learning_rate": 6.188442140777742e-06, - "loss": 3.4732, - "step": 15560 - }, - { - "epoch": 0.22, - "grad_norm": 20.663110733032227, - "learning_rate": 6.1352119313175945e-06, - "loss": 3.471, - "step": 15580 - }, - { - "epoch": 0.22, - "grad_norm": 24.35353660583496, - "learning_rate": 6.082179604557617e-06, - "loss": 3.503, - "step": 15600 - }, - { - "epoch": 0.22, - "grad_norm": 21.658618927001953, - "learning_rate": 6.029345716781837e-06, - "loss": 3.5414, - "step": 15620 - }, - { - "epoch": 0.22, - "grad_norm": 29.32859230041504, - "learning_rate": 5.9767108221927216e-06, - "loss": 3.4492, - "step": 15640 - }, - { - "epoch": 0.22, - "grad_norm": 29.497888565063477, - "learning_rate": 5.924275472905424e-06, - "loss": 3.6211, - "step": 15660 - }, - { - "epoch": 0.22, - "grad_norm": 22.027616500854492, - "learning_rate": 5.872040218941929e-06, - "loss": 3.6381, - "step": 15680 - }, - { - "epoch": 0.22, - "grad_norm": 25.56600570678711, - "learning_rate": 5.820005608225346e-06, - "loss": 3.6468, - "step": 15700 - }, - { - "epoch": 0.22, - "grad_norm": 20.85995864868164, - "learning_rate": 5.768172186574122e-06, - "loss": 3.5111, - "step": 15720 - }, - { - "epoch": 0.22, - "grad_norm": 27.828828811645508, - "learning_rate": 5.716540497696307e-06, - "loss": 3.4975, - "step": 15740 - }, - { - "epoch": 0.22, - "grad_norm": 28.20557403564453, - "learning_rate": 5.665111083183905e-06, - "loss": 3.5542, - "step": 15760 - }, - { - "epoch": 0.22, - "grad_norm": 27.977331161499023, - "learning_rate": 5.613884482507123e-06, - "loss": 3.5096, - "step": 15780 - }, - { - "epoch": 0.22, - "grad_norm": 15.945253372192383, - "learning_rate": 5.562861233008774e-06, - "loss": 3.4329, - "step": 15800 - }, - { - "epoch": 0.22, - "grad_norm": 19.057100296020508, - "learning_rate": 5.512041869898585e-06, - "loss": 3.5043, - "step": 15820 - }, - { - "epoch": 0.22, - "grad_norm": 19.960477828979492, - "learning_rate": 5.46142692624764e-06, - "loss": 3.4124, - "step": 15840 - }, - { - "epoch": 0.22, - "grad_norm": 20.818775177001953, - "learning_rate": 5.411016932982752e-06, - "loss": 3.4409, - "step": 15860 - }, - { - "epoch": 0.22, - "grad_norm": 19.86461639404297, - "learning_rate": 5.360812418880884e-06, - "loss": 3.6115, - "step": 15880 - }, - { - "epoch": 0.23, - "grad_norm": 26.129798889160156, - "learning_rate": 5.310813910563644e-06, - "loss": 3.5875, - "step": 15900 - }, - { - "epoch": 0.23, - "grad_norm": 17.876537322998047, - "learning_rate": 5.261021932491714e-06, - "loss": 3.5214, - "step": 15920 - }, - { - "epoch": 0.23, - "grad_norm": 37.67085266113281, - "learning_rate": 5.2114370069593965e-06, - "loss": 3.6228, - "step": 15940 - }, - { - "epoch": 0.23, - "grad_norm": 21.973346710205078, - "learning_rate": 5.162059654089083e-06, - "loss": 3.457, - "step": 15960 - }, - { - "epoch": 0.23, - "grad_norm": 30.133270263671875, - "learning_rate": 5.112890391825845e-06, - "loss": 3.4729, - "step": 15980 - }, - { - "epoch": 0.23, - "grad_norm": 24.22100830078125, - "learning_rate": 5.063929735931985e-06, - "loss": 3.5727, - "step": 16000 - }, - { - "epoch": 0.23, - "grad_norm": 25.7775821685791, - "learning_rate": 5.015178199981602e-06, - "loss": 3.5195, - "step": 16020 - }, - { - "epoch": 0.23, - "grad_norm": 21.86203384399414, - "learning_rate": 4.966636295355253e-06, - "loss": 3.5248, - "step": 16040 - }, - { - "epoch": 0.23, - "grad_norm": 31.954734802246094, - "learning_rate": 4.918304531234533e-06, - "loss": 3.5392, - "step": 16060 - }, - { - "epoch": 0.23, - "grad_norm": 19.251066207885742, - "learning_rate": 4.870183414596794e-06, - "loss": 3.5204, - "step": 16080 - }, - { - "epoch": 0.23, - "grad_norm": 30.7813777923584, - "learning_rate": 4.8222734502097665e-06, - "loss": 3.5081, - "step": 16100 - }, - { - "epoch": 0.23, - "grad_norm": 24.208667755126953, - "learning_rate": 4.7745751406263165e-06, - "loss": 3.5487, - "step": 16120 - }, - { - "epoch": 0.23, - "grad_norm": 24.639707565307617, - "learning_rate": 4.727088986179129e-06, - "loss": 3.5998, - "step": 16140 - }, - { - "epoch": 0.23, - "grad_norm": 23.774940490722656, - "learning_rate": 4.679815484975505e-06, - "loss": 3.4195, - "step": 16160 - }, - { - "epoch": 0.23, - "grad_norm": 20.040874481201172, - "learning_rate": 4.6327551328920945e-06, - "loss": 3.5555, - "step": 16180 - }, - { - "epoch": 0.23, - "grad_norm": 32.899959564208984, - "learning_rate": 4.585908423569724e-06, - "loss": 3.5204, - "step": 16200 - }, - { - "epoch": 0.23, - "grad_norm": 29.705387115478516, - "learning_rate": 4.539275848408217e-06, - "loss": 3.5667, - "step": 16220 - }, - { - "epoch": 0.23, - "grad_norm": 19.238882064819336, - "learning_rate": 4.492857896561204e-06, - "loss": 3.4192, - "step": 16240 - }, - { - "epoch": 0.23, - "grad_norm": 18.87932777404785, - "learning_rate": 4.446655054931051e-06, - "loss": 3.4987, - "step": 16260 - }, - { - "epoch": 0.23, - "grad_norm": 25.21925163269043, - "learning_rate": 4.4006678081636884e-06, - "loss": 3.6039, - "step": 16280 - }, - { - "epoch": 0.23, - "grad_norm": 42.43282699584961, - "learning_rate": 4.35489663864359e-06, - "loss": 3.5736, - "step": 16300 - }, - { - "epoch": 0.23, - "grad_norm": 34.624874114990234, - "learning_rate": 4.309342026488653e-06, - "loss": 3.4077, - "step": 16320 - }, - { - "epoch": 0.23, - "grad_norm": 28.791912078857422, - "learning_rate": 4.264004449545206e-06, - "loss": 3.511, - "step": 16340 - }, - { - "epoch": 0.23, - "grad_norm": 21.95167350769043, - "learning_rate": 4.218884383382987e-06, - "loss": 3.4688, - "step": 16360 - }, - { - "epoch": 0.23, - "grad_norm": 29.78521156311035, - "learning_rate": 4.173982301290122e-06, - "loss": 3.4808, - "step": 16380 - }, - { - "epoch": 0.23, - "grad_norm": 24.426288604736328, - "learning_rate": 4.129298674268225e-06, - "loss": 3.5356, - "step": 16400 - }, - { - "epoch": 0.23, - "grad_norm": 23.966203689575195, - "learning_rate": 4.084833971027379e-06, - "loss": 3.5471, - "step": 16420 - }, - { - "epoch": 0.23, - "grad_norm": 26.148141860961914, - "learning_rate": 4.040588657981301e-06, - "loss": 3.4811, - "step": 16440 - }, - { - "epoch": 0.23, - "grad_norm": 29.300769805908203, - "learning_rate": 3.99656319924237e-06, - "loss": 3.5584, - "step": 16460 - }, - { - "epoch": 0.23, - "grad_norm": 27.95845603942871, - "learning_rate": 3.952758056616826e-06, - "loss": 3.5451, - "step": 16480 - }, - { - "epoch": 0.23, - "grad_norm": 21.04642677307129, - "learning_rate": 3.90917368959989e-06, - "loss": 3.5002, - "step": 16500 - }, - { - "epoch": 0.23, - "grad_norm": 38.936370849609375, - "learning_rate": 3.865810555370936e-06, - "loss": 3.4524, - "step": 16520 - }, - { - "epoch": 0.23, - "grad_norm": 24.49747085571289, - "learning_rate": 3.822669108788738e-06, - "loss": 3.4887, - "step": 16540 - }, - { - "epoch": 0.23, - "grad_norm": 36.478424072265625, - "learning_rate": 3.7797498023866396e-06, - "loss": 3.5807, - "step": 16560 - }, - { - "epoch": 0.23, - "grad_norm": 19.034469604492188, - "learning_rate": 3.737053086367873e-06, - "loss": 3.5806, - "step": 16580 - }, - { - "epoch": 0.23, - "grad_norm": 26.125469207763672, - "learning_rate": 3.694579408600771e-06, - "loss": 3.4561, - "step": 16600 - }, - { - "epoch": 0.24, - "grad_norm": 30.372791290283203, - "learning_rate": 3.6523292146141227e-06, - "loss": 3.5875, - "step": 16620 - }, - { - "epoch": 0.24, - "grad_norm": 25.58843421936035, - "learning_rate": 3.6103029475924726e-06, - "loss": 3.498, - "step": 16640 - }, - { - "epoch": 0.24, - "grad_norm": 27.708513259887695, - "learning_rate": 3.56850104837147e-06, - "loss": 3.5339, - "step": 16660 - }, - { - "epoch": 0.24, - "grad_norm": 28.574857711791992, - "learning_rate": 3.5269239554332563e-06, - "loss": 3.5488, - "step": 16680 - }, - { - "epoch": 0.24, - "grad_norm": 37.658775329589844, - "learning_rate": 3.4855721049018688e-06, - "loss": 3.5008, - "step": 16700 - }, - { - "epoch": 0.24, - "grad_norm": 31.66521644592285, - "learning_rate": 3.4444459305386507e-06, - "loss": 3.4864, - "step": 16720 - }, - { - "epoch": 0.24, - "grad_norm": 18.813400268554688, - "learning_rate": 3.403545863737706e-06, - "loss": 3.5685, - "step": 16740 - }, - { - "epoch": 0.24, - "grad_norm": 24.066808700561523, - "learning_rate": 3.3628723335213885e-06, - "loss": 3.5549, - "step": 16760 - }, - { - "epoch": 0.24, - "grad_norm": 23.029767990112305, - "learning_rate": 3.322425766535778e-06, - "loss": 3.4389, - "step": 16780 - }, - { - "epoch": 0.24, - "grad_norm": 25.6854190826416, - "learning_rate": 3.2822065870462217e-06, - "loss": 3.4405, - "step": 16800 - }, - { - "epoch": 0.24, - "grad_norm": 26.14007568359375, - "learning_rate": 3.2422152169328922e-06, - "loss": 3.5291, - "step": 16820 - }, - { - "epoch": 0.24, - "grad_norm": 22.264413833618164, - "learning_rate": 3.2024520756863243e-06, - "loss": 3.613, - "step": 16840 - }, - { - "epoch": 0.24, - "grad_norm": 32.82612991333008, - "learning_rate": 3.1629175804030658e-06, - "loss": 3.4603, - "step": 16860 - }, - { - "epoch": 0.24, - "grad_norm": 32.21546173095703, - "learning_rate": 3.1236121457812544e-06, - "loss": 3.5886, - "step": 16880 - }, - { - "epoch": 0.24, - "grad_norm": 18.181438446044922, - "learning_rate": 3.08453618411631e-06, - "loss": 3.4568, - "step": 16900 - }, - { - "epoch": 0.24, - "grad_norm": 23.75358772277832, - "learning_rate": 3.0456901052965724e-06, - "loss": 3.5491, - "step": 16920 - }, - { - "epoch": 0.24, - "grad_norm": 17.43839454650879, - "learning_rate": 3.0070743167990273e-06, - "loss": 3.5776, - "step": 16940 - }, - { - "epoch": 0.24, - "grad_norm": 20.617218017578125, - "learning_rate": 2.9686892236850337e-06, - "loss": 3.539, - "step": 16960 - }, - { - "epoch": 0.24, - "grad_norm": 19.83597755432129, - "learning_rate": 2.93053522859604e-06, - "loss": 3.3575, - "step": 16980 - }, - { - "epoch": 0.24, - "grad_norm": 18.4063720703125, - "learning_rate": 2.892612731749414e-06, - "loss": 3.3658, - "step": 17000 - }, - { - "epoch": 0.24, - "grad_norm": 23.77143096923828, - "learning_rate": 2.85492213093419e-06, - "loss": 3.4393, - "step": 17020 - }, - { - "epoch": 0.24, - "grad_norm": 22.04786491394043, - "learning_rate": 2.8174638215069493e-06, - "loss": 3.5262, - "step": 17040 - }, - { - "epoch": 0.24, - "grad_norm": 24.277013778686523, - "learning_rate": 2.780238196387619e-06, - "loss": 3.4419, - "step": 17060 - }, - { - "epoch": 0.24, - "grad_norm": 30.128318786621094, - "learning_rate": 2.743245646055398e-06, - "loss": 3.5387, - "step": 17080 - }, - { - "epoch": 0.24, - "grad_norm": 26.737049102783203, - "learning_rate": 2.7064865585446434e-06, - "loss": 3.4134, - "step": 17100 - }, - { - "epoch": 0.24, - "grad_norm": 26.093942642211914, - "learning_rate": 2.6699613194407725e-06, - "loss": 3.5691, - "step": 17120 - }, - { - "epoch": 0.24, - "grad_norm": 30.150657653808594, - "learning_rate": 2.6336703118762766e-06, - "loss": 3.4658, - "step": 17140 - }, - { - "epoch": 0.24, - "grad_norm": 21.17641258239746, - "learning_rate": 2.597613916526637e-06, - "loss": 3.4942, - "step": 17160 - }, - { - "epoch": 0.24, - "grad_norm": 28.02484130859375, - "learning_rate": 2.5617925116063924e-06, - "loss": 3.4448, - "step": 17180 - }, - { - "epoch": 0.24, - "grad_norm": 24.14384651184082, - "learning_rate": 2.52620647286512e-06, - "loss": 3.5448, - "step": 17200 - }, - { - "epoch": 0.24, - "grad_norm": 25.69285774230957, - "learning_rate": 2.4908561735835306e-06, - "loss": 3.5668, - "step": 17220 - }, - { - "epoch": 0.24, - "grad_norm": 19.125316619873047, - "learning_rate": 2.4557419845695427e-06, - "loss": 3.5204, - "step": 17240 - }, - { - "epoch": 0.24, - "grad_norm": 23.64023208618164, - "learning_rate": 2.420864274154372e-06, - "loss": 3.4345, - "step": 17260 - }, - { - "epoch": 0.24, - "grad_norm": 24.39943504333496, - "learning_rate": 2.3862234081887036e-06, - "loss": 3.5515, - "step": 17280 - }, - { - "epoch": 0.24, - "grad_norm": 24.969629287719727, - "learning_rate": 2.351819750038828e-06, - "loss": 3.4973, - "step": 17300 - }, - { - "epoch": 0.25, - "grad_norm": 18.182634353637695, - "learning_rate": 2.317653660582844e-06, - "loss": 3.6065, - "step": 17320 - }, - { - "epoch": 0.25, - "grad_norm": 27.141944885253906, - "learning_rate": 2.2837254982068567e-06, - "loss": 3.5106, - "step": 17340 - }, - { - "epoch": 0.25, - "grad_norm": 20.061452865600586, - "learning_rate": 2.250035618801241e-06, - "loss": 3.4274, - "step": 17360 - }, - { - "epoch": 0.25, - "grad_norm": 39.53497314453125, - "learning_rate": 2.2165843757568805e-06, - "loss": 3.4597, - "step": 17380 - }, - { - "epoch": 0.25, - "grad_norm": 14.67487907409668, - "learning_rate": 2.183372119961499e-06, - "loss": 3.5732, - "step": 17400 - }, - { - "epoch": 0.25, - "grad_norm": 14.984709739685059, - "learning_rate": 2.15039919979593e-06, - "loss": 3.4735, - "step": 17420 - }, - { - "epoch": 0.25, - "grad_norm": 30.988039016723633, - "learning_rate": 2.117665961130513e-06, - "loss": 3.4269, - "step": 17440 - }, - { - "epoch": 0.25, - "grad_norm": 23.8664493560791, - "learning_rate": 2.0851727473214315e-06, - "loss": 3.4997, - "step": 17460 - }, - { - "epoch": 0.25, - "grad_norm": 24.271230697631836, - "learning_rate": 2.05291989920712e-06, - "loss": 3.5919, - "step": 17480 - }, - { - "epoch": 0.25, - "grad_norm": 32.17240524291992, - "learning_rate": 2.020907755104698e-06, - "loss": 3.4734, - "step": 17500 - }, - { - "epoch": 0.25, - "grad_norm": 24.72242546081543, - "learning_rate": 1.9891366508064003e-06, - "loss": 3.5043, - "step": 17520 - }, - { - "epoch": 0.25, - "grad_norm": 29.70708656311035, - "learning_rate": 1.957606919576088e-06, - "loss": 3.4543, - "step": 17540 - }, - { - "epoch": 0.25, - "grad_norm": 29.549745559692383, - "learning_rate": 1.926318892145712e-06, - "loss": 3.4355, - "step": 17560 - }, - { - "epoch": 0.25, - "grad_norm": 25.912363052368164, - "learning_rate": 1.8952728967118804e-06, - "loss": 3.4614, - "step": 17580 - }, - { - "epoch": 0.25, - "grad_norm": 22.835115432739258, - "learning_rate": 1.864469258932397e-06, - "loss": 3.5498, - "step": 17600 - }, - { - "epoch": 0.25, - "grad_norm": 20.103981018066406, - "learning_rate": 1.8339083019228404e-06, - "loss": 3.5791, - "step": 17620 - }, - { - "epoch": 0.25, - "grad_norm": 24.153532028198242, - "learning_rate": 1.803590346253195e-06, - "loss": 3.495, - "step": 17640 - }, - { - "epoch": 0.25, - "grad_norm": 19.70048713684082, - "learning_rate": 1.7735157099444593e-06, - "loss": 3.5439, - "step": 17660 - }, - { - "epoch": 0.25, - "grad_norm": 23.056358337402344, - "learning_rate": 1.7436847084653456e-06, - "loss": 3.4222, - "step": 17680 - }, - { - "epoch": 0.25, - "grad_norm": 25.633689880371094, - "learning_rate": 1.7140976547289438e-06, - "loss": 3.5387, - "step": 17700 - }, - { - "epoch": 0.25, - "grad_norm": 30.34889030456543, - "learning_rate": 1.6847548590894435e-06, - "loss": 3.5579, - "step": 17720 - }, - { - "epoch": 0.25, - "grad_norm": 19.06514549255371, - "learning_rate": 1.6556566293388892e-06, - "loss": 3.4082, - "step": 17740 - }, - { - "epoch": 0.25, - "grad_norm": 16.91566276550293, - "learning_rate": 1.626803270703936e-06, - "loss": 3.5513, - "step": 17760 - }, - { - "epoch": 0.25, - "grad_norm": 26.17884635925293, - "learning_rate": 1.5981950858426714e-06, - "loss": 3.5068, - "step": 17780 - }, - { - "epoch": 0.25, - "grad_norm": 35.77287292480469, - "learning_rate": 1.5698323748414124e-06, - "loss": 3.4825, - "step": 17800 - }, - { - "epoch": 0.25, - "grad_norm": 25.684925079345703, - "learning_rate": 1.5417154352115742e-06, - "loss": 3.5529, - "step": 17820 - }, - { - "epoch": 0.25, - "grad_norm": 23.964488983154297, - "learning_rate": 1.5138445618865544e-06, - "loss": 3.549, - "step": 17840 - }, - { - "epoch": 0.25, - "grad_norm": 20.69983673095703, - "learning_rate": 1.4862200472186199e-06, - "loss": 3.5607, - "step": 17860 - }, - { - "epoch": 0.25, - "grad_norm": 24.382530212402344, - "learning_rate": 1.458842180975864e-06, - "loss": 3.4468, - "step": 17880 - }, - { - "epoch": 0.25, - "grad_norm": 20.305166244506836, - "learning_rate": 1.4317112503391432e-06, - "loss": 3.5468, - "step": 17900 - }, - { - "epoch": 0.25, - "grad_norm": 20.76270294189453, - "learning_rate": 1.4048275398990896e-06, - "loss": 3.5828, - "step": 17920 - }, - { - "epoch": 0.25, - "grad_norm": 31.468564987182617, - "learning_rate": 1.3781913316530948e-06, - "loss": 3.6117, - "step": 17940 - }, - { - "epoch": 0.25, - "grad_norm": 22.185617446899414, - "learning_rate": 1.351802905002386e-06, - "loss": 3.4663, - "step": 17960 - }, - { - "epoch": 0.25, - "grad_norm": 27.454687118530273, - "learning_rate": 1.32566253674907e-06, - "loss": 3.4419, - "step": 17980 - }, - { - "epoch": 0.25, - "grad_norm": 20.76512336730957, - "learning_rate": 1.2997705010932393e-06, - "loss": 3.5315, - "step": 18000 - }, - { - "epoch": 0.26, - "grad_norm": 27.795419692993164, - "learning_rate": 1.274127069630096e-06, - "loss": 3.5435, - "step": 18020 - }, - { - "epoch": 0.26, - "grad_norm": 45.871864318847656, - "learning_rate": 1.2487325113471032e-06, - "loss": 3.3871, - "step": 18040 - }, - { - "epoch": 0.26, - "grad_norm": 15.510208129882812, - "learning_rate": 1.2235870926211619e-06, - "loss": 3.5862, - "step": 18060 - }, - { - "epoch": 0.26, - "grad_norm": 26.943269729614258, - "learning_rate": 1.1986910772158104e-06, - "loss": 3.5032, - "step": 18080 - }, - { - "epoch": 0.26, - "grad_norm": 28.423053741455078, - "learning_rate": 1.1740447262784781e-06, - "loss": 3.4936, - "step": 18100 - }, - { - "epoch": 0.26, - "grad_norm": 25.210853576660156, - "learning_rate": 1.1496482983377189e-06, - "loss": 3.4515, - "step": 18120 - }, - { - "epoch": 0.26, - "grad_norm": 32.88740921020508, - "learning_rate": 1.125502049300517e-06, - "loss": 3.5196, - "step": 18140 - }, - { - "epoch": 0.26, - "grad_norm": 20.562488555908203, - "learning_rate": 1.1016062324496008e-06, - "loss": 3.4467, - "step": 18160 - }, - { - "epoch": 0.26, - "grad_norm": 21.112634658813477, - "learning_rate": 1.0779610984407773e-06, - "loss": 3.5286, - "step": 18180 - }, - { - "epoch": 0.26, - "grad_norm": 29.323238372802734, - "learning_rate": 1.0545668953003241e-06, - "loss": 3.4971, - "step": 18200 - }, - { - "epoch": 0.26, - "grad_norm": 24.024930953979492, - "learning_rate": 1.0314238684223515e-06, - "loss": 3.5919, - "step": 18220 - }, - { - "epoch": 0.26, - "grad_norm": 29.396581649780273, - "learning_rate": 1.0085322605662666e-06, - "loss": 3.4255, - "step": 18240 - }, - { - "epoch": 0.26, - "grad_norm": 19.502662658691406, - "learning_rate": 9.858923118542002e-07, - "loss": 3.464, - "step": 18260 - }, - { - "epoch": 0.26, - "grad_norm": 20.03078269958496, - "learning_rate": 9.635042597685023e-07, - "loss": 3.4305, - "step": 18280 - }, - { - "epoch": 0.26, - "grad_norm": 18.905967712402344, - "learning_rate": 9.413683391492456e-07, - "loss": 3.6401, - "step": 18300 - }, - { - "epoch": 0.26, - "grad_norm": 23.61101531982422, - "learning_rate": 9.194847821917623e-07, - "loss": 3.5543, - "step": 18320 - }, - { - "epoch": 0.26, - "grad_norm": 18.563806533813477, - "learning_rate": 8.978538184442137e-07, - "loss": 3.4395, - "step": 18340 - }, - { - "epoch": 0.26, - "grad_norm": 21.695003509521484, - "learning_rate": 8.764756748051662e-07, - "loss": 3.4193, - "step": 18360 - }, - { - "epoch": 0.26, - "grad_norm": 21.57720947265625, - "learning_rate": 8.553505755212382e-07, - "loss": 3.5357, - "step": 18380 - }, - { - "epoch": 0.26, - "grad_norm": 30.37428855895996, - "learning_rate": 8.344787421847217e-07, - "loss": 3.5414, - "step": 18400 - }, - { - "epoch": 0.26, - "grad_norm": 42.314064025878906, - "learning_rate": 8.138603937312722e-07, - "loss": 3.5528, - "step": 18420 - }, - { - "epoch": 0.26, - "grad_norm": 22.21116065979004, - "learning_rate": 7.934957464376058e-07, - "loss": 3.6419, - "step": 18440 - }, - { - "epoch": 0.26, - "grad_norm": 26.877450942993164, - "learning_rate": 7.733850139192395e-07, - "loss": 3.5869, - "step": 18460 - }, - { - "epoch": 0.26, - "grad_norm": 21.281030654907227, - "learning_rate": 7.535284071282455e-07, - "loss": 3.6047, - "step": 18480 - }, - { - "epoch": 0.26, - "grad_norm": 20.147789001464844, - "learning_rate": 7.339261343510206e-07, - "loss": 3.4247, - "step": 18500 - }, - { - "epoch": 0.26, - "grad_norm": 19.394601821899414, - "learning_rate": 7.145784012061424e-07, - "loss": 3.5844, - "step": 18520 - }, - { - "epoch": 0.26, - "grad_norm": 22.156579971313477, - "learning_rate": 6.954854106421715e-07, - "loss": 3.5348, - "step": 18540 - }, - { - "epoch": 0.26, - "grad_norm": 28.641721725463867, - "learning_rate": 6.766473629355452e-07, - "loss": 3.5451, - "step": 18560 - }, - { - "epoch": 0.26, - "grad_norm": 19.47591209411621, - "learning_rate": 6.580644556884702e-07, - "loss": 3.5458, - "step": 18580 } ], "logging_steps": 20, - "max_steps": 20000, + "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, - "total_flos": 4.037882943504384e+16, + "total_flos": 49799375290368.0, "train_batch_size": 8, "trial_name": null, "trial_params": null