{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 94748, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004221724996833706, "grad_norm": 99.84131622314453, "learning_rate": 2.0474934036939315e-07, "loss": 12.0647, "step": 100 }, { "epoch": 0.008443449993667413, "grad_norm": 95.49885559082031, "learning_rate": 4.1583113456464383e-07, "loss": 11.7727, "step": 200 }, { "epoch": 0.012665174990501118, "grad_norm": 66.75164794921875, "learning_rate": 6.269129287598945e-07, "loss": 11.1315, "step": 300 }, { "epoch": 0.016886899987334825, "grad_norm": 65.58353424072266, "learning_rate": 8.379947229551452e-07, "loss": 10.8852, "step": 400 }, { "epoch": 0.02110862498416853, "grad_norm": 66.87042999267578, "learning_rate": 1.0469656992084432e-06, "loss": 9.9168, "step": 500 }, { "epoch": 0.02110862498416853, "eval_loss": 10.42082691192627, "eval_runtime": 383.256, "eval_samples_per_second": 494.424, "eval_steps_per_second": 15.452, "step": 500 }, { "epoch": 0.025330349981002236, "grad_norm": 112.13134765625, "learning_rate": 1.258047493403694e-06, "loss": 9.4099, "step": 600 }, { "epoch": 0.029552074977835945, "grad_norm": 74.77991485595703, "learning_rate": 1.4691292875989447e-06, "loss": 8.5361, "step": 700 }, { "epoch": 0.03377379997466965, "grad_norm": 66.13243103027344, "learning_rate": 1.6802110817941955e-06, "loss": 7.7286, "step": 800 }, { "epoch": 0.03799552497150336, "grad_norm": 43.571102142333984, "learning_rate": 1.8912928759894462e-06, "loss": 7.0852, "step": 900 }, { "epoch": 0.04221724996833706, "grad_norm": 39.42892074584961, "learning_rate": 2.1023746701846966e-06, "loss": 6.3646, "step": 1000 }, { "epoch": 0.04221724996833706, "eval_loss": 6.634986400604248, "eval_runtime": 371.6377, "eval_samples_per_second": 509.881, "eval_steps_per_second": 15.935, "step": 1000 }, { "epoch": 0.04643897496517077, "grad_norm": 34.04639434814453, "learning_rate": 2.3134564643799472e-06, "loss": 6.1673, "step": 1100 }, { "epoch": 0.05066069996200447, "grad_norm": 19.675539016723633, "learning_rate": 2.5245382585751983e-06, "loss": 5.5683, "step": 1200 }, { "epoch": 0.05488242495883818, "grad_norm": 18.309850692749023, "learning_rate": 2.7356200527704485e-06, "loss": 5.4462, "step": 1300 }, { "epoch": 0.05910414995567189, "grad_norm": 21.344575881958008, "learning_rate": 2.9467018469656995e-06, "loss": 5.303, "step": 1400 }, { "epoch": 0.06332587495250559, "grad_norm": 18.55843162536621, "learning_rate": 3.15778364116095e-06, "loss": 5.1935, "step": 1500 }, { "epoch": 0.06332587495250559, "eval_loss": 5.242936134338379, "eval_runtime": 373.9106, "eval_samples_per_second": 506.782, "eval_steps_per_second": 15.838, "step": 1500 }, { "epoch": 0.0675475999493393, "grad_norm": 15.893339157104492, "learning_rate": 3.3688654353562008e-06, "loss": 5.1856, "step": 1600 }, { "epoch": 0.07176932494617301, "grad_norm": 34.76871871948242, "learning_rate": 3.5799472295514514e-06, "loss": 5.0136, "step": 1700 }, { "epoch": 0.07599104994300672, "grad_norm": 20.7496280670166, "learning_rate": 3.7910290237467025e-06, "loss": 5.0667, "step": 1800 }, { "epoch": 0.08021277493984041, "grad_norm": 30.79672622680664, "learning_rate": 4.002110817941952e-06, "loss": 4.9982, "step": 1900 }, { "epoch": 0.08443449993667412, "grad_norm": 18.434511184692383, "learning_rate": 4.213192612137204e-06, "loss": 5.0429, "step": 2000 }, { "epoch": 0.08443449993667412, "eval_loss": 5.009856224060059, "eval_runtime": 383.4068, "eval_samples_per_second": 494.23, "eval_steps_per_second": 15.446, "step": 2000 }, { "epoch": 0.08865622493350783, "grad_norm": 23.91575050354004, "learning_rate": 4.424274406332454e-06, "loss": 4.8719, "step": 2100 }, { "epoch": 0.09287794993034154, "grad_norm": 16.657917022705078, "learning_rate": 4.635356200527705e-06, "loss": 4.8579, "step": 2200 }, { "epoch": 0.09709967492717525, "grad_norm": 20.16551399230957, "learning_rate": 4.846437994722956e-06, "loss": 4.9282, "step": 2300 }, { "epoch": 0.10132139992400895, "grad_norm": 26.069272994995117, "learning_rate": 5.057519788918206e-06, "loss": 4.9848, "step": 2400 }, { "epoch": 0.10554312492084265, "grad_norm": 20.215301513671875, "learning_rate": 5.268601583113458e-06, "loss": 4.8974, "step": 2500 }, { "epoch": 0.10554312492084265, "eval_loss": 4.907773971557617, "eval_runtime": 383.6054, "eval_samples_per_second": 493.974, "eval_steps_per_second": 15.438, "step": 2500 }, { "epoch": 0.10976484991767636, "grad_norm": 16.090208053588867, "learning_rate": 5.4796833773087075e-06, "loss": 4.9103, "step": 2600 }, { "epoch": 0.11398657491451007, "grad_norm": 14.869135856628418, "learning_rate": 5.690765171503958e-06, "loss": 4.7459, "step": 2700 }, { "epoch": 0.11820829991134378, "grad_norm": 40.26353454589844, "learning_rate": 5.90184696569921e-06, "loss": 4.8084, "step": 2800 }, { "epoch": 0.12243002490817748, "grad_norm": 12.258187294006348, "learning_rate": 6.112928759894459e-06, "loss": 4.8221, "step": 2900 }, { "epoch": 0.12665174990501119, "grad_norm": 14.802818298339844, "learning_rate": 6.32401055408971e-06, "loss": 4.7622, "step": 3000 }, { "epoch": 0.12665174990501119, "eval_loss": 4.81698751449585, "eval_runtime": 385.8745, "eval_samples_per_second": 491.069, "eval_steps_per_second": 15.347, "step": 3000 }, { "epoch": 0.1308734749018449, "grad_norm": 34.78262710571289, "learning_rate": 6.535092348284961e-06, "loss": 4.7004, "step": 3100 }, { "epoch": 0.1350951998986786, "grad_norm": 13.215925216674805, "learning_rate": 6.746174142480212e-06, "loss": 4.6912, "step": 3200 }, { "epoch": 0.1393169248955123, "grad_norm": 26.275604248046875, "learning_rate": 6.957255936675462e-06, "loss": 4.6595, "step": 3300 }, { "epoch": 0.14353864989234602, "grad_norm": 17.464338302612305, "learning_rate": 7.1683377308707125e-06, "loss": 4.7322, "step": 3400 }, { "epoch": 0.14776037488917973, "grad_norm": 17.785873413085938, "learning_rate": 7.379419525065964e-06, "loss": 4.7575, "step": 3500 }, { "epoch": 0.14776037488917973, "eval_loss": 4.719913959503174, "eval_runtime": 380.6996, "eval_samples_per_second": 497.744, "eval_steps_per_second": 15.556, "step": 3500 }, { "epoch": 0.15198209988601344, "grad_norm": 33.81704330444336, "learning_rate": 7.590501319261215e-06, "loss": 4.6443, "step": 3600 }, { "epoch": 0.15620382488284712, "grad_norm": 37.59410095214844, "learning_rate": 7.801583113456465e-06, "loss": 4.6638, "step": 3700 }, { "epoch": 0.16042554987968083, "grad_norm": 19.028879165649414, "learning_rate": 8.012664907651716e-06, "loss": 4.5958, "step": 3800 }, { "epoch": 0.16464727487651454, "grad_norm": 17.87994956970215, "learning_rate": 8.223746701846966e-06, "loss": 4.6285, "step": 3900 }, { "epoch": 0.16886899987334825, "grad_norm": 24.185325622558594, "learning_rate": 8.434828496042217e-06, "loss": 4.6347, "step": 4000 }, { "epoch": 0.16886899987334825, "eval_loss": 4.655384063720703, "eval_runtime": 389.5193, "eval_samples_per_second": 486.474, "eval_steps_per_second": 15.203, "step": 4000 }, { "epoch": 0.17309072487018196, "grad_norm": 22.38553237915039, "learning_rate": 8.645910290237468e-06, "loss": 4.6558, "step": 4100 }, { "epoch": 0.17731244986701566, "grad_norm": 17.568811416625977, "learning_rate": 8.856992084432718e-06, "loss": 4.6712, "step": 4200 }, { "epoch": 0.18153417486384937, "grad_norm": 21.295473098754883, "learning_rate": 9.068073878627969e-06, "loss": 4.6126, "step": 4300 }, { "epoch": 0.18575589986068308, "grad_norm": 43.39851760864258, "learning_rate": 9.27915567282322e-06, "loss": 4.6219, "step": 4400 }, { "epoch": 0.1899776248575168, "grad_norm": 17.622400283813477, "learning_rate": 9.488126649076517e-06, "loss": 4.6101, "step": 4500 }, { "epoch": 0.1899776248575168, "eval_loss": 4.6206207275390625, "eval_runtime": 383.5173, "eval_samples_per_second": 494.087, "eval_steps_per_second": 15.441, "step": 4500 }, { "epoch": 0.1941993498543505, "grad_norm": 16.083688735961914, "learning_rate": 9.699208443271768e-06, "loss": 4.7682, "step": 4600 }, { "epoch": 0.19842107485118418, "grad_norm": 61.793235778808594, "learning_rate": 9.91029023746702e-06, "loss": 4.5385, "step": 4700 }, { "epoch": 0.2026427998480179, "grad_norm": 19.626968383789062, "learning_rate": 1.012137203166227e-05, "loss": 4.6744, "step": 4800 }, { "epoch": 0.2068645248448516, "grad_norm": 13.591843605041504, "learning_rate": 1.033245382585752e-05, "loss": 4.5383, "step": 4900 }, { "epoch": 0.2110862498416853, "grad_norm": 26.316524505615234, "learning_rate": 1.0543535620052772e-05, "loss": 4.6095, "step": 5000 }, { "epoch": 0.2110862498416853, "eval_loss": 4.626244068145752, "eval_runtime": 385.0461, "eval_samples_per_second": 492.125, "eval_steps_per_second": 15.38, "step": 5000 }, { "epoch": 0.21530797483851902, "grad_norm": 35.92019271850586, "learning_rate": 1.0754617414248022e-05, "loss": 4.6807, "step": 5100 }, { "epoch": 0.21952969983535273, "grad_norm": 16.565446853637695, "learning_rate": 1.0965699208443273e-05, "loss": 4.4866, "step": 5200 }, { "epoch": 0.22375142483218644, "grad_norm": 71.62289428710938, "learning_rate": 1.1176781002638524e-05, "loss": 4.5353, "step": 5300 }, { "epoch": 0.22797314982902014, "grad_norm": 85.97599029541016, "learning_rate": 1.1387862796833773e-05, "loss": 4.5285, "step": 5400 }, { "epoch": 0.23219487482585385, "grad_norm": 114.70708465576172, "learning_rate": 1.1598944591029025e-05, "loss": 4.5416, "step": 5500 }, { "epoch": 0.23219487482585385, "eval_loss": 4.591431617736816, "eval_runtime": 389.2061, "eval_samples_per_second": 486.865, "eval_steps_per_second": 15.216, "step": 5500 }, { "epoch": 0.23641659982268756, "grad_norm": 18.64859962463379, "learning_rate": 1.1810026385224276e-05, "loss": 4.623, "step": 5600 }, { "epoch": 0.24063832481952124, "grad_norm": 40.123992919921875, "learning_rate": 1.2021108179419525e-05, "loss": 4.5337, "step": 5700 }, { "epoch": 0.24486004981635495, "grad_norm": 19.153076171875, "learning_rate": 1.2232189973614777e-05, "loss": 4.5726, "step": 5800 }, { "epoch": 0.24908177481318866, "grad_norm": 16.48304557800293, "learning_rate": 1.2443271767810027e-05, "loss": 4.5467, "step": 5900 }, { "epoch": 0.25330349981002237, "grad_norm": 35.746307373046875, "learning_rate": 1.2654353562005276e-05, "loss": 4.3986, "step": 6000 }, { "epoch": 0.25330349981002237, "eval_loss": 4.601134300231934, "eval_runtime": 382.9558, "eval_samples_per_second": 494.812, "eval_steps_per_second": 15.464, "step": 6000 }, { "epoch": 0.2575252248068561, "grad_norm": 15.414148330688477, "learning_rate": 1.2863324538258577e-05, "loss": 4.559, "step": 6100 }, { "epoch": 0.2617469498036898, "grad_norm": 18.076143264770508, "learning_rate": 1.3074406332453826e-05, "loss": 4.6066, "step": 6200 }, { "epoch": 0.2659686748005235, "grad_norm": 19.2498836517334, "learning_rate": 1.3285488126649078e-05, "loss": 4.4445, "step": 6300 }, { "epoch": 0.2701903997973572, "grad_norm": 17.76839828491211, "learning_rate": 1.3496569920844329e-05, "loss": 4.4518, "step": 6400 }, { "epoch": 0.2744121247941909, "grad_norm": 17.81798553466797, "learning_rate": 1.3707651715039578e-05, "loss": 4.4761, "step": 6500 }, { "epoch": 0.2744121247941909, "eval_loss": 4.5092549324035645, "eval_runtime": 383.1229, "eval_samples_per_second": 494.596, "eval_steps_per_second": 15.457, "step": 6500 }, { "epoch": 0.2786338497910246, "grad_norm": 26.38512420654297, "learning_rate": 1.391873350923483e-05, "loss": 4.3362, "step": 6600 }, { "epoch": 0.28285557478785833, "grad_norm": 22.481346130371094, "learning_rate": 1.4129815303430081e-05, "loss": 4.4936, "step": 6700 }, { "epoch": 0.28707729978469204, "grad_norm": 34.47163772583008, "learning_rate": 1.434089709762533e-05, "loss": 4.2397, "step": 6800 }, { "epoch": 0.29129902478152575, "grad_norm": 14.515748023986816, "learning_rate": 1.4551978891820582e-05, "loss": 4.5243, "step": 6900 }, { "epoch": 0.29552074977835946, "grad_norm": 13.278151512145996, "learning_rate": 1.4763060686015833e-05, "loss": 4.496, "step": 7000 }, { "epoch": 0.29552074977835946, "eval_loss": 4.396859645843506, "eval_runtime": 380.6896, "eval_samples_per_second": 497.757, "eval_steps_per_second": 15.556, "step": 7000 }, { "epoch": 0.29974247477519317, "grad_norm": 28.896297454833984, "learning_rate": 1.4974142480211082e-05, "loss": 4.2558, "step": 7100 }, { "epoch": 0.3039641997720269, "grad_norm": 41.2579231262207, "learning_rate": 1.5185224274406334e-05, "loss": 4.4691, "step": 7200 }, { "epoch": 0.30818592476886053, "grad_norm": 23.71632957458496, "learning_rate": 1.5396306068601585e-05, "loss": 4.4819, "step": 7300 }, { "epoch": 0.31240764976569424, "grad_norm": 31.86453628540039, "learning_rate": 1.5607387862796834e-05, "loss": 4.3785, "step": 7400 }, { "epoch": 0.31662937476252795, "grad_norm": 27.288312911987305, "learning_rate": 1.5818469656992086e-05, "loss": 4.4214, "step": 7500 }, { "epoch": 0.31662937476252795, "eval_loss": 4.419884204864502, "eval_runtime": 381.558, "eval_samples_per_second": 496.624, "eval_steps_per_second": 15.521, "step": 7500 }, { "epoch": 0.32085109975936166, "grad_norm": 31.071884155273438, "learning_rate": 1.6029551451187338e-05, "loss": 4.4935, "step": 7600 }, { "epoch": 0.32507282475619537, "grad_norm": 18.19314956665039, "learning_rate": 1.6240633245382587e-05, "loss": 4.4238, "step": 7700 }, { "epoch": 0.3292945497530291, "grad_norm": 23.160940170288086, "learning_rate": 1.645171503957784e-05, "loss": 4.5361, "step": 7800 }, { "epoch": 0.3335162747498628, "grad_norm": 30.73809242248535, "learning_rate": 1.666279683377309e-05, "loss": 4.4284, "step": 7900 }, { "epoch": 0.3377379997466965, "grad_norm": 17.99860954284668, "learning_rate": 1.6873878627968337e-05, "loss": 4.3918, "step": 8000 }, { "epoch": 0.3377379997466965, "eval_loss": 4.397606372833252, "eval_runtime": 384.2042, "eval_samples_per_second": 493.204, "eval_steps_per_second": 15.414, "step": 8000 }, { "epoch": 0.3419597247435302, "grad_norm": 35.046939849853516, "learning_rate": 1.708496042216359e-05, "loss": 4.4622, "step": 8100 }, { "epoch": 0.3461814497403639, "grad_norm": 15.606463432312012, "learning_rate": 1.7296042216358842e-05, "loss": 4.4128, "step": 8200 }, { "epoch": 0.3504031747371976, "grad_norm": 22.16493034362793, "learning_rate": 1.750712401055409e-05, "loss": 4.1565, "step": 8300 }, { "epoch": 0.35462489973403133, "grad_norm": 53.230464935302734, "learning_rate": 1.7718205804749343e-05, "loss": 4.3241, "step": 8400 }, { "epoch": 0.35884662473086504, "grad_norm": 24.896638870239258, "learning_rate": 1.7929287598944592e-05, "loss": 4.2764, "step": 8500 }, { "epoch": 0.35884662473086504, "eval_loss": 4.426083564758301, "eval_runtime": 374.4401, "eval_samples_per_second": 506.065, "eval_steps_per_second": 15.816, "step": 8500 }, { "epoch": 0.36306834972769875, "grad_norm": 37.623687744140625, "learning_rate": 1.814036939313984e-05, "loss": 4.2101, "step": 8600 }, { "epoch": 0.36729007472453246, "grad_norm": 15.538634300231934, "learning_rate": 1.8351451187335093e-05, "loss": 4.4044, "step": 8700 }, { "epoch": 0.37151179972136616, "grad_norm": 26.525089263916016, "learning_rate": 1.8562532981530342e-05, "loss": 4.254, "step": 8800 }, { "epoch": 0.3757335247181999, "grad_norm": 23.026073455810547, "learning_rate": 1.8771503957783643e-05, "loss": 4.362, "step": 8900 }, { "epoch": 0.3799552497150336, "grad_norm": 38.66673278808594, "learning_rate": 1.8982585751978892e-05, "loss": 4.3424, "step": 9000 }, { "epoch": 0.3799552497150336, "eval_loss": 4.440927505493164, "eval_runtime": 383.0189, "eval_samples_per_second": 494.73, "eval_steps_per_second": 15.461, "step": 9000 }, { "epoch": 0.3841769747118673, "grad_norm": 33.175357818603516, "learning_rate": 1.9193667546174144e-05, "loss": 4.3383, "step": 9100 }, { "epoch": 0.388398699708701, "grad_norm": 26.07857894897461, "learning_rate": 1.9404749340369397e-05, "loss": 4.4713, "step": 9200 }, { "epoch": 0.39262042470553465, "grad_norm": 20.96491813659668, "learning_rate": 1.9615831134564646e-05, "loss": 4.2773, "step": 9300 }, { "epoch": 0.39684214970236836, "grad_norm": 36.000953674316406, "learning_rate": 1.9826912928759895e-05, "loss": 4.2842, "step": 9400 }, { "epoch": 0.40106387469920207, "grad_norm": 48.96345901489258, "learning_rate": 1.99957782651015e-05, "loss": 4.3301, "step": 9500 }, { "epoch": 0.40106387469920207, "eval_loss": 4.345411777496338, "eval_runtime": 388.2926, "eval_samples_per_second": 488.011, "eval_steps_per_second": 15.251, "step": 9500 }, { "epoch": 0.4052855996960358, "grad_norm": 34.45381546020508, "learning_rate": 1.9972324182332042e-05, "loss": 4.3224, "step": 9600 }, { "epoch": 0.4095073246928695, "grad_norm": 20.29124641418457, "learning_rate": 1.9948870099562583e-05, "loss": 4.3878, "step": 9700 }, { "epoch": 0.4137290496897032, "grad_norm": 15.433843612670898, "learning_rate": 1.9925416016793124e-05, "loss": 4.3614, "step": 9800 }, { "epoch": 0.4179507746865369, "grad_norm": 22.69325065612793, "learning_rate": 1.9901961934023665e-05, "loss": 4.3423, "step": 9900 }, { "epoch": 0.4221724996833706, "grad_norm": 15.297199249267578, "learning_rate": 1.987850785125421e-05, "loss": 4.3576, "step": 10000 }, { "epoch": 0.4221724996833706, "eval_loss": 4.354933261871338, "eval_runtime": 386.7965, "eval_samples_per_second": 489.898, "eval_steps_per_second": 15.31, "step": 10000 }, { "epoch": 0.4263942246802043, "grad_norm": 18.692224502563477, "learning_rate": 1.985505376848475e-05, "loss": 4.1451, "step": 10100 }, { "epoch": 0.43061594967703803, "grad_norm": 16.447420120239258, "learning_rate": 1.983159968571529e-05, "loss": 4.3326, "step": 10200 }, { "epoch": 0.43483767467387174, "grad_norm": 21.373384475708008, "learning_rate": 1.9808145602945835e-05, "loss": 4.2761, "step": 10300 }, { "epoch": 0.43905939967070545, "grad_norm": 16.10767364501953, "learning_rate": 1.9784691520176376e-05, "loss": 4.2421, "step": 10400 }, { "epoch": 0.44328112466753916, "grad_norm": 33.177181243896484, "learning_rate": 1.9761237437406917e-05, "loss": 4.262, "step": 10500 }, { "epoch": 0.44328112466753916, "eval_loss": 4.349300861358643, "eval_runtime": 386.4801, "eval_samples_per_second": 490.299, "eval_steps_per_second": 15.323, "step": 10500 }, { "epoch": 0.44750284966437287, "grad_norm": 29.280973434448242, "learning_rate": 1.973778335463746e-05, "loss": 4.1227, "step": 10600 }, { "epoch": 0.4517245746612066, "grad_norm": 26.586437225341797, "learning_rate": 1.9714329271868002e-05, "loss": 4.2365, "step": 10700 }, { "epoch": 0.4559462996580403, "grad_norm": 34.612091064453125, "learning_rate": 1.9690875189098543e-05, "loss": 4.3528, "step": 10800 }, { "epoch": 0.460168024654874, "grad_norm": 25.977516174316406, "learning_rate": 1.9667421106329084e-05, "loss": 4.077, "step": 10900 }, { "epoch": 0.4643897496517077, "grad_norm": 17.730405807495117, "learning_rate": 1.964396702355963e-05, "loss": 4.0878, "step": 11000 }, { "epoch": 0.4643897496517077, "eval_loss": 4.334869384765625, "eval_runtime": 381.6891, "eval_samples_per_second": 496.454, "eval_steps_per_second": 15.515, "step": 11000 }, { "epoch": 0.4686114746485414, "grad_norm": 23.120807647705078, "learning_rate": 1.9620747481617864e-05, "loss": 4.4246, "step": 11100 }, { "epoch": 0.4728331996453751, "grad_norm": 62.60495376586914, "learning_rate": 1.9597293398848408e-05, "loss": 4.1019, "step": 11200 }, { "epoch": 0.47705492464220883, "grad_norm": 38.34320068359375, "learning_rate": 1.957383931607895e-05, "loss": 4.2565, "step": 11300 }, { "epoch": 0.4812766496390425, "grad_norm": 19.053138732910156, "learning_rate": 1.955038523330949e-05, "loss": 4.3177, "step": 11400 }, { "epoch": 0.4854983746358762, "grad_norm": 25.41574478149414, "learning_rate": 1.952693115054003e-05, "loss": 4.1283, "step": 11500 }, { "epoch": 0.4854983746358762, "eval_loss": 4.423605918884277, "eval_runtime": 385.1053, "eval_samples_per_second": 492.05, "eval_steps_per_second": 15.378, "step": 11500 }, { "epoch": 0.4897200996327099, "grad_norm": 16.951583862304688, "learning_rate": 1.9503477067770572e-05, "loss": 4.2232, "step": 11600 }, { "epoch": 0.4939418246295436, "grad_norm": 19.988862991333008, "learning_rate": 1.9480022985001116e-05, "loss": 4.2347, "step": 11700 }, { "epoch": 0.4981635496263773, "grad_norm": 29.073562622070312, "learning_rate": 1.9456568902231657e-05, "loss": 4.082, "step": 11800 }, { "epoch": 0.502385274623211, "grad_norm": 57.14900207519531, "learning_rate": 1.94331148194622e-05, "loss": 4.2026, "step": 11900 }, { "epoch": 0.5066069996200447, "grad_norm": 19.27651596069336, "learning_rate": 1.9409660736692742e-05, "loss": 4.2687, "step": 12000 }, { "epoch": 0.5066069996200447, "eval_loss": 4.269065856933594, "eval_runtime": 389.1571, "eval_samples_per_second": 486.927, "eval_steps_per_second": 15.218, "step": 12000 }, { "epoch": 0.5108287246168784, "grad_norm": 47.312129974365234, "learning_rate": 1.9386206653923283e-05, "loss": 4.302, "step": 12100 }, { "epoch": 0.5150504496137122, "grad_norm": 51.988243103027344, "learning_rate": 1.9362752571153824e-05, "loss": 4.0474, "step": 12200 }, { "epoch": 0.5192721746105459, "grad_norm": 25.95807456970215, "learning_rate": 1.933929848838437e-05, "loss": 4.1286, "step": 12300 }, { "epoch": 0.5234938996073796, "grad_norm": 29.327302932739258, "learning_rate": 1.931584440561491e-05, "loss": 4.3888, "step": 12400 }, { "epoch": 0.5277156246042133, "grad_norm": 21.580862045288086, "learning_rate": 1.929239032284545e-05, "loss": 4.2339, "step": 12500 }, { "epoch": 0.5277156246042133, "eval_loss": 4.2413506507873535, "eval_runtime": 385.1064, "eval_samples_per_second": 492.048, "eval_steps_per_second": 15.378, "step": 12500 }, { "epoch": 0.531937349601047, "grad_norm": 57.45950698852539, "learning_rate": 1.926893624007599e-05, "loss": 4.1976, "step": 12600 }, { "epoch": 0.5361590745978807, "grad_norm": 40.33968734741211, "learning_rate": 1.9245482157306535e-05, "loss": 4.1851, "step": 12700 }, { "epoch": 0.5403807995947144, "grad_norm": 33.016944885253906, "learning_rate": 1.9222028074537076e-05, "loss": 4.3969, "step": 12800 }, { "epoch": 0.5446025245915481, "grad_norm": 33.44650650024414, "learning_rate": 1.919857399176762e-05, "loss": 4.5229, "step": 12900 }, { "epoch": 0.5488242495883818, "grad_norm": 26.207275390625, "learning_rate": 1.917511990899816e-05, "loss": 4.2242, "step": 13000 }, { "epoch": 0.5488242495883818, "eval_loss": 4.138918399810791, "eval_runtime": 383.7736, "eval_samples_per_second": 493.757, "eval_steps_per_second": 15.431, "step": 13000 }, { "epoch": 0.5530459745852155, "grad_norm": 17.512197494506836, "learning_rate": 1.9151665826228703e-05, "loss": 4.2804, "step": 13100 }, { "epoch": 0.5572676995820492, "grad_norm": 27.008798599243164, "learning_rate": 1.9128211743459243e-05, "loss": 4.2097, "step": 13200 }, { "epoch": 0.561489424578883, "grad_norm": 66.86945343017578, "learning_rate": 1.9104757660689784e-05, "loss": 3.9226, "step": 13300 }, { "epoch": 0.5657111495757167, "grad_norm": 49.8387451171875, "learning_rate": 1.9081538118748023e-05, "loss": 4.2274, "step": 13400 }, { "epoch": 0.5699328745725504, "grad_norm": 134.98756408691406, "learning_rate": 1.9058084035978564e-05, "loss": 4.0309, "step": 13500 }, { "epoch": 0.5699328745725504, "eval_loss": 4.242140293121338, "eval_runtime": 388.1121, "eval_samples_per_second": 488.238, "eval_steps_per_second": 15.258, "step": 13500 }, { "epoch": 0.5741545995693841, "grad_norm": 16.55082130432129, "learning_rate": 1.9034629953209108e-05, "loss": 4.3429, "step": 13600 }, { "epoch": 0.5783763245662178, "grad_norm": 17.452939987182617, "learning_rate": 1.901117587043965e-05, "loss": 4.0352, "step": 13700 }, { "epoch": 0.5825980495630515, "grad_norm": 36.04256057739258, "learning_rate": 1.898772178767019e-05, "loss": 4.2926, "step": 13800 }, { "epoch": 0.5868197745598852, "grad_norm": 38.001712799072266, "learning_rate": 1.896426770490073e-05, "loss": 4.3063, "step": 13900 }, { "epoch": 0.5910414995567189, "grad_norm": 30.819929122924805, "learning_rate": 1.894104816295897e-05, "loss": 4.3172, "step": 14000 }, { "epoch": 0.5910414995567189, "eval_loss": 4.2266845703125, "eval_runtime": 374.8984, "eval_samples_per_second": 505.446, "eval_steps_per_second": 15.796, "step": 14000 }, { "epoch": 0.5952632245535526, "grad_norm": 22.755001068115234, "learning_rate": 1.891759408018951e-05, "loss": 4.0057, "step": 14100 }, { "epoch": 0.5994849495503863, "grad_norm": 109.07671356201172, "learning_rate": 1.8894139997420055e-05, "loss": 4.2081, "step": 14200 }, { "epoch": 0.60370667454722, "grad_norm": 32.722652435302734, "learning_rate": 1.8870685914650596e-05, "loss": 4.2408, "step": 14300 }, { "epoch": 0.6079283995440538, "grad_norm": 17.725366592407227, "learning_rate": 1.8847231831881137e-05, "loss": 4.1066, "step": 14400 }, { "epoch": 0.6121501245408874, "grad_norm": 18.536985397338867, "learning_rate": 1.8823777749111678e-05, "loss": 4.1997, "step": 14500 }, { "epoch": 0.6121501245408874, "eval_loss": 4.1797871589660645, "eval_runtime": 387.8769, "eval_samples_per_second": 488.534, "eval_steps_per_second": 15.268, "step": 14500 }, { "epoch": 0.6163718495377211, "grad_norm": 18.541019439697266, "learning_rate": 1.880032366634222e-05, "loss": 4.2364, "step": 14600 }, { "epoch": 0.6205935745345548, "grad_norm": 20.86994743347168, "learning_rate": 1.877686958357276e-05, "loss": 4.1135, "step": 14700 }, { "epoch": 0.6248152995313885, "grad_norm": 19.748693466186523, "learning_rate": 1.8753415500803304e-05, "loss": 4.0561, "step": 14800 }, { "epoch": 0.6290370245282222, "grad_norm": 16.203369140625, "learning_rate": 1.8729961418033845e-05, "loss": 4.0347, "step": 14900 }, { "epoch": 0.6332587495250559, "grad_norm": 38.8477668762207, "learning_rate": 1.870650733526439e-05, "loss": 4.1979, "step": 15000 }, { "epoch": 0.6332587495250559, "eval_loss": 4.240939140319824, "eval_runtime": 382.8365, "eval_samples_per_second": 494.966, "eval_steps_per_second": 15.469, "step": 15000 }, { "epoch": 0.6374804745218896, "grad_norm": 44.9690055847168, "learning_rate": 1.868305325249493e-05, "loss": 4.0132, "step": 15100 }, { "epoch": 0.6417021995187233, "grad_norm": 21.35906410217285, "learning_rate": 1.865959916972547e-05, "loss": 4.1131, "step": 15200 }, { "epoch": 0.645923924515557, "grad_norm": 26.089805603027344, "learning_rate": 1.8636145086956012e-05, "loss": 3.8049, "step": 15300 }, { "epoch": 0.6501456495123907, "grad_norm": 17.96413230895996, "learning_rate": 1.8612691004186556e-05, "loss": 3.9468, "step": 15400 }, { "epoch": 0.6543673745092244, "grad_norm": 23.05537223815918, "learning_rate": 1.8589236921417097e-05, "loss": 4.17, "step": 15500 }, { "epoch": 0.6543673745092244, "eval_loss": 4.1938157081604, "eval_runtime": 389.3344, "eval_samples_per_second": 486.705, "eval_steps_per_second": 15.211, "step": 15500 }, { "epoch": 0.6585890995060582, "grad_norm": 29.09811782836914, "learning_rate": 1.8565782838647638e-05, "loss": 4.2369, "step": 15600 }, { "epoch": 0.6628108245028919, "grad_norm": 54.29764938354492, "learning_rate": 1.854232875587818e-05, "loss": 4.159, "step": 15700 }, { "epoch": 0.6670325494997256, "grad_norm": 30.67020606994629, "learning_rate": 1.8518874673108723e-05, "loss": 4.1172, "step": 15800 }, { "epoch": 0.6712542744965593, "grad_norm": 50.64274597167969, "learning_rate": 1.8495420590339268e-05, "loss": 4.01, "step": 15900 }, { "epoch": 0.675475999493393, "grad_norm": 48.833839416503906, "learning_rate": 1.847196650756981e-05, "loss": 4.0204, "step": 16000 }, { "epoch": 0.675475999493393, "eval_loss": 4.279551982879639, "eval_runtime": 380.0, "eval_samples_per_second": 498.661, "eval_steps_per_second": 15.584, "step": 16000 }, { "epoch": 0.6796977244902267, "grad_norm": 32.197509765625, "learning_rate": 1.844851242480035e-05, "loss": 4.0013, "step": 16100 }, { "epoch": 0.6839194494870604, "grad_norm": 27.845190048217773, "learning_rate": 1.842505834203089e-05, "loss": 4.0174, "step": 16200 }, { "epoch": 0.6881411744838941, "grad_norm": 57.353271484375, "learning_rate": 1.840160425926143e-05, "loss": 4.0616, "step": 16300 }, { "epoch": 0.6923628994807278, "grad_norm": 49.316341400146484, "learning_rate": 1.8378150176491972e-05, "loss": 3.9944, "step": 16400 }, { "epoch": 0.6965846244775615, "grad_norm": 69.53150939941406, "learning_rate": 1.8354696093722516e-05, "loss": 4.05, "step": 16500 }, { "epoch": 0.6965846244775615, "eval_loss": 4.213247776031494, "eval_runtime": 390.3133, "eval_samples_per_second": 485.484, "eval_steps_per_second": 15.172, "step": 16500 }, { "epoch": 0.7008063494743952, "grad_norm": 101.47056579589844, "learning_rate": 1.8331242010953057e-05, "loss": 4.0769, "step": 16600 }, { "epoch": 0.705028074471229, "grad_norm": 21.874292373657227, "learning_rate": 1.8307787928183602e-05, "loss": 4.1289, "step": 16700 }, { "epoch": 0.7092497994680627, "grad_norm": 28.484107971191406, "learning_rate": 1.8284333845414143e-05, "loss": 4.0941, "step": 16800 }, { "epoch": 0.7134715244648964, "grad_norm": 20.411182403564453, "learning_rate": 1.8260879762644684e-05, "loss": 4.2556, "step": 16900 }, { "epoch": 0.7176932494617301, "grad_norm": 21.108642578125, "learning_rate": 1.8237425679875224e-05, "loss": 4.3075, "step": 17000 }, { "epoch": 0.7176932494617301, "eval_loss": 4.128803253173828, "eval_runtime": 387.8228, "eval_samples_per_second": 488.602, "eval_steps_per_second": 15.27, "step": 17000 }, { "epoch": 0.7219149744585638, "grad_norm": 24.48760986328125, "learning_rate": 1.821397159710577e-05, "loss": 4.0751, "step": 17100 }, { "epoch": 0.7261366994553975, "grad_norm": 28.520254135131836, "learning_rate": 1.819051751433631e-05, "loss": 4.0711, "step": 17200 }, { "epoch": 0.7303584244522312, "grad_norm": 37.351932525634766, "learning_rate": 1.816706343156685e-05, "loss": 3.9483, "step": 17300 }, { "epoch": 0.7345801494490649, "grad_norm": 32.99684143066406, "learning_rate": 1.814360934879739e-05, "loss": 4.3186, "step": 17400 }, { "epoch": 0.7388018744458986, "grad_norm": 24.938167572021484, "learning_rate": 1.8120155266027936e-05, "loss": 3.932, "step": 17500 }, { "epoch": 0.7388018744458986, "eval_loss": 4.114780902862549, "eval_runtime": 379.5561, "eval_samples_per_second": 499.244, "eval_steps_per_second": 15.602, "step": 17500 }, { "epoch": 0.7430235994427323, "grad_norm": 19.03070068359375, "learning_rate": 1.8096701183258477e-05, "loss": 3.8774, "step": 17600 }, { "epoch": 0.747245324439566, "grad_norm": 20.461793899536133, "learning_rate": 1.807324710048902e-05, "loss": 4.2312, "step": 17700 }, { "epoch": 0.7514670494363997, "grad_norm": 50.202266693115234, "learning_rate": 1.8049793017719562e-05, "loss": 3.9327, "step": 17800 }, { "epoch": 0.7556887744332335, "grad_norm": 28.14083480834961, "learning_rate": 1.8026338934950103e-05, "loss": 4.2264, "step": 17900 }, { "epoch": 0.7599104994300672, "grad_norm": 58.97740173339844, "learning_rate": 1.8002884852180644e-05, "loss": 3.9723, "step": 18000 }, { "epoch": 0.7599104994300672, "eval_loss": 4.106083869934082, "eval_runtime": 385.336, "eval_samples_per_second": 491.755, "eval_steps_per_second": 15.368, "step": 18000 }, { "epoch": 0.7641322244269009, "grad_norm": 25.112369537353516, "learning_rate": 1.7979430769411185e-05, "loss": 4.1206, "step": 18100 }, { "epoch": 0.7683539494237346, "grad_norm": 29.05199432373047, "learning_rate": 1.795597668664173e-05, "loss": 4.1744, "step": 18200 }, { "epoch": 0.7725756744205683, "grad_norm": 44.24822998046875, "learning_rate": 1.793252260387227e-05, "loss": 3.89, "step": 18300 }, { "epoch": 0.776797399417402, "grad_norm": 44.473838806152344, "learning_rate": 1.790930306193051e-05, "loss": 4.1414, "step": 18400 }, { "epoch": 0.7810191244142357, "grad_norm": 29.81984519958496, "learning_rate": 1.788584897916105e-05, "loss": 4.0286, "step": 18500 }, { "epoch": 0.7810191244142357, "eval_loss": 4.140493392944336, "eval_runtime": 383.1405, "eval_samples_per_second": 494.573, "eval_steps_per_second": 15.456, "step": 18500 }, { "epoch": 0.7852408494110693, "grad_norm": 67.35831451416016, "learning_rate": 1.786239489639159e-05, "loss": 3.885, "step": 18600 }, { "epoch": 0.789462574407903, "grad_norm": 65.76871490478516, "learning_rate": 1.783894081362213e-05, "loss": 4.3785, "step": 18700 }, { "epoch": 0.7936842994047367, "grad_norm": 22.527624130249023, "learning_rate": 1.7815486730852672e-05, "loss": 3.9304, "step": 18800 }, { "epoch": 0.7979060244015704, "grad_norm": 62.7253532409668, "learning_rate": 1.7792032648083217e-05, "loss": 4.0831, "step": 18900 }, { "epoch": 0.8021277493984041, "grad_norm": 23.038379669189453, "learning_rate": 1.7768578565313758e-05, "loss": 4.1698, "step": 19000 }, { "epoch": 0.8021277493984041, "eval_loss": 4.099842071533203, "eval_runtime": 383.8583, "eval_samples_per_second": 493.648, "eval_steps_per_second": 15.428, "step": 19000 }, { "epoch": 0.8063494743952379, "grad_norm": 49.628074645996094, "learning_rate": 1.7745124482544302e-05, "loss": 3.9876, "step": 19100 }, { "epoch": 0.8105711993920716, "grad_norm": 81.16788482666016, "learning_rate": 1.7721670399774843e-05, "loss": 3.9194, "step": 19200 }, { "epoch": 0.8147929243889053, "grad_norm": 62.49740982055664, "learning_rate": 1.7698216317005384e-05, "loss": 3.9222, "step": 19300 }, { "epoch": 0.819014649385739, "grad_norm": 33.98008728027344, "learning_rate": 1.7674762234235928e-05, "loss": 4.1863, "step": 19400 }, { "epoch": 0.8232363743825727, "grad_norm": 22.858428955078125, "learning_rate": 1.765130815146647e-05, "loss": 4.0315, "step": 19500 }, { "epoch": 0.8232363743825727, "eval_loss": 4.077751636505127, "eval_runtime": 385.7439, "eval_samples_per_second": 491.235, "eval_steps_per_second": 15.352, "step": 19500 }, { "epoch": 0.8274580993794064, "grad_norm": 24.006502151489258, "learning_rate": 1.762785406869701e-05, "loss": 3.9286, "step": 19600 }, { "epoch": 0.8316798243762401, "grad_norm": 27.51500701904297, "learning_rate": 1.760439998592755e-05, "loss": 3.9605, "step": 19700 }, { "epoch": 0.8359015493730738, "grad_norm": 77.57249450683594, "learning_rate": 1.7580945903158092e-05, "loss": 4.1991, "step": 19800 }, { "epoch": 0.8401232743699075, "grad_norm": 30.322465896606445, "learning_rate": 1.7557491820388636e-05, "loss": 4.0311, "step": 19900 }, { "epoch": 0.8443449993667412, "grad_norm": 27.484439849853516, "learning_rate": 1.7534037737619177e-05, "loss": 3.7869, "step": 20000 }, { "epoch": 0.8443449993667412, "eval_loss": 4.174862384796143, "eval_runtime": 387.7755, "eval_samples_per_second": 488.662, "eval_steps_per_second": 15.272, "step": 20000 }, { "epoch": 0.8485667243635749, "grad_norm": 23.302629470825195, "learning_rate": 1.751058365484972e-05, "loss": 3.9232, "step": 20100 }, { "epoch": 0.8527884493604087, "grad_norm": 23.86292839050293, "learning_rate": 1.7487129572080262e-05, "loss": 4.034, "step": 20200 }, { "epoch": 0.8570101743572424, "grad_norm": 21.296226501464844, "learning_rate": 1.7463675489310803e-05, "loss": 4.2625, "step": 20300 }, { "epoch": 0.8612318993540761, "grad_norm": 30.49806022644043, "learning_rate": 1.7440221406541344e-05, "loss": 3.983, "step": 20400 }, { "epoch": 0.8654536243509098, "grad_norm": 27.714088439941406, "learning_rate": 1.741700186459958e-05, "loss": 4.2154, "step": 20500 }, { "epoch": 0.8654536243509098, "eval_loss": 4.1056694984436035, "eval_runtime": 381.9065, "eval_samples_per_second": 496.171, "eval_steps_per_second": 15.506, "step": 20500 }, { "epoch": 0.8696753493477435, "grad_norm": 41.86836624145508, "learning_rate": 1.7393547781830124e-05, "loss": 4.1696, "step": 20600 }, { "epoch": 0.8738970743445772, "grad_norm": 155.70619201660156, "learning_rate": 1.7370093699060665e-05, "loss": 3.8989, "step": 20700 }, { "epoch": 0.8781187993414109, "grad_norm": 29.960683822631836, "learning_rate": 1.734663961629121e-05, "loss": 3.9004, "step": 20800 }, { "epoch": 0.8823405243382446, "grad_norm": 35.051795959472656, "learning_rate": 1.732318553352175e-05, "loss": 4.2134, "step": 20900 }, { "epoch": 0.8865622493350783, "grad_norm": 68.12458801269531, "learning_rate": 1.729973145075229e-05, "loss": 3.9789, "step": 21000 }, { "epoch": 0.8865622493350783, "eval_loss": 4.088019371032715, "eval_runtime": 387.7304, "eval_samples_per_second": 488.718, "eval_steps_per_second": 15.273, "step": 21000 }, { "epoch": 0.890783974331912, "grad_norm": 32.287498474121094, "learning_rate": 1.727627736798283e-05, "loss": 4.2438, "step": 21100 }, { "epoch": 0.8950056993287457, "grad_norm": 53.939903259277344, "learning_rate": 1.7252823285213376e-05, "loss": 3.9271, "step": 21200 }, { "epoch": 0.8992274243255794, "grad_norm": 23.49491310119629, "learning_rate": 1.7229369202443917e-05, "loss": 3.9693, "step": 21300 }, { "epoch": 0.9034491493224132, "grad_norm": 66.63202667236328, "learning_rate": 1.7205915119674458e-05, "loss": 4.0197, "step": 21400 }, { "epoch": 0.9076708743192469, "grad_norm": 38.183998107910156, "learning_rate": 1.7182461036905002e-05, "loss": 4.1802, "step": 21500 }, { "epoch": 0.9076708743192469, "eval_loss": 4.014532089233398, "eval_runtime": 389.5421, "eval_samples_per_second": 486.446, "eval_steps_per_second": 15.202, "step": 21500 }, { "epoch": 0.9118925993160806, "grad_norm": 25.80255126953125, "learning_rate": 1.7159006954135543e-05, "loss": 3.8818, "step": 21600 }, { "epoch": 0.9161143243129143, "grad_norm": 105.8138198852539, "learning_rate": 1.7135552871366084e-05, "loss": 4.1069, "step": 21700 }, { "epoch": 0.920336049309748, "grad_norm": 30.18889617919922, "learning_rate": 1.7112098788596628e-05, "loss": 3.7999, "step": 21800 }, { "epoch": 0.9245577743065817, "grad_norm": 35.612117767333984, "learning_rate": 1.708864470582717e-05, "loss": 3.8949, "step": 21900 }, { "epoch": 0.9287794993034154, "grad_norm": 40.16869354248047, "learning_rate": 1.706519062305771e-05, "loss": 3.9893, "step": 22000 }, { "epoch": 0.9287794993034154, "eval_loss": 4.131321430206299, "eval_runtime": 385.7035, "eval_samples_per_second": 491.287, "eval_steps_per_second": 15.354, "step": 22000 }, { "epoch": 0.9330012243002491, "grad_norm": 44.418731689453125, "learning_rate": 1.704173654028825e-05, "loss": 4.0918, "step": 22100 }, { "epoch": 0.9372229492970828, "grad_norm": 75.3628921508789, "learning_rate": 1.7018282457518792e-05, "loss": 4.0451, "step": 22200 }, { "epoch": 0.9414446742939165, "grad_norm": 49.9603271484375, "learning_rate": 1.6994828374749336e-05, "loss": 3.9312, "step": 22300 }, { "epoch": 0.9456663992907502, "grad_norm": 17.13250160217285, "learning_rate": 1.6971374291979877e-05, "loss": 4.117, "step": 22400 }, { "epoch": 0.949888124287584, "grad_norm": 22.806734085083008, "learning_rate": 1.694792020921042e-05, "loss": 3.883, "step": 22500 }, { "epoch": 0.949888124287584, "eval_loss": 4.10904598236084, "eval_runtime": 385.0496, "eval_samples_per_second": 492.121, "eval_steps_per_second": 15.38, "step": 22500 }, { "epoch": 0.9541098492844177, "grad_norm": 20.560272216796875, "learning_rate": 1.6924700667268657e-05, "loss": 3.6942, "step": 22600 }, { "epoch": 0.9583315742812513, "grad_norm": 29.096519470214844, "learning_rate": 1.6901246584499198e-05, "loss": 4.1196, "step": 22700 }, { "epoch": 0.962553299278085, "grad_norm": 37.949188232421875, "learning_rate": 1.687779250172974e-05, "loss": 3.9292, "step": 22800 }, { "epoch": 0.9667750242749187, "grad_norm": 62.616546630859375, "learning_rate": 1.685433841896028e-05, "loss": 3.9081, "step": 22900 }, { "epoch": 0.9709967492717524, "grad_norm": 25.162185668945312, "learning_rate": 1.6830884336190824e-05, "loss": 3.8169, "step": 23000 }, { "epoch": 0.9709967492717524, "eval_loss": 4.123219966888428, "eval_runtime": 384.0529, "eval_samples_per_second": 493.398, "eval_steps_per_second": 15.42, "step": 23000 }, { "epoch": 0.9752184742685861, "grad_norm": 28.75921630859375, "learning_rate": 1.6807664794249062e-05, "loss": 3.8342, "step": 23100 }, { "epoch": 0.9794401992654198, "grad_norm": 49.7982177734375, "learning_rate": 1.6784210711479603e-05, "loss": 4.078, "step": 23200 }, { "epoch": 0.9836619242622535, "grad_norm": 20.842620849609375, "learning_rate": 1.6760756628710144e-05, "loss": 4.0002, "step": 23300 }, { "epoch": 0.9878836492590872, "grad_norm": 27.644798278808594, "learning_rate": 1.6737302545940685e-05, "loss": 3.9373, "step": 23400 }, { "epoch": 0.9921053742559209, "grad_norm": 45.985050201416016, "learning_rate": 1.6713848463171226e-05, "loss": 3.8344, "step": 23500 }, { "epoch": 0.9921053742559209, "eval_loss": 4.156482696533203, "eval_runtime": 388.5562, "eval_samples_per_second": 487.68, "eval_steps_per_second": 15.241, "step": 23500 }, { "epoch": 0.9963270992527546, "grad_norm": 21.684967041015625, "learning_rate": 1.6690394380401767e-05, "loss": 4.2827, "step": 23600 }, { "epoch": 1.0005488242495884, "grad_norm": 25.846668243408203, "learning_rate": 1.666694029763231e-05, "loss": 4.0298, "step": 23700 }, { "epoch": 1.004770549246422, "grad_norm": 29.205032348632812, "learning_rate": 1.6643486214862856e-05, "loss": 3.9967, "step": 23800 }, { "epoch": 1.0089922742432558, "grad_norm": 25.228853225708008, "learning_rate": 1.6620032132093397e-05, "loss": 3.7508, "step": 23900 }, { "epoch": 1.0132139992400895, "grad_norm": 39.203895568847656, "learning_rate": 1.6596578049323938e-05, "loss": 3.8919, "step": 24000 }, { "epoch": 1.0132139992400895, "eval_loss": 4.079037666320801, "eval_runtime": 386.829, "eval_samples_per_second": 489.857, "eval_steps_per_second": 15.309, "step": 24000 }, { "epoch": 1.0174357242369232, "grad_norm": 29.192537307739258, "learning_rate": 1.657312396655448e-05, "loss": 4.0181, "step": 24100 }, { "epoch": 1.021657449233757, "grad_norm": 29.643613815307617, "learning_rate": 1.6549669883785023e-05, "loss": 3.7934, "step": 24200 }, { "epoch": 1.0258791742305906, "grad_norm": 37.858001708984375, "learning_rate": 1.6526215801015564e-05, "loss": 3.8986, "step": 24300 }, { "epoch": 1.0301008992274243, "grad_norm": 31.105730056762695, "learning_rate": 1.6502761718246105e-05, "loss": 3.9275, "step": 24400 }, { "epoch": 1.034322624224258, "grad_norm": 36.47043991088867, "learning_rate": 1.6479307635476646e-05, "loss": 3.6911, "step": 24500 }, { "epoch": 1.034322624224258, "eval_loss": 4.160192012786865, "eval_runtime": 385.9759, "eval_samples_per_second": 490.94, "eval_steps_per_second": 15.343, "step": 24500 }, { "epoch": 1.0385443492210917, "grad_norm": 36.14301681518555, "learning_rate": 1.645585355270719e-05, "loss": 3.5855, "step": 24600 }, { "epoch": 1.0427660742179254, "grad_norm": 37.279823303222656, "learning_rate": 1.643239946993773e-05, "loss": 3.7875, "step": 24700 }, { "epoch": 1.0469877992147592, "grad_norm": 18.79073715209961, "learning_rate": 1.6408945387168275e-05, "loss": 3.7999, "step": 24800 }, { "epoch": 1.0512095242115929, "grad_norm": 38.917659759521484, "learning_rate": 1.6385491304398816e-05, "loss": 3.7718, "step": 24900 }, { "epoch": 1.0554312492084266, "grad_norm": 23.614336013793945, "learning_rate": 1.6362037221629357e-05, "loss": 3.8362, "step": 25000 }, { "epoch": 1.0554312492084266, "eval_loss": 4.038120746612549, "eval_runtime": 386.8592, "eval_samples_per_second": 489.819, "eval_steps_per_second": 15.308, "step": 25000 }, { "epoch": 1.0596529742052603, "grad_norm": 70.36714172363281, "learning_rate": 1.6338583138859898e-05, "loss": 3.8076, "step": 25100 }, { "epoch": 1.063874699202094, "grad_norm": 20.698545455932617, "learning_rate": 1.631512905609044e-05, "loss": 3.8875, "step": 25200 }, { "epoch": 1.0680964241989277, "grad_norm": 25.684085845947266, "learning_rate": 1.629167497332098e-05, "loss": 3.9675, "step": 25300 }, { "epoch": 1.0723181491957614, "grad_norm": 24.55357551574707, "learning_rate": 1.6268220890551524e-05, "loss": 3.8451, "step": 25400 }, { "epoch": 1.0765398741925951, "grad_norm": 41.31557846069336, "learning_rate": 1.6244766807782065e-05, "loss": 3.4346, "step": 25500 }, { "epoch": 1.0765398741925951, "eval_loss": 4.199647903442383, "eval_runtime": 384.2225, "eval_samples_per_second": 493.18, "eval_steps_per_second": 15.413, "step": 25500 }, { "epoch": 1.0807615991894288, "grad_norm": 24.676942825317383, "learning_rate": 1.622131272501261e-05, "loss": 4.0584, "step": 25600 }, { "epoch": 1.0849833241862625, "grad_norm": 53.181884765625, "learning_rate": 1.619785864224315e-05, "loss": 3.602, "step": 25700 }, { "epoch": 1.0892050491830962, "grad_norm": 50.667518615722656, "learning_rate": 1.617440455947369e-05, "loss": 3.673, "step": 25800 }, { "epoch": 1.09342677417993, "grad_norm": 145.48304748535156, "learning_rate": 1.6150950476704235e-05, "loss": 3.976, "step": 25900 }, { "epoch": 1.0976484991767637, "grad_norm": 20.84932518005371, "learning_rate": 1.6127496393934776e-05, "loss": 3.8768, "step": 26000 }, { "epoch": 1.0976484991767637, "eval_loss": 3.9983105659484863, "eval_runtime": 381.5211, "eval_samples_per_second": 496.672, "eval_steps_per_second": 15.522, "step": 26000 }, { "epoch": 1.1018702241735974, "grad_norm": 40.64849090576172, "learning_rate": 1.6104042311165317e-05, "loss": 3.7575, "step": 26100 }, { "epoch": 1.106091949170431, "grad_norm": 35.289878845214844, "learning_rate": 1.6080588228395858e-05, "loss": 3.8101, "step": 26200 }, { "epoch": 1.1103136741672648, "grad_norm": 48.63871383666992, "learning_rate": 1.60571341456264e-05, "loss": 4.104, "step": 26300 }, { "epoch": 1.1145353991640985, "grad_norm": 107.08880615234375, "learning_rate": 1.6033680062856943e-05, "loss": 3.7139, "step": 26400 }, { "epoch": 1.1187571241609322, "grad_norm": 24.861068725585938, "learning_rate": 1.6010225980087484e-05, "loss": 4.0391, "step": 26500 }, { "epoch": 1.1187571241609322, "eval_loss": 4.00176477432251, "eval_runtime": 390.396, "eval_samples_per_second": 485.381, "eval_steps_per_second": 15.169, "step": 26500 }, { "epoch": 1.122978849157766, "grad_norm": 19.068403244018555, "learning_rate": 1.598677189731803e-05, "loss": 3.8449, "step": 26600 }, { "epoch": 1.1272005741545996, "grad_norm": 37.11012649536133, "learning_rate": 1.596331781454857e-05, "loss": 3.7146, "step": 26700 }, { "epoch": 1.1314222991514333, "grad_norm": 37.64760971069336, "learning_rate": 1.593986373177911e-05, "loss": 4.0576, "step": 26800 }, { "epoch": 1.135644024148267, "grad_norm": 32.245750427246094, "learning_rate": 1.591640964900965e-05, "loss": 3.8831, "step": 26900 }, { "epoch": 1.1398657491451007, "grad_norm": 27.54793930053711, "learning_rate": 1.5892955566240192e-05, "loss": 3.8161, "step": 27000 }, { "epoch": 1.1398657491451007, "eval_loss": 4.0019025802612305, "eval_runtime": 388.1527, "eval_samples_per_second": 488.187, "eval_steps_per_second": 15.257, "step": 27000 }, { "epoch": 1.1440874741419345, "grad_norm": 73.00877380371094, "learning_rate": 1.5869501483470737e-05, "loss": 3.9283, "step": 27100 }, { "epoch": 1.1483091991387682, "grad_norm": 180.4134063720703, "learning_rate": 1.5846281941528975e-05, "loss": 3.8637, "step": 27200 }, { "epoch": 1.1525309241356019, "grad_norm": 28.230194091796875, "learning_rate": 1.5822827858759516e-05, "loss": 3.701, "step": 27300 }, { "epoch": 1.1567526491324356, "grad_norm": 29.84754753112793, "learning_rate": 1.5799373775990057e-05, "loss": 3.9364, "step": 27400 }, { "epoch": 1.1609743741292693, "grad_norm": 27.61314582824707, "learning_rate": 1.5775919693220598e-05, "loss": 3.7305, "step": 27500 }, { "epoch": 1.1609743741292693, "eval_loss": 3.995920419692993, "eval_runtime": 389.5266, "eval_samples_per_second": 486.465, "eval_steps_per_second": 15.203, "step": 27500 }, { "epoch": 1.165196099126103, "grad_norm": 50.62400817871094, "learning_rate": 1.575246561045114e-05, "loss": 3.8542, "step": 27600 }, { "epoch": 1.1694178241229367, "grad_norm": 50.76478576660156, "learning_rate": 1.5729011527681683e-05, "loss": 3.7249, "step": 27700 }, { "epoch": 1.1736395491197704, "grad_norm": 30.597715377807617, "learning_rate": 1.5705557444912224e-05, "loss": 3.7223, "step": 27800 }, { "epoch": 1.1778612741166041, "grad_norm": 121.53684997558594, "learning_rate": 1.5682103362142765e-05, "loss": 3.9777, "step": 27900 }, { "epoch": 1.1820829991134378, "grad_norm": 161.595947265625, "learning_rate": 1.565864927937331e-05, "loss": 3.8036, "step": 28000 }, { "epoch": 1.1820829991134378, "eval_loss": 4.054747104644775, "eval_runtime": 387.9897, "eval_samples_per_second": 488.392, "eval_steps_per_second": 15.263, "step": 28000 }, { "epoch": 1.1863047241102715, "grad_norm": 35.772239685058594, "learning_rate": 1.563519519660385e-05, "loss": 3.8635, "step": 28100 }, { "epoch": 1.1905264491071053, "grad_norm": 20.932790756225586, "learning_rate": 1.561174111383439e-05, "loss": 3.8523, "step": 28200 }, { "epoch": 1.194748174103939, "grad_norm": 20.581287384033203, "learning_rate": 1.5588287031064936e-05, "loss": 3.6757, "step": 28300 }, { "epoch": 1.1989698991007725, "grad_norm": 62.859352111816406, "learning_rate": 1.5564832948295477e-05, "loss": 3.7519, "step": 28400 }, { "epoch": 1.2031916240976064, "grad_norm": 29.448869705200195, "learning_rate": 1.5541378865526017e-05, "loss": 3.983, "step": 28500 }, { "epoch": 1.2031916240976064, "eval_loss": 4.038938045501709, "eval_runtime": 387.2247, "eval_samples_per_second": 489.357, "eval_steps_per_second": 15.293, "step": 28500 }, { "epoch": 1.2074133490944399, "grad_norm": 28.24205780029297, "learning_rate": 1.551792478275656e-05, "loss": 3.8288, "step": 28600 }, { "epoch": 1.2116350740912738, "grad_norm": 36.54047393798828, "learning_rate": 1.54944706999871e-05, "loss": 3.8074, "step": 28700 }, { "epoch": 1.2158567990881073, "grad_norm": 64.28271484375, "learning_rate": 1.5471016617217644e-05, "loss": 3.714, "step": 28800 }, { "epoch": 1.2200785240849412, "grad_norm": 43.12767028808594, "learning_rate": 1.5447562534448185e-05, "loss": 3.6594, "step": 28900 }, { "epoch": 1.2243002490817747, "grad_norm": 206.6219940185547, "learning_rate": 1.542410845167873e-05, "loss": 3.9452, "step": 29000 }, { "epoch": 1.2243002490817747, "eval_loss": 4.0273637771606445, "eval_runtime": 384.7693, "eval_samples_per_second": 492.48, "eval_steps_per_second": 15.391, "step": 29000 }, { "epoch": 1.2285219740786086, "grad_norm": 82.68762969970703, "learning_rate": 1.540065436890927e-05, "loss": 3.9906, "step": 29100 }, { "epoch": 1.2327436990754421, "grad_norm": 93.16060638427734, "learning_rate": 1.537720028613981e-05, "loss": 3.9826, "step": 29200 }, { "epoch": 1.236965424072276, "grad_norm": 28.004316329956055, "learning_rate": 1.5353980744198046e-05, "loss": 3.8635, "step": 29300 }, { "epoch": 1.2411871490691095, "grad_norm": 26.93125343322754, "learning_rate": 1.533052666142859e-05, "loss": 3.9888, "step": 29400 }, { "epoch": 1.2454088740659432, "grad_norm": 55.691139221191406, "learning_rate": 1.530707257865913e-05, "loss": 3.7248, "step": 29500 }, { "epoch": 1.2454088740659432, "eval_loss": 4.028724193572998, "eval_runtime": 387.9576, "eval_samples_per_second": 488.432, "eval_steps_per_second": 15.265, "step": 29500 }, { "epoch": 1.249630599062777, "grad_norm": 52.49585723876953, "learning_rate": 1.5283618495889675e-05, "loss": 3.7484, "step": 29600 }, { "epoch": 1.253852324059611, "grad_norm": 27.375572204589844, "learning_rate": 1.5260164413120216e-05, "loss": 3.9694, "step": 29700 }, { "epoch": 1.2580740490564444, "grad_norm": 27.40184211730957, "learning_rate": 1.5236710330350757e-05, "loss": 4.059, "step": 29800 }, { "epoch": 1.2622957740532783, "grad_norm": 38.194427490234375, "learning_rate": 1.5213256247581298e-05, "loss": 3.9358, "step": 29900 }, { "epoch": 1.2665174990501118, "grad_norm": 30.75528907775879, "learning_rate": 1.518980216481184e-05, "loss": 3.8575, "step": 30000 }, { "epoch": 1.2665174990501118, "eval_loss": 3.9483540058135986, "eval_runtime": 386.0272, "eval_samples_per_second": 490.875, "eval_steps_per_second": 15.341, "step": 30000 }, { "epoch": 1.2707392240469455, "grad_norm": 41.0136604309082, "learning_rate": 1.5166348082042384e-05, "loss": 3.8382, "step": 30100 }, { "epoch": 1.2749609490437792, "grad_norm": 27.01508903503418, "learning_rate": 1.5142893999272924e-05, "loss": 3.73, "step": 30200 }, { "epoch": 1.279182674040613, "grad_norm": 19.524795532226562, "learning_rate": 1.5119674457331161e-05, "loss": 4.0439, "step": 30300 }, { "epoch": 1.2834043990374466, "grad_norm": 22.745532989501953, "learning_rate": 1.5096220374561702e-05, "loss": 3.8426, "step": 30400 }, { "epoch": 1.2876261240342803, "grad_norm": 36.96416091918945, "learning_rate": 1.5072766291792245e-05, "loss": 3.7062, "step": 30500 }, { "epoch": 1.2876261240342803, "eval_loss": 4.0188140869140625, "eval_runtime": 385.3455, "eval_samples_per_second": 491.743, "eval_steps_per_second": 15.368, "step": 30500 }, { "epoch": 1.291847849031114, "grad_norm": 29.24265480041504, "learning_rate": 1.5049312209022786e-05, "loss": 3.8926, "step": 30600 }, { "epoch": 1.2960695740279478, "grad_norm": 38.205142974853516, "learning_rate": 1.502585812625333e-05, "loss": 4.0276, "step": 30700 }, { "epoch": 1.3002912990247815, "grad_norm": 52.72880172729492, "learning_rate": 1.5002404043483871e-05, "loss": 3.6359, "step": 30800 }, { "epoch": 1.3045130240216152, "grad_norm": 26.97931480407715, "learning_rate": 1.4978949960714414e-05, "loss": 4.0006, "step": 30900 }, { "epoch": 1.3087347490184489, "grad_norm": 19.663036346435547, "learning_rate": 1.4955495877944955e-05, "loss": 3.8485, "step": 31000 }, { "epoch": 1.3087347490184489, "eval_loss": 4.001898288726807, "eval_runtime": 389.5661, "eval_samples_per_second": 486.416, "eval_steps_per_second": 15.202, "step": 31000 }, { "epoch": 1.3129564740152826, "grad_norm": 55.0498161315918, "learning_rate": 1.4932041795175496e-05, "loss": 3.7892, "step": 31100 }, { "epoch": 1.3171781990121163, "grad_norm": 28.248655319213867, "learning_rate": 1.4908587712406038e-05, "loss": 3.5783, "step": 31200 }, { "epoch": 1.32139992400895, "grad_norm": 46.059234619140625, "learning_rate": 1.488513362963658e-05, "loss": 4.0018, "step": 31300 }, { "epoch": 1.3256216490057837, "grad_norm": 36.40607833862305, "learning_rate": 1.4861679546867123e-05, "loss": 3.9542, "step": 31400 }, { "epoch": 1.3298433740026174, "grad_norm": 44.54596710205078, "learning_rate": 1.4838225464097664e-05, "loss": 3.7739, "step": 31500 }, { "epoch": 1.3298433740026174, "eval_loss": 3.987475633621216, "eval_runtime": 389.5011, "eval_samples_per_second": 486.497, "eval_steps_per_second": 15.204, "step": 31500 }, { "epoch": 1.3340650989994511, "grad_norm": 58.875572204589844, "learning_rate": 1.4814771381328205e-05, "loss": 3.8806, "step": 31600 }, { "epoch": 1.3382868239962848, "grad_norm": 32.03676986694336, "learning_rate": 1.4791317298558748e-05, "loss": 4.176, "step": 31700 }, { "epoch": 1.3425085489931186, "grad_norm": 30.134937286376953, "learning_rate": 1.4767863215789289e-05, "loss": 3.826, "step": 31800 }, { "epoch": 1.3467302739899523, "grad_norm": 27.317060470581055, "learning_rate": 1.4744409133019833e-05, "loss": 3.8514, "step": 31900 }, { "epoch": 1.350951998986786, "grad_norm": 28.351472854614258, "learning_rate": 1.4720955050250374e-05, "loss": 3.8261, "step": 32000 }, { "epoch": 1.350951998986786, "eval_loss": 3.9716382026672363, "eval_runtime": 389.2558, "eval_samples_per_second": 486.803, "eval_steps_per_second": 15.214, "step": 32000 }, { "epoch": 1.3551737239836197, "grad_norm": 113.39342498779297, "learning_rate": 1.4697500967480915e-05, "loss": 3.8825, "step": 32100 }, { "epoch": 1.3593954489804534, "grad_norm": 36.84119415283203, "learning_rate": 1.4674046884711458e-05, "loss": 3.6388, "step": 32200 }, { "epoch": 1.363617173977287, "grad_norm": 26.57939910888672, "learning_rate": 1.4650592801941998e-05, "loss": 3.7851, "step": 32300 }, { "epoch": 1.3678388989741208, "grad_norm": 73.95629119873047, "learning_rate": 1.4627138719172543e-05, "loss": 3.5687, "step": 32400 }, { "epoch": 1.3720606239709545, "grad_norm": 36.70028305053711, "learning_rate": 1.4603684636403084e-05, "loss": 3.5408, "step": 32500 }, { "epoch": 1.3720606239709545, "eval_loss": 3.9371213912963867, "eval_runtime": 388.4609, "eval_samples_per_second": 487.799, "eval_steps_per_second": 15.245, "step": 32500 }, { "epoch": 1.3762823489677882, "grad_norm": 30.751720428466797, "learning_rate": 1.4580230553633625e-05, "loss": 3.6995, "step": 32600 }, { "epoch": 1.380504073964622, "grad_norm": 36.78146743774414, "learning_rate": 1.4556776470864167e-05, "loss": 3.882, "step": 32700 }, { "epoch": 1.3847257989614556, "grad_norm": 25.60002899169922, "learning_rate": 1.4533322388094708e-05, "loss": 3.8703, "step": 32800 }, { "epoch": 1.3889475239582894, "grad_norm": 67.25765991210938, "learning_rate": 1.4509868305325249e-05, "loss": 3.806, "step": 32900 }, { "epoch": 1.393169248955123, "grad_norm": 28.73509979248047, "learning_rate": 1.4486414222555793e-05, "loss": 3.7826, "step": 33000 }, { "epoch": 1.393169248955123, "eval_loss": 3.8901188373565674, "eval_runtime": 387.4878, "eval_samples_per_second": 489.024, "eval_steps_per_second": 15.283, "step": 33000 }, { "epoch": 1.3973909739519568, "grad_norm": 23.818187713623047, "learning_rate": 1.4462960139786334e-05, "loss": 3.7853, "step": 33100 }, { "epoch": 1.4016126989487905, "grad_norm": 55.28591537475586, "learning_rate": 1.4439506057016877e-05, "loss": 3.5745, "step": 33200 }, { "epoch": 1.4058344239456242, "grad_norm": 32.88700485229492, "learning_rate": 1.4416051974247418e-05, "loss": 3.5884, "step": 33300 }, { "epoch": 1.410056148942458, "grad_norm": 60.0154914855957, "learning_rate": 1.4392597891477959e-05, "loss": 3.8678, "step": 33400 }, { "epoch": 1.4142778739392916, "grad_norm": 51.1928825378418, "learning_rate": 1.4369143808708501e-05, "loss": 4.0917, "step": 33500 }, { "epoch": 1.4142778739392916, "eval_loss": 3.9331603050231934, "eval_runtime": 387.4535, "eval_samples_per_second": 489.068, "eval_steps_per_second": 15.284, "step": 33500 }, { "epoch": 1.4184995989361253, "grad_norm": 37.41315841674805, "learning_rate": 1.4345689725939044e-05, "loss": 3.7125, "step": 33600 }, { "epoch": 1.422721323932959, "grad_norm": 22.55731201171875, "learning_rate": 1.4322470183997281e-05, "loss": 3.7298, "step": 33700 }, { "epoch": 1.4269430489297927, "grad_norm": 37.30392837524414, "learning_rate": 1.4299016101227824e-05, "loss": 3.9447, "step": 33800 }, { "epoch": 1.4311647739266264, "grad_norm": 36.30218505859375, "learning_rate": 1.4275562018458364e-05, "loss": 3.7176, "step": 33900 }, { "epoch": 1.4353864989234602, "grad_norm": 28.75307846069336, "learning_rate": 1.4252107935688905e-05, "loss": 3.6765, "step": 34000 }, { "epoch": 1.4353864989234602, "eval_loss": 4.030208110809326, "eval_runtime": 387.4948, "eval_samples_per_second": 489.016, "eval_steps_per_second": 15.283, "step": 34000 }, { "epoch": 1.4396082239202939, "grad_norm": 80.53971099853516, "learning_rate": 1.4228653852919448e-05, "loss": 3.9847, "step": 34100 }, { "epoch": 1.4438299489171276, "grad_norm": 62.22883605957031, "learning_rate": 1.420519977014999e-05, "loss": 3.7364, "step": 34200 }, { "epoch": 1.4480516739139613, "grad_norm": 20.756181716918945, "learning_rate": 1.4181745687380533e-05, "loss": 3.8246, "step": 34300 }, { "epoch": 1.452273398910795, "grad_norm": 39.283870697021484, "learning_rate": 1.4158291604611074e-05, "loss": 3.575, "step": 34400 }, { "epoch": 1.4564951239076287, "grad_norm": 21.12331771850586, "learning_rate": 1.4134837521841615e-05, "loss": 3.814, "step": 34500 }, { "epoch": 1.4564951239076287, "eval_loss": 3.9518651962280273, "eval_runtime": 385.0216, "eval_samples_per_second": 492.157, "eval_steps_per_second": 15.381, "step": 34500 }, { "epoch": 1.4607168489044624, "grad_norm": 101.86262512207031, "learning_rate": 1.4111383439072158e-05, "loss": 3.8708, "step": 34600 }, { "epoch": 1.4649385739012961, "grad_norm": 22.11623191833496, "learning_rate": 1.4087929356302699e-05, "loss": 3.7277, "step": 34700 }, { "epoch": 1.4691602988981298, "grad_norm": 21.435291290283203, "learning_rate": 1.4064475273533243e-05, "loss": 3.7758, "step": 34800 }, { "epoch": 1.4733820238949635, "grad_norm": 42.297630310058594, "learning_rate": 1.4041021190763784e-05, "loss": 3.6727, "step": 34900 }, { "epoch": 1.4776037488917972, "grad_norm": 133.29806518554688, "learning_rate": 1.4017567107994325e-05, "loss": 3.773, "step": 35000 }, { "epoch": 1.4776037488917972, "eval_loss": 3.952765464782715, "eval_runtime": 386.7776, "eval_samples_per_second": 489.922, "eval_steps_per_second": 15.311, "step": 35000 }, { "epoch": 1.481825473888631, "grad_norm": 22.62911605834961, "learning_rate": 1.3994113025224867e-05, "loss": 4.0004, "step": 35100 }, { "epoch": 1.4860471988854647, "grad_norm": 37.50686264038086, "learning_rate": 1.3970658942455408e-05, "loss": 3.8468, "step": 35200 }, { "epoch": 1.4902689238822984, "grad_norm": 38.95034408569336, "learning_rate": 1.394720485968595e-05, "loss": 3.6814, "step": 35300 }, { "epoch": 1.494490648879132, "grad_norm": 48.52857208251953, "learning_rate": 1.3923750776916494e-05, "loss": 3.8993, "step": 35400 }, { "epoch": 1.4987123738759658, "grad_norm": 41.82535934448242, "learning_rate": 1.3900296694147035e-05, "loss": 3.8841, "step": 35500 }, { "epoch": 1.4987123738759658, "eval_loss": 3.9401652812957764, "eval_runtime": 386.7397, "eval_samples_per_second": 489.97, "eval_steps_per_second": 15.313, "step": 35500 }, { "epoch": 1.5029340988727995, "grad_norm": 81.59280395507812, "learning_rate": 1.3876842611377577e-05, "loss": 3.8272, "step": 35600 }, { "epoch": 1.507155823869633, "grad_norm": 100.15462493896484, "learning_rate": 1.3853388528608118e-05, "loss": 3.584, "step": 35700 }, { "epoch": 1.511377548866467, "grad_norm": 27.60382843017578, "learning_rate": 1.3829934445838659e-05, "loss": 3.8424, "step": 35800 }, { "epoch": 1.5155992738633004, "grad_norm": 40.822444915771484, "learning_rate": 1.3806480363069203e-05, "loss": 3.7274, "step": 35900 }, { "epoch": 1.5198209988601343, "grad_norm": 31.995193481445312, "learning_rate": 1.3783026280299744e-05, "loss": 3.9671, "step": 36000 }, { "epoch": 1.5198209988601343, "eval_loss": 3.9035112857818604, "eval_runtime": 389.4034, "eval_samples_per_second": 486.619, "eval_steps_per_second": 15.208, "step": 36000 }, { "epoch": 1.5240427238569678, "grad_norm": 31.883012771606445, "learning_rate": 1.3759572197530287e-05, "loss": 3.7078, "step": 36100 }, { "epoch": 1.5282644488538017, "grad_norm": 33.22710418701172, "learning_rate": 1.3736118114760828e-05, "loss": 3.7524, "step": 36200 }, { "epoch": 1.5324861738506352, "grad_norm": 22.334789276123047, "learning_rate": 1.3712664031991369e-05, "loss": 3.6992, "step": 36300 }, { "epoch": 1.5367078988474692, "grad_norm": 35.32610321044922, "learning_rate": 1.3689209949221911e-05, "loss": 3.8152, "step": 36400 }, { "epoch": 1.5409296238443027, "grad_norm": 46.904090881347656, "learning_rate": 1.3665755866452454e-05, "loss": 3.9007, "step": 36500 }, { "epoch": 1.5409296238443027, "eval_loss": 3.9784727096557617, "eval_runtime": 387.4936, "eval_samples_per_second": 489.017, "eval_steps_per_second": 15.283, "step": 36500 }, { "epoch": 1.5451513488411366, "grad_norm": 105.48990631103516, "learning_rate": 1.3642301783682997e-05, "loss": 3.6302, "step": 36600 }, { "epoch": 1.54937307383797, "grad_norm": 35.433895111083984, "learning_rate": 1.3618847700913538e-05, "loss": 3.6208, "step": 36700 }, { "epoch": 1.553594798834804, "grad_norm": 43.27334976196289, "learning_rate": 1.359539361814408e-05, "loss": 3.6039, "step": 36800 }, { "epoch": 1.5578165238316375, "grad_norm": 44.104942321777344, "learning_rate": 1.3571939535374621e-05, "loss": 3.7039, "step": 36900 }, { "epoch": 1.5620382488284714, "grad_norm": 59.13047790527344, "learning_rate": 1.3548485452605162e-05, "loss": 3.7069, "step": 37000 }, { "epoch": 1.5620382488284714, "eval_loss": 3.9214632511138916, "eval_runtime": 388.3204, "eval_samples_per_second": 487.976, "eval_steps_per_second": 15.25, "step": 37000 }, { "epoch": 1.566259973825305, "grad_norm": 43.375633239746094, "learning_rate": 1.3525031369835706e-05, "loss": 3.7246, "step": 37100 }, { "epoch": 1.5704816988221388, "grad_norm": 266.7533874511719, "learning_rate": 1.3501577287066247e-05, "loss": 3.7269, "step": 37200 }, { "epoch": 1.5747034238189723, "grad_norm": 58.76247787475586, "learning_rate": 1.347812320429679e-05, "loss": 3.6822, "step": 37300 }, { "epoch": 1.5789251488158063, "grad_norm": 65.02640533447266, "learning_rate": 1.345466912152733e-05, "loss": 3.7083, "step": 37400 }, { "epoch": 1.5831468738126397, "grad_norm": 57.553504943847656, "learning_rate": 1.3431215038757872e-05, "loss": 3.7095, "step": 37500 }, { "epoch": 1.5831468738126397, "eval_loss": 3.966798782348633, "eval_runtime": 389.3808, "eval_samples_per_second": 486.647, "eval_steps_per_second": 15.209, "step": 37500 }, { "epoch": 1.5873685988094737, "grad_norm": 25.69357681274414, "learning_rate": 1.3407760955988416e-05, "loss": 3.4556, "step": 37600 }, { "epoch": 1.5915903238063072, "grad_norm": 30.168212890625, "learning_rate": 1.3384306873218957e-05, "loss": 4.0595, "step": 37700 }, { "epoch": 1.595812048803141, "grad_norm": 22.801198959350586, "learning_rate": 1.3361087331277194e-05, "loss": 3.6583, "step": 37800 }, { "epoch": 1.6000337737999746, "grad_norm": 33.38633346557617, "learning_rate": 1.3337633248507735e-05, "loss": 3.5662, "step": 37900 }, { "epoch": 1.6042554987968085, "grad_norm": 85.28038024902344, "learning_rate": 1.3314179165738277e-05, "loss": 3.6365, "step": 38000 }, { "epoch": 1.6042554987968085, "eval_loss": 3.9034698009490967, "eval_runtime": 383.621, "eval_samples_per_second": 493.954, "eval_steps_per_second": 15.437, "step": 38000 }, { "epoch": 1.608477223793642, "grad_norm": 180.01327514648438, "learning_rate": 1.3290725082968818e-05, "loss": 3.6313, "step": 38100 }, { "epoch": 1.612698948790476, "grad_norm": 32.02112579345703, "learning_rate": 1.326727100019936e-05, "loss": 3.8767, "step": 38200 }, { "epoch": 1.6169206737873094, "grad_norm": 75.3433837890625, "learning_rate": 1.3243816917429904e-05, "loss": 3.9992, "step": 38300 }, { "epoch": 1.6211423987841433, "grad_norm": 37.75492477416992, "learning_rate": 1.3220362834660444e-05, "loss": 3.554, "step": 38400 }, { "epoch": 1.6253641237809768, "grad_norm": 230.31956481933594, "learning_rate": 1.3196908751890987e-05, "loss": 3.6862, "step": 38500 }, { "epoch": 1.6253641237809768, "eval_loss": 3.890026330947876, "eval_runtime": 388.3824, "eval_samples_per_second": 487.898, "eval_steps_per_second": 15.248, "step": 38500 }, { "epoch": 1.6295858487778108, "grad_norm": 41.58514404296875, "learning_rate": 1.3173689209949222e-05, "loss": 3.7638, "step": 38600 }, { "epoch": 1.6338075737746443, "grad_norm": 39.31863784790039, "learning_rate": 1.3150235127179765e-05, "loss": 3.6716, "step": 38700 }, { "epoch": 1.6380292987714782, "grad_norm": 32.94029998779297, "learning_rate": 1.3126781044410306e-05, "loss": 3.8667, "step": 38800 }, { "epoch": 1.6422510237683117, "grad_norm": 40.90266418457031, "learning_rate": 1.3103326961640848e-05, "loss": 3.5304, "step": 38900 }, { "epoch": 1.6464727487651456, "grad_norm": 31.46969985961914, "learning_rate": 1.3079872878871391e-05, "loss": 3.955, "step": 39000 }, { "epoch": 1.6464727487651456, "eval_loss": 3.889432668685913, "eval_runtime": 389.0277, "eval_samples_per_second": 487.089, "eval_steps_per_second": 15.223, "step": 39000 }, { "epoch": 1.650694473761979, "grad_norm": 22.939836502075195, "learning_rate": 1.3056418796101934e-05, "loss": 3.4049, "step": 39100 }, { "epoch": 1.654916198758813, "grad_norm": 77.34811401367188, "learning_rate": 1.3032964713332475e-05, "loss": 3.663, "step": 39200 }, { "epoch": 1.6591379237556465, "grad_norm": 145.7189178466797, "learning_rate": 1.3009510630563016e-05, "loss": 4.0267, "step": 39300 }, { "epoch": 1.6633596487524802, "grad_norm": 20.231035232543945, "learning_rate": 1.2986056547793558e-05, "loss": 3.8868, "step": 39400 }, { "epoch": 1.667581373749314, "grad_norm": 43.548316955566406, "learning_rate": 1.29626024650241e-05, "loss": 3.8984, "step": 39500 }, { "epoch": 1.667581373749314, "eval_loss": 3.9277007579803467, "eval_runtime": 389.9149, "eval_samples_per_second": 485.98, "eval_steps_per_second": 15.188, "step": 39500 }, { "epoch": 1.6718030987461476, "grad_norm": 75.36473846435547, "learning_rate": 1.2939148382254643e-05, "loss": 3.575, "step": 39600 }, { "epoch": 1.6760248237429813, "grad_norm": 82.61229705810547, "learning_rate": 1.2915694299485184e-05, "loss": 3.6966, "step": 39700 }, { "epoch": 1.680246548739815, "grad_norm": 23.42165184020996, "learning_rate": 1.2892240216715725e-05, "loss": 4.0533, "step": 39800 }, { "epoch": 1.6844682737366488, "grad_norm": 39.809532165527344, "learning_rate": 1.2868786133946268e-05, "loss": 3.6106, "step": 39900 }, { "epoch": 1.6886899987334825, "grad_norm": 30.228683471679688, "learning_rate": 1.2845332051176809e-05, "loss": 3.6468, "step": 40000 }, { "epoch": 1.6886899987334825, "eval_loss": 3.9425179958343506, "eval_runtime": 389.7162, "eval_samples_per_second": 486.228, "eval_steps_per_second": 15.196, "step": 40000 }, { "epoch": 1.6929117237303162, "grad_norm": 29.66598892211914, "learning_rate": 1.2821877968407353e-05, "loss": 3.7145, "step": 40100 }, { "epoch": 1.6971334487271499, "grad_norm": 32.41665267944336, "learning_rate": 1.2798423885637894e-05, "loss": 3.6602, "step": 40200 }, { "epoch": 1.7013551737239836, "grad_norm": 54.770538330078125, "learning_rate": 1.2774969802868435e-05, "loss": 3.5531, "step": 40300 }, { "epoch": 1.7055768987208173, "grad_norm": 21.214088439941406, "learning_rate": 1.2751515720098978e-05, "loss": 3.7857, "step": 40400 }, { "epoch": 1.709798623717651, "grad_norm": 18.517242431640625, "learning_rate": 1.2728061637329518e-05, "loss": 3.5586, "step": 40500 }, { "epoch": 1.709798623717651, "eval_loss": 3.8483657836914062, "eval_runtime": 386.5428, "eval_samples_per_second": 490.22, "eval_steps_per_second": 15.32, "step": 40500 }, { "epoch": 1.7140203487144847, "grad_norm": 101.35408782958984, "learning_rate": 1.270460755456006e-05, "loss": 3.7711, "step": 40600 }, { "epoch": 1.7182420737113184, "grad_norm": 32.46183395385742, "learning_rate": 1.2681153471790604e-05, "loss": 3.7135, "step": 40700 }, { "epoch": 1.7224637987081521, "grad_norm": 38.50857162475586, "learning_rate": 1.2657699389021145e-05, "loss": 3.8785, "step": 40800 }, { "epoch": 1.7266855237049858, "grad_norm": 21.530080795288086, "learning_rate": 1.2634245306251687e-05, "loss": 3.5577, "step": 40900 }, { "epoch": 1.7309072487018196, "grad_norm": 33.64255142211914, "learning_rate": 1.2610791223482228e-05, "loss": 3.5783, "step": 41000 }, { "epoch": 1.7309072487018196, "eval_loss": 3.9012844562530518, "eval_runtime": 387.6772, "eval_samples_per_second": 488.786, "eval_steps_per_second": 15.276, "step": 41000 }, { "epoch": 1.7351289736986533, "grad_norm": 175.7689971923828, "learning_rate": 1.2587337140712769e-05, "loss": 3.7346, "step": 41100 }, { "epoch": 1.739350698695487, "grad_norm": 27.353748321533203, "learning_rate": 1.2563883057943313e-05, "loss": 3.5098, "step": 41200 }, { "epoch": 1.7435724236923207, "grad_norm": 78.1789321899414, "learning_rate": 1.2540428975173854e-05, "loss": 4.0181, "step": 41300 }, { "epoch": 1.7477941486891544, "grad_norm": 56.047279357910156, "learning_rate": 1.2516974892404397e-05, "loss": 3.8404, "step": 41400 }, { "epoch": 1.752015873685988, "grad_norm": 24.14201545715332, "learning_rate": 1.2493520809634938e-05, "loss": 3.6327, "step": 41500 }, { "epoch": 1.752015873685988, "eval_loss": 3.868438243865967, "eval_runtime": 388.5976, "eval_samples_per_second": 487.628, "eval_steps_per_second": 15.239, "step": 41500 }, { "epoch": 1.7562375986828218, "grad_norm": 21.085111618041992, "learning_rate": 1.2470066726865479e-05, "loss": 3.7503, "step": 41600 }, { "epoch": 1.7604593236796555, "grad_norm": 23.111595153808594, "learning_rate": 1.2446612644096021e-05, "loss": 3.45, "step": 41700 }, { "epoch": 1.7646810486764892, "grad_norm": 90.75397491455078, "learning_rate": 1.2423158561326564e-05, "loss": 3.9138, "step": 41800 }, { "epoch": 1.768902773673323, "grad_norm": 55.01781463623047, "learning_rate": 1.2399704478557107e-05, "loss": 3.6061, "step": 41900 }, { "epoch": 1.7731244986701566, "grad_norm": 19.09023094177246, "learning_rate": 1.2376250395787648e-05, "loss": 3.6603, "step": 42000 }, { "epoch": 1.7731244986701566, "eval_loss": 3.795603036880493, "eval_runtime": 387.4527, "eval_samples_per_second": 489.069, "eval_steps_per_second": 15.284, "step": 42000 }, { "epoch": 1.7773462236669904, "grad_norm": 36.818756103515625, "learning_rate": 1.2352796313018189e-05, "loss": 3.6722, "step": 42100 }, { "epoch": 1.781567948663824, "grad_norm": 35.12385559082031, "learning_rate": 1.2329342230248731e-05, "loss": 3.678, "step": 42200 }, { "epoch": 1.7857896736606578, "grad_norm": 47.89170837402344, "learning_rate": 1.2305888147479272e-05, "loss": 3.5802, "step": 42300 }, { "epoch": 1.7900113986574915, "grad_norm": 53.81658172607422, "learning_rate": 1.2282434064709816e-05, "loss": 3.8253, "step": 42400 }, { "epoch": 1.7942331236543252, "grad_norm": 31.743684768676758, "learning_rate": 1.2258979981940357e-05, "loss": 3.7815, "step": 42500 }, { "epoch": 1.7942331236543252, "eval_loss": 3.8192477226257324, "eval_runtime": 388.2855, "eval_samples_per_second": 488.02, "eval_steps_per_second": 15.252, "step": 42500 }, { "epoch": 1.798454848651159, "grad_norm": 29.357864379882812, "learning_rate": 1.2235760439998594e-05, "loss": 3.7021, "step": 42600 }, { "epoch": 1.8026765736479926, "grad_norm": 25.66359519958496, "learning_rate": 1.2212306357229135e-05, "loss": 3.4263, "step": 42700 }, { "epoch": 1.8068982986448263, "grad_norm": 52.50632095336914, "learning_rate": 1.2188852274459678e-05, "loss": 3.8781, "step": 42800 }, { "epoch": 1.81112002364166, "grad_norm": 26.848169326782227, "learning_rate": 1.2165398191690219e-05, "loss": 3.5784, "step": 42900 }, { "epoch": 1.8153417486384937, "grad_norm": 51.33030700683594, "learning_rate": 1.2141944108920763e-05, "loss": 3.9405, "step": 43000 }, { "epoch": 1.8153417486384937, "eval_loss": 3.810027837753296, "eval_runtime": 388.9312, "eval_samples_per_second": 487.21, "eval_steps_per_second": 15.226, "step": 43000 }, { "epoch": 1.8195634736353274, "grad_norm": 53.251548767089844, "learning_rate": 1.2118490026151304e-05, "loss": 3.5516, "step": 43100 }, { "epoch": 1.8237851986321612, "grad_norm": 50.611228942871094, "learning_rate": 1.2095035943381845e-05, "loss": 3.8322, "step": 43200 }, { "epoch": 1.8280069236289949, "grad_norm": 61.101463317871094, "learning_rate": 1.2071581860612387e-05, "loss": 3.7948, "step": 43300 }, { "epoch": 1.8322286486258283, "grad_norm": 25.454072952270508, "learning_rate": 1.2048127777842928e-05, "loss": 3.6175, "step": 43400 }, { "epoch": 1.8364503736226623, "grad_norm": 36.41463851928711, "learning_rate": 1.202467369507347e-05, "loss": 3.5256, "step": 43500 }, { "epoch": 1.8364503736226623, "eval_loss": 3.855207920074463, "eval_runtime": 389.0478, "eval_samples_per_second": 487.064, "eval_steps_per_second": 15.222, "step": 43500 }, { "epoch": 1.8406720986194958, "grad_norm": 85.75359344482422, "learning_rate": 1.2001219612304014e-05, "loss": 3.8199, "step": 43600 }, { "epoch": 1.8448938236163297, "grad_norm": 90.45889282226562, "learning_rate": 1.1977765529534555e-05, "loss": 3.6168, "step": 43700 }, { "epoch": 1.8491155486131632, "grad_norm": 40.536827087402344, "learning_rate": 1.1954311446765097e-05, "loss": 3.5648, "step": 43800 }, { "epoch": 1.8533372736099971, "grad_norm": 34.12901306152344, "learning_rate": 1.1930857363995638e-05, "loss": 3.5584, "step": 43900 }, { "epoch": 1.8575589986068306, "grad_norm": 33.35273742675781, "learning_rate": 1.1907403281226179e-05, "loss": 3.7623, "step": 44000 }, { "epoch": 1.8575589986068306, "eval_loss": 3.8202288150787354, "eval_runtime": 389.4093, "eval_samples_per_second": 486.611, "eval_steps_per_second": 15.208, "step": 44000 }, { "epoch": 1.8617807236036645, "grad_norm": 37.19559860229492, "learning_rate": 1.1883949198456722e-05, "loss": 3.7884, "step": 44100 }, { "epoch": 1.866002448600498, "grad_norm": 63.01133346557617, "learning_rate": 1.1860495115687264e-05, "loss": 3.6241, "step": 44200 }, { "epoch": 1.870224173597332, "grad_norm": 58.19880294799805, "learning_rate": 1.1837041032917807e-05, "loss": 3.4533, "step": 44300 }, { "epoch": 1.8744458985941654, "grad_norm": 46.244720458984375, "learning_rate": 1.1813586950148348e-05, "loss": 3.575, "step": 44400 }, { "epoch": 1.8786676235909994, "grad_norm": 34.9444465637207, "learning_rate": 1.1790132867378889e-05, "loss": 3.6981, "step": 44500 }, { "epoch": 1.8786676235909994, "eval_loss": 3.9079577922821045, "eval_runtime": 388.7641, "eval_samples_per_second": 487.419, "eval_steps_per_second": 15.233, "step": 44500 }, { "epoch": 1.8828893485878329, "grad_norm": 43.4091682434082, "learning_rate": 1.1766913325437126e-05, "loss": 3.6384, "step": 44600 }, { "epoch": 1.8871110735846668, "grad_norm": 32.79497528076172, "learning_rate": 1.1743459242667668e-05, "loss": 3.8267, "step": 44700 }, { "epoch": 1.8913327985815003, "grad_norm": 59.50571823120117, "learning_rate": 1.1720005159898211e-05, "loss": 3.5696, "step": 44800 }, { "epoch": 1.8955545235783342, "grad_norm": 57.57821273803711, "learning_rate": 1.1696551077128754e-05, "loss": 3.5189, "step": 44900 }, { "epoch": 1.8997762485751677, "grad_norm": 51.14540481567383, "learning_rate": 1.1673096994359294e-05, "loss": 3.7528, "step": 45000 }, { "epoch": 1.8997762485751677, "eval_loss": 3.875887155532837, "eval_runtime": 387.2131, "eval_samples_per_second": 489.371, "eval_steps_per_second": 15.294, "step": 45000 }, { "epoch": 1.9039979735720016, "grad_norm": 23.407438278198242, "learning_rate": 1.1649642911589835e-05, "loss": 3.7572, "step": 45100 }, { "epoch": 1.908219698568835, "grad_norm": 19.387798309326172, "learning_rate": 1.1626188828820378e-05, "loss": 3.7283, "step": 45200 }, { "epoch": 1.912441423565669, "grad_norm": 44.595035552978516, "learning_rate": 1.1602734746050919e-05, "loss": 3.6185, "step": 45300 }, { "epoch": 1.9166631485625025, "grad_norm": 40.92133331298828, "learning_rate": 1.1579280663281463e-05, "loss": 3.5348, "step": 45400 }, { "epoch": 1.9208848735593365, "grad_norm": 272.9943542480469, "learning_rate": 1.1555826580512004e-05, "loss": 3.5366, "step": 45500 }, { "epoch": 1.9208848735593365, "eval_loss": 3.971277952194214, "eval_runtime": 389.7703, "eval_samples_per_second": 486.161, "eval_steps_per_second": 15.194, "step": 45500 }, { "epoch": 1.92510659855617, "grad_norm": 30.447141647338867, "learning_rate": 1.1532372497742545e-05, "loss": 3.8358, "step": 45600 }, { "epoch": 1.9293283235530039, "grad_norm": 40.50982666015625, "learning_rate": 1.1508918414973088e-05, "loss": 3.7831, "step": 45700 }, { "epoch": 1.9335500485498374, "grad_norm": 91.95133209228516, "learning_rate": 1.1485464332203629e-05, "loss": 3.7524, "step": 45800 }, { "epoch": 1.9377717735466713, "grad_norm": 89.98445892333984, "learning_rate": 1.1462010249434173e-05, "loss": 3.4533, "step": 45900 }, { "epoch": 1.9419934985435048, "grad_norm": 33.398841857910156, "learning_rate": 1.1438556166664714e-05, "loss": 3.4622, "step": 46000 }, { "epoch": 1.9419934985435048, "eval_loss": 3.890702962875366, "eval_runtime": 388.4518, "eval_samples_per_second": 487.811, "eval_steps_per_second": 15.245, "step": 46000 }, { "epoch": 1.9462152235403387, "grad_norm": 52.59076690673828, "learning_rate": 1.1415102083895255e-05, "loss": 3.7096, "step": 46100 }, { "epoch": 1.9504369485371722, "grad_norm": 77.39093780517578, "learning_rate": 1.1391648001125797e-05, "loss": 3.5447, "step": 46200 }, { "epoch": 1.9546586735340061, "grad_norm": 57.93257522583008, "learning_rate": 1.1368193918356338e-05, "loss": 3.601, "step": 46300 }, { "epoch": 1.9588803985308396, "grad_norm": 63.35990524291992, "learning_rate": 1.134473983558688e-05, "loss": 3.6369, "step": 46400 }, { "epoch": 1.9631021235276735, "grad_norm": 74.04061889648438, "learning_rate": 1.1321285752817424e-05, "loss": 3.8619, "step": 46500 }, { "epoch": 1.9631021235276735, "eval_loss": 3.8416244983673096, "eval_runtime": 388.455, "eval_samples_per_second": 487.807, "eval_steps_per_second": 15.245, "step": 46500 }, { "epoch": 1.967323848524507, "grad_norm": 32.15678405761719, "learning_rate": 1.1297831670047964e-05, "loss": 3.6623, "step": 46600 }, { "epoch": 1.971545573521341, "grad_norm": 134.01588439941406, "learning_rate": 1.1274612128106201e-05, "loss": 3.8225, "step": 46700 }, { "epoch": 1.9757672985181745, "grad_norm": 38.84286880493164, "learning_rate": 1.1251158045336742e-05, "loss": 3.9126, "step": 46800 }, { "epoch": 1.9799890235150084, "grad_norm": 27.815914154052734, "learning_rate": 1.1227703962567285e-05, "loss": 3.8429, "step": 46900 }, { "epoch": 1.9842107485118419, "grad_norm": 66.18096160888672, "learning_rate": 1.1204249879797826e-05, "loss": 3.6607, "step": 47000 }, { "epoch": 1.9842107485118419, "eval_loss": 3.8103389739990234, "eval_runtime": 388.5622, "eval_samples_per_second": 487.672, "eval_steps_per_second": 15.241, "step": 47000 }, { "epoch": 1.9884324735086758, "grad_norm": 61.14945602416992, "learning_rate": 1.1181030337856063e-05, "loss": 3.5708, "step": 47100 }, { "epoch": 1.9926541985055093, "grad_norm": 35.80445861816406, "learning_rate": 1.1157576255086604e-05, "loss": 3.6346, "step": 47200 }, { "epoch": 1.9968759235023432, "grad_norm": 47.520687103271484, "learning_rate": 1.1134122172317148e-05, "loss": 3.4577, "step": 47300 }, { "epoch": 2.0010976484991767, "grad_norm": 72.3710708618164, "learning_rate": 1.1110668089547689e-05, "loss": 3.4326, "step": 47400 }, { "epoch": 2.0053193734960106, "grad_norm": 62.152923583984375, "learning_rate": 1.1087214006778232e-05, "loss": 3.5431, "step": 47500 }, { "epoch": 2.0053193734960106, "eval_loss": 3.831125497817993, "eval_runtime": 383.5804, "eval_samples_per_second": 494.006, "eval_steps_per_second": 15.439, "step": 47500 }, { "epoch": 2.009541098492844, "grad_norm": 37.57729721069336, "learning_rate": 1.1063759924008772e-05, "loss": 3.4852, "step": 47600 }, { "epoch": 2.013762823489678, "grad_norm": 19.60400390625, "learning_rate": 1.1040305841239313e-05, "loss": 3.4037, "step": 47700 }, { "epoch": 2.0179845484865115, "grad_norm": 43.43424987792969, "learning_rate": 1.1016851758469858e-05, "loss": 3.5685, "step": 47800 }, { "epoch": 2.0222062734833455, "grad_norm": 48.39424514770508, "learning_rate": 1.0993397675700399e-05, "loss": 3.2866, "step": 47900 }, { "epoch": 2.026427998480179, "grad_norm": 45.38421630859375, "learning_rate": 1.0969943592930941e-05, "loss": 3.3943, "step": 48000 }, { "epoch": 2.026427998480179, "eval_loss": 3.944741725921631, "eval_runtime": 389.6599, "eval_samples_per_second": 486.298, "eval_steps_per_second": 15.198, "step": 48000 }, { "epoch": 2.030649723477013, "grad_norm": 34.202125549316406, "learning_rate": 1.0946489510161482e-05, "loss": 3.3675, "step": 48100 }, { "epoch": 2.0348714484738464, "grad_norm": 33.32945251464844, "learning_rate": 1.0923035427392023e-05, "loss": 3.7605, "step": 48200 }, { "epoch": 2.0390931734706803, "grad_norm": 22.147314071655273, "learning_rate": 1.0899581344622566e-05, "loss": 4.0646, "step": 48300 }, { "epoch": 2.043314898467514, "grad_norm": 50.3154411315918, "learning_rate": 1.0876127261853108e-05, "loss": 3.4634, "step": 48400 }, { "epoch": 2.0475366234643477, "grad_norm": 27.445606231689453, "learning_rate": 1.0852673179083651e-05, "loss": 3.5041, "step": 48500 }, { "epoch": 2.0475366234643477, "eval_loss": 3.8681693077087402, "eval_runtime": 388.4463, "eval_samples_per_second": 487.818, "eval_steps_per_second": 15.245, "step": 48500 }, { "epoch": 2.051758348461181, "grad_norm": 58.96662521362305, "learning_rate": 1.0829219096314192e-05, "loss": 3.6862, "step": 48600 }, { "epoch": 2.055980073458015, "grad_norm": 18.482248306274414, "learning_rate": 1.0805765013544733e-05, "loss": 3.5486, "step": 48700 }, { "epoch": 2.0602017984548486, "grad_norm": 33.20754623413086, "learning_rate": 1.0782310930775275e-05, "loss": 3.6148, "step": 48800 }, { "epoch": 2.0644235234516826, "grad_norm": 67.11410522460938, "learning_rate": 1.0758856848005816e-05, "loss": 3.3776, "step": 48900 }, { "epoch": 2.068645248448516, "grad_norm": 52.77125930786133, "learning_rate": 1.073540276523636e-05, "loss": 3.4514, "step": 49000 }, { "epoch": 2.068645248448516, "eval_loss": 3.8840792179107666, "eval_runtime": 387.6216, "eval_samples_per_second": 488.856, "eval_steps_per_second": 15.278, "step": 49000 }, { "epoch": 2.07286697344535, "grad_norm": 64.37210845947266, "learning_rate": 1.0711948682466902e-05, "loss": 3.4575, "step": 49100 }, { "epoch": 2.0770886984421835, "grad_norm": 84.62837219238281, "learning_rate": 1.0688494599697443e-05, "loss": 3.4984, "step": 49200 }, { "epoch": 2.0813104234390174, "grad_norm": 43.95246124267578, "learning_rate": 1.0665040516927985e-05, "loss": 3.3978, "step": 49300 }, { "epoch": 2.085532148435851, "grad_norm": 58.663265228271484, "learning_rate": 1.0641586434158526e-05, "loss": 3.644, "step": 49400 }, { "epoch": 2.0897538734326844, "grad_norm": 78.60660552978516, "learning_rate": 1.061813235138907e-05, "loss": 3.6412, "step": 49500 }, { "epoch": 2.0897538734326844, "eval_loss": 3.8865935802459717, "eval_runtime": 388.5454, "eval_samples_per_second": 487.693, "eval_steps_per_second": 15.241, "step": 49500 }, { "epoch": 2.0939755984295183, "grad_norm": 71.96966552734375, "learning_rate": 1.0594678268619611e-05, "loss": 3.322, "step": 49600 }, { "epoch": 2.0981973234263522, "grad_norm": 26.565078735351562, "learning_rate": 1.0571224185850152e-05, "loss": 3.7186, "step": 49700 }, { "epoch": 2.1024190484231857, "grad_norm": 47.89260482788086, "learning_rate": 1.0547770103080695e-05, "loss": 3.3604, "step": 49800 }, { "epoch": 2.106640773420019, "grad_norm": 26.684965133666992, "learning_rate": 1.0524316020311236e-05, "loss": 3.7262, "step": 49900 }, { "epoch": 2.110862498416853, "grad_norm": 34.246253967285156, "learning_rate": 1.0500861937541777e-05, "loss": 3.541, "step": 50000 }, { "epoch": 2.110862498416853, "eval_loss": 3.857327938079834, "eval_runtime": 387.0381, "eval_samples_per_second": 489.593, "eval_steps_per_second": 15.301, "step": 50000 }, { "epoch": 2.1150842234136866, "grad_norm": 79.23475646972656, "learning_rate": 1.0477642395600014e-05, "loss": 3.4695, "step": 50100 }, { "epoch": 2.1193059484105206, "grad_norm": 28.924774169921875, "learning_rate": 1.0454188312830558e-05, "loss": 3.3366, "step": 50200 }, { "epoch": 2.123527673407354, "grad_norm": 154.0482635498047, "learning_rate": 1.0430734230061099e-05, "loss": 3.6556, "step": 50300 }, { "epoch": 2.127749398404188, "grad_norm": 21.952199935913086, "learning_rate": 1.0407280147291641e-05, "loss": 3.4016, "step": 50400 }, { "epoch": 2.1319711234010215, "grad_norm": 34.361366271972656, "learning_rate": 1.0383826064522182e-05, "loss": 3.6733, "step": 50500 }, { "epoch": 2.1319711234010215, "eval_loss": 3.9452035427093506, "eval_runtime": 388.1734, "eval_samples_per_second": 488.161, "eval_steps_per_second": 15.256, "step": 50500 }, { "epoch": 2.1361928483978554, "grad_norm": 25.264711380004883, "learning_rate": 1.0360371981752723e-05, "loss": 3.7553, "step": 50600 }, { "epoch": 2.140414573394689, "grad_norm": 62.976287841796875, "learning_rate": 1.0336917898983266e-05, "loss": 3.6238, "step": 50700 }, { "epoch": 2.144636298391523, "grad_norm": 29.24872398376465, "learning_rate": 1.0313463816213809e-05, "loss": 3.4566, "step": 50800 }, { "epoch": 2.1488580233883563, "grad_norm": 40.195960998535156, "learning_rate": 1.0290244274272045e-05, "loss": 3.7676, "step": 50900 }, { "epoch": 2.1530797483851902, "grad_norm": 53.48100280761719, "learning_rate": 1.0266790191502586e-05, "loss": 3.4482, "step": 51000 }, { "epoch": 2.1530797483851902, "eval_loss": 3.895794153213501, "eval_runtime": 389.8627, "eval_samples_per_second": 486.046, "eval_steps_per_second": 15.19, "step": 51000 }, { "epoch": 2.1573014733820237, "grad_norm": 60.159095764160156, "learning_rate": 1.0243336108733129e-05, "loss": 3.409, "step": 51100 }, { "epoch": 2.1615231983788576, "grad_norm": 67.60205078125, "learning_rate": 1.021988202596367e-05, "loss": 3.9074, "step": 51200 }, { "epoch": 2.165744923375691, "grad_norm": 74.41675567626953, "learning_rate": 1.019642794319421e-05, "loss": 3.613, "step": 51300 }, { "epoch": 2.169966648372525, "grad_norm": 143.5885009765625, "learning_rate": 1.0172973860424755e-05, "loss": 3.4835, "step": 51400 }, { "epoch": 2.1741883733693586, "grad_norm": 71.70709991455078, "learning_rate": 1.0149519777655296e-05, "loss": 3.5494, "step": 51500 }, { "epoch": 2.1741883733693586, "eval_loss": 3.788289785385132, "eval_runtime": 387.1462, "eval_samples_per_second": 489.456, "eval_steps_per_second": 15.297, "step": 51500 }, { "epoch": 2.1784100983661925, "grad_norm": 30.032886505126953, "learning_rate": 1.0126065694885839e-05, "loss": 3.5205, "step": 51600 }, { "epoch": 2.182631823363026, "grad_norm": 53.41360092163086, "learning_rate": 1.010261161211638e-05, "loss": 3.4232, "step": 51700 }, { "epoch": 2.18685354835986, "grad_norm": 69.20714569091797, "learning_rate": 1.007915752934692e-05, "loss": 3.6539, "step": 51800 }, { "epoch": 2.1910752733566934, "grad_norm": 113.37935638427734, "learning_rate": 1.0055703446577463e-05, "loss": 3.6586, "step": 51900 }, { "epoch": 2.1952969983535273, "grad_norm": 72.74465942382812, "learning_rate": 1.0032249363808006e-05, "loss": 3.3321, "step": 52000 }, { "epoch": 2.1952969983535273, "eval_loss": 3.893357992172241, "eval_runtime": 388.1116, "eval_samples_per_second": 488.238, "eval_steps_per_second": 15.258, "step": 52000 }, { "epoch": 2.199518723350361, "grad_norm": 71.91123962402344, "learning_rate": 1.0008795281038548e-05, "loss": 3.7712, "step": 52100 }, { "epoch": 2.2037404483471947, "grad_norm": 35.67560577392578, "learning_rate": 9.98534119826909e-06, "loss": 3.4973, "step": 52200 }, { "epoch": 2.2079621733440282, "grad_norm": 122.19819641113281, "learning_rate": 9.96188711549963e-06, "loss": 3.5404, "step": 52300 }, { "epoch": 2.212183898340862, "grad_norm": 42.20289993286133, "learning_rate": 9.938433032730173e-06, "loss": 3.4815, "step": 52400 }, { "epoch": 2.2164056233376956, "grad_norm": 36.77101516723633, "learning_rate": 9.914978949960715e-06, "loss": 3.5586, "step": 52500 }, { "epoch": 2.2164056233376956, "eval_loss": 3.8740973472595215, "eval_runtime": 384.4478, "eval_samples_per_second": 492.891, "eval_steps_per_second": 15.404, "step": 52500 }, { "epoch": 2.2206273483345296, "grad_norm": 30.4133243560791, "learning_rate": 9.891524867191256e-06, "loss": 3.5463, "step": 52600 }, { "epoch": 2.224849073331363, "grad_norm": 47.02263259887695, "learning_rate": 9.868070784421799e-06, "loss": 3.537, "step": 52700 }, { "epoch": 2.229070798328197, "grad_norm": 26.324251174926758, "learning_rate": 9.844616701652342e-06, "loss": 3.5299, "step": 52800 }, { "epoch": 2.2332925233250305, "grad_norm": 203.6620635986328, "learning_rate": 9.821162618882883e-06, "loss": 3.5967, "step": 52900 }, { "epoch": 2.2375142483218644, "grad_norm": 29.926071166992188, "learning_rate": 9.797708536113425e-06, "loss": 3.5156, "step": 53000 }, { "epoch": 2.2375142483218644, "eval_loss": 3.8976686000823975, "eval_runtime": 389.0602, "eval_samples_per_second": 487.048, "eval_steps_per_second": 15.221, "step": 53000 }, { "epoch": 2.241735973318698, "grad_norm": 35.688720703125, "learning_rate": 9.774254453343966e-06, "loss": 3.6079, "step": 53100 }, { "epoch": 2.245957698315532, "grad_norm": 47.552772521972656, "learning_rate": 9.750800370574509e-06, "loss": 3.4492, "step": 53200 }, { "epoch": 2.2501794233123653, "grad_norm": 217.60997009277344, "learning_rate": 9.727346287805051e-06, "loss": 3.4611, "step": 53300 }, { "epoch": 2.2544011483091992, "grad_norm": 38.56391906738281, "learning_rate": 9.703892205035592e-06, "loss": 3.1611, "step": 53400 }, { "epoch": 2.2586228733060327, "grad_norm": 36.49523162841797, "learning_rate": 9.680438122266135e-06, "loss": 3.6554, "step": 53500 }, { "epoch": 2.2586228733060327, "eval_loss": 3.844381809234619, "eval_runtime": 385.0727, "eval_samples_per_second": 492.092, "eval_steps_per_second": 15.379, "step": 53500 }, { "epoch": 2.2628445983028667, "grad_norm": 111.7251968383789, "learning_rate": 9.656984039496676e-06, "loss": 3.473, "step": 53600 }, { "epoch": 2.2670663232997, "grad_norm": 26.387981414794922, "learning_rate": 9.633529956727218e-06, "loss": 3.8362, "step": 53700 }, { "epoch": 2.271288048296534, "grad_norm": 51.86467742919922, "learning_rate": 9.610310414785454e-06, "loss": 3.8243, "step": 53800 }, { "epoch": 2.2755097732933676, "grad_norm": 48.3861198425293, "learning_rate": 9.586856332015996e-06, "loss": 3.2957, "step": 53900 }, { "epoch": 2.2797314982902015, "grad_norm": 96.6748046875, "learning_rate": 9.563402249246539e-06, "loss": 3.4227, "step": 54000 }, { "epoch": 2.2797314982902015, "eval_loss": 3.865082025527954, "eval_runtime": 388.8342, "eval_samples_per_second": 487.331, "eval_steps_per_second": 15.23, "step": 54000 }, { "epoch": 2.283953223287035, "grad_norm": 145.63818359375, "learning_rate": 9.53994816647708e-06, "loss": 3.6771, "step": 54100 }, { "epoch": 2.288174948283869, "grad_norm": 27.48724365234375, "learning_rate": 9.516494083707622e-06, "loss": 3.5287, "step": 54200 }, { "epoch": 2.2923966732807024, "grad_norm": 33.88056564331055, "learning_rate": 9.493040000938163e-06, "loss": 3.6393, "step": 54300 }, { "epoch": 2.2966183982775363, "grad_norm": 46.24734878540039, "learning_rate": 9.469585918168706e-06, "loss": 3.6447, "step": 54400 }, { "epoch": 2.30084012327437, "grad_norm": 65.59355926513672, "learning_rate": 9.446131835399249e-06, "loss": 3.1714, "step": 54500 }, { "epoch": 2.30084012327437, "eval_loss": 3.8695130348205566, "eval_runtime": 386.6075, "eval_samples_per_second": 490.138, "eval_steps_per_second": 15.318, "step": 54500 }, { "epoch": 2.3050618482712038, "grad_norm": 152.84823608398438, "learning_rate": 9.42267775262979e-06, "loss": 3.5703, "step": 54600 }, { "epoch": 2.3092835732680372, "grad_norm": 49.877586364746094, "learning_rate": 9.39922366986033e-06, "loss": 3.6058, "step": 54700 }, { "epoch": 2.313505298264871, "grad_norm": 35.91408920288086, "learning_rate": 9.375769587090873e-06, "loss": 3.3485, "step": 54800 }, { "epoch": 2.3177270232617047, "grad_norm": 42.26518630981445, "learning_rate": 9.352315504321416e-06, "loss": 3.4143, "step": 54900 }, { "epoch": 2.3219487482585386, "grad_norm": 75.06291961669922, "learning_rate": 9.328861421551958e-06, "loss": 3.461, "step": 55000 }, { "epoch": 2.3219487482585386, "eval_loss": 3.828523874282837, "eval_runtime": 385.5046, "eval_samples_per_second": 491.54, "eval_steps_per_second": 15.362, "step": 55000 }, { "epoch": 2.326170473255372, "grad_norm": 119.58143615722656, "learning_rate": 9.3054073387825e-06, "loss": 3.7676, "step": 55100 }, { "epoch": 2.330392198252206, "grad_norm": 151.1800079345703, "learning_rate": 9.28195325601304e-06, "loss": 3.7192, "step": 55200 }, { "epoch": 2.3346139232490395, "grad_norm": 32.434814453125, "learning_rate": 9.258499173243583e-06, "loss": 3.3955, "step": 55300 }, { "epoch": 2.3388356482458734, "grad_norm": 17.332210540771484, "learning_rate": 9.235045090474125e-06, "loss": 3.5533, "step": 55400 }, { "epoch": 2.343057373242707, "grad_norm": 46.541019439697266, "learning_rate": 9.211591007704666e-06, "loss": 3.7335, "step": 55500 }, { "epoch": 2.343057373242707, "eval_loss": 3.7877399921417236, "eval_runtime": 386.5024, "eval_samples_per_second": 490.271, "eval_steps_per_second": 15.322, "step": 55500 }, { "epoch": 2.347279098239541, "grad_norm": 48.02127456665039, "learning_rate": 9.188136924935209e-06, "loss": 3.3743, "step": 55600 }, { "epoch": 2.3515008232363743, "grad_norm": 84.77007293701172, "learning_rate": 9.164682842165752e-06, "loss": 3.5598, "step": 55700 }, { "epoch": 2.3557225482332083, "grad_norm": 52.11927795410156, "learning_rate": 9.141228759396292e-06, "loss": 3.5939, "step": 55800 }, { "epoch": 2.3599442732300417, "grad_norm": 56.104732513427734, "learning_rate": 9.117774676626835e-06, "loss": 3.6577, "step": 55900 }, { "epoch": 2.3641659982268757, "grad_norm": 29.97869300842285, "learning_rate": 9.094320593857376e-06, "loss": 3.407, "step": 56000 }, { "epoch": 2.3641659982268757, "eval_loss": 3.789689540863037, "eval_runtime": 382.469, "eval_samples_per_second": 495.441, "eval_steps_per_second": 15.484, "step": 56000 }, { "epoch": 2.368387723223709, "grad_norm": 37.954345703125, "learning_rate": 9.070866511087919e-06, "loss": 3.5838, "step": 56100 }, { "epoch": 2.372609448220543, "grad_norm": 47.05314636230469, "learning_rate": 9.047412428318461e-06, "loss": 3.5182, "step": 56200 }, { "epoch": 2.3768311732173766, "grad_norm": 72.84585571289062, "learning_rate": 9.023958345549002e-06, "loss": 3.7881, "step": 56300 }, { "epoch": 2.3810528982142105, "grad_norm": 164.88848876953125, "learning_rate": 9.000504262779543e-06, "loss": 3.772, "step": 56400 }, { "epoch": 2.385274623211044, "grad_norm": 56.11479568481445, "learning_rate": 8.977050180010086e-06, "loss": 3.5779, "step": 56500 }, { "epoch": 2.385274623211044, "eval_loss": 3.8028056621551514, "eval_runtime": 387.2356, "eval_samples_per_second": 489.343, "eval_steps_per_second": 15.293, "step": 56500 }, { "epoch": 2.389496348207878, "grad_norm": 31.131439208984375, "learning_rate": 8.953596097240628e-06, "loss": 3.4765, "step": 56600 }, { "epoch": 2.3937180732047114, "grad_norm": 41.05427169799805, "learning_rate": 8.930142014471171e-06, "loss": 3.5122, "step": 56700 }, { "epoch": 2.397939798201545, "grad_norm": 51.679901123046875, "learning_rate": 8.906687931701712e-06, "loss": 3.8646, "step": 56800 }, { "epoch": 2.402161523198379, "grad_norm": 21.275638580322266, "learning_rate": 8.883233848932253e-06, "loss": 3.4861, "step": 56900 }, { "epoch": 2.4063832481952128, "grad_norm": 42.77296829223633, "learning_rate": 8.859779766162795e-06, "loss": 3.4486, "step": 57000 }, { "epoch": 2.4063832481952128, "eval_loss": 3.8668415546417236, "eval_runtime": 386.271, "eval_samples_per_second": 490.565, "eval_steps_per_second": 15.331, "step": 57000 }, { "epoch": 2.4106049731920463, "grad_norm": 60.467735290527344, "learning_rate": 8.836325683393338e-06, "loss": 3.4319, "step": 57100 }, { "epoch": 2.4148266981888797, "grad_norm": 38.16925811767578, "learning_rate": 8.812871600623879e-06, "loss": 3.5801, "step": 57200 }, { "epoch": 2.4190484231857137, "grad_norm": 35.175376892089844, "learning_rate": 8.789417517854422e-06, "loss": 3.4412, "step": 57300 }, { "epoch": 2.4232701481825476, "grad_norm": 37.78458023071289, "learning_rate": 8.765963435084963e-06, "loss": 3.4917, "step": 57400 }, { "epoch": 2.427491873179381, "grad_norm": 310.4344177246094, "learning_rate": 8.742509352315505e-06, "loss": 3.8994, "step": 57500 }, { "epoch": 2.427491873179381, "eval_loss": 3.7693347930908203, "eval_runtime": 385.0963, "eval_samples_per_second": 492.061, "eval_steps_per_second": 15.378, "step": 57500 }, { "epoch": 2.4317135981762146, "grad_norm": 87.53755950927734, "learning_rate": 8.719055269546048e-06, "loss": 3.4321, "step": 57600 }, { "epoch": 2.4359353231730485, "grad_norm": 30.359472274780273, "learning_rate": 8.695601186776589e-06, "loss": 3.4605, "step": 57700 }, { "epoch": 2.4401570481698824, "grad_norm": 47.97589874267578, "learning_rate": 8.67214710400713e-06, "loss": 3.5348, "step": 57800 }, { "epoch": 2.444378773166716, "grad_norm": 18.004453659057617, "learning_rate": 8.648693021237672e-06, "loss": 3.6156, "step": 57900 }, { "epoch": 2.4486004981635494, "grad_norm": 74.85140991210938, "learning_rate": 8.625238938468215e-06, "loss": 3.6726, "step": 58000 }, { "epoch": 2.4486004981635494, "eval_loss": 3.7412655353546143, "eval_runtime": 389.1795, "eval_samples_per_second": 486.899, "eval_steps_per_second": 15.217, "step": 58000 }, { "epoch": 2.4528222231603833, "grad_norm": 59.58810043334961, "learning_rate": 8.601784855698756e-06, "loss": 3.4447, "step": 58100 }, { "epoch": 2.4570439481572173, "grad_norm": 55.2222785949707, "learning_rate": 8.578330772929298e-06, "loss": 3.5318, "step": 58200 }, { "epoch": 2.4612656731540508, "grad_norm": 30.20258903503418, "learning_rate": 8.55487669015984e-06, "loss": 3.4284, "step": 58300 }, { "epoch": 2.4654873981508842, "grad_norm": 75.82894134521484, "learning_rate": 8.531422607390382e-06, "loss": 3.3426, "step": 58400 }, { "epoch": 2.469709123147718, "grad_norm": 81.5134506225586, "learning_rate": 8.507968524620925e-06, "loss": 3.5549, "step": 58500 }, { "epoch": 2.469709123147718, "eval_loss": 3.8397915363311768, "eval_runtime": 387.2912, "eval_samples_per_second": 489.273, "eval_steps_per_second": 15.291, "step": 58500 }, { "epoch": 2.473930848144552, "grad_norm": 43.514183044433594, "learning_rate": 8.484514441851465e-06, "loss": 3.7305, "step": 58600 }, { "epoch": 2.4781525731413856, "grad_norm": 44.70104217529297, "learning_rate": 8.461060359082008e-06, "loss": 3.4777, "step": 58700 }, { "epoch": 2.482374298138219, "grad_norm": 41.51865768432617, "learning_rate": 8.43760627631255e-06, "loss": 3.625, "step": 58800 }, { "epoch": 2.486596023135053, "grad_norm": 54.522674560546875, "learning_rate": 8.414152193543092e-06, "loss": 3.8084, "step": 58900 }, { "epoch": 2.4908177481318865, "grad_norm": 118.72567749023438, "learning_rate": 8.390698110773634e-06, "loss": 3.6772, "step": 59000 }, { "epoch": 2.4908177481318865, "eval_loss": 3.780479669570923, "eval_runtime": 389.0844, "eval_samples_per_second": 487.018, "eval_steps_per_second": 15.22, "step": 59000 }, { "epoch": 2.4950394731287204, "grad_norm": 34.044559478759766, "learning_rate": 8.367478568831871e-06, "loss": 3.4634, "step": 59100 }, { "epoch": 2.499261198125554, "grad_norm": 93.83566284179688, "learning_rate": 8.344024486062412e-06, "loss": 3.5926, "step": 59200 }, { "epoch": 2.503482923122388, "grad_norm": 53.263832092285156, "learning_rate": 8.320570403292953e-06, "loss": 3.303, "step": 59300 }, { "epoch": 2.507704648119222, "grad_norm": 65.40522766113281, "learning_rate": 8.297116320523496e-06, "loss": 3.5749, "step": 59400 }, { "epoch": 2.5119263731160553, "grad_norm": 65.15473937988281, "learning_rate": 8.273662237754038e-06, "loss": 3.7852, "step": 59500 }, { "epoch": 2.5119263731160553, "eval_loss": 3.825472593307495, "eval_runtime": 390.0313, "eval_samples_per_second": 485.835, "eval_steps_per_second": 15.183, "step": 59500 }, { "epoch": 2.5161480981128888, "grad_norm": 29.49171257019043, "learning_rate": 8.25020815498458e-06, "loss": 3.6317, "step": 59600 }, { "epoch": 2.5203698231097227, "grad_norm": 31.727855682373047, "learning_rate": 8.226754072215122e-06, "loss": 3.3228, "step": 59700 }, { "epoch": 2.5245915481065566, "grad_norm": 30.480257034301758, "learning_rate": 8.203299989445663e-06, "loss": 3.4541, "step": 59800 }, { "epoch": 2.52881327310339, "grad_norm": 29.858154296875, "learning_rate": 8.179845906676205e-06, "loss": 3.5879, "step": 59900 }, { "epoch": 2.5330349981002236, "grad_norm": 38.494544982910156, "learning_rate": 8.156391823906748e-06, "loss": 3.6403, "step": 60000 }, { "epoch": 2.5330349981002236, "eval_loss": 3.7743747234344482, "eval_runtime": 377.1382, "eval_samples_per_second": 502.445, "eval_steps_per_second": 15.702, "step": 60000 }, { "epoch": 2.5372567230970575, "grad_norm": 48.94626998901367, "learning_rate": 8.132937741137289e-06, "loss": 3.5289, "step": 60100 }, { "epoch": 2.541478448093891, "grad_norm": 41.8023567199707, "learning_rate": 8.109483658367832e-06, "loss": 3.4, "step": 60200 }, { "epoch": 2.545700173090725, "grad_norm": 64.16112518310547, "learning_rate": 8.086029575598372e-06, "loss": 3.4026, "step": 60300 }, { "epoch": 2.5499218980875584, "grad_norm": 33.682769775390625, "learning_rate": 8.062575492828915e-06, "loss": 3.6348, "step": 60400 }, { "epoch": 2.5541436230843924, "grad_norm": 36.693504333496094, "learning_rate": 8.039121410059458e-06, "loss": 3.3963, "step": 60500 }, { "epoch": 2.5541436230843924, "eval_loss": 3.7308504581451416, "eval_runtime": 377.3946, "eval_samples_per_second": 502.103, "eval_steps_per_second": 15.692, "step": 60500 }, { "epoch": 2.558365348081226, "grad_norm": 25.65692138671875, "learning_rate": 8.015667327289999e-06, "loss": 3.2838, "step": 60600 }, { "epoch": 2.5625870730780598, "grad_norm": 62.07941436767578, "learning_rate": 7.99221324452054e-06, "loss": 3.8537, "step": 60700 }, { "epoch": 2.5668087980748933, "grad_norm": 46.93730163574219, "learning_rate": 7.968759161751082e-06, "loss": 3.3861, "step": 60800 }, { "epoch": 2.571030523071727, "grad_norm": 55.51591491699219, "learning_rate": 7.945305078981625e-06, "loss": 3.5289, "step": 60900 }, { "epoch": 2.5752522480685607, "grad_norm": 33.335594177246094, "learning_rate": 7.921850996212166e-06, "loss": 3.7611, "step": 61000 }, { "epoch": 2.5752522480685607, "eval_loss": 3.7403736114501953, "eval_runtime": 385.32, "eval_samples_per_second": 491.776, "eval_steps_per_second": 15.369, "step": 61000 }, { "epoch": 2.5794739730653946, "grad_norm": 30.4799861907959, "learning_rate": 7.898396913442708e-06, "loss": 3.3036, "step": 61100 }, { "epoch": 2.583695698062228, "grad_norm": 41.96533203125, "learning_rate": 7.87494283067325e-06, "loss": 3.4874, "step": 61200 }, { "epoch": 2.587917423059062, "grad_norm": 38.74420928955078, "learning_rate": 7.851488747903792e-06, "loss": 3.3885, "step": 61300 }, { "epoch": 2.5921391480558955, "grad_norm": 109.50426483154297, "learning_rate": 7.828034665134334e-06, "loss": 3.6008, "step": 61400 }, { "epoch": 2.5963608730527294, "grad_norm": 45.021488189697266, "learning_rate": 7.804580582364875e-06, "loss": 3.7175, "step": 61500 }, { "epoch": 2.5963608730527294, "eval_loss": 3.768545389175415, "eval_runtime": 386.8942, "eval_samples_per_second": 489.775, "eval_steps_per_second": 15.307, "step": 61500 }, { "epoch": 2.600582598049563, "grad_norm": 62.09092330932617, "learning_rate": 7.781361040423112e-06, "loss": 3.589, "step": 61600 }, { "epoch": 2.604804323046397, "grad_norm": 62.47654724121094, "learning_rate": 7.757906957653653e-06, "loss": 3.6725, "step": 61700 }, { "epoch": 2.6090260480432304, "grad_norm": 46.53767776489258, "learning_rate": 7.734452874884196e-06, "loss": 3.3397, "step": 61800 }, { "epoch": 2.6132477730400643, "grad_norm": 46.46326446533203, "learning_rate": 7.710998792114738e-06, "loss": 3.562, "step": 61900 }, { "epoch": 2.6174694980368978, "grad_norm": 48.67976379394531, "learning_rate": 7.687544709345281e-06, "loss": 3.5421, "step": 62000 }, { "epoch": 2.6174694980368978, "eval_loss": 3.755615711212158, "eval_runtime": 381.9504, "eval_samples_per_second": 496.114, "eval_steps_per_second": 15.505, "step": 62000 }, { "epoch": 2.6216912230337317, "grad_norm": 29.504335403442383, "learning_rate": 7.664090626575822e-06, "loss": 3.462, "step": 62100 }, { "epoch": 2.625912948030565, "grad_norm": 89.24791717529297, "learning_rate": 7.640636543806363e-06, "loss": 3.329, "step": 62200 }, { "epoch": 2.630134673027399, "grad_norm": 97.40950012207031, "learning_rate": 7.617182461036906e-06, "loss": 3.4274, "step": 62300 }, { "epoch": 2.6343563980242326, "grad_norm": 57.1513671875, "learning_rate": 7.593728378267447e-06, "loss": 3.4622, "step": 62400 }, { "epoch": 2.6385781230210665, "grad_norm": 39.16256332397461, "learning_rate": 7.570274295497989e-06, "loss": 3.2924, "step": 62500 }, { "epoch": 2.6385781230210665, "eval_loss": 3.8314058780670166, "eval_runtime": 383.303, "eval_samples_per_second": 494.363, "eval_steps_per_second": 15.45, "step": 62500 }, { "epoch": 2.6427998480179, "grad_norm": 72.36564636230469, "learning_rate": 7.546820212728532e-06, "loss": 3.5465, "step": 62600 }, { "epoch": 2.647021573014734, "grad_norm": 82.64096069335938, "learning_rate": 7.5233661299590735e-06, "loss": 3.3618, "step": 62700 }, { "epoch": 2.6512432980115674, "grad_norm": 41.66115188598633, "learning_rate": 7.499912047189614e-06, "loss": 3.2383, "step": 62800 }, { "epoch": 2.6554650230084014, "grad_norm": 114.26473999023438, "learning_rate": 7.476457964420157e-06, "loss": 3.3542, "step": 62900 }, { "epoch": 2.659686748005235, "grad_norm": 45.76057434082031, "learning_rate": 7.453003881650699e-06, "loss": 3.5209, "step": 63000 }, { "epoch": 2.659686748005235, "eval_loss": 3.7941720485687256, "eval_runtime": 378.5969, "eval_samples_per_second": 500.509, "eval_steps_per_second": 15.642, "step": 63000 }, { "epoch": 2.663908473002069, "grad_norm": 44.19281768798828, "learning_rate": 7.4295497988812406e-06, "loss": 3.4416, "step": 63100 }, { "epoch": 2.6681301979989023, "grad_norm": 29.7802677154541, "learning_rate": 7.406095716111783e-06, "loss": 3.5881, "step": 63200 }, { "epoch": 2.672351922995736, "grad_norm": 23.825647354125977, "learning_rate": 7.382641633342324e-06, "loss": 3.7347, "step": 63300 }, { "epoch": 2.6765736479925697, "grad_norm": 120.45719909667969, "learning_rate": 7.359187550572866e-06, "loss": 3.815, "step": 63400 }, { "epoch": 2.6807953729894036, "grad_norm": 76.13922882080078, "learning_rate": 7.3357334678034085e-06, "loss": 3.4667, "step": 63500 }, { "epoch": 2.6807953729894036, "eval_loss": 3.779139995574951, "eval_runtime": 386.2584, "eval_samples_per_second": 490.581, "eval_steps_per_second": 15.332, "step": 63500 }, { "epoch": 2.685017097986237, "grad_norm": 67.92314147949219, "learning_rate": 7.31227938503395e-06, "loss": 3.342, "step": 63600 }, { "epoch": 2.689238822983071, "grad_norm": 49.04058837890625, "learning_rate": 7.288825302264493e-06, "loss": 3.6695, "step": 63700 }, { "epoch": 2.6934605479799045, "grad_norm": 90.01438903808594, "learning_rate": 7.265371219495035e-06, "loss": 3.4863, "step": 63800 }, { "epoch": 2.6976822729767385, "grad_norm": 103.12335968017578, "learning_rate": 7.241917136725576e-06, "loss": 3.741, "step": 63900 }, { "epoch": 2.701903997973572, "grad_norm": 26.381654739379883, "learning_rate": 7.2186975947838125e-06, "loss": 3.6267, "step": 64000 }, { "epoch": 2.701903997973572, "eval_loss": 3.8010313510894775, "eval_runtime": 378.1577, "eval_samples_per_second": 501.09, "eval_steps_per_second": 15.66, "step": 64000 }, { "epoch": 2.7061257229704054, "grad_norm": 39.231876373291016, "learning_rate": 7.195243512014355e-06, "loss": 3.2939, "step": 64100 }, { "epoch": 2.7103474479672394, "grad_norm": 79.92709350585938, "learning_rate": 7.171789429244896e-06, "loss": 3.4422, "step": 64200 }, { "epoch": 2.7145691729640733, "grad_norm": 55.60668182373047, "learning_rate": 7.148335346475438e-06, "loss": 3.7278, "step": 64300 }, { "epoch": 2.718790897960907, "grad_norm": 67.64916229248047, "learning_rate": 7.1248812637059804e-06, "loss": 3.8691, "step": 64400 }, { "epoch": 2.7230126229577403, "grad_norm": 75.73139953613281, "learning_rate": 7.101427180936522e-06, "loss": 3.611, "step": 64500 }, { "epoch": 2.7230126229577403, "eval_loss": 3.717172145843506, "eval_runtime": 384.6215, "eval_samples_per_second": 492.669, "eval_steps_per_second": 15.397, "step": 64500 }, { "epoch": 2.727234347954574, "grad_norm": 47.768821716308594, "learning_rate": 7.077973098167064e-06, "loss": 3.5474, "step": 64600 }, { "epoch": 2.731456072951408, "grad_norm": 112.48494720458984, "learning_rate": 7.054519015397607e-06, "loss": 3.5087, "step": 64700 }, { "epoch": 2.7356777979482416, "grad_norm": 64.66181945800781, "learning_rate": 7.0310649326281475e-06, "loss": 3.4489, "step": 64800 }, { "epoch": 2.739899522945075, "grad_norm": 35.16935348510742, "learning_rate": 7.007610849858689e-06, "loss": 3.6549, "step": 64900 }, { "epoch": 2.744121247941909, "grad_norm": 36.89544677734375, "learning_rate": 6.984156767089232e-06, "loss": 3.3956, "step": 65000 }, { "epoch": 2.744121247941909, "eval_loss": 3.735604763031006, "eval_runtime": 383.8224, "eval_samples_per_second": 493.694, "eval_steps_per_second": 15.429, "step": 65000 }, { "epoch": 2.748342972938743, "grad_norm": 37.28645706176758, "learning_rate": 6.960702684319774e-06, "loss": 3.5116, "step": 65100 }, { "epoch": 2.7525646979355765, "grad_norm": 85.3537826538086, "learning_rate": 6.937248601550316e-06, "loss": 3.1777, "step": 65200 }, { "epoch": 2.75678642293241, "grad_norm": 79.33281707763672, "learning_rate": 6.913794518780857e-06, "loss": 3.7644, "step": 65300 }, { "epoch": 2.761008147929244, "grad_norm": 58.84652328491211, "learning_rate": 6.890340436011399e-06, "loss": 3.6376, "step": 65400 }, { "epoch": 2.765229872926078, "grad_norm": 95.42400360107422, "learning_rate": 6.866886353241942e-06, "loss": 3.6153, "step": 65500 }, { "epoch": 2.765229872926078, "eval_loss": 3.7485992908477783, "eval_runtime": 386.3393, "eval_samples_per_second": 490.478, "eval_steps_per_second": 15.328, "step": 65500 }, { "epoch": 2.7694515979229113, "grad_norm": 43.77649688720703, "learning_rate": 6.843432270472483e-06, "loss": 3.2564, "step": 65600 }, { "epoch": 2.7736733229197448, "grad_norm": 26.90199089050293, "learning_rate": 6.819978187703024e-06, "loss": 3.5547, "step": 65700 }, { "epoch": 2.7778950479165787, "grad_norm": 65.34386444091797, "learning_rate": 6.796524104933567e-06, "loss": 3.3283, "step": 65800 }, { "epoch": 2.7821167729134126, "grad_norm": 39.57930374145508, "learning_rate": 6.773070022164109e-06, "loss": 3.4592, "step": 65900 }, { "epoch": 2.786338497910246, "grad_norm": 53.023067474365234, "learning_rate": 6.7496159393946505e-06, "loss": 3.7505, "step": 66000 }, { "epoch": 2.786338497910246, "eval_loss": 3.725229263305664, "eval_runtime": 384.6647, "eval_samples_per_second": 492.613, "eval_steps_per_second": 15.395, "step": 66000 }, { "epoch": 2.7905602229070796, "grad_norm": 30.69059181213379, "learning_rate": 6.7263963974528865e-06, "loss": 3.3761, "step": 66100 }, { "epoch": 2.7947819479039135, "grad_norm": 67.33950805664062, "learning_rate": 6.702942314683429e-06, "loss": 3.6223, "step": 66200 }, { "epoch": 2.7990036729007475, "grad_norm": 72.26821899414062, "learning_rate": 6.679488231913971e-06, "loss": 3.4702, "step": 66300 }, { "epoch": 2.803225397897581, "grad_norm": 42.78792190551758, "learning_rate": 6.656034149144513e-06, "loss": 3.8666, "step": 66400 }, { "epoch": 2.8074471228944144, "grad_norm": 36.370338439941406, "learning_rate": 6.632580066375055e-06, "loss": 3.2927, "step": 66500 }, { "epoch": 2.8074471228944144, "eval_loss": 3.7299137115478516, "eval_runtime": 379.7934, "eval_samples_per_second": 498.932, "eval_steps_per_second": 15.593, "step": 66500 }, { "epoch": 2.8116688478912484, "grad_norm": 108.68196868896484, "learning_rate": 6.609125983605596e-06, "loss": 3.5424, "step": 66600 }, { "epoch": 2.8158905728880823, "grad_norm": 43.846248626708984, "learning_rate": 6.585671900836138e-06, "loss": 3.5487, "step": 66700 }, { "epoch": 2.820112297884916, "grad_norm": 39.46868133544922, "learning_rate": 6.562217818066681e-06, "loss": 3.3343, "step": 66800 }, { "epoch": 2.8243340228817493, "grad_norm": 176.4261016845703, "learning_rate": 6.538763735297222e-06, "loss": 3.3005, "step": 66900 }, { "epoch": 2.828555747878583, "grad_norm": 49.91089630126953, "learning_rate": 6.515309652527765e-06, "loss": 3.5036, "step": 67000 }, { "epoch": 2.828555747878583, "eval_loss": 3.7751986980438232, "eval_runtime": 382.91, "eval_samples_per_second": 494.871, "eval_steps_per_second": 15.466, "step": 67000 }, { "epoch": 2.832777472875417, "grad_norm": 42.26047897338867, "learning_rate": 6.491855569758306e-06, "loss": 3.4419, "step": 67100 }, { "epoch": 2.8369991978722506, "grad_norm": 61.47079849243164, "learning_rate": 6.468401486988848e-06, "loss": 3.3805, "step": 67200 }, { "epoch": 2.841220922869084, "grad_norm": 36.838199615478516, "learning_rate": 6.44494740421939e-06, "loss": 3.3591, "step": 67300 }, { "epoch": 2.845442647865918, "grad_norm": 38.40938186645508, "learning_rate": 6.421493321449932e-06, "loss": 3.738, "step": 67400 }, { "epoch": 2.849664372862752, "grad_norm": 72.58192443847656, "learning_rate": 6.398039238680474e-06, "loss": 3.268, "step": 67500 }, { "epoch": 2.849664372862752, "eval_loss": 3.765678882598877, "eval_runtime": 368.4593, "eval_samples_per_second": 514.279, "eval_steps_per_second": 16.072, "step": 67500 }, { "epoch": 2.8538860978595855, "grad_norm": 70.88404083251953, "learning_rate": 6.3745851559110165e-06, "loss": 3.4224, "step": 67600 }, { "epoch": 2.858107822856419, "grad_norm": 32.05725860595703, "learning_rate": 6.3511310731415574e-06, "loss": 3.5734, "step": 67700 }, { "epoch": 2.862329547853253, "grad_norm": 29.63147735595703, "learning_rate": 6.327676990372099e-06, "loss": 3.3804, "step": 67800 }, { "epoch": 2.866551272850087, "grad_norm": 31.01226043701172, "learning_rate": 6.304222907602642e-06, "loss": 3.594, "step": 67900 }, { "epoch": 2.8707729978469203, "grad_norm": 41.206783294677734, "learning_rate": 6.280768824833184e-06, "loss": 3.6526, "step": 68000 }, { "epoch": 2.8707729978469203, "eval_loss": 3.779634714126587, "eval_runtime": 382.5774, "eval_samples_per_second": 495.301, "eval_steps_per_second": 15.479, "step": 68000 }, { "epoch": 2.874994722843754, "grad_norm": Infinity, "learning_rate": 6.25754928289142e-06, "loss": 3.7921, "step": 68100 }, { "epoch": 2.8792164478405877, "grad_norm": 196.81781005859375, "learning_rate": 6.2340952001219614e-06, "loss": 3.352, "step": 68200 }, { "epoch": 2.883438172837421, "grad_norm": 62.34221267700195, "learning_rate": 6.210641117352504e-06, "loss": 3.7122, "step": 68300 }, { "epoch": 2.887659897834255, "grad_norm": 45.04619216918945, "learning_rate": 6.187187034583045e-06, "loss": 3.5739, "step": 68400 }, { "epoch": 2.8918816228310886, "grad_norm": 29.019325256347656, "learning_rate": 6.163732951813588e-06, "loss": 3.3912, "step": 68500 }, { "epoch": 2.8918816228310886, "eval_loss": 3.7358908653259277, "eval_runtime": 382.9272, "eval_samples_per_second": 494.849, "eval_steps_per_second": 15.465, "step": 68500 }, { "epoch": 2.8961033478279226, "grad_norm": 36.82758331298828, "learning_rate": 6.140278869044129e-06, "loss": 3.8863, "step": 68600 }, { "epoch": 2.900325072824756, "grad_norm": 45.10524368286133, "learning_rate": 6.116824786274671e-06, "loss": 3.6851, "step": 68700 }, { "epoch": 2.90454679782159, "grad_norm": 47.0596923828125, "learning_rate": 6.093370703505214e-06, "loss": 3.1867, "step": 68800 }, { "epoch": 2.9087685228184235, "grad_norm": 97.21186828613281, "learning_rate": 6.0699166207357555e-06, "loss": 3.2456, "step": 68900 }, { "epoch": 2.9129902478152574, "grad_norm": 42.81410598754883, "learning_rate": 6.0464625379662964e-06, "loss": 3.447, "step": 69000 }, { "epoch": 2.9129902478152574, "eval_loss": 3.7272026538848877, "eval_runtime": 373.7122, "eval_samples_per_second": 507.051, "eval_steps_per_second": 15.846, "step": 69000 }, { "epoch": 2.917211972812091, "grad_norm": 21.994388580322266, "learning_rate": 6.023008455196839e-06, "loss": 3.3142, "step": 69100 }, { "epoch": 2.921433697808925, "grad_norm": 75.22715759277344, "learning_rate": 5.999554372427381e-06, "loss": 3.8019, "step": 69200 }, { "epoch": 2.9256554228057583, "grad_norm": 108.84042358398438, "learning_rate": 5.976100289657923e-06, "loss": 3.6041, "step": 69300 }, { "epoch": 2.9298771478025922, "grad_norm": 49.068809509277344, "learning_rate": 5.952646206888465e-06, "loss": 3.6291, "step": 69400 }, { "epoch": 2.9340988727994257, "grad_norm": 27.290891647338867, "learning_rate": 5.929192124119006e-06, "loss": 3.5412, "step": 69500 }, { "epoch": 2.9340988727994257, "eval_loss": 3.704101085662842, "eval_runtime": 378.324, "eval_samples_per_second": 500.87, "eval_steps_per_second": 15.653, "step": 69500 }, { "epoch": 2.9383205977962596, "grad_norm": 33.8206901550293, "learning_rate": 5.905738041349548e-06, "loss": 3.5873, "step": 69600 }, { "epoch": 2.942542322793093, "grad_norm": 52.327754974365234, "learning_rate": 5.8822839585800905e-06, "loss": 3.6207, "step": 69700 }, { "epoch": 2.946764047789927, "grad_norm": 193.724853515625, "learning_rate": 5.858829875810632e-06, "loss": 3.5858, "step": 69800 }, { "epoch": 2.9509857727867606, "grad_norm": 51.89821243286133, "learning_rate": 5.835375793041173e-06, "loss": 3.5341, "step": 69900 }, { "epoch": 2.9552074977835945, "grad_norm": 42.79822540283203, "learning_rate": 5.811921710271716e-06, "loss": 3.6644, "step": 70000 }, { "epoch": 2.9552074977835945, "eval_loss": 3.740612268447876, "eval_runtime": 378.139, "eval_samples_per_second": 501.115, "eval_steps_per_second": 15.661, "step": 70000 }, { "epoch": 2.959429222780428, "grad_norm": 31.204317092895508, "learning_rate": 5.788467627502258e-06, "loss": 3.4947, "step": 70100 }, { "epoch": 2.963650947777262, "grad_norm": 43.13151931762695, "learning_rate": 5.765013544732799e-06, "loss": 3.5763, "step": 70200 }, { "epoch": 2.9678726727740954, "grad_norm": 91.4846420288086, "learning_rate": 5.741559461963342e-06, "loss": 3.6131, "step": 70300 }, { "epoch": 2.9720943977709293, "grad_norm": 72.16215515136719, "learning_rate": 5.718105379193884e-06, "loss": 3.509, "step": 70400 }, { "epoch": 2.976316122767763, "grad_norm": 80.8546371459961, "learning_rate": 5.694651296424426e-06, "loss": 3.4352, "step": 70500 }, { "epoch": 2.976316122767763, "eval_loss": 3.7115588188171387, "eval_runtime": 375.398, "eval_samples_per_second": 504.774, "eval_steps_per_second": 15.775, "step": 70500 }, { "epoch": 2.9805378477645967, "grad_norm": 30.867778778076172, "learning_rate": 5.671197213654967e-06, "loss": 3.3115, "step": 70600 }, { "epoch": 2.9847595727614302, "grad_norm": 77.84353637695312, "learning_rate": 5.647743130885509e-06, "loss": 3.2393, "step": 70700 }, { "epoch": 2.988981297758264, "grad_norm": 87.11060333251953, "learning_rate": 5.624289048116052e-06, "loss": 3.3738, "step": 70800 }, { "epoch": 2.9932030227550976, "grad_norm": 54.43430709838867, "learning_rate": 5.6008349653465935e-06, "loss": 3.424, "step": 70900 }, { "epoch": 2.9974247477519316, "grad_norm": 48.77461242675781, "learning_rate": 5.5773808825771344e-06, "loss": 3.7252, "step": 71000 }, { "epoch": 2.9974247477519316, "eval_loss": 3.7152390480041504, "eval_runtime": 381.7179, "eval_samples_per_second": 496.416, "eval_steps_per_second": 15.514, "step": 71000 }, { "epoch": 3.001646472748765, "grad_norm": 83.55622100830078, "learning_rate": 5.553926799807677e-06, "loss": 3.6013, "step": 71100 }, { "epoch": 3.005868197745599, "grad_norm": 26.548662185668945, "learning_rate": 5.530472717038219e-06, "loss": 3.407, "step": 71200 }, { "epoch": 3.0100899227424325, "grad_norm": 256.9148864746094, "learning_rate": 5.507018634268761e-06, "loss": 3.2695, "step": 71300 }, { "epoch": 3.0143116477392664, "grad_norm": 70.4573974609375, "learning_rate": 5.483564551499303e-06, "loss": 3.3632, "step": 71400 }, { "epoch": 3.0185333727361, "grad_norm": 34.62995529174805, "learning_rate": 5.460110468729844e-06, "loss": 2.95, "step": 71500 }, { "epoch": 3.0185333727361, "eval_loss": 3.7984957695007324, "eval_runtime": 381.4288, "eval_samples_per_second": 496.793, "eval_steps_per_second": 15.526, "step": 71500 }, { "epoch": 3.022755097732934, "grad_norm": 47.09511184692383, "learning_rate": 5.436656385960386e-06, "loss": 3.164, "step": 71600 }, { "epoch": 3.0269768227297673, "grad_norm": 43.297088623046875, "learning_rate": 5.4132023031909285e-06, "loss": 3.4829, "step": 71700 }, { "epoch": 3.0311985477266012, "grad_norm": 119.86432647705078, "learning_rate": 5.38974822042147e-06, "loss": 3.7491, "step": 71800 }, { "epoch": 3.0354202727234347, "grad_norm": 35.53313064575195, "learning_rate": 5.366294137652012e-06, "loss": 3.7257, "step": 71900 }, { "epoch": 3.0396419977202687, "grad_norm": 82.5821304321289, "learning_rate": 5.342840054882555e-06, "loss": 3.701, "step": 72000 }, { "epoch": 3.0396419977202687, "eval_loss": 3.749661684036255, "eval_runtime": 378.3724, "eval_samples_per_second": 500.806, "eval_steps_per_second": 15.651, "step": 72000 }, { "epoch": 3.043863722717102, "grad_norm": 46.78045654296875, "learning_rate": 5.319385972113096e-06, "loss": 3.105, "step": 72100 }, { "epoch": 3.048085447713936, "grad_norm": 24.64872169494629, "learning_rate": 5.295931889343638e-06, "loss": 3.43, "step": 72200 }, { "epoch": 3.0523071727107696, "grad_norm": 54.81727981567383, "learning_rate": 5.272946888229569e-06, "loss": 3.3014, "step": 72300 }, { "epoch": 3.0565288977076035, "grad_norm": 35.038734436035156, "learning_rate": 5.249492805460111e-06, "loss": 3.2578, "step": 72400 }, { "epoch": 3.060750622704437, "grad_norm": 63.87615203857422, "learning_rate": 5.226038722690653e-06, "loss": 3.4971, "step": 72500 }, { "epoch": 3.060750622704437, "eval_loss": 3.7966361045837402, "eval_runtime": 384.8477, "eval_samples_per_second": 492.379, "eval_steps_per_second": 15.388, "step": 72500 }, { "epoch": 3.064972347701271, "grad_norm": 41.326568603515625, "learning_rate": 5.202584639921195e-06, "loss": 3.4542, "step": 72600 }, { "epoch": 3.0691940726981044, "grad_norm": 38.29585647583008, "learning_rate": 5.179130557151737e-06, "loss": 3.4634, "step": 72700 }, { "epoch": 3.0734157976949383, "grad_norm": 131.59951782226562, "learning_rate": 5.155676474382278e-06, "loss": 3.6576, "step": 72800 }, { "epoch": 3.077637522691772, "grad_norm": 33.464576721191406, "learning_rate": 5.13222239161282e-06, "loss": 3.38, "step": 72900 }, { "epoch": 3.0818592476886058, "grad_norm": 56.84671401977539, "learning_rate": 5.108768308843363e-06, "loss": 3.4615, "step": 73000 }, { "epoch": 3.0818592476886058, "eval_loss": 3.733682870864868, "eval_runtime": 379.8356, "eval_samples_per_second": 498.876, "eval_steps_per_second": 15.591, "step": 73000 }, { "epoch": 3.0860809726854392, "grad_norm": 78.62255096435547, "learning_rate": 5.0853142260739044e-06, "loss": 3.6838, "step": 73100 }, { "epoch": 3.090302697682273, "grad_norm": 40.18800735473633, "learning_rate": 5.061860143304445e-06, "loss": 3.2183, "step": 73200 }, { "epoch": 3.0945244226791067, "grad_norm": 72.73197174072266, "learning_rate": 5.038406060534988e-06, "loss": 3.3281, "step": 73300 }, { "epoch": 3.0987461476759406, "grad_norm": 91.64218139648438, "learning_rate": 5.01495197776553e-06, "loss": 3.2304, "step": 73400 }, { "epoch": 3.102967872672774, "grad_norm": 43.35820007324219, "learning_rate": 4.9914978949960715e-06, "loss": 3.3314, "step": 73500 }, { "epoch": 3.102967872672774, "eval_loss": 3.7586066722869873, "eval_runtime": 377.7973, "eval_samples_per_second": 501.568, "eval_steps_per_second": 15.675, "step": 73500 }, { "epoch": 3.107189597669608, "grad_norm": 175.2445526123047, "learning_rate": 4.968043812226614e-06, "loss": 3.0549, "step": 73600 }, { "epoch": 3.1114113226664415, "grad_norm": 57.96905517578125, "learning_rate": 4.944589729457155e-06, "loss": 3.4578, "step": 73700 }, { "epoch": 3.1156330476632754, "grad_norm": 32.305484771728516, "learning_rate": 4.921135646687698e-06, "loss": 3.3797, "step": 73800 }, { "epoch": 3.119854772660109, "grad_norm": 32.543643951416016, "learning_rate": 4.8976815639182395e-06, "loss": 3.4435, "step": 73900 }, { "epoch": 3.124076497656943, "grad_norm": 62.69993591308594, "learning_rate": 4.874227481148781e-06, "loss": 3.1522, "step": 74000 }, { "epoch": 3.124076497656943, "eval_loss": 3.7354512214660645, "eval_runtime": 379.0534, "eval_samples_per_second": 499.906, "eval_steps_per_second": 15.623, "step": 74000 }, { "epoch": 3.1282982226537763, "grad_norm": 42.34346008300781, "learning_rate": 4.850773398379323e-06, "loss": 3.3775, "step": 74100 }, { "epoch": 3.13251994765061, "grad_norm": 88.97834014892578, "learning_rate": 4.827319315609866e-06, "loss": 3.5751, "step": 74200 }, { "epoch": 3.1367416726474437, "grad_norm": 99.53689575195312, "learning_rate": 4.803865232840407e-06, "loss": 3.5017, "step": 74300 }, { "epoch": 3.1409633976442777, "grad_norm": 29.37308120727539, "learning_rate": 4.780411150070949e-06, "loss": 3.3353, "step": 74400 }, { "epoch": 3.145185122641111, "grad_norm": 80.75165557861328, "learning_rate": 4.757191608129185e-06, "loss": 3.5746, "step": 74500 }, { "epoch": 3.145185122641111, "eval_loss": 3.7411203384399414, "eval_runtime": 374.8408, "eval_samples_per_second": 505.524, "eval_steps_per_second": 15.799, "step": 74500 }, { "epoch": 3.1494068476379447, "grad_norm": 26.067644119262695, "learning_rate": 4.733737525359727e-06, "loss": 3.7003, "step": 74600 }, { "epoch": 3.1536285726347786, "grad_norm": 32.2315788269043, "learning_rate": 4.71028344259027e-06, "loss": 3.0499, "step": 74700 }, { "epoch": 3.157850297631612, "grad_norm": 49.030643463134766, "learning_rate": 4.686829359820811e-06, "loss": 3.3735, "step": 74800 }, { "epoch": 3.162072022628446, "grad_norm": 103.1226577758789, "learning_rate": 4.663375277051353e-06, "loss": 3.5844, "step": 74900 }, { "epoch": 3.1662937476252795, "grad_norm": 54.65353775024414, "learning_rate": 4.639921194281895e-06, "loss": 3.3551, "step": 75000 }, { "epoch": 3.1662937476252795, "eval_loss": 3.8240253925323486, "eval_runtime": 380.1742, "eval_samples_per_second": 498.432, "eval_steps_per_second": 15.577, "step": 75000 }, { "epoch": 3.1705154726221134, "grad_norm": 48.09339904785156, "learning_rate": 4.616467111512437e-06, "loss": 3.262, "step": 75100 }, { "epoch": 3.174737197618947, "grad_norm": 37.72649383544922, "learning_rate": 4.5930130287429785e-06, "loss": 3.4301, "step": 75200 }, { "epoch": 3.178958922615781, "grad_norm": 46.34507369995117, "learning_rate": 4.56955894597352e-06, "loss": 3.464, "step": 75300 }, { "epoch": 3.1831806476126143, "grad_norm": 68.04910278320312, "learning_rate": 4.546104863204063e-06, "loss": 3.4751, "step": 75400 }, { "epoch": 3.1874023726094483, "grad_norm": 44.80384826660156, "learning_rate": 4.522650780434605e-06, "loss": 3.5351, "step": 75500 }, { "epoch": 3.1874023726094483, "eval_loss": 3.7464685440063477, "eval_runtime": 387.0901, "eval_samples_per_second": 489.527, "eval_steps_per_second": 15.299, "step": 75500 }, { "epoch": 3.1916240976062817, "grad_norm": 51.414955139160156, "learning_rate": 4.499196697665146e-06, "loss": 3.2933, "step": 75600 }, { "epoch": 3.1958458226031157, "grad_norm": 78.93827056884766, "learning_rate": 4.475742614895688e-06, "loss": 4.0448, "step": 75700 }, { "epoch": 3.200067547599949, "grad_norm": 28.45920753479004, "learning_rate": 4.452288532126231e-06, "loss": 3.4882, "step": 75800 }, { "epoch": 3.204289272596783, "grad_norm": 56.69211959838867, "learning_rate": 4.428834449356772e-06, "loss": 3.615, "step": 75900 }, { "epoch": 3.2085109975936166, "grad_norm": 43.08451843261719, "learning_rate": 4.405380366587314e-06, "loss": 3.1492, "step": 76000 }, { "epoch": 3.2085109975936166, "eval_loss": 3.7142016887664795, "eval_runtime": 383.7896, "eval_samples_per_second": 493.737, "eval_steps_per_second": 15.43, "step": 76000 }, { "epoch": 3.2127327225904505, "grad_norm": 43.29754638671875, "learning_rate": 4.381926283817856e-06, "loss": 3.0458, "step": 76100 }, { "epoch": 3.216954447587284, "grad_norm": 87.42881774902344, "learning_rate": 4.358472201048398e-06, "loss": 3.6002, "step": 76200 }, { "epoch": 3.221176172584118, "grad_norm": 119.25186920166016, "learning_rate": 4.33501811827894e-06, "loss": 3.3197, "step": 76300 }, { "epoch": 3.2253978975809514, "grad_norm": 70.65145111083984, "learning_rate": 4.3115640355094814e-06, "loss": 3.3113, "step": 76400 }, { "epoch": 3.2296196225777853, "grad_norm": 48.13566207885742, "learning_rate": 4.288109952740024e-06, "loss": 3.3607, "step": 76500 }, { "epoch": 3.2296196225777853, "eval_loss": 3.777985095977783, "eval_runtime": 381.0294, "eval_samples_per_second": 497.313, "eval_steps_per_second": 15.542, "step": 76500 }, { "epoch": 3.233841347574619, "grad_norm": 43.97998809814453, "learning_rate": 4.264655869970565e-06, "loss": 3.3242, "step": 76600 }, { "epoch": 3.2380630725714528, "grad_norm": 97.08489990234375, "learning_rate": 4.241201787201108e-06, "loss": 3.6477, "step": 76700 }, { "epoch": 3.2422847975682862, "grad_norm": 63.02432632446289, "learning_rate": 4.217747704431649e-06, "loss": 3.1657, "step": 76800 }, { "epoch": 3.24650652256512, "grad_norm": 34.82364273071289, "learning_rate": 4.194528162489886e-06, "loss": 3.0839, "step": 76900 }, { "epoch": 3.2507282475619537, "grad_norm": 37.6974983215332, "learning_rate": 4.171074079720427e-06, "loss": 3.599, "step": 77000 }, { "epoch": 3.2507282475619537, "eval_loss": 3.7618165016174316, "eval_runtime": 380.91, "eval_samples_per_second": 497.469, "eval_steps_per_second": 15.547, "step": 77000 }, { "epoch": 3.2549499725587876, "grad_norm": 44.467830657958984, "learning_rate": 4.14761999695097e-06, "loss": 3.1563, "step": 77100 }, { "epoch": 3.259171697555621, "grad_norm": 25.788896560668945, "learning_rate": 4.124165914181512e-06, "loss": 3.1867, "step": 77200 }, { "epoch": 3.263393422552455, "grad_norm": 57.96255874633789, "learning_rate": 4.100711831412053e-06, "loss": 3.5676, "step": 77300 }, { "epoch": 3.2676151475492885, "grad_norm": 58.24321746826172, "learning_rate": 4.077257748642595e-06, "loss": 3.6313, "step": 77400 }, { "epoch": 3.2718368725461224, "grad_norm": 48.765167236328125, "learning_rate": 4.053803665873137e-06, "loss": 3.2504, "step": 77500 }, { "epoch": 3.2718368725461224, "eval_loss": 3.7300686836242676, "eval_runtime": 379.0002, "eval_samples_per_second": 499.976, "eval_steps_per_second": 15.625, "step": 77500 }, { "epoch": 3.276058597542956, "grad_norm": 89.91081237792969, "learning_rate": 4.0303495831036795e-06, "loss": 3.2488, "step": 77600 }, { "epoch": 3.28028032253979, "grad_norm": 85.31146240234375, "learning_rate": 4.0068955003342205e-06, "loss": 3.0412, "step": 77700 }, { "epoch": 3.2845020475366233, "grad_norm": 81.51136016845703, "learning_rate": 3.983441417564763e-06, "loss": 3.1514, "step": 77800 }, { "epoch": 3.2887237725334573, "grad_norm": 33.587486267089844, "learning_rate": 3.959987334795305e-06, "loss": 2.9742, "step": 77900 }, { "epoch": 3.2929454975302908, "grad_norm": 58.87001037597656, "learning_rate": 3.936533252025847e-06, "loss": 3.395, "step": 78000 }, { "epoch": 3.2929454975302908, "eval_loss": 3.757880210876465, "eval_runtime": 387.0641, "eval_samples_per_second": 489.56, "eval_steps_per_second": 15.3, "step": 78000 }, { "epoch": 3.2971672225271247, "grad_norm": 101.9001693725586, "learning_rate": 3.913079169256388e-06, "loss": 3.5513, "step": 78100 }, { "epoch": 3.301388947523958, "grad_norm": 59.32830047607422, "learning_rate": 3.889625086486931e-06, "loss": 3.3194, "step": 78200 }, { "epoch": 3.305610672520792, "grad_norm": 40.857460021972656, "learning_rate": 3.866171003717473e-06, "loss": 3.2702, "step": 78300 }, { "epoch": 3.3098323975176256, "grad_norm": 32.93022155761719, "learning_rate": 3.8427169209480146e-06, "loss": 3.322, "step": 78400 }, { "epoch": 3.3140541225144595, "grad_norm": 50.25773239135742, "learning_rate": 3.819262838178556e-06, "loss": 3.5357, "step": 78500 }, { "epoch": 3.3140541225144595, "eval_loss": 3.782740354537964, "eval_runtime": 385.7272, "eval_samples_per_second": 491.257, "eval_steps_per_second": 15.353, "step": 78500 }, { "epoch": 3.318275847511293, "grad_norm": 136.00146484375, "learning_rate": 3.7958087554090985e-06, "loss": 3.3831, "step": 78600 }, { "epoch": 3.322497572508127, "grad_norm": 56.89426803588867, "learning_rate": 3.77235467263964e-06, "loss": 3.3878, "step": 78700 }, { "epoch": 3.3267192975049604, "grad_norm": 39.877349853515625, "learning_rate": 3.748900589870182e-06, "loss": 3.2869, "step": 78800 }, { "epoch": 3.3309410225017944, "grad_norm": 70.3539047241211, "learning_rate": 3.725446507100724e-06, "loss": 3.7636, "step": 78900 }, { "epoch": 3.335162747498628, "grad_norm": 72.76617431640625, "learning_rate": 3.701992424331266e-06, "loss": 3.4089, "step": 79000 }, { "epoch": 3.335162747498628, "eval_loss": 3.7984070777893066, "eval_runtime": 375.5303, "eval_samples_per_second": 504.596, "eval_steps_per_second": 15.77, "step": 79000 }, { "epoch": 3.3393844724954618, "grad_norm": 37.42483901977539, "learning_rate": 3.6785383415618074e-06, "loss": 3.3371, "step": 79100 }, { "epoch": 3.3436061974922953, "grad_norm": 54.95014953613281, "learning_rate": 3.6550842587923496e-06, "loss": 3.5966, "step": 79200 }, { "epoch": 3.347827922489129, "grad_norm": 231.54039001464844, "learning_rate": 3.6316301760228918e-06, "loss": 3.8318, "step": 79300 }, { "epoch": 3.3520496474859627, "grad_norm": 22.132776260375977, "learning_rate": 3.6084106340811283e-06, "loss": 3.4452, "step": 79400 }, { "epoch": 3.3562713724827966, "grad_norm": 46.317466735839844, "learning_rate": 3.5849565513116696e-06, "loss": 3.1789, "step": 79500 }, { "epoch": 3.3562713724827966, "eval_loss": 3.728987693786621, "eval_runtime": 381.2454, "eval_samples_per_second": 497.032, "eval_steps_per_second": 15.533, "step": 79500 }, { "epoch": 3.36049309747963, "grad_norm": 18.88166046142578, "learning_rate": 3.561502468542212e-06, "loss": 3.1829, "step": 79600 }, { "epoch": 3.364714822476464, "grad_norm": 61.83921813964844, "learning_rate": 3.5380483857727536e-06, "loss": 3.4624, "step": 79700 }, { "epoch": 3.3689365474732975, "grad_norm": 43.87940216064453, "learning_rate": 3.5145943030032958e-06, "loss": 3.3163, "step": 79800 }, { "epoch": 3.3731582724701314, "grad_norm": 35.34148406982422, "learning_rate": 3.4911402202338375e-06, "loss": 3.2591, "step": 79900 }, { "epoch": 3.377379997466965, "grad_norm": 38.877281188964844, "learning_rate": 3.4676861374643793e-06, "loss": 3.2375, "step": 80000 }, { "epoch": 3.377379997466965, "eval_loss": 3.715250015258789, "eval_runtime": 385.7979, "eval_samples_per_second": 491.166, "eval_steps_per_second": 15.35, "step": 80000 }, { "epoch": 3.381601722463799, "grad_norm": 94.71104431152344, "learning_rate": 3.4442320546949215e-06, "loss": 3.0596, "step": 80100 }, { "epoch": 3.3858234474606324, "grad_norm": 87.31482696533203, "learning_rate": 3.420777971925463e-06, "loss": 3.2673, "step": 80200 }, { "epoch": 3.3900451724574663, "grad_norm": 61.90350341796875, "learning_rate": 3.397323889156005e-06, "loss": 3.8284, "step": 80300 }, { "epoch": 3.3942668974542998, "grad_norm": 36.948333740234375, "learning_rate": 3.3738698063865473e-06, "loss": 3.2518, "step": 80400 }, { "epoch": 3.3984886224511337, "grad_norm": 74.06956481933594, "learning_rate": 3.3504157236170886e-06, "loss": 3.4214, "step": 80500 }, { "epoch": 3.3984886224511337, "eval_loss": 3.7571513652801514, "eval_runtime": 365.367, "eval_samples_per_second": 518.632, "eval_steps_per_second": 16.208, "step": 80500 }, { "epoch": 3.402710347447967, "grad_norm": 243.53871154785156, "learning_rate": 3.326961640847631e-06, "loss": 3.3534, "step": 80600 }, { "epoch": 3.406932072444801, "grad_norm": 23.158308029174805, "learning_rate": 3.303507558078173e-06, "loss": 3.7609, "step": 80700 }, { "epoch": 3.4111537974416346, "grad_norm": 70.86226654052734, "learning_rate": 3.2800534753087148e-06, "loss": 3.7096, "step": 80800 }, { "epoch": 3.4153755224384685, "grad_norm": 36.22040557861328, "learning_rate": 3.2565993925392565e-06, "loss": 2.9755, "step": 80900 }, { "epoch": 3.419597247435302, "grad_norm": 110.31868743896484, "learning_rate": 3.2331453097697983e-06, "loss": 3.4585, "step": 81000 }, { "epoch": 3.419597247435302, "eval_loss": 3.736179828643799, "eval_runtime": 373.3817, "eval_samples_per_second": 507.499, "eval_steps_per_second": 15.86, "step": 81000 }, { "epoch": 3.423818972432136, "grad_norm": 60.848907470703125, "learning_rate": 3.2096912270003405e-06, "loss": 3.5315, "step": 81100 }, { "epoch": 3.4280406974289694, "grad_norm": 27.560346603393555, "learning_rate": 3.186237144230882e-06, "loss": 3.4276, "step": 81200 }, { "epoch": 3.4322624224258034, "grad_norm": 46.1323127746582, "learning_rate": 3.162783061461424e-06, "loss": 3.5303, "step": 81300 }, { "epoch": 3.436484147422637, "grad_norm": 55.932132720947266, "learning_rate": 3.1393289786919662e-06, "loss": 3.2272, "step": 81400 }, { "epoch": 3.440705872419471, "grad_norm": 36.39140701293945, "learning_rate": 3.1158748959225076e-06, "loss": 2.9614, "step": 81500 }, { "epoch": 3.440705872419471, "eval_loss": 3.7592084407806396, "eval_runtime": 380.4214, "eval_samples_per_second": 498.108, "eval_steps_per_second": 15.567, "step": 81500 }, { "epoch": 3.4449275974163043, "grad_norm": 63.25979232788086, "learning_rate": 3.0924208131530498e-06, "loss": 3.3272, "step": 81600 }, { "epoch": 3.449149322413138, "grad_norm": 60.168540954589844, "learning_rate": 3.068966730383592e-06, "loss": 3.548, "step": 81700 }, { "epoch": 3.4533710474099717, "grad_norm": 65.39545440673828, "learning_rate": 3.0455126476141338e-06, "loss": 3.5806, "step": 81800 }, { "epoch": 3.457592772406805, "grad_norm": 71.17179107666016, "learning_rate": 3.0220585648446755e-06, "loss": 3.2915, "step": 81900 }, { "epoch": 3.461814497403639, "grad_norm": 83.21878814697266, "learning_rate": 2.9986044820752173e-06, "loss": 3.4571, "step": 82000 }, { "epoch": 3.461814497403639, "eval_loss": 3.7446765899658203, "eval_runtime": 371.3966, "eval_samples_per_second": 510.212, "eval_steps_per_second": 15.945, "step": 82000 }, { "epoch": 3.466036222400473, "grad_norm": 40.86414337158203, "learning_rate": 2.9751503993057595e-06, "loss": 3.2471, "step": 82100 }, { "epoch": 3.4702579473973065, "grad_norm": 69.95441436767578, "learning_rate": 2.9516963165363013e-06, "loss": 3.3675, "step": 82200 }, { "epoch": 3.47447967239414, "grad_norm": 87.65948486328125, "learning_rate": 2.928242233766843e-06, "loss": 3.039, "step": 82300 }, { "epoch": 3.478701397390974, "grad_norm": 42.40869140625, "learning_rate": 2.9047881509973852e-06, "loss": 3.1737, "step": 82400 }, { "epoch": 3.482923122387808, "grad_norm": 95.52741241455078, "learning_rate": 2.8813340682279274e-06, "loss": 3.5937, "step": 82500 }, { "epoch": 3.482923122387808, "eval_loss": 3.7526960372924805, "eval_runtime": 382.5828, "eval_samples_per_second": 495.294, "eval_steps_per_second": 15.479, "step": 82500 }, { "epoch": 3.4871448473846414, "grad_norm": 99.28910064697266, "learning_rate": 2.858114526286164e-06, "loss": 3.3723, "step": 82600 }, { "epoch": 3.491366572381475, "grad_norm": 84.63138580322266, "learning_rate": 2.8346604435167053e-06, "loss": 3.5835, "step": 82700 }, { "epoch": 3.495588297378309, "grad_norm": 30.03119468688965, "learning_rate": 2.8112063607472475e-06, "loss": 3.3739, "step": 82800 }, { "epoch": 3.4998100223751427, "grad_norm": 45.82896041870117, "learning_rate": 2.7877522779777892e-06, "loss": 3.3891, "step": 82900 }, { "epoch": 3.504031747371976, "grad_norm": 64.97034454345703, "learning_rate": 2.764298195208331e-06, "loss": 3.5204, "step": 83000 }, { "epoch": 3.504031747371976, "eval_loss": 3.7397613525390625, "eval_runtime": 380.231, "eval_samples_per_second": 498.358, "eval_steps_per_second": 15.575, "step": 83000 }, { "epoch": 3.5082534723688097, "grad_norm": 23.58553123474121, "learning_rate": 2.7408441124388728e-06, "loss": 3.0925, "step": 83100 }, { "epoch": 3.5124751973656436, "grad_norm": 46.03030014038086, "learning_rate": 2.717390029669415e-06, "loss": 3.2285, "step": 83200 }, { "epoch": 3.5166969223624776, "grad_norm": 51.1907844543457, "learning_rate": 2.693935946899957e-06, "loss": 3.4032, "step": 83300 }, { "epoch": 3.520918647359311, "grad_norm": 72.13794708251953, "learning_rate": 2.6704818641304985e-06, "loss": 3.5367, "step": 83400 }, { "epoch": 3.5251403723561445, "grad_norm": 31.403751373291016, "learning_rate": 2.6470277813610407e-06, "loss": 3.1513, "step": 83500 }, { "epoch": 3.5251403723561445, "eval_loss": 3.7454099655151367, "eval_runtime": 387.0416, "eval_samples_per_second": 489.588, "eval_steps_per_second": 15.301, "step": 83500 }, { "epoch": 3.5293620973529785, "grad_norm": 45.70579147338867, "learning_rate": 2.623573698591583e-06, "loss": 3.292, "step": 83600 }, { "epoch": 3.5335838223498124, "grad_norm": 53.186622619628906, "learning_rate": 2.6001196158221242e-06, "loss": 3.2018, "step": 83700 }, { "epoch": 3.537805547346646, "grad_norm": 60.18638610839844, "learning_rate": 2.5766655330526664e-06, "loss": 3.4814, "step": 83800 }, { "epoch": 3.5420272723434794, "grad_norm": 63.47237014770508, "learning_rate": 2.5532114502832082e-06, "loss": 3.2591, "step": 83900 }, { "epoch": 3.5462489973403133, "grad_norm": 35.679569244384766, "learning_rate": 2.52975736751375e-06, "loss": 3.179, "step": 84000 }, { "epoch": 3.5462489973403133, "eval_loss": 3.7721922397613525, "eval_runtime": 368.2527, "eval_samples_per_second": 514.568, "eval_steps_per_second": 16.081, "step": 84000 }, { "epoch": 3.5504707223371472, "grad_norm": 70.08448791503906, "learning_rate": 2.5063032847442918e-06, "loss": 3.3408, "step": 84100 }, { "epoch": 3.5546924473339807, "grad_norm": 30.69338607788086, "learning_rate": 2.482849201974834e-06, "loss": 3.6131, "step": 84200 }, { "epoch": 3.558914172330814, "grad_norm": 57.080509185791016, "learning_rate": 2.4593951192053757e-06, "loss": 3.2299, "step": 84300 }, { "epoch": 3.563135897327648, "grad_norm": 76.74966430664062, "learning_rate": 2.435941036435918e-06, "loss": 3.3005, "step": 84400 }, { "epoch": 3.567357622324482, "grad_norm": 33.93446731567383, "learning_rate": 2.4124869536664597e-06, "loss": 3.4731, "step": 84500 }, { "epoch": 3.567357622324482, "eval_loss": 3.7457754611968994, "eval_runtime": 380.3578, "eval_samples_per_second": 498.191, "eval_steps_per_second": 15.57, "step": 84500 }, { "epoch": 3.5715793473213155, "grad_norm": 48.096961975097656, "learning_rate": 2.3890328708970015e-06, "loss": 3.2288, "step": 84600 }, { "epoch": 3.575801072318149, "grad_norm": 107.95574188232422, "learning_rate": 2.3655787881275437e-06, "loss": 3.4384, "step": 84700 }, { "epoch": 3.580022797314983, "grad_norm": 213.74327087402344, "learning_rate": 2.3421247053580854e-06, "loss": 3.6438, "step": 84800 }, { "epoch": 3.5842445223118164, "grad_norm": 42.909175872802734, "learning_rate": 2.318670622588627e-06, "loss": 3.3293, "step": 84900 }, { "epoch": 3.5884662473086504, "grad_norm": 32.89263916015625, "learning_rate": 2.2952165398191694e-06, "loss": 3.3555, "step": 85000 }, { "epoch": 3.5884662473086504, "eval_loss": 3.7533552646636963, "eval_runtime": 386.4534, "eval_samples_per_second": 490.333, "eval_steps_per_second": 15.324, "step": 85000 }, { "epoch": 3.592687972305484, "grad_norm": 52.0558967590332, "learning_rate": 2.271762457049711e-06, "loss": 3.4791, "step": 85100 }, { "epoch": 3.596909697302318, "grad_norm": 36.7622184753418, "learning_rate": 2.248308374280253e-06, "loss": 3.1024, "step": 85200 }, { "epoch": 3.6011314222991513, "grad_norm": 37.515411376953125, "learning_rate": 2.2248542915107947e-06, "loss": 3.4605, "step": 85300 }, { "epoch": 3.605353147295985, "grad_norm": 35.729393005371094, "learning_rate": 2.201400208741337e-06, "loss": 3.4317, "step": 85400 }, { "epoch": 3.6095748722928187, "grad_norm": 117.54667663574219, "learning_rate": 2.1779461259718787e-06, "loss": 3.2913, "step": 85500 }, { "epoch": 3.6095748722928187, "eval_loss": 3.703676223754883, "eval_runtime": 374.3622, "eval_samples_per_second": 506.17, "eval_steps_per_second": 15.819, "step": 85500 }, { "epoch": 3.6137965972896526, "grad_norm": 60.1458854675293, "learning_rate": 2.1544920432024205e-06, "loss": 3.3377, "step": 85600 }, { "epoch": 3.618018322286486, "grad_norm": 74.54237365722656, "learning_rate": 2.1310379604329627e-06, "loss": 3.2746, "step": 85700 }, { "epoch": 3.62224004728332, "grad_norm": 82.62213134765625, "learning_rate": 2.1075838776635044e-06, "loss": 3.4173, "step": 85800 }, { "epoch": 3.6264617722801535, "grad_norm": 41.210052490234375, "learning_rate": 2.0841297948940466e-06, "loss": 3.4623, "step": 85900 }, { "epoch": 3.6306834972769875, "grad_norm": 24.477449417114258, "learning_rate": 2.0606757121245884e-06, "loss": 3.2596, "step": 86000 }, { "epoch": 3.6306834972769875, "eval_loss": 3.705226421356201, "eval_runtime": 377.8636, "eval_samples_per_second": 501.48, "eval_steps_per_second": 15.672, "step": 86000 }, { "epoch": 3.634905222273821, "grad_norm": 44.822235107421875, "learning_rate": 2.03722162935513e-06, "loss": 3.2322, "step": 86100 }, { "epoch": 3.639126947270655, "grad_norm": 116.65343475341797, "learning_rate": 2.013767546585672e-06, "loss": 3.2082, "step": 86200 }, { "epoch": 3.6433486722674884, "grad_norm": 38.853790283203125, "learning_rate": 1.9903134638162137e-06, "loss": 3.4993, "step": 86300 }, { "epoch": 3.6475703972643223, "grad_norm": 45.420650482177734, "learning_rate": 1.966859381046756e-06, "loss": 3.3922, "step": 86400 }, { "epoch": 3.651792122261156, "grad_norm": 57.968753814697266, "learning_rate": 1.9434052982772977e-06, "loss": 3.2275, "step": 86500 }, { "epoch": 3.651792122261156, "eval_loss": 3.679260492324829, "eval_runtime": 381.7795, "eval_samples_per_second": 496.336, "eval_steps_per_second": 15.512, "step": 86500 }, { "epoch": 3.6560138472579897, "grad_norm": 44.483787536621094, "learning_rate": 1.91995121550784e-06, "loss": 3.3031, "step": 86600 }, { "epoch": 3.660235572254823, "grad_norm": 57.32283020019531, "learning_rate": 1.8967316735660761e-06, "loss": 3.2876, "step": 86700 }, { "epoch": 3.664457297251657, "grad_norm": 63.705631256103516, "learning_rate": 1.8732775907966181e-06, "loss": 3.5403, "step": 86800 }, { "epoch": 3.6686790222484906, "grad_norm": 39.64052200317383, "learning_rate": 1.8498235080271601e-06, "loss": 3.3889, "step": 86900 }, { "epoch": 3.6729007472453246, "grad_norm": 67.5136489868164, "learning_rate": 1.8263694252577019e-06, "loss": 3.3938, "step": 87000 }, { "epoch": 3.6729007472453246, "eval_loss": 3.6927380561828613, "eval_runtime": 390.1224, "eval_samples_per_second": 485.722, "eval_steps_per_second": 15.18, "step": 87000 }, { "epoch": 3.677122472242158, "grad_norm": 65.40144348144531, "learning_rate": 1.8029153424882437e-06, "loss": 3.3169, "step": 87100 }, { "epoch": 3.681344197238992, "grad_norm": 39.77958297729492, "learning_rate": 1.7794612597187858e-06, "loss": 3.1372, "step": 87200 }, { "epoch": 3.6855659222358255, "grad_norm": 38.851959228515625, "learning_rate": 1.7560071769493276e-06, "loss": 3.0958, "step": 87300 }, { "epoch": 3.6897876472326594, "grad_norm": 75.6167221069336, "learning_rate": 1.7325530941798696e-06, "loss": 3.1186, "step": 87400 }, { "epoch": 3.694009372229493, "grad_norm": 61.1735725402832, "learning_rate": 1.7090990114104114e-06, "loss": 3.5419, "step": 87500 }, { "epoch": 3.694009372229493, "eval_loss": 3.6852128505706787, "eval_runtime": 386.9458, "eval_samples_per_second": 489.709, "eval_steps_per_second": 15.304, "step": 87500 }, { "epoch": 3.698231097226327, "grad_norm": 66.27415466308594, "learning_rate": 1.6856449286409531e-06, "loss": 3.6208, "step": 87600 }, { "epoch": 3.7024528222231603, "grad_norm": 75.61920928955078, "learning_rate": 1.6621908458714953e-06, "loss": 3.3815, "step": 87700 }, { "epoch": 3.7066745472199942, "grad_norm": 77.92952728271484, "learning_rate": 1.6387367631020371e-06, "loss": 2.9388, "step": 87800 }, { "epoch": 3.7108962722168277, "grad_norm": 47.41593551635742, "learning_rate": 1.615282680332579e-06, "loss": 3.2111, "step": 87900 }, { "epoch": 3.7151179972136616, "grad_norm": 106.80548095703125, "learning_rate": 1.5918285975631209e-06, "loss": 3.4742, "step": 88000 }, { "epoch": 3.7151179972136616, "eval_loss": 3.690481424331665, "eval_runtime": 376.2552, "eval_samples_per_second": 503.624, "eval_steps_per_second": 15.739, "step": 88000 }, { "epoch": 3.719339722210495, "grad_norm": 99.46328735351562, "learning_rate": 1.5683745147936626e-06, "loss": 3.2668, "step": 88100 }, { "epoch": 3.723561447207329, "grad_norm": 40.66118240356445, "learning_rate": 1.5451549728518993e-06, "loss": 3.439, "step": 88200 }, { "epoch": 3.7277831722041626, "grad_norm": 47.70708084106445, "learning_rate": 1.5217008900824411e-06, "loss": 3.3342, "step": 88300 }, { "epoch": 3.7320048972009965, "grad_norm": 79.89665222167969, "learning_rate": 1.498246807312983e-06, "loss": 3.5079, "step": 88400 }, { "epoch": 3.73622662219783, "grad_norm": 39.58213806152344, "learning_rate": 1.474792724543525e-06, "loss": 3.4446, "step": 88500 }, { "epoch": 3.73622662219783, "eval_loss": 3.7126083374023438, "eval_runtime": 378.45, "eval_samples_per_second": 500.703, "eval_steps_per_second": 15.648, "step": 88500 }, { "epoch": 3.740448347194664, "grad_norm": 60.13318634033203, "learning_rate": 1.4513386417740668e-06, "loss": 3.3036, "step": 88600 }, { "epoch": 3.7446700721914974, "grad_norm": 95.5160903930664, "learning_rate": 1.4278845590046088e-06, "loss": 3.323, "step": 88700 }, { "epoch": 3.748891797188331, "grad_norm": 29.227277755737305, "learning_rate": 1.4044304762351508e-06, "loss": 3.2921, "step": 88800 }, { "epoch": 3.753113522185165, "grad_norm": 46.99550247192383, "learning_rate": 1.381210934293387e-06, "loss": 3.3972, "step": 88900 }, { "epoch": 3.7573352471819987, "grad_norm": 44.87736892700195, "learning_rate": 1.3577568515239293e-06, "loss": 3.3132, "step": 89000 }, { "epoch": 3.7573352471819987, "eval_loss": 3.7031056880950928, "eval_runtime": 387.1006, "eval_samples_per_second": 489.514, "eval_steps_per_second": 15.298, "step": 89000 }, { "epoch": 3.7615569721788322, "grad_norm": 249.27423095703125, "learning_rate": 1.334302768754471e-06, "loss": 3.6181, "step": 89100 }, { "epoch": 3.7657786971756657, "grad_norm": 56.87501907348633, "learning_rate": 1.310848685985013e-06, "loss": 3.41, "step": 89200 }, { "epoch": 3.7700004221724996, "grad_norm": 30.149934768676758, "learning_rate": 1.2873946032155548e-06, "loss": 3.2602, "step": 89300 }, { "epoch": 3.7742221471693336, "grad_norm": 81.41632080078125, "learning_rate": 1.2639405204460966e-06, "loss": 3.3742, "step": 89400 }, { "epoch": 3.778443872166167, "grad_norm": 79.90999603271484, "learning_rate": 1.2404864376766386e-06, "loss": 3.2929, "step": 89500 }, { "epoch": 3.778443872166167, "eval_loss": 3.7064220905303955, "eval_runtime": 380.9556, "eval_samples_per_second": 497.41, "eval_steps_per_second": 15.545, "step": 89500 }, { "epoch": 3.7826655971630005, "grad_norm": 86.91078186035156, "learning_rate": 1.2170323549071805e-06, "loss": 3.1366, "step": 89600 }, { "epoch": 3.7868873221598345, "grad_norm": 55.73985290527344, "learning_rate": 1.1935782721377225e-06, "loss": 3.5312, "step": 89700 }, { "epoch": 3.7911090471566684, "grad_norm": 47.79535675048828, "learning_rate": 1.1701241893682643e-06, "loss": 3.2735, "step": 89800 }, { "epoch": 3.795330772153502, "grad_norm": 122.197021484375, "learning_rate": 1.1466701065988063e-06, "loss": 3.3797, "step": 89900 }, { "epoch": 3.7995524971503354, "grad_norm": 83.0975341796875, "learning_rate": 1.1232160238293483e-06, "loss": 3.3003, "step": 90000 }, { "epoch": 3.7995524971503354, "eval_loss": 3.713467597961426, "eval_runtime": 391.6827, "eval_samples_per_second": 483.787, "eval_steps_per_second": 15.119, "step": 90000 }, { "epoch": 3.8037742221471693, "grad_norm": 27.892484664916992, "learning_rate": 1.09976194105989e-06, "loss": 3.483, "step": 90100 }, { "epoch": 3.8079959471440032, "grad_norm": 29.49579620361328, "learning_rate": 1.076307858290432e-06, "loss": 3.2309, "step": 90200 }, { "epoch": 3.8122176721408367, "grad_norm": 48.57194900512695, "learning_rate": 1.0528537755209738e-06, "loss": 3.3767, "step": 90300 }, { "epoch": 3.81643939713767, "grad_norm": 34.58513259887695, "learning_rate": 1.0293996927515158e-06, "loss": 2.8296, "step": 90400 }, { "epoch": 3.820661122134504, "grad_norm": 70.50397491455078, "learning_rate": 1.0059456099820578e-06, "loss": 3.361, "step": 90500 }, { "epoch": 3.820661122134504, "eval_loss": 3.714541435241699, "eval_runtime": 387.3586, "eval_samples_per_second": 489.188, "eval_steps_per_second": 15.288, "step": 90500 }, { "epoch": 3.824882847131338, "grad_norm": 49.018592834472656, "learning_rate": 9.824915272125995e-07, "loss": 3.3726, "step": 90600 }, { "epoch": 3.8291045721281716, "grad_norm": 63.764713287353516, "learning_rate": 9.590374444431415e-07, "loss": 3.2925, "step": 90700 }, { "epoch": 3.833326297125005, "grad_norm": 32.11079788208008, "learning_rate": 9.355833616736834e-07, "loss": 3.5113, "step": 90800 }, { "epoch": 3.837548022121839, "grad_norm": 65.86510467529297, "learning_rate": 9.121292789042254e-07, "loss": 3.6037, "step": 90900 }, { "epoch": 3.841769747118673, "grad_norm": 80.28820037841797, "learning_rate": 8.886751961347673e-07, "loss": 3.0925, "step": 91000 }, { "epoch": 3.841769747118673, "eval_loss": 3.7223033905029297, "eval_runtime": 379.9027, "eval_samples_per_second": 498.788, "eval_steps_per_second": 15.588, "step": 91000 }, { "epoch": 3.8459914721155064, "grad_norm": 44.35951232910156, "learning_rate": 8.652211133653092e-07, "loss": 3.4363, "step": 91100 }, { "epoch": 3.85021319711234, "grad_norm": 31.515623092651367, "learning_rate": 8.41767030595851e-07, "loss": 3.3181, "step": 91200 }, { "epoch": 3.854434922109174, "grad_norm": 80.0140609741211, "learning_rate": 8.183129478263929e-07, "loss": 3.4216, "step": 91300 }, { "epoch": 3.8586566471060078, "grad_norm": 32.14973068237305, "learning_rate": 7.948588650569349e-07, "loss": 3.1301, "step": 91400 }, { "epoch": 3.8628783721028412, "grad_norm": 47.468631744384766, "learning_rate": 7.714047822874768e-07, "loss": 3.5791, "step": 91500 }, { "epoch": 3.8628783721028412, "eval_loss": 3.714407205581665, "eval_runtime": 386.9507, "eval_samples_per_second": 489.703, "eval_steps_per_second": 15.304, "step": 91500 }, { "epoch": 3.8671000970996747, "grad_norm": 17.921409606933594, "learning_rate": 7.479506995180187e-07, "loss": 3.0492, "step": 91600 }, { "epoch": 3.8713218220965087, "grad_norm": 46.39325714111328, "learning_rate": 7.244966167485605e-07, "loss": 3.4513, "step": 91700 }, { "epoch": 3.8755435470933426, "grad_norm": 73.21107482910156, "learning_rate": 7.010425339791024e-07, "loss": 3.7442, "step": 91800 }, { "epoch": 3.879765272090176, "grad_norm": 101.89634704589844, "learning_rate": 6.775884512096444e-07, "loss": 3.1566, "step": 91900 }, { "epoch": 3.8839869970870096, "grad_norm": 69.1541748046875, "learning_rate": 6.541343684401863e-07, "loss": 3.3871, "step": 92000 }, { "epoch": 3.8839869970870096, "eval_loss": 3.7024970054626465, "eval_runtime": 386.5745, "eval_samples_per_second": 490.18, "eval_steps_per_second": 15.319, "step": 92000 }, { "epoch": 3.8882087220838435, "grad_norm": 64.59981536865234, "learning_rate": 6.306802856707282e-07, "loss": 3.3478, "step": 92100 }, { "epoch": 3.8924304470806774, "grad_norm": 48.25294494628906, "learning_rate": 6.072262029012701e-07, "loss": 3.2922, "step": 92200 }, { "epoch": 3.896652172077511, "grad_norm": 33.20170974731445, "learning_rate": 5.83772120131812e-07, "loss": 3.0988, "step": 92300 }, { "epoch": 3.9008738970743444, "grad_norm": 59.5956916809082, "learning_rate": 5.603180373623539e-07, "loss": 3.4383, "step": 92400 }, { "epoch": 3.9050956220711783, "grad_norm": 29.315317153930664, "learning_rate": 5.368639545928959e-07, "loss": 3.175, "step": 92500 }, { "epoch": 3.9050956220711783, "eval_loss": 3.702639579772949, "eval_runtime": 374.6028, "eval_samples_per_second": 505.845, "eval_steps_per_second": 15.809, "step": 92500 }, { "epoch": 3.9093173470680123, "grad_norm": 86.02645111083984, "learning_rate": 5.134098718234377e-07, "loss": 3.3831, "step": 92600 }, { "epoch": 3.9135390720648457, "grad_norm": 53.895973205566406, "learning_rate": 4.899557890539796e-07, "loss": 3.3871, "step": 92700 }, { "epoch": 3.9177607970616792, "grad_norm": 41.91706085205078, "learning_rate": 4.6650170628452154e-07, "loss": 3.5747, "step": 92800 }, { "epoch": 3.921982522058513, "grad_norm": 35.26768112182617, "learning_rate": 4.4304762351506336e-07, "loss": 3.272, "step": 92900 }, { "epoch": 3.9262042470553467, "grad_norm": 29.90506362915039, "learning_rate": 4.195935407456053e-07, "loss": 3.4294, "step": 93000 }, { "epoch": 3.9262042470553467, "eval_loss": 3.6876306533813477, "eval_runtime": 371.6904, "eval_samples_per_second": 509.809, "eval_steps_per_second": 15.933, "step": 93000 }, { "epoch": 3.9304259720521806, "grad_norm": 28.437841415405273, "learning_rate": 3.961394579761472e-07, "loss": 3.6332, "step": 93100 }, { "epoch": 3.934647697049014, "grad_norm": 40.40605163574219, "learning_rate": 3.726853752066891e-07, "loss": 3.626, "step": 93200 }, { "epoch": 3.938869422045848, "grad_norm": 53.085670471191406, "learning_rate": 3.4923129243723103e-07, "loss": 3.5402, "step": 93300 }, { "epoch": 3.9430911470426815, "grad_norm": 51.06597900390625, "learning_rate": 3.2577720966777297e-07, "loss": 3.348, "step": 93400 }, { "epoch": 3.9473128720395154, "grad_norm": 48.22876739501953, "learning_rate": 3.0232312689831484e-07, "loss": 3.2556, "step": 93500 }, { "epoch": 3.9473128720395154, "eval_loss": 3.6902225017547607, "eval_runtime": 371.965, "eval_samples_per_second": 509.432, "eval_steps_per_second": 15.921, "step": 93500 }, { "epoch": 3.951534597036349, "grad_norm": 43.102745056152344, "learning_rate": 2.788690441288568e-07, "loss": 3.5298, "step": 93600 }, { "epoch": 3.955756322033183, "grad_norm": 38.394798278808594, "learning_rate": 2.5541496135939865e-07, "loss": 3.5247, "step": 93700 }, { "epoch": 3.9599780470300163, "grad_norm": 33.49041748046875, "learning_rate": 2.3196087858994056e-07, "loss": 3.0763, "step": 93800 }, { "epoch": 3.9641997720268503, "grad_norm": 36.5673828125, "learning_rate": 2.0850679582048246e-07, "loss": 3.2457, "step": 93900 }, { "epoch": 3.9684214970236837, "grad_norm": 31.246381759643555, "learning_rate": 1.8528725387871894e-07, "loss": 3.1805, "step": 94000 }, { "epoch": 3.9684214970236837, "eval_loss": 3.689732551574707, "eval_runtime": 382.9877, "eval_samples_per_second": 494.77, "eval_steps_per_second": 15.463, "step": 94000 }, { "epoch": 3.9726432220205177, "grad_norm": 51.14909362792969, "learning_rate": 1.6183317110926085e-07, "loss": 3.3959, "step": 94100 }, { "epoch": 3.976864947017351, "grad_norm": 30.674938201904297, "learning_rate": 1.3837908833980275e-07, "loss": 3.205, "step": 94200 }, { "epoch": 3.981086672014185, "grad_norm": 92.1559829711914, "learning_rate": 1.1492500557034467e-07, "loss": 3.3307, "step": 94300 }, { "epoch": 3.9853083970110186, "grad_norm": 105.24581909179688, "learning_rate": 9.147092280088657e-08, "loss": 3.0448, "step": 94400 }, { "epoch": 3.9895301220078525, "grad_norm": 43.431854248046875, "learning_rate": 6.801684003142848e-08, "loss": 3.0447, "step": 94500 }, { "epoch": 3.9895301220078525, "eval_loss": 3.690552234649658, "eval_runtime": 386.4079, "eval_samples_per_second": 490.391, "eval_steps_per_second": 15.326, "step": 94500 }, { "epoch": 3.993751847004686, "grad_norm": 53.77738952636719, "learning_rate": 4.4562757261970375e-08, "loss": 3.3314, "step": 94600 }, { "epoch": 3.99797357200152, "grad_norm": 63.30742645263672, "learning_rate": 2.1108674492512286e-08, "loss": 3.4516, "step": 94700 } ], "logging_steps": 100, "max_steps": 94748, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }