diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6095 @@ +{ + "best_metric": 0.4145541489124298, + "best_model_checkpoint": "saves/CADICA_qwenvl_stenosis_detect_scale4/lora/sft/checkpoint-2350", + "epoch": 0.8756116404841617, + "eval_steps": 50, + "global_step": 3400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012876641771825909, + "grad_norm": 13.245840411597928, + "learning_rate": 2.9411764705882355e-06, + "loss": 2.8889, + "num_input_tokens_seen": 52840, + "step": 5 + }, + { + "epoch": 0.0025753283543651817, + "grad_norm": 12.237619501215374, + "learning_rate": 5.882352941176471e-06, + "loss": 2.8165, + "num_input_tokens_seen": 105528, + "step": 10 + }, + { + "epoch": 0.0038629925315477724, + "grad_norm": 16.29688816410412, + "learning_rate": 8.823529411764707e-06, + "loss": 2.8363, + "num_input_tokens_seen": 158768, + "step": 15 + }, + { + "epoch": 0.0051506567087303634, + "grad_norm": 11.576419511120797, + "learning_rate": 1.1764705882352942e-05, + "loss": 2.6853, + "num_input_tokens_seen": 210816, + "step": 20 + }, + { + "epoch": 0.006438320885912954, + "grad_norm": 6.9672256792859, + "learning_rate": 1.4705882352941177e-05, + "loss": 2.2992, + "num_input_tokens_seen": 262936, + "step": 25 + }, + { + "epoch": 0.007725985063095545, + "grad_norm": 3.1837818528204305, + "learning_rate": 1.7647058823529414e-05, + "loss": 1.8923, + "num_input_tokens_seen": 315264, + "step": 30 + }, + { + "epoch": 0.009013649240278136, + "grad_norm": 2.835950303969337, + "learning_rate": 2.058823529411765e-05, + "loss": 1.6984, + "num_input_tokens_seen": 367840, + "step": 35 + }, + { + "epoch": 0.010301313417460727, + "grad_norm": 2.223740001042382, + "learning_rate": 2.3529411764705884e-05, + "loss": 1.6434, + "num_input_tokens_seen": 420112, + "step": 40 + }, + { + "epoch": 0.011588977594643318, + "grad_norm": 1.9880935044313244, + "learning_rate": 2.647058823529412e-05, + "loss": 1.4659, + "num_input_tokens_seen": 472728, + "step": 45 + }, + { + "epoch": 0.012876641771825908, + "grad_norm": 1.7151131700495934, + "learning_rate": 2.9411764705882354e-05, + "loss": 1.3506, + "num_input_tokens_seen": 524648, + "step": 50 + }, + { + "epoch": 0.012876641771825908, + "eval_loss": 1.1727452278137207, + "eval_runtime": 66.3207, + "eval_samples_per_second": 1.809, + "eval_steps_per_second": 0.452, + "num_input_tokens_seen": 524648, + "step": 50 + }, + { + "epoch": 0.014164305949008499, + "grad_norm": 1.47475981537851, + "learning_rate": 3.235294117647059e-05, + "loss": 1.1455, + "num_input_tokens_seen": 576472, + "step": 55 + }, + { + "epoch": 0.01545197012619109, + "grad_norm": 1.7476693647440722, + "learning_rate": 3.529411764705883e-05, + "loss": 0.9971, + "num_input_tokens_seen": 628056, + "step": 60 + }, + { + "epoch": 0.01673963430337368, + "grad_norm": 1.3384365493212875, + "learning_rate": 3.8235294117647055e-05, + "loss": 0.9073, + "num_input_tokens_seen": 680448, + "step": 65 + }, + { + "epoch": 0.018027298480556272, + "grad_norm": 0.9014358219807773, + "learning_rate": 4.11764705882353e-05, + "loss": 0.8386, + "num_input_tokens_seen": 733664, + "step": 70 + }, + { + "epoch": 0.01931496265773886, + "grad_norm": 0.8007820009902022, + "learning_rate": 4.411764705882353e-05, + "loss": 0.7827, + "num_input_tokens_seen": 786096, + "step": 75 + }, + { + "epoch": 0.020602626834921454, + "grad_norm": 0.6701003454307716, + "learning_rate": 4.705882352941177e-05, + "loss": 0.7814, + "num_input_tokens_seen": 838192, + "step": 80 + }, + { + "epoch": 0.021890291012104043, + "grad_norm": 0.8973165751658843, + "learning_rate": 5e-05, + "loss": 0.7297, + "num_input_tokens_seen": 890112, + "step": 85 + }, + { + "epoch": 0.023177955189286635, + "grad_norm": 0.9060968630490469, + "learning_rate": 5.294117647058824e-05, + "loss": 0.7894, + "num_input_tokens_seen": 943472, + "step": 90 + }, + { + "epoch": 0.024465619366469224, + "grad_norm": 0.9520214202472889, + "learning_rate": 5.588235294117647e-05, + "loss": 0.7758, + "num_input_tokens_seen": 996872, + "step": 95 + }, + { + "epoch": 0.025753283543651816, + "grad_norm": 0.8226006535044261, + "learning_rate": 5.882352941176471e-05, + "loss": 0.7577, + "num_input_tokens_seen": 1049816, + "step": 100 + }, + { + "epoch": 0.025753283543651816, + "eval_loss": 0.7517351508140564, + "eval_runtime": 38.7829, + "eval_samples_per_second": 3.094, + "eval_steps_per_second": 0.774, + "num_input_tokens_seen": 1049816, + "step": 100 + }, + { + "epoch": 0.027040947720834405, + "grad_norm": 0.7251208491150668, + "learning_rate": 6.176470588235295e-05, + "loss": 0.7579, + "num_input_tokens_seen": 1102584, + "step": 105 + }, + { + "epoch": 0.028328611898016998, + "grad_norm": 0.8217419839297042, + "learning_rate": 6.470588235294118e-05, + "loss": 0.7659, + "num_input_tokens_seen": 1155512, + "step": 110 + }, + { + "epoch": 0.029616276075199587, + "grad_norm": 0.6768053879888967, + "learning_rate": 6.764705882352942e-05, + "loss": 0.7469, + "num_input_tokens_seen": 1207976, + "step": 115 + }, + { + "epoch": 0.03090394025238218, + "grad_norm": 1.9562630849642013, + "learning_rate": 7.058823529411765e-05, + "loss": 0.7353, + "num_input_tokens_seen": 1259776, + "step": 120 + }, + { + "epoch": 0.03219160442956477, + "grad_norm": 0.6439041597153087, + "learning_rate": 7.352941176470589e-05, + "loss": 0.7537, + "num_input_tokens_seen": 1312760, + "step": 125 + }, + { + "epoch": 0.03347926860674736, + "grad_norm": 0.6124318582166212, + "learning_rate": 7.647058823529411e-05, + "loss": 0.7669, + "num_input_tokens_seen": 1365616, + "step": 130 + }, + { + "epoch": 0.03476693278392995, + "grad_norm": 0.7593534002488418, + "learning_rate": 7.941176470588235e-05, + "loss": 0.722, + "num_input_tokens_seen": 1417544, + "step": 135 + }, + { + "epoch": 0.036054596961112545, + "grad_norm": 0.7827834651032061, + "learning_rate": 8.23529411764706e-05, + "loss": 0.7502, + "num_input_tokens_seen": 1469856, + "step": 140 + }, + { + "epoch": 0.037342261138295134, + "grad_norm": 0.5444126155596626, + "learning_rate": 8.529411764705883e-05, + "loss": 0.7174, + "num_input_tokens_seen": 1521496, + "step": 145 + }, + { + "epoch": 0.03862992531547772, + "grad_norm": 0.40878703812837747, + "learning_rate": 8.823529411764706e-05, + "loss": 0.7018, + "num_input_tokens_seen": 1573376, + "step": 150 + }, + { + "epoch": 0.03862992531547772, + "eval_loss": 0.7309949994087219, + "eval_runtime": 38.2005, + "eval_samples_per_second": 3.141, + "eval_steps_per_second": 0.785, + "num_input_tokens_seen": 1573376, + "step": 150 + }, + { + "epoch": 0.03991758949266031, + "grad_norm": 0.5536144453733772, + "learning_rate": 9.11764705882353e-05, + "loss": 0.738, + "num_input_tokens_seen": 1626136, + "step": 155 + }, + { + "epoch": 0.04120525366984291, + "grad_norm": 0.5151715191704441, + "learning_rate": 9.411764705882353e-05, + "loss": 0.7579, + "num_input_tokens_seen": 1678760, + "step": 160 + }, + { + "epoch": 0.042492917847025496, + "grad_norm": 0.5209077394596254, + "learning_rate": 9.705882352941177e-05, + "loss": 0.7502, + "num_input_tokens_seen": 1731240, + "step": 165 + }, + { + "epoch": 0.043780582024208085, + "grad_norm": 0.721213601237688, + "learning_rate": 0.0001, + "loss": 0.7448, + "num_input_tokens_seen": 1783816, + "step": 170 + }, + { + "epoch": 0.045068246201390674, + "grad_norm": 0.48666007914879555, + "learning_rate": 9.999940874631277e-05, + "loss": 0.6648, + "num_input_tokens_seen": 1834592, + "step": 175 + }, + { + "epoch": 0.04635591037857327, + "grad_norm": 0.5136600613696797, + "learning_rate": 9.999763499923432e-05, + "loss": 0.7759, + "num_input_tokens_seen": 1888176, + "step": 180 + }, + { + "epoch": 0.04764357455575586, + "grad_norm": 0.6706281530046975, + "learning_rate": 9.999467880071402e-05, + "loss": 0.7167, + "num_input_tokens_seen": 1940280, + "step": 185 + }, + { + "epoch": 0.04893123873293845, + "grad_norm": 0.5159139445497618, + "learning_rate": 9.999054022066641e-05, + "loss": 0.7483, + "num_input_tokens_seen": 1993096, + "step": 190 + }, + { + "epoch": 0.050218902910121044, + "grad_norm": 0.40251006129746847, + "learning_rate": 9.998521935696953e-05, + "loss": 0.7464, + "num_input_tokens_seen": 2045648, + "step": 195 + }, + { + "epoch": 0.05150656708730363, + "grad_norm": 0.4811730853311867, + "learning_rate": 9.997871633546257e-05, + "loss": 0.7594, + "num_input_tokens_seen": 2099008, + "step": 200 + }, + { + "epoch": 0.05150656708730363, + "eval_loss": 0.7274295687675476, + "eval_runtime": 38.079, + "eval_samples_per_second": 3.151, + "eval_steps_per_second": 0.788, + "num_input_tokens_seen": 2099008, + "step": 200 + }, + { + "epoch": 0.05279423126448622, + "grad_norm": 0.591934959695668, + "learning_rate": 9.997103130994296e-05, + "loss": 0.706, + "num_input_tokens_seen": 2151680, + "step": 205 + }, + { + "epoch": 0.05408189544166881, + "grad_norm": 0.48253717444489286, + "learning_rate": 9.996216446216267e-05, + "loss": 0.7186, + "num_input_tokens_seen": 2203784, + "step": 210 + }, + { + "epoch": 0.055369559618851406, + "grad_norm": 0.5274315079401322, + "learning_rate": 9.995211600182397e-05, + "loss": 0.7009, + "num_input_tokens_seen": 2255632, + "step": 215 + }, + { + "epoch": 0.056657223796033995, + "grad_norm": 0.32879215224292613, + "learning_rate": 9.994088616657444e-05, + "loss": 0.6801, + "num_input_tokens_seen": 2308096, + "step": 220 + }, + { + "epoch": 0.057944887973216584, + "grad_norm": 0.37171195071448215, + "learning_rate": 9.992847522200133e-05, + "loss": 0.7569, + "num_input_tokens_seen": 2361168, + "step": 225 + }, + { + "epoch": 0.05923255215039917, + "grad_norm": 0.4120941016934064, + "learning_rate": 9.99148834616253e-05, + "loss": 0.7402, + "num_input_tokens_seen": 2413896, + "step": 230 + }, + { + "epoch": 0.06052021632758177, + "grad_norm": 0.5998680948310651, + "learning_rate": 9.990011120689351e-05, + "loss": 0.7191, + "num_input_tokens_seen": 2466136, + "step": 235 + }, + { + "epoch": 0.06180788050476436, + "grad_norm": 0.538488141249078, + "learning_rate": 9.988415880717194e-05, + "loss": 0.7274, + "num_input_tokens_seen": 2518848, + "step": 240 + }, + { + "epoch": 0.06309554468194695, + "grad_norm": 0.4393093124760277, + "learning_rate": 9.986702663973722e-05, + "loss": 0.7704, + "num_input_tokens_seen": 2572384, + "step": 245 + }, + { + "epoch": 0.06438320885912954, + "grad_norm": 0.6116643616510118, + "learning_rate": 9.98487151097676e-05, + "loss": 0.7346, + "num_input_tokens_seen": 2625352, + "step": 250 + }, + { + "epoch": 0.06438320885912954, + "eval_loss": 0.7181503176689148, + "eval_runtime": 38.0986, + "eval_samples_per_second": 3.15, + "eval_steps_per_second": 0.787, + "num_input_tokens_seen": 2625352, + "step": 250 + }, + { + "epoch": 0.06567087303631212, + "grad_norm": 0.41200227731339506, + "learning_rate": 9.98292246503335e-05, + "loss": 0.7408, + "num_input_tokens_seen": 2678216, + "step": 255 + }, + { + "epoch": 0.06695853721349472, + "grad_norm": 0.44521059732114987, + "learning_rate": 9.980855572238714e-05, + "loss": 0.7044, + "num_input_tokens_seen": 2730664, + "step": 260 + }, + { + "epoch": 0.06824620139067732, + "grad_norm": 0.571896859428363, + "learning_rate": 9.978670881475172e-05, + "loss": 0.7334, + "num_input_tokens_seen": 2783584, + "step": 265 + }, + { + "epoch": 0.0695338655678599, + "grad_norm": 0.3907697039722125, + "learning_rate": 9.976368444410985e-05, + "loss": 0.7075, + "num_input_tokens_seen": 2836152, + "step": 270 + }, + { + "epoch": 0.0708215297450425, + "grad_norm": 0.4507806825752261, + "learning_rate": 9.973948315499126e-05, + "loss": 0.7039, + "num_input_tokens_seen": 2887808, + "step": 275 + }, + { + "epoch": 0.07210919392222509, + "grad_norm": 0.41330504132984697, + "learning_rate": 9.971410551976002e-05, + "loss": 0.6953, + "num_input_tokens_seen": 2939656, + "step": 280 + }, + { + "epoch": 0.07339685809940767, + "grad_norm": 0.4625671909482009, + "learning_rate": 9.968755213860094e-05, + "loss": 0.7022, + "num_input_tokens_seen": 2991632, + "step": 285 + }, + { + "epoch": 0.07468452227659027, + "grad_norm": 0.6553627840267285, + "learning_rate": 9.96598236395054e-05, + "loss": 0.6796, + "num_input_tokens_seen": 3043616, + "step": 290 + }, + { + "epoch": 0.07597218645377285, + "grad_norm": 0.5157886895754477, + "learning_rate": 9.96309206782565e-05, + "loss": 0.7346, + "num_input_tokens_seen": 3096920, + "step": 295 + }, + { + "epoch": 0.07725985063095545, + "grad_norm": 0.5672965149433489, + "learning_rate": 9.960084393841355e-05, + "loss": 0.6815, + "num_input_tokens_seen": 3149032, + "step": 300 + }, + { + "epoch": 0.07725985063095545, + "eval_loss": 0.7073924541473389, + "eval_runtime": 38.1842, + "eval_samples_per_second": 3.143, + "eval_steps_per_second": 0.786, + "num_input_tokens_seen": 3149032, + "step": 300 + }, + { + "epoch": 0.07854751480813804, + "grad_norm": 0.4479276285203507, + "learning_rate": 9.956959413129585e-05, + "loss": 0.7208, + "num_input_tokens_seen": 3201560, + "step": 305 + }, + { + "epoch": 0.07983517898532062, + "grad_norm": 0.368457437106614, + "learning_rate": 9.953717199596598e-05, + "loss": 0.7144, + "num_input_tokens_seen": 3254632, + "step": 310 + }, + { + "epoch": 0.08112284316250322, + "grad_norm": 0.5531413254856732, + "learning_rate": 9.95035782992122e-05, + "loss": 0.6861, + "num_input_tokens_seen": 3306432, + "step": 315 + }, + { + "epoch": 0.08241050733968582, + "grad_norm": 0.41513991799613037, + "learning_rate": 9.94688138355304e-05, + "loss": 0.6836, + "num_input_tokens_seen": 3358392, + "step": 320 + }, + { + "epoch": 0.0836981715168684, + "grad_norm": 0.47052274706452957, + "learning_rate": 9.943287942710527e-05, + "loss": 0.7353, + "num_input_tokens_seen": 3411424, + "step": 325 + }, + { + "epoch": 0.08498583569405099, + "grad_norm": 0.6322586593511644, + "learning_rate": 9.939577592379088e-05, + "loss": 0.6774, + "num_input_tokens_seen": 3462992, + "step": 330 + }, + { + "epoch": 0.08627349987123359, + "grad_norm": 0.4129597798905344, + "learning_rate": 9.935750420309055e-05, + "loss": 0.7331, + "num_input_tokens_seen": 3516136, + "step": 335 + }, + { + "epoch": 0.08756116404841617, + "grad_norm": 0.4031509882699161, + "learning_rate": 9.931806517013612e-05, + "loss": 0.6939, + "num_input_tokens_seen": 3568360, + "step": 340 + }, + { + "epoch": 0.08884882822559877, + "grad_norm": 0.4444358747076587, + "learning_rate": 9.927745975766654e-05, + "loss": 0.7158, + "num_input_tokens_seen": 3620696, + "step": 345 + }, + { + "epoch": 0.09013649240278135, + "grad_norm": 0.5290547365449167, + "learning_rate": 9.923568892600578e-05, + "loss": 0.6932, + "num_input_tokens_seen": 3673152, + "step": 350 + }, + { + "epoch": 0.09013649240278135, + "eval_loss": 0.7044599056243896, + "eval_runtime": 38.2709, + "eval_samples_per_second": 3.136, + "eval_steps_per_second": 0.784, + "num_input_tokens_seen": 3673152, + "step": 350 + }, + { + "epoch": 0.09142415657996394, + "grad_norm": 0.47530311368359207, + "learning_rate": 9.91927536630402e-05, + "loss": 0.6778, + "num_input_tokens_seen": 3725296, + "step": 355 + }, + { + "epoch": 0.09271182075714654, + "grad_norm": 0.38913022785688944, + "learning_rate": 9.91486549841951e-05, + "loss": 0.6857, + "num_input_tokens_seen": 3777552, + "step": 360 + }, + { + "epoch": 0.09399948493432912, + "grad_norm": 0.4834773141333328, + "learning_rate": 9.91033939324107e-05, + "loss": 0.7184, + "num_input_tokens_seen": 3830200, + "step": 365 + }, + { + "epoch": 0.09528714911151172, + "grad_norm": 0.5862045807150876, + "learning_rate": 9.905697157811761e-05, + "loss": 0.7196, + "num_input_tokens_seen": 3883200, + "step": 370 + }, + { + "epoch": 0.09657481328869431, + "grad_norm": 0.4576971522205563, + "learning_rate": 9.900938901921131e-05, + "loss": 0.6914, + "num_input_tokens_seen": 3935576, + "step": 375 + }, + { + "epoch": 0.0978624774658769, + "grad_norm": 0.49551517524520683, + "learning_rate": 9.896064738102635e-05, + "loss": 0.6681, + "num_input_tokens_seen": 3987624, + "step": 380 + }, + { + "epoch": 0.09915014164305949, + "grad_norm": 0.8198390819787913, + "learning_rate": 9.891074781630966e-05, + "loss": 0.6723, + "num_input_tokens_seen": 4039680, + "step": 385 + }, + { + "epoch": 0.10043780582024209, + "grad_norm": 0.7034626469978683, + "learning_rate": 9.885969150519331e-05, + "loss": 0.6498, + "num_input_tokens_seen": 4091216, + "step": 390 + }, + { + "epoch": 0.10172546999742467, + "grad_norm": 0.8838075623197742, + "learning_rate": 9.88074796551666e-05, + "loss": 0.7311, + "num_input_tokens_seen": 4144264, + "step": 395 + }, + { + "epoch": 0.10301313417460727, + "grad_norm": 0.7342758386202114, + "learning_rate": 9.875411350104744e-05, + "loss": 0.7089, + "num_input_tokens_seen": 4197072, + "step": 400 + }, + { + "epoch": 0.10301313417460727, + "eval_loss": 0.6847750544548035, + "eval_runtime": 37.9238, + "eval_samples_per_second": 3.164, + "eval_steps_per_second": 0.791, + "num_input_tokens_seen": 4197072, + "step": 400 + }, + { + "epoch": 0.10430079835178985, + "grad_norm": 0.8113533605928532, + "learning_rate": 9.86995943049533e-05, + "loss": 0.7021, + "num_input_tokens_seen": 4249656, + "step": 405 + }, + { + "epoch": 0.10558846252897244, + "grad_norm": 1.1772677082041305, + "learning_rate": 9.864392335627117e-05, + "loss": 0.6943, + "num_input_tokens_seen": 4302944, + "step": 410 + }, + { + "epoch": 0.10687612670615504, + "grad_norm": 1.6493280510697776, + "learning_rate": 9.858710197162721e-05, + "loss": 0.7146, + "num_input_tokens_seen": 4355480, + "step": 415 + }, + { + "epoch": 0.10816379088333762, + "grad_norm": 3.0159798803441715, + "learning_rate": 9.852913149485556e-05, + "loss": 0.6312, + "num_input_tokens_seen": 4407688, + "step": 420 + }, + { + "epoch": 0.10945145506052022, + "grad_norm": 1.7981196843056153, + "learning_rate": 9.847001329696653e-05, + "loss": 0.6877, + "num_input_tokens_seen": 4459736, + "step": 425 + }, + { + "epoch": 0.11073911923770281, + "grad_norm": 1.5783278376799834, + "learning_rate": 9.840974877611422e-05, + "loss": 0.6975, + "num_input_tokens_seen": 4512928, + "step": 430 + }, + { + "epoch": 0.1120267834148854, + "grad_norm": 3.306646516615779, + "learning_rate": 9.834833935756344e-05, + "loss": 0.651, + "num_input_tokens_seen": 4565840, + "step": 435 + }, + { + "epoch": 0.11331444759206799, + "grad_norm": 2.3184973874904005, + "learning_rate": 9.828578649365601e-05, + "loss": 0.685, + "num_input_tokens_seen": 4618168, + "step": 440 + }, + { + "epoch": 0.11460211176925057, + "grad_norm": 1.602690016495642, + "learning_rate": 9.822209166377635e-05, + "loss": 0.6258, + "num_input_tokens_seen": 4669784, + "step": 445 + }, + { + "epoch": 0.11588977594643317, + "grad_norm": 2.6770797227308196, + "learning_rate": 9.815725637431662e-05, + "loss": 0.6732, + "num_input_tokens_seen": 4722528, + "step": 450 + }, + { + "epoch": 0.11588977594643317, + "eval_loss": 0.6526497006416321, + "eval_runtime": 39.085, + "eval_samples_per_second": 3.07, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 4722528, + "step": 450 + }, + { + "epoch": 0.11717744012361576, + "grad_norm": 2.1823349329218074, + "learning_rate": 9.809128215864097e-05, + "loss": 0.6544, + "num_input_tokens_seen": 4774400, + "step": 455 + }, + { + "epoch": 0.11846510430079835, + "grad_norm": 1.434521593914191, + "learning_rate": 9.802417057704931e-05, + "loss": 0.652, + "num_input_tokens_seen": 4826704, + "step": 460 + }, + { + "epoch": 0.11975276847798094, + "grad_norm": 2.399754385687283, + "learning_rate": 9.795592321674045e-05, + "loss": 0.6582, + "num_input_tokens_seen": 4880072, + "step": 465 + }, + { + "epoch": 0.12104043265516354, + "grad_norm": 3.9235176077985536, + "learning_rate": 9.788654169177453e-05, + "loss": 0.6506, + "num_input_tokens_seen": 4931968, + "step": 470 + }, + { + "epoch": 0.12232809683234612, + "grad_norm": 3.659330745777227, + "learning_rate": 9.781602764303487e-05, + "loss": 0.6551, + "num_input_tokens_seen": 4983656, + "step": 475 + }, + { + "epoch": 0.12361576100952872, + "grad_norm": 1.9670601503398757, + "learning_rate": 9.774438273818911e-05, + "loss": 0.6978, + "num_input_tokens_seen": 5036528, + "step": 480 + }, + { + "epoch": 0.12490342518671131, + "grad_norm": 1.308580869419328, + "learning_rate": 9.767160867164979e-05, + "loss": 0.6407, + "num_input_tokens_seen": 5088768, + "step": 485 + }, + { + "epoch": 0.1261910893638939, + "grad_norm": 1.7349486072682865, + "learning_rate": 9.759770716453436e-05, + "loss": 0.6641, + "num_input_tokens_seen": 5142080, + "step": 490 + }, + { + "epoch": 0.1274787535410765, + "grad_norm": 2.993327939872198, + "learning_rate": 9.752267996462434e-05, + "loss": 0.6588, + "num_input_tokens_seen": 5194432, + "step": 495 + }, + { + "epoch": 0.12876641771825909, + "grad_norm": 2.6430988002320976, + "learning_rate": 9.744652884632406e-05, + "loss": 0.6304, + "num_input_tokens_seen": 5246640, + "step": 500 + }, + { + "epoch": 0.12876641771825909, + "eval_loss": 0.6272165775299072, + "eval_runtime": 39.4177, + "eval_samples_per_second": 3.044, + "eval_steps_per_second": 0.761, + "num_input_tokens_seen": 5246640, + "step": 500 + }, + { + "epoch": 0.13005408189544168, + "grad_norm": 2.6047672112920286, + "learning_rate": 9.736925561061871e-05, + "loss": 0.5741, + "num_input_tokens_seen": 5299024, + "step": 505 + }, + { + "epoch": 0.13134174607262425, + "grad_norm": 2.4706517190834063, + "learning_rate": 9.729086208503174e-05, + "loss": 0.6535, + "num_input_tokens_seen": 5352664, + "step": 510 + }, + { + "epoch": 0.13262941024980685, + "grad_norm": 2.031672226684599, + "learning_rate": 9.721135012358156e-05, + "loss": 0.6081, + "num_input_tokens_seen": 5406008, + "step": 515 + }, + { + "epoch": 0.13391707442698944, + "grad_norm": 2.773997809426142, + "learning_rate": 9.713072160673777e-05, + "loss": 0.6792, + "num_input_tokens_seen": 5459368, + "step": 520 + }, + { + "epoch": 0.13520473860417204, + "grad_norm": 5.083057729524855, + "learning_rate": 9.704897844137673e-05, + "loss": 0.6821, + "num_input_tokens_seen": 5512960, + "step": 525 + }, + { + "epoch": 0.13649240278135463, + "grad_norm": 3.0440654843385584, + "learning_rate": 9.696612256073633e-05, + "loss": 0.5835, + "num_input_tokens_seen": 5565368, + "step": 530 + }, + { + "epoch": 0.1377800669585372, + "grad_norm": 3.7400231170971323, + "learning_rate": 9.688215592437039e-05, + "loss": 0.6129, + "num_input_tokens_seen": 5618008, + "step": 535 + }, + { + "epoch": 0.1390677311357198, + "grad_norm": 6.340287952379529, + "learning_rate": 9.679708051810221e-05, + "loss": 0.5765, + "num_input_tokens_seen": 5670072, + "step": 540 + }, + { + "epoch": 0.1403553953129024, + "grad_norm": 3.6351560550229207, + "learning_rate": 9.67108983539777e-05, + "loss": 0.6325, + "num_input_tokens_seen": 5722936, + "step": 545 + }, + { + "epoch": 0.141643059490085, + "grad_norm": 3.8363425916745117, + "learning_rate": 9.662361147021779e-05, + "loss": 0.5596, + "num_input_tokens_seen": 5774880, + "step": 550 + }, + { + "epoch": 0.141643059490085, + "eval_loss": 0.5832681059837341, + "eval_runtime": 38.2495, + "eval_samples_per_second": 3.137, + "eval_steps_per_second": 0.784, + "num_input_tokens_seen": 5774880, + "step": 550 + }, + { + "epoch": 0.14293072366726758, + "grad_norm": 3.911447203674744, + "learning_rate": 9.653522193117013e-05, + "loss": 0.5073, + "num_input_tokens_seen": 5826608, + "step": 555 + }, + { + "epoch": 0.14421838784445018, + "grad_norm": 3.3501835856945763, + "learning_rate": 9.644573182726035e-05, + "loss": 0.5652, + "num_input_tokens_seen": 5879776, + "step": 560 + }, + { + "epoch": 0.14550605202163275, + "grad_norm": 8.75758822201328, + "learning_rate": 9.63551432749426e-05, + "loss": 0.5727, + "num_input_tokens_seen": 5932888, + "step": 565 + }, + { + "epoch": 0.14679371619881534, + "grad_norm": 4.351029258458384, + "learning_rate": 9.626345841664953e-05, + "loss": 0.6251, + "num_input_tokens_seen": 5984648, + "step": 570 + }, + { + "epoch": 0.14808138037599794, + "grad_norm": 7.617020699535255, + "learning_rate": 9.617067942074153e-05, + "loss": 0.6508, + "num_input_tokens_seen": 6037000, + "step": 575 + }, + { + "epoch": 0.14936904455318054, + "grad_norm": 7.293430172750479, + "learning_rate": 9.607680848145558e-05, + "loss": 0.6686, + "num_input_tokens_seen": 6090512, + "step": 580 + }, + { + "epoch": 0.15065670873036313, + "grad_norm": 3.3635276124166653, + "learning_rate": 9.598184781885318e-05, + "loss": 0.5793, + "num_input_tokens_seen": 6143320, + "step": 585 + }, + { + "epoch": 0.1519443729075457, + "grad_norm": 2.7589160396339407, + "learning_rate": 9.588579967876806e-05, + "loss": 0.5954, + "num_input_tokens_seen": 6195720, + "step": 590 + }, + { + "epoch": 0.1532320370847283, + "grad_norm": 1.582169884399532, + "learning_rate": 9.578866633275288e-05, + "loss": 0.5644, + "num_input_tokens_seen": 6247592, + "step": 595 + }, + { + "epoch": 0.1545197012619109, + "grad_norm": 3.891844940061855, + "learning_rate": 9.569045007802559e-05, + "loss": 0.5794, + "num_input_tokens_seen": 6299656, + "step": 600 + }, + { + "epoch": 0.1545197012619109, + "eval_loss": 0.6039358973503113, + "eval_runtime": 38.3138, + "eval_samples_per_second": 3.132, + "eval_steps_per_second": 0.783, + "num_input_tokens_seen": 6299656, + "step": 600 + }, + { + "epoch": 0.1558073654390935, + "grad_norm": 5.90634634073773, + "learning_rate": 9.55911532374151e-05, + "loss": 0.6106, + "num_input_tokens_seen": 6351680, + "step": 605 + }, + { + "epoch": 0.15709502961627608, + "grad_norm": 3.5429043559071034, + "learning_rate": 9.549077815930636e-05, + "loss": 0.5812, + "num_input_tokens_seen": 6403648, + "step": 610 + }, + { + "epoch": 0.15838269379345868, + "grad_norm": 2.8753548663225144, + "learning_rate": 9.538932721758474e-05, + "loss": 0.5992, + "num_input_tokens_seen": 6456328, + "step": 615 + }, + { + "epoch": 0.15967035797064125, + "grad_norm": 2.4013005755622467, + "learning_rate": 9.528680281157999e-05, + "loss": 0.587, + "num_input_tokens_seen": 6509024, + "step": 620 + }, + { + "epoch": 0.16095802214782384, + "grad_norm": 3.860358696946306, + "learning_rate": 9.518320736600943e-05, + "loss": 0.5836, + "num_input_tokens_seen": 6561336, + "step": 625 + }, + { + "epoch": 0.16224568632500644, + "grad_norm": 3.187917212328382, + "learning_rate": 9.507854333092063e-05, + "loss": 0.5913, + "num_input_tokens_seen": 6614024, + "step": 630 + }, + { + "epoch": 0.16353335050218903, + "grad_norm": 3.5342177024321586, + "learning_rate": 9.497281318163346e-05, + "loss": 0.5693, + "num_input_tokens_seen": 6666416, + "step": 635 + }, + { + "epoch": 0.16482101467937163, + "grad_norm": 3.90374612709263, + "learning_rate": 9.486601941868154e-05, + "loss": 0.572, + "num_input_tokens_seen": 6718200, + "step": 640 + }, + { + "epoch": 0.1661086788565542, + "grad_norm": 4.4270591027201665, + "learning_rate": 9.475816456775313e-05, + "loss": 0.6111, + "num_input_tokens_seen": 6771256, + "step": 645 + }, + { + "epoch": 0.1673963430337368, + "grad_norm": 5.04761388655614, + "learning_rate": 9.464925117963133e-05, + "loss": 0.5959, + "num_input_tokens_seen": 6824008, + "step": 650 + }, + { + "epoch": 0.1673963430337368, + "eval_loss": 0.5542036890983582, + "eval_runtime": 68.9048, + "eval_samples_per_second": 1.742, + "eval_steps_per_second": 0.435, + "num_input_tokens_seen": 6824008, + "step": 650 + }, + { + "epoch": 0.1686840072109194, + "grad_norm": 3.428410481447858, + "learning_rate": 9.453928183013385e-05, + "loss": 0.5344, + "num_input_tokens_seen": 6875432, + "step": 655 + }, + { + "epoch": 0.16997167138810199, + "grad_norm": 2.9137495299009846, + "learning_rate": 9.442825912005202e-05, + "loss": 0.56, + "num_input_tokens_seen": 6927768, + "step": 660 + }, + { + "epoch": 0.17125933556528458, + "grad_norm": 4.2956604210715925, + "learning_rate": 9.431618567508933e-05, + "loss": 0.5701, + "num_input_tokens_seen": 6980544, + "step": 665 + }, + { + "epoch": 0.17254699974246718, + "grad_norm": 4.3977584083656405, + "learning_rate": 9.420306414579925e-05, + "loss": 0.5604, + "num_input_tokens_seen": 7032584, + "step": 670 + }, + { + "epoch": 0.17383466391964975, + "grad_norm": 4.48381006313936, + "learning_rate": 9.408889720752266e-05, + "loss": 0.5763, + "num_input_tokens_seen": 7085048, + "step": 675 + }, + { + "epoch": 0.17512232809683234, + "grad_norm": 2.189534287393346, + "learning_rate": 9.397368756032445e-05, + "loss": 0.5962, + "num_input_tokens_seen": 7137952, + "step": 680 + }, + { + "epoch": 0.17640999227401494, + "grad_norm": 3.34591241093722, + "learning_rate": 9.385743792892982e-05, + "loss": 0.5935, + "num_input_tokens_seen": 7190584, + "step": 685 + }, + { + "epoch": 0.17769765645119753, + "grad_norm": 2.7509902524242507, + "learning_rate": 9.374015106265968e-05, + "loss": 0.5267, + "num_input_tokens_seen": 7243440, + "step": 690 + }, + { + "epoch": 0.17898532062838013, + "grad_norm": 2.322454948468365, + "learning_rate": 9.362182973536569e-05, + "loss": 0.5351, + "num_input_tokens_seen": 7295568, + "step": 695 + }, + { + "epoch": 0.1802729848055627, + "grad_norm": 3.4615171229405046, + "learning_rate": 9.35024767453647e-05, + "loss": 0.5014, + "num_input_tokens_seen": 7347040, + "step": 700 + }, + { + "epoch": 0.1802729848055627, + "eval_loss": 0.5440100431442261, + "eval_runtime": 39.1181, + "eval_samples_per_second": 3.068, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 7347040, + "step": 700 + }, + { + "epoch": 0.1815606489827453, + "grad_norm": 4.815426816055898, + "learning_rate": 9.338209491537257e-05, + "loss": 0.543, + "num_input_tokens_seen": 7399584, + "step": 705 + }, + { + "epoch": 0.1828483131599279, + "grad_norm": 7.294932559918336, + "learning_rate": 9.326068709243727e-05, + "loss": 0.4995, + "num_input_tokens_seen": 7452928, + "step": 710 + }, + { + "epoch": 0.18413597733711048, + "grad_norm": 3.6946433405013495, + "learning_rate": 9.313825614787177e-05, + "loss": 0.5109, + "num_input_tokens_seen": 7505112, + "step": 715 + }, + { + "epoch": 0.18542364151429308, + "grad_norm": 4.339671310261357, + "learning_rate": 9.301480497718593e-05, + "loss": 0.4932, + "num_input_tokens_seen": 7557608, + "step": 720 + }, + { + "epoch": 0.18671130569147568, + "grad_norm": 11.604530853746237, + "learning_rate": 9.289033650001817e-05, + "loss": 0.5573, + "num_input_tokens_seen": 7610048, + "step": 725 + }, + { + "epoch": 0.18799896986865824, + "grad_norm": 5.990020165378009, + "learning_rate": 9.276485366006634e-05, + "loss": 0.5305, + "num_input_tokens_seen": 7662056, + "step": 730 + }, + { + "epoch": 0.18928663404584084, + "grad_norm": 4.709895983169237, + "learning_rate": 9.263835942501807e-05, + "loss": 0.5369, + "num_input_tokens_seen": 7713656, + "step": 735 + }, + { + "epoch": 0.19057429822302344, + "grad_norm": 4.873824727341975, + "learning_rate": 9.251085678648072e-05, + "loss": 0.5397, + "num_input_tokens_seen": 7765992, + "step": 740 + }, + { + "epoch": 0.19186196240020603, + "grad_norm": 3.288968567031419, + "learning_rate": 9.238234875991046e-05, + "loss": 0.5116, + "num_input_tokens_seen": 7818448, + "step": 745 + }, + { + "epoch": 0.19314962657738863, + "grad_norm": 4.778741391076671, + "learning_rate": 9.225283838454111e-05, + "loss": 0.541, + "num_input_tokens_seen": 7870520, + "step": 750 + }, + { + "epoch": 0.19314962657738863, + "eval_loss": 0.5273815989494324, + "eval_runtime": 39.1812, + "eval_samples_per_second": 3.063, + "eval_steps_per_second": 0.766, + "num_input_tokens_seen": 7870520, + "step": 750 + }, + { + "epoch": 0.1944372907545712, + "grad_norm": 4.544356566141105, + "learning_rate": 9.21223287233121e-05, + "loss": 0.4961, + "num_input_tokens_seen": 7922736, + "step": 755 + }, + { + "epoch": 0.1957249549317538, + "grad_norm": 7.025876813077666, + "learning_rate": 9.199082286279622e-05, + "loss": 0.4956, + "num_input_tokens_seen": 7975304, + "step": 760 + }, + { + "epoch": 0.1970126191089364, + "grad_norm": 4.9360968239249985, + "learning_rate": 9.185832391312644e-05, + "loss": 0.4997, + "num_input_tokens_seen": 8027448, + "step": 765 + }, + { + "epoch": 0.19830028328611898, + "grad_norm": 10.528361984915874, + "learning_rate": 9.172483500792244e-05, + "loss": 0.5214, + "num_input_tokens_seen": 8080944, + "step": 770 + }, + { + "epoch": 0.19958794746330158, + "grad_norm": 9.264531258094065, + "learning_rate": 9.159035930421658e-05, + "loss": 0.6098, + "num_input_tokens_seen": 8133392, + "step": 775 + }, + { + "epoch": 0.20087561164048418, + "grad_norm": 1.9709167614209242, + "learning_rate": 9.145489998237902e-05, + "loss": 0.5046, + "num_input_tokens_seen": 8185360, + "step": 780 + }, + { + "epoch": 0.20216327581766674, + "grad_norm": 7.5915211434567595, + "learning_rate": 9.131846024604274e-05, + "loss": 0.5803, + "num_input_tokens_seen": 8237672, + "step": 785 + }, + { + "epoch": 0.20345093999484934, + "grad_norm": 3.251682970663388, + "learning_rate": 9.11810433220276e-05, + "loss": 0.5365, + "num_input_tokens_seen": 8289688, + "step": 790 + }, + { + "epoch": 0.20473860417203193, + "grad_norm": 4.341533737034294, + "learning_rate": 9.104265246026415e-05, + "loss": 0.5259, + "num_input_tokens_seen": 8341624, + "step": 795 + }, + { + "epoch": 0.20602626834921453, + "grad_norm": 5.463180544339495, + "learning_rate": 9.090329093371666e-05, + "loss": 0.5291, + "num_input_tokens_seen": 8393696, + "step": 800 + }, + { + "epoch": 0.20602626834921453, + "eval_loss": 0.5219093561172485, + "eval_runtime": 39.7455, + "eval_samples_per_second": 3.019, + "eval_steps_per_second": 0.755, + "num_input_tokens_seen": 8393696, + "step": 800 + }, + { + "epoch": 0.20731393252639713, + "grad_norm": 4.254130676908817, + "learning_rate": 9.076296203830579e-05, + "loss": 0.5449, + "num_input_tokens_seen": 8446496, + "step": 805 + }, + { + "epoch": 0.2086015967035797, + "grad_norm": 5.6525741285524145, + "learning_rate": 9.062166909283062e-05, + "loss": 0.5625, + "num_input_tokens_seen": 8499544, + "step": 810 + }, + { + "epoch": 0.2098892608807623, + "grad_norm": 3.8041246225911345, + "learning_rate": 9.047941543889014e-05, + "loss": 0.5564, + "num_input_tokens_seen": 8552568, + "step": 815 + }, + { + "epoch": 0.2111769250579449, + "grad_norm": 3.803732280546421, + "learning_rate": 9.033620444080428e-05, + "loss": 0.5487, + "num_input_tokens_seen": 8605560, + "step": 820 + }, + { + "epoch": 0.21246458923512748, + "grad_norm": 2.8518948364927925, + "learning_rate": 9.019203948553422e-05, + "loss": 0.5719, + "num_input_tokens_seen": 8657704, + "step": 825 + }, + { + "epoch": 0.21375225341231008, + "grad_norm": 3.939376115862177, + "learning_rate": 9.004692398260244e-05, + "loss": 0.5235, + "num_input_tokens_seen": 8711088, + "step": 830 + }, + { + "epoch": 0.21503991758949267, + "grad_norm": 6.635912128499916, + "learning_rate": 8.9900861364012e-05, + "loss": 0.5566, + "num_input_tokens_seen": 8763712, + "step": 835 + }, + { + "epoch": 0.21632758176667524, + "grad_norm": 3.7547407090496687, + "learning_rate": 8.975385508416532e-05, + "loss": 0.482, + "num_input_tokens_seen": 8815760, + "step": 840 + }, + { + "epoch": 0.21761524594385784, + "grad_norm": 4.093006904445721, + "learning_rate": 8.960590861978265e-05, + "loss": 0.5046, + "num_input_tokens_seen": 8867720, + "step": 845 + }, + { + "epoch": 0.21890291012104043, + "grad_norm": 11.397392997722068, + "learning_rate": 8.945702546981969e-05, + "loss": 0.5063, + "num_input_tokens_seen": 8919608, + "step": 850 + }, + { + "epoch": 0.21890291012104043, + "eval_loss": 0.5525640249252319, + "eval_runtime": 39.0469, + "eval_samples_per_second": 3.073, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 8919608, + "step": 850 + }, + { + "epoch": 0.22019057429822303, + "grad_norm": 4.339535962830116, + "learning_rate": 8.930720915538487e-05, + "loss": 0.5853, + "num_input_tokens_seen": 8971048, + "step": 855 + }, + { + "epoch": 0.22147823847540563, + "grad_norm": 6.118436891847819, + "learning_rate": 8.915646321965614e-05, + "loss": 0.5534, + "num_input_tokens_seen": 9022936, + "step": 860 + }, + { + "epoch": 0.2227659026525882, + "grad_norm": 3.3997835203618667, + "learning_rate": 8.900479122779712e-05, + "loss": 0.5623, + "num_input_tokens_seen": 9075336, + "step": 865 + }, + { + "epoch": 0.2240535668297708, + "grad_norm": 4.188326935911128, + "learning_rate": 8.885219676687277e-05, + "loss": 0.5561, + "num_input_tokens_seen": 9127688, + "step": 870 + }, + { + "epoch": 0.22534123100695339, + "grad_norm": 5.220175192497493, + "learning_rate": 8.869868344576459e-05, + "loss": 0.5449, + "num_input_tokens_seen": 9180624, + "step": 875 + }, + { + "epoch": 0.22662889518413598, + "grad_norm": 2.2022914161050577, + "learning_rate": 8.854425489508532e-05, + "loss": 0.5062, + "num_input_tokens_seen": 9233176, + "step": 880 + }, + { + "epoch": 0.22791655936131858, + "grad_norm": 4.62379059067999, + "learning_rate": 8.838891476709288e-05, + "loss": 0.5033, + "num_input_tokens_seen": 9286688, + "step": 885 + }, + { + "epoch": 0.22920422353850115, + "grad_norm": 3.639684630492015, + "learning_rate": 8.823266673560426e-05, + "loss": 0.4845, + "num_input_tokens_seen": 9339600, + "step": 890 + }, + { + "epoch": 0.23049188771568374, + "grad_norm": 4.131757647310936, + "learning_rate": 8.807551449590846e-05, + "loss": 0.5595, + "num_input_tokens_seen": 9391536, + "step": 895 + }, + { + "epoch": 0.23177955189286634, + "grad_norm": 4.771128685196347, + "learning_rate": 8.791746176467907e-05, + "loss": 0.5251, + "num_input_tokens_seen": 9443616, + "step": 900 + }, + { + "epoch": 0.23177955189286634, + "eval_loss": 0.49604204297065735, + "eval_runtime": 39.5289, + "eval_samples_per_second": 3.036, + "eval_steps_per_second": 0.759, + "num_input_tokens_seen": 9443616, + "step": 900 + }, + { + "epoch": 0.23306721607004893, + "grad_norm": 6.849781513397169, + "learning_rate": 8.775851227988656e-05, + "loss": 0.5774, + "num_input_tokens_seen": 9497304, + "step": 905 + }, + { + "epoch": 0.23435488024723153, + "grad_norm": 2.526801567699946, + "learning_rate": 8.759866980070963e-05, + "loss": 0.5441, + "num_input_tokens_seen": 9549416, + "step": 910 + }, + { + "epoch": 0.23564254442441412, + "grad_norm": 3.1008408808291503, + "learning_rate": 8.743793810744654e-05, + "loss": 0.4898, + "num_input_tokens_seen": 9601800, + "step": 915 + }, + { + "epoch": 0.2369302086015967, + "grad_norm": 4.120824184689494, + "learning_rate": 8.727632100142551e-05, + "loss": 0.4681, + "num_input_tokens_seen": 9653600, + "step": 920 + }, + { + "epoch": 0.2382178727787793, + "grad_norm": 5.251488809494114, + "learning_rate": 8.711382230491493e-05, + "loss": 0.4946, + "num_input_tokens_seen": 9707224, + "step": 925 + }, + { + "epoch": 0.23950553695596188, + "grad_norm": 6.885034741125289, + "learning_rate": 8.695044586103296e-05, + "loss": 0.5517, + "num_input_tokens_seen": 9760096, + "step": 930 + }, + { + "epoch": 0.24079320113314448, + "grad_norm": 4.6246077239626855, + "learning_rate": 8.678619553365659e-05, + "loss": 0.6064, + "num_input_tokens_seen": 9812672, + "step": 935 + }, + { + "epoch": 0.24208086531032708, + "grad_norm": 5.621020693846077, + "learning_rate": 8.662107520733027e-05, + "loss": 0.5398, + "num_input_tokens_seen": 9866200, + "step": 940 + }, + { + "epoch": 0.24336852948750964, + "grad_norm": 3.1921985322817092, + "learning_rate": 8.64550887871741e-05, + "loss": 0.5068, + "num_input_tokens_seen": 9918160, + "step": 945 + }, + { + "epoch": 0.24465619366469224, + "grad_norm": 2.3689648161336465, + "learning_rate": 8.628824019879137e-05, + "loss": 0.5862, + "num_input_tokens_seen": 9970600, + "step": 950 + }, + { + "epoch": 0.24465619366469224, + "eval_loss": 0.5085262656211853, + "eval_runtime": 39.0437, + "eval_samples_per_second": 3.073, + "eval_steps_per_second": 0.768, + "num_input_tokens_seen": 9970600, + "step": 950 + }, + { + "epoch": 0.24594385784187484, + "grad_norm": 2.8827978223065363, + "learning_rate": 8.612053338817581e-05, + "loss": 0.4549, + "num_input_tokens_seen": 10022248, + "step": 955 + }, + { + "epoch": 0.24723152201905743, + "grad_norm": 6.662877258417003, + "learning_rate": 8.595197232161824e-05, + "loss": 0.4791, + "num_input_tokens_seen": 10075280, + "step": 960 + }, + { + "epoch": 0.24851918619624003, + "grad_norm": 8.140970355143077, + "learning_rate": 8.578256098561275e-05, + "loss": 0.4833, + "num_input_tokens_seen": 10128392, + "step": 965 + }, + { + "epoch": 0.24980685037342262, + "grad_norm": 3.243184767888501, + "learning_rate": 8.561230338676239e-05, + "loss": 0.4672, + "num_input_tokens_seen": 10180720, + "step": 970 + }, + { + "epoch": 0.2510945145506052, + "grad_norm": 6.588760068173114, + "learning_rate": 8.544120355168451e-05, + "loss": 0.5205, + "num_input_tokens_seen": 10233256, + "step": 975 + }, + { + "epoch": 0.2523821787277878, + "grad_norm": 2.6240987196110837, + "learning_rate": 8.526926552691544e-05, + "loss": 0.5124, + "num_input_tokens_seen": 10284928, + "step": 980 + }, + { + "epoch": 0.2536698429049704, + "grad_norm": 8.242761558538728, + "learning_rate": 8.509649337881483e-05, + "loss": 0.5034, + "num_input_tokens_seen": 10338208, + "step": 985 + }, + { + "epoch": 0.254957507082153, + "grad_norm": 8.922137566500533, + "learning_rate": 8.492289119346943e-05, + "loss": 0.5226, + "num_input_tokens_seen": 10390224, + "step": 990 + }, + { + "epoch": 0.25624517125933555, + "grad_norm": 4.922275874717211, + "learning_rate": 8.474846307659658e-05, + "loss": 0.5399, + "num_input_tokens_seen": 10443080, + "step": 995 + }, + { + "epoch": 0.25753283543651817, + "grad_norm": 6.866585614783304, + "learning_rate": 8.457321315344694e-05, + "loss": 0.483, + "num_input_tokens_seen": 10495592, + "step": 1000 + }, + { + "epoch": 0.25753283543651817, + "eval_loss": 0.5305114388465881, + "eval_runtime": 38.9297, + "eval_samples_per_second": 3.082, + "eval_steps_per_second": 0.771, + "num_input_tokens_seen": 10495592, + "step": 1000 + }, + { + "epoch": 0.25882049961370074, + "grad_norm": 8.233033578002926, + "learning_rate": 8.439714556870704e-05, + "loss": 0.568, + "num_input_tokens_seen": 10548136, + "step": 1005 + }, + { + "epoch": 0.26010816379088336, + "grad_norm": 5.3701298824478485, + "learning_rate": 8.422026448640124e-05, + "loss": 0.4335, + "num_input_tokens_seen": 10600048, + "step": 1010 + }, + { + "epoch": 0.26139582796806593, + "grad_norm": 5.491882026124958, + "learning_rate": 8.40425740897932e-05, + "loss": 0.5385, + "num_input_tokens_seen": 10652160, + "step": 1015 + }, + { + "epoch": 0.2626834921452485, + "grad_norm": 5.479941792055548, + "learning_rate": 8.386407858128706e-05, + "loss": 0.5171, + "num_input_tokens_seen": 10705208, + "step": 1020 + }, + { + "epoch": 0.2639711563224311, + "grad_norm": 3.489116106033337, + "learning_rate": 8.368478218232787e-05, + "loss": 0.5201, + "num_input_tokens_seen": 10758688, + "step": 1025 + }, + { + "epoch": 0.2652588204996137, + "grad_norm": 5.923123692460237, + "learning_rate": 8.350468913330192e-05, + "loss": 0.5521, + "num_input_tokens_seen": 10811408, + "step": 1030 + }, + { + "epoch": 0.2665464846767963, + "grad_norm": 2.7605406738569824, + "learning_rate": 8.33238036934364e-05, + "loss": 0.4938, + "num_input_tokens_seen": 10864144, + "step": 1035 + }, + { + "epoch": 0.2678341488539789, + "grad_norm": 5.500647711838314, + "learning_rate": 8.31421301406986e-05, + "loss": 0.4828, + "num_input_tokens_seen": 10916952, + "step": 1040 + }, + { + "epoch": 0.26912181303116145, + "grad_norm": 6.823855575342733, + "learning_rate": 8.29596727716949e-05, + "loss": 0.5491, + "num_input_tokens_seen": 10968824, + "step": 1045 + }, + { + "epoch": 0.2704094772083441, + "grad_norm": 5.409054743152559, + "learning_rate": 8.277643590156894e-05, + "loss": 0.4628, + "num_input_tokens_seen": 11021656, + "step": 1050 + }, + { + "epoch": 0.2704094772083441, + "eval_loss": 0.5039986371994019, + "eval_runtime": 40.3009, + "eval_samples_per_second": 2.978, + "eval_steps_per_second": 0.744, + "num_input_tokens_seen": 11021656, + "step": 1050 + }, + { + "epoch": 0.27169714138552664, + "grad_norm": 3.2588151986321994, + "learning_rate": 8.259242386389973e-05, + "loss": 0.4586, + "num_input_tokens_seen": 11074336, + "step": 1055 + }, + { + "epoch": 0.27298480556270927, + "grad_norm": 12.995641199019554, + "learning_rate": 8.240764101059912e-05, + "loss": 0.4939, + "num_input_tokens_seen": 11126776, + "step": 1060 + }, + { + "epoch": 0.27427246973989183, + "grad_norm": 8.713479932798109, + "learning_rate": 8.222209171180883e-05, + "loss": 0.4978, + "num_input_tokens_seen": 11179680, + "step": 1065 + }, + { + "epoch": 0.2755601339170744, + "grad_norm": 3.6728132957332016, + "learning_rate": 8.203578035579715e-05, + "loss": 0.5695, + "num_input_tokens_seen": 11231616, + "step": 1070 + }, + { + "epoch": 0.276847798094257, + "grad_norm": 9.661110166832387, + "learning_rate": 8.184871134885513e-05, + "loss": 0.4635, + "num_input_tokens_seen": 11283720, + "step": 1075 + }, + { + "epoch": 0.2781354622714396, + "grad_norm": 5.4096015474623576, + "learning_rate": 8.166088911519235e-05, + "loss": 0.4974, + "num_input_tokens_seen": 11336144, + "step": 1080 + }, + { + "epoch": 0.2794231264486222, + "grad_norm": 5.353663008589148, + "learning_rate": 8.147231809683236e-05, + "loss": 0.4439, + "num_input_tokens_seen": 11389128, + "step": 1085 + }, + { + "epoch": 0.2807107906258048, + "grad_norm": 3.863008112890598, + "learning_rate": 8.128300275350756e-05, + "loss": 0.4368, + "num_input_tokens_seen": 11441864, + "step": 1090 + }, + { + "epoch": 0.2819984548029874, + "grad_norm": 5.545035623030093, + "learning_rate": 8.109294756255375e-05, + "loss": 0.4895, + "num_input_tokens_seen": 11494880, + "step": 1095 + }, + { + "epoch": 0.28328611898017, + "grad_norm": 5.124762488175073, + "learning_rate": 8.090215701880419e-05, + "loss": 0.4825, + "num_input_tokens_seen": 11547008, + "step": 1100 + }, + { + "epoch": 0.28328611898017, + "eval_loss": 0.4798590838909149, + "eval_runtime": 40.6942, + "eval_samples_per_second": 2.949, + "eval_steps_per_second": 0.737, + "num_input_tokens_seen": 11547008, + "step": 1100 + }, + { + "epoch": 0.28457378315735254, + "grad_norm": 11.308296783543483, + "learning_rate": 8.07106356344834e-05, + "loss": 0.4927, + "num_input_tokens_seen": 11600032, + "step": 1105 + }, + { + "epoch": 0.28586144733453517, + "grad_norm": 4.902660398367944, + "learning_rate": 8.051838793910038e-05, + "loss": 0.4353, + "num_input_tokens_seen": 11652120, + "step": 1110 + }, + { + "epoch": 0.28714911151171774, + "grad_norm": 4.185631754620407, + "learning_rate": 8.032541847934146e-05, + "loss": 0.4891, + "num_input_tokens_seen": 11705184, + "step": 1115 + }, + { + "epoch": 0.28843677568890036, + "grad_norm": 6.049695709018542, + "learning_rate": 8.013173181896283e-05, + "loss": 0.4497, + "num_input_tokens_seen": 11758032, + "step": 1120 + }, + { + "epoch": 0.28972443986608293, + "grad_norm": 4.598736726589848, + "learning_rate": 7.993733253868256e-05, + "loss": 0.4927, + "num_input_tokens_seen": 11810736, + "step": 1125 + }, + { + "epoch": 0.2910121040432655, + "grad_norm": 41.010822412039396, + "learning_rate": 7.974222523607236e-05, + "loss": 0.4853, + "num_input_tokens_seen": 11863152, + "step": 1130 + }, + { + "epoch": 0.2922997682204481, + "grad_norm": 5.591270811303827, + "learning_rate": 7.954641452544865e-05, + "loss": 0.4458, + "num_input_tokens_seen": 11914536, + "step": 1135 + }, + { + "epoch": 0.2935874323976307, + "grad_norm": 4.526048407550314, + "learning_rate": 7.934990503776363e-05, + "loss": 0.3976, + "num_input_tokens_seen": 11966064, + "step": 1140 + }, + { + "epoch": 0.2948750965748133, + "grad_norm": 4.778105875378293, + "learning_rate": 7.915270142049566e-05, + "loss": 0.508, + "num_input_tokens_seen": 12018928, + "step": 1145 + }, + { + "epoch": 0.2961627607519959, + "grad_norm": 8.075837130866274, + "learning_rate": 7.89548083375394e-05, + "loss": 0.4553, + "num_input_tokens_seen": 12071088, + "step": 1150 + }, + { + "epoch": 0.2961627607519959, + "eval_loss": 0.45381438732147217, + "eval_runtime": 38.3303, + "eval_samples_per_second": 3.131, + "eval_steps_per_second": 0.783, + "num_input_tokens_seen": 12071088, + "step": 1150 + }, + { + "epoch": 0.29745042492917845, + "grad_norm": 5.66991445612284, + "learning_rate": 7.875623046909544e-05, + "loss": 0.4192, + "num_input_tokens_seen": 12122128, + "step": 1155 + }, + { + "epoch": 0.29873808910636107, + "grad_norm": 11.08291356725024, + "learning_rate": 7.855697251155967e-05, + "loss": 0.433, + "num_input_tokens_seen": 12174288, + "step": 1160 + }, + { + "epoch": 0.30002575328354364, + "grad_norm": 8.191495602021662, + "learning_rate": 7.835703917741212e-05, + "loss": 0.4817, + "num_input_tokens_seen": 12227008, + "step": 1165 + }, + { + "epoch": 0.30131341746072626, + "grad_norm": 7.763763600628314, + "learning_rate": 7.81564351951057e-05, + "loss": 0.485, + "num_input_tokens_seen": 12280168, + "step": 1170 + }, + { + "epoch": 0.30260108163790883, + "grad_norm": 5.347838532189795, + "learning_rate": 7.795516530895414e-05, + "loss": 0.4532, + "num_input_tokens_seen": 12333072, + "step": 1175 + }, + { + "epoch": 0.3038887458150914, + "grad_norm": 7.959591215701365, + "learning_rate": 7.775323427901993e-05, + "loss": 0.4643, + "num_input_tokens_seen": 12386208, + "step": 1180 + }, + { + "epoch": 0.305176409992274, + "grad_norm": 6.676689561663868, + "learning_rate": 7.755064688100171e-05, + "loss": 0.4577, + "num_input_tokens_seen": 12439304, + "step": 1185 + }, + { + "epoch": 0.3064640741694566, + "grad_norm": 6.976246725003336, + "learning_rate": 7.734740790612136e-05, + "loss": 0.4666, + "num_input_tokens_seen": 12491360, + "step": 1190 + }, + { + "epoch": 0.3077517383466392, + "grad_norm": 6.034570050567919, + "learning_rate": 7.714352216101055e-05, + "loss": 0.407, + "num_input_tokens_seen": 12544264, + "step": 1195 + }, + { + "epoch": 0.3090394025238218, + "grad_norm": 4.583037231101643, + "learning_rate": 7.693899446759727e-05, + "loss": 0.454, + "num_input_tokens_seen": 12596208, + "step": 1200 + }, + { + "epoch": 0.3090394025238218, + "eval_loss": 0.49250805377960205, + "eval_runtime": 38.6863, + "eval_samples_per_second": 3.102, + "eval_steps_per_second": 0.775, + "num_input_tokens_seen": 12596208, + "step": 1200 + }, + { + "epoch": 0.31032706670100435, + "grad_norm": 4.0964966925406365, + "learning_rate": 7.673382966299163e-05, + "loss": 0.5226, + "num_input_tokens_seen": 12648936, + "step": 1205 + }, + { + "epoch": 0.311614730878187, + "grad_norm": 7.87992303723905, + "learning_rate": 7.65280325993715e-05, + "loss": 0.4757, + "num_input_tokens_seen": 12702432, + "step": 1210 + }, + { + "epoch": 0.31290239505536954, + "grad_norm": 6.822793875901239, + "learning_rate": 7.63216081438678e-05, + "loss": 0.451, + "num_input_tokens_seen": 12755128, + "step": 1215 + }, + { + "epoch": 0.31419005923255217, + "grad_norm": 8.804840574778536, + "learning_rate": 7.611456117844934e-05, + "loss": 0.4155, + "num_input_tokens_seen": 12808152, + "step": 1220 + }, + { + "epoch": 0.31547772340973473, + "grad_norm": 12.832933509895003, + "learning_rate": 7.59068965998074e-05, + "loss": 0.4094, + "num_input_tokens_seen": 12861592, + "step": 1225 + }, + { + "epoch": 0.31676538758691736, + "grad_norm": 3.769639586972444, + "learning_rate": 7.569861931923989e-05, + "loss": 0.4663, + "num_input_tokens_seen": 12914240, + "step": 1230 + }, + { + "epoch": 0.3180530517640999, + "grad_norm": 5.011688667303979, + "learning_rate": 7.548973426253521e-05, + "loss": 0.468, + "num_input_tokens_seen": 12967472, + "step": 1235 + }, + { + "epoch": 0.3193407159412825, + "grad_norm": 5.925703481508644, + "learning_rate": 7.528024636985575e-05, + "loss": 0.4744, + "num_input_tokens_seen": 13020232, + "step": 1240 + }, + { + "epoch": 0.3206283801184651, + "grad_norm": 3.511846132089351, + "learning_rate": 7.507016059562107e-05, + "loss": 0.4269, + "num_input_tokens_seen": 13073032, + "step": 1245 + }, + { + "epoch": 0.3219160442956477, + "grad_norm": 6.878508053492975, + "learning_rate": 7.485948190839077e-05, + "loss": 0.4725, + "num_input_tokens_seen": 13125624, + "step": 1250 + }, + { + "epoch": 0.3219160442956477, + "eval_loss": 0.4339977502822876, + "eval_runtime": 39.1132, + "eval_samples_per_second": 3.068, + "eval_steps_per_second": 0.767, + "num_input_tokens_seen": 13125624, + "step": 1250 + }, + { + "epoch": 0.3232037084728303, + "grad_norm": 3.2225418900054184, + "learning_rate": 7.464821529074679e-05, + "loss": 0.4196, + "num_input_tokens_seen": 13178656, + "step": 1255 + }, + { + "epoch": 0.3244913726500129, + "grad_norm": 5.7056125199065475, + "learning_rate": 7.443636573917585e-05, + "loss": 0.4349, + "num_input_tokens_seen": 13231224, + "step": 1260 + }, + { + "epoch": 0.32577903682719545, + "grad_norm": 3.1679429520474587, + "learning_rate": 7.422393826395108e-05, + "loss": 0.4726, + "num_input_tokens_seen": 13283208, + "step": 1265 + }, + { + "epoch": 0.32706670100437807, + "grad_norm": 5.409673500894723, + "learning_rate": 7.40109378890136e-05, + "loss": 0.4604, + "num_input_tokens_seen": 13335808, + "step": 1270 + }, + { + "epoch": 0.32835436518156064, + "grad_norm": 6.011303613930208, + "learning_rate": 7.379736965185368e-05, + "loss": 0.4606, + "num_input_tokens_seen": 13389112, + "step": 1275 + }, + { + "epoch": 0.32964202935874326, + "grad_norm": 11.490498301960598, + "learning_rate": 7.358323860339165e-05, + "loss": 0.4487, + "num_input_tokens_seen": 13441816, + "step": 1280 + }, + { + "epoch": 0.33092969353592583, + "grad_norm": 8.761206465870922, + "learning_rate": 7.336854980785839e-05, + "loss": 0.422, + "num_input_tokens_seen": 13493592, + "step": 1285 + }, + { + "epoch": 0.3322173577131084, + "grad_norm": 8.457687965106274, + "learning_rate": 7.315330834267553e-05, + "loss": 0.5397, + "num_input_tokens_seen": 13545696, + "step": 1290 + }, + { + "epoch": 0.333505021890291, + "grad_norm": 6.1852361009354295, + "learning_rate": 7.293751929833553e-05, + "loss": 0.5022, + "num_input_tokens_seen": 13597560, + "step": 1295 + }, + { + "epoch": 0.3347926860674736, + "grad_norm": 3.157280649859201, + "learning_rate": 7.272118777828108e-05, + "loss": 0.4794, + "num_input_tokens_seen": 13650264, + "step": 1300 + }, + { + "epoch": 0.3347926860674736, + "eval_loss": 0.4991846978664398, + "eval_runtime": 38.2504, + "eval_samples_per_second": 3.137, + "eval_steps_per_second": 0.784, + "num_input_tokens_seen": 13650264, + "step": 1300 + }, + { + "epoch": 0.3360803502446562, + "grad_norm": 6.386835645613503, + "learning_rate": 7.250431889878455e-05, + "loss": 0.4971, + "num_input_tokens_seen": 13702584, + "step": 1305 + }, + { + "epoch": 0.3373680144218388, + "grad_norm": 4.797592029689297, + "learning_rate": 7.228691778882693e-05, + "loss": 0.4574, + "num_input_tokens_seen": 13755024, + "step": 1310 + }, + { + "epoch": 0.33865567859902135, + "grad_norm": 3.659831343491765, + "learning_rate": 7.20689895899765e-05, + "loss": 0.4463, + "num_input_tokens_seen": 13807528, + "step": 1315 + }, + { + "epoch": 0.33994334277620397, + "grad_norm": 8.104230440489859, + "learning_rate": 7.185053945626733e-05, + "loss": 0.4549, + "num_input_tokens_seen": 13859760, + "step": 1320 + }, + { + "epoch": 0.34123100695338654, + "grad_norm": 4.000749012853666, + "learning_rate": 7.163157255407732e-05, + "loss": 0.4073, + "num_input_tokens_seen": 13911656, + "step": 1325 + }, + { + "epoch": 0.34251867113056916, + "grad_norm": 4.431361614574065, + "learning_rate": 7.141209406200599e-05, + "loss": 0.433, + "num_input_tokens_seen": 13963816, + "step": 1330 + }, + { + "epoch": 0.34380633530775173, + "grad_norm": 3.9352317738395635, + "learning_rate": 7.1192109170752e-05, + "loss": 0.4244, + "num_input_tokens_seen": 14016256, + "step": 1335 + }, + { + "epoch": 0.34509399948493436, + "grad_norm": 4.571632866024196, + "learning_rate": 7.097162308299054e-05, + "loss": 0.4448, + "num_input_tokens_seen": 14068768, + "step": 1340 + }, + { + "epoch": 0.3463816636621169, + "grad_norm": 4.2711556426666375, + "learning_rate": 7.07506410132501e-05, + "loss": 0.4608, + "num_input_tokens_seen": 14121272, + "step": 1345 + }, + { + "epoch": 0.3476693278392995, + "grad_norm": 4.49067434213006, + "learning_rate": 7.052916818778918e-05, + "loss": 0.3994, + "num_input_tokens_seen": 14173240, + "step": 1350 + }, + { + "epoch": 0.3476693278392995, + "eval_loss": 0.460835725069046, + "eval_runtime": 38.3552, + "eval_samples_per_second": 3.129, + "eval_steps_per_second": 0.782, + "num_input_tokens_seen": 14173240, + "step": 1350 + }, + { + "epoch": 0.3489569920164821, + "grad_norm": 6.100571377010892, + "learning_rate": 7.030720984447279e-05, + "loss": 0.41, + "num_input_tokens_seen": 14226032, + "step": 1355 + }, + { + "epoch": 0.3502446561936647, + "grad_norm": 3.531812694789996, + "learning_rate": 7.008477123264848e-05, + "loss": 0.3751, + "num_input_tokens_seen": 14278128, + "step": 1360 + }, + { + "epoch": 0.3515323203708473, + "grad_norm": 13.528736327050117, + "learning_rate": 6.986185761302224e-05, + "loss": 0.4814, + "num_input_tokens_seen": 14330624, + "step": 1365 + }, + { + "epoch": 0.3528199845480299, + "grad_norm": 6.2453361475565305, + "learning_rate": 6.963847425753403e-05, + "loss": 0.5007, + "num_input_tokens_seen": 14382416, + "step": 1370 + }, + { + "epoch": 0.35410764872521244, + "grad_norm": 3.5868157849734925, + "learning_rate": 6.941462644923318e-05, + "loss": 0.4335, + "num_input_tokens_seen": 14434896, + "step": 1375 + }, + { + "epoch": 0.35539531290239507, + "grad_norm": 7.0930284762784925, + "learning_rate": 6.919031948215335e-05, + "loss": 0.4427, + "num_input_tokens_seen": 14487152, + "step": 1380 + }, + { + "epoch": 0.35668297707957763, + "grad_norm": 1.8673746248959853, + "learning_rate": 6.896555866118741e-05, + "loss": 0.42, + "num_input_tokens_seen": 14539608, + "step": 1385 + }, + { + "epoch": 0.35797064125676026, + "grad_norm": 3.29378340171418, + "learning_rate": 6.87403493019619e-05, + "loss": 0.4573, + "num_input_tokens_seen": 14592168, + "step": 1390 + }, + { + "epoch": 0.3592583054339428, + "grad_norm": 4.710051493913417, + "learning_rate": 6.851469673071143e-05, + "loss": 0.4341, + "num_input_tokens_seen": 14643920, + "step": 1395 + }, + { + "epoch": 0.3605459696111254, + "grad_norm": 5.46737560287727, + "learning_rate": 6.828860628415253e-05, + "loss": 0.437, + "num_input_tokens_seen": 14697136, + "step": 1400 + }, + { + "epoch": 0.3605459696111254, + "eval_loss": 0.46620962023735046, + "eval_runtime": 38.4197, + "eval_samples_per_second": 3.123, + "eval_steps_per_second": 0.781, + "num_input_tokens_seen": 14697136, + "step": 1400 + }, + { + "epoch": 0.361833633788308, + "grad_norm": 5.6011715346425355, + "learning_rate": 6.806208330935766e-05, + "loss": 0.4377, + "num_input_tokens_seen": 14749168, + "step": 1405 + }, + { + "epoch": 0.3631212979654906, + "grad_norm": 8.725023519965001, + "learning_rate": 6.783513316362855e-05, + "loss": 0.412, + "num_input_tokens_seen": 14801568, + "step": 1410 + }, + { + "epoch": 0.3644089621426732, + "grad_norm": 8.12664534705471, + "learning_rate": 6.760776121436962e-05, + "loss": 0.4441, + "num_input_tokens_seen": 14853384, + "step": 1415 + }, + { + "epoch": 0.3656966263198558, + "grad_norm": 3.5568354734329244, + "learning_rate": 6.737997283896103e-05, + "loss": 0.4576, + "num_input_tokens_seen": 14906632, + "step": 1420 + }, + { + "epoch": 0.36698429049703835, + "grad_norm": 2.9816566580274007, + "learning_rate": 6.715177342463145e-05, + "loss": 0.3853, + "num_input_tokens_seen": 14959240, + "step": 1425 + }, + { + "epoch": 0.36827195467422097, + "grad_norm": 9.270651786172323, + "learning_rate": 6.692316836833065e-05, + "loss": 0.3755, + "num_input_tokens_seen": 15012256, + "step": 1430 + }, + { + "epoch": 0.36955961885140354, + "grad_norm": 7.022055493979997, + "learning_rate": 6.6694163076602e-05, + "loss": 0.5384, + "num_input_tokens_seen": 15064664, + "step": 1435 + }, + { + "epoch": 0.37084728302858616, + "grad_norm": 3.764454647275643, + "learning_rate": 6.646476296545434e-05, + "loss": 0.4377, + "num_input_tokens_seen": 15117384, + "step": 1440 + }, + { + "epoch": 0.37213494720576873, + "grad_norm": 5.3073636057406794, + "learning_rate": 6.623497346023418e-05, + "loss": 0.3876, + "num_input_tokens_seen": 15169880, + "step": 1445 + }, + { + "epoch": 0.37342261138295135, + "grad_norm": 3.8443265684988392, + "learning_rate": 6.60047999954972e-05, + "loss": 0.4065, + "num_input_tokens_seen": 15222568, + "step": 1450 + }, + { + "epoch": 0.37342261138295135, + "eval_loss": 0.4395444095134735, + "eval_runtime": 38.336, + "eval_samples_per_second": 3.13, + "eval_steps_per_second": 0.783, + "num_input_tokens_seen": 15222568, + "step": 1450 + }, + { + "epoch": 0.3747102755601339, + "grad_norm": 8.614661225033187, + "learning_rate": 6.57742480148798e-05, + "loss": 0.4231, + "num_input_tokens_seen": 15275288, + "step": 1455 + }, + { + "epoch": 0.3759979397373165, + "grad_norm": 3.107561516867378, + "learning_rate": 6.554332297097031e-05, + "loss": 0.4301, + "num_input_tokens_seen": 15328072, + "step": 1460 + }, + { + "epoch": 0.3772856039144991, + "grad_norm": 2.9024892391048867, + "learning_rate": 6.53120303251801e-05, + "loss": 0.446, + "num_input_tokens_seen": 15379120, + "step": 1465 + }, + { + "epoch": 0.3785732680916817, + "grad_norm": 2.7506997409330105, + "learning_rate": 6.508037554761432e-05, + "loss": 0.3764, + "num_input_tokens_seen": 15431104, + "step": 1470 + }, + { + "epoch": 0.3798609322688643, + "grad_norm": 5.7118625908326734, + "learning_rate": 6.484836411694267e-05, + "loss": 0.4423, + "num_input_tokens_seen": 15482816, + "step": 1475 + }, + { + "epoch": 0.3811485964460469, + "grad_norm": 4.701095405963631, + "learning_rate": 6.461600152026965e-05, + "loss": 0.4439, + "num_input_tokens_seen": 15534896, + "step": 1480 + }, + { + "epoch": 0.38243626062322944, + "grad_norm": 5.574717716204205, + "learning_rate": 6.438329325300499e-05, + "loss": 0.4408, + "num_input_tokens_seen": 15587496, + "step": 1485 + }, + { + "epoch": 0.38372392480041206, + "grad_norm": 4.6497322752918, + "learning_rate": 6.415024481873352e-05, + "loss": 0.4086, + "num_input_tokens_seen": 15639672, + "step": 1490 + }, + { + "epoch": 0.38501158897759463, + "grad_norm": 5.427307211472868, + "learning_rate": 6.391686172908506e-05, + "loss": 0.4489, + "num_input_tokens_seen": 15693120, + "step": 1495 + }, + { + "epoch": 0.38629925315477726, + "grad_norm": 5.005547973733715, + "learning_rate": 6.368314950360415e-05, + "loss": 0.4338, + "num_input_tokens_seen": 15744848, + "step": 1500 + }, + { + "epoch": 0.38629925315477726, + "eval_loss": 0.45475366711616516, + "eval_runtime": 38.3957, + "eval_samples_per_second": 3.125, + "eval_steps_per_second": 0.781, + "num_input_tokens_seen": 15744848, + "step": 1500 + }, + { + "epoch": 0.3875869173319598, + "grad_norm": 5.097132399629058, + "learning_rate": 6.344911366961934e-05, + "loss": 0.4558, + "num_input_tokens_seen": 15797632, + "step": 1505 + }, + { + "epoch": 0.3888745815091424, + "grad_norm": 4.502325593575991, + "learning_rate": 6.321475976211266e-05, + "loss": 0.4518, + "num_input_tokens_seen": 15850040, + "step": 1510 + }, + { + "epoch": 0.390162245686325, + "grad_norm": 6.425152572566654, + "learning_rate": 6.298009332358856e-05, + "loss": 0.4092, + "num_input_tokens_seen": 15902496, + "step": 1515 + }, + { + "epoch": 0.3914499098635076, + "grad_norm": 3.968135032555422, + "learning_rate": 6.274511990394294e-05, + "loss": 0.478, + "num_input_tokens_seen": 15954936, + "step": 1520 + }, + { + "epoch": 0.3927375740406902, + "grad_norm": 4.636757769906518, + "learning_rate": 6.250984506033183e-05, + "loss": 0.4294, + "num_input_tokens_seen": 16007624, + "step": 1525 + }, + { + "epoch": 0.3940252382178728, + "grad_norm": 2.7967900169696347, + "learning_rate": 6.227427435703997e-05, + "loss": 0.3846, + "num_input_tokens_seen": 16059440, + "step": 1530 + }, + { + "epoch": 0.39531290239505534, + "grad_norm": 2.983520749639549, + "learning_rate": 6.203841336534924e-05, + "loss": 0.4372, + "num_input_tokens_seen": 16111136, + "step": 1535 + }, + { + "epoch": 0.39660056657223797, + "grad_norm": 8.364510466670477, + "learning_rate": 6.180226766340688e-05, + "loss": 0.484, + "num_input_tokens_seen": 16163976, + "step": 1540 + }, + { + "epoch": 0.39788823074942054, + "grad_norm": 4.45878743373729, + "learning_rate": 6.156584283609359e-05, + "loss": 0.3965, + "num_input_tokens_seen": 16217192, + "step": 1545 + }, + { + "epoch": 0.39917589492660316, + "grad_norm": 2.6831990995391717, + "learning_rate": 6.132914447489137e-05, + "loss": 0.3872, + "num_input_tokens_seen": 16269896, + "step": 1550 + }, + { + "epoch": 0.39917589492660316, + "eval_loss": 0.4416767656803131, + "eval_runtime": 38.4671, + "eval_samples_per_second": 3.12, + "eval_steps_per_second": 0.78, + "num_input_tokens_seen": 16269896, + "step": 1550 + }, + { + "epoch": 0.4004635591037857, + "grad_norm": 4.920079251827062, + "learning_rate": 6.109217817775139e-05, + "loss": 0.4593, + "num_input_tokens_seen": 16322496, + "step": 1555 + }, + { + "epoch": 0.40175122328096835, + "grad_norm": 9.068094163618136, + "learning_rate": 6.085494954896156e-05, + "loss": 0.4865, + "num_input_tokens_seen": 16375320, + "step": 1560 + }, + { + "epoch": 0.4030388874581509, + "grad_norm": 9.316944070527988, + "learning_rate": 6.061746419901388e-05, + "loss": 0.4422, + "num_input_tokens_seen": 16428096, + "step": 1565 + }, + { + "epoch": 0.4043265516353335, + "grad_norm": 2.4617418860122213, + "learning_rate": 6.0379727744471936e-05, + "loss": 0.3538, + "num_input_tokens_seen": 16480832, + "step": 1570 + }, + { + "epoch": 0.4056142158125161, + "grad_norm": 5.028400110331736, + "learning_rate": 6.014174580783794e-05, + "loss": 0.3923, + "num_input_tokens_seen": 16534016, + "step": 1575 + }, + { + "epoch": 0.4069018799896987, + "grad_norm": 6.638266454273257, + "learning_rate": 5.990352401741981e-05, + "loss": 0.3967, + "num_input_tokens_seen": 16586216, + "step": 1580 + }, + { + "epoch": 0.4081895441668813, + "grad_norm": 6.928848680437489, + "learning_rate": 5.9665068007197976e-05, + "loss": 0.4212, + "num_input_tokens_seen": 16639312, + "step": 1585 + }, + { + "epoch": 0.40947720834406387, + "grad_norm": 4.2324092477507005, + "learning_rate": 5.94263834166923e-05, + "loss": 0.3489, + "num_input_tokens_seen": 16692328, + "step": 1590 + }, + { + "epoch": 0.41076487252124644, + "grad_norm": 5.607976113391715, + "learning_rate": 5.918747589082853e-05, + "loss": 0.4105, + "num_input_tokens_seen": 16745088, + "step": 1595 + }, + { + "epoch": 0.41205253669842906, + "grad_norm": 5.155332109104381, + "learning_rate": 5.8948351079804875e-05, + "loss": 0.3914, + "num_input_tokens_seen": 16798768, + "step": 1600 + }, + { + "epoch": 0.41205253669842906, + "eval_loss": 0.4657597243785858, + "eval_runtime": 38.2951, + "eval_samples_per_second": 3.134, + "eval_steps_per_second": 0.783, + "num_input_tokens_seen": 16798768, + "step": 1600 + }, + { + "epoch": 0.41334020087561163, + "grad_norm": 9.705842143603624, + "learning_rate": 5.8709014638958404e-05, + "loss": 0.3731, + "num_input_tokens_seen": 16851408, + "step": 1605 + }, + { + "epoch": 0.41462786505279425, + "grad_norm": 3.9787044551608752, + "learning_rate": 5.846947222863123e-05, + "loss": 0.4099, + "num_input_tokens_seen": 16903136, + "step": 1610 + }, + { + "epoch": 0.4159155292299768, + "grad_norm": 4.14166731803799, + "learning_rate": 5.8229729514036705e-05, + "loss": 0.4136, + "num_input_tokens_seen": 16955528, + "step": 1615 + }, + { + "epoch": 0.4172031934071594, + "grad_norm": 5.571493786006333, + "learning_rate": 5.7989792165125356e-05, + "loss": 0.3818, + "num_input_tokens_seen": 17008032, + "step": 1620 + }, + { + "epoch": 0.418490857584342, + "grad_norm": 6.599331542411203, + "learning_rate": 5.774966585645092e-05, + "loss": 0.4303, + "num_input_tokens_seen": 17060488, + "step": 1625 + }, + { + "epoch": 0.4197785217615246, + "grad_norm": 3.5274772999039072, + "learning_rate": 5.7509356267035975e-05, + "loss": 0.3673, + "num_input_tokens_seen": 17112408, + "step": 1630 + }, + { + "epoch": 0.4210661859387072, + "grad_norm": 3.483018179241301, + "learning_rate": 5.726886908023776e-05, + "loss": 0.4149, + "num_input_tokens_seen": 17164664, + "step": 1635 + }, + { + "epoch": 0.4223538501158898, + "grad_norm": 5.924637144990831, + "learning_rate": 5.702820998361373e-05, + "loss": 0.4613, + "num_input_tokens_seen": 17217232, + "step": 1640 + }, + { + "epoch": 0.42364151429307234, + "grad_norm": 6.036127770884555, + "learning_rate": 5.6787384668786994e-05, + "loss": 0.372, + "num_input_tokens_seen": 17269344, + "step": 1645 + }, + { + "epoch": 0.42492917847025496, + "grad_norm": 5.185881680374458, + "learning_rate": 5.654639883131178e-05, + "loss": 0.3755, + "num_input_tokens_seen": 17323232, + "step": 1650 + }, + { + "epoch": 0.42492917847025496, + "eval_loss": 0.4726848006248474, + "eval_runtime": 38.644, + "eval_samples_per_second": 3.105, + "eval_steps_per_second": 0.776, + "num_input_tokens_seen": 17323232, + "step": 1650 + }, + { + "epoch": 0.42621684264743753, + "grad_norm": 3.857123876804029, + "learning_rate": 5.6305258170538676e-05, + "loss": 0.3972, + "num_input_tokens_seen": 17375432, + "step": 1655 + }, + { + "epoch": 0.42750450682462016, + "grad_norm": 5.654580193869214, + "learning_rate": 5.606396838947988e-05, + "loss": 0.3988, + "num_input_tokens_seen": 17427832, + "step": 1660 + }, + { + "epoch": 0.4287921710018027, + "grad_norm": 3.947785396211361, + "learning_rate": 5.582253519467432e-05, + "loss": 0.4247, + "num_input_tokens_seen": 17480056, + "step": 1665 + }, + { + "epoch": 0.43007983517898535, + "grad_norm": 7.275477625247532, + "learning_rate": 5.558096429605263e-05, + "loss": 0.386, + "num_input_tokens_seen": 17533192, + "step": 1670 + }, + { + "epoch": 0.4313674993561679, + "grad_norm": 6.488968692662172, + "learning_rate": 5.533926140680221e-05, + "loss": 0.4487, + "num_input_tokens_seen": 17585000, + "step": 1675 + }, + { + "epoch": 0.4326551635333505, + "grad_norm": 2.852361322266725, + "learning_rate": 5.509743224323203e-05, + "loss": 0.3878, + "num_input_tokens_seen": 17638152, + "step": 1680 + }, + { + "epoch": 0.4339428277105331, + "grad_norm": 7.981827756404923, + "learning_rate": 5.485548252463749e-05, + "loss": 0.3333, + "num_input_tokens_seen": 17690656, + "step": 1685 + }, + { + "epoch": 0.4352304918877157, + "grad_norm": 2.573091816072651, + "learning_rate": 5.4613417973165106e-05, + "loss": 0.464, + "num_input_tokens_seen": 17742112, + "step": 1690 + }, + { + "epoch": 0.4365181560648983, + "grad_norm": 5.873606814461249, + "learning_rate": 5.4371244313677225e-05, + "loss": 0.4374, + "num_input_tokens_seen": 17793968, + "step": 1695 + }, + { + "epoch": 0.43780582024208087, + "grad_norm": 2.7838359754450956, + "learning_rate": 5.4128967273616625e-05, + "loss": 0.3796, + "num_input_tokens_seen": 17845600, + "step": 1700 + }, + { + "epoch": 0.43780582024208087, + "eval_loss": 0.4555380642414093, + "eval_runtime": 38.1895, + "eval_samples_per_second": 3.142, + "eval_steps_per_second": 0.786, + "num_input_tokens_seen": 17845600, + "step": 1700 + }, + { + "epoch": 0.43909348441926344, + "grad_norm": 6.805905061208508, + "learning_rate": 5.388659258287102e-05, + "loss": 0.4066, + "num_input_tokens_seen": 17897920, + "step": 1705 + }, + { + "epoch": 0.44038114859644606, + "grad_norm": 6.964220035587859, + "learning_rate": 5.364412597363759e-05, + "loss": 0.3599, + "num_input_tokens_seen": 17950920, + "step": 1710 + }, + { + "epoch": 0.4416688127736286, + "grad_norm": 6.2080800188283956, + "learning_rate": 5.3401573180287426e-05, + "loss": 0.3681, + "num_input_tokens_seen": 18003280, + "step": 1715 + }, + { + "epoch": 0.44295647695081125, + "grad_norm": 6.3073464125282195, + "learning_rate": 5.315893993922986e-05, + "loss": 0.4005, + "num_input_tokens_seen": 18056296, + "step": 1720 + }, + { + "epoch": 0.4442441411279938, + "grad_norm": 4.274996116585604, + "learning_rate": 5.29162319887768e-05, + "loss": 0.3513, + "num_input_tokens_seen": 18108904, + "step": 1725 + }, + { + "epoch": 0.4455318053051764, + "grad_norm": 2.3776496789610224, + "learning_rate": 5.26734550690071e-05, + "loss": 0.373, + "num_input_tokens_seen": 18160696, + "step": 1730 + }, + { + "epoch": 0.446819469482359, + "grad_norm": 5.321325318314331, + "learning_rate": 5.243061492163073e-05, + "loss": 0.4246, + "num_input_tokens_seen": 18213760, + "step": 1735 + }, + { + "epoch": 0.4481071336595416, + "grad_norm": 4.658819445189394, + "learning_rate": 5.2187717289852955e-05, + "loss": 0.3703, + "num_input_tokens_seen": 18266424, + "step": 1740 + }, + { + "epoch": 0.4493947978367242, + "grad_norm": 4.197790185757161, + "learning_rate": 5.1944767918238624e-05, + "loss": 0.3763, + "num_input_tokens_seen": 18318984, + "step": 1745 + }, + { + "epoch": 0.45068246201390677, + "grad_norm": 8.378009104226413, + "learning_rate": 5.170177255257618e-05, + "loss": 0.3767, + "num_input_tokens_seen": 18371928, + "step": 1750 + }, + { + "epoch": 0.45068246201390677, + "eval_loss": 0.4234265685081482, + "eval_runtime": 38.3269, + "eval_samples_per_second": 3.131, + "eval_steps_per_second": 0.783, + "num_input_tokens_seen": 18371928, + "step": 1750 + }, + { + "epoch": 0.45197012619108934, + "grad_norm": 14.32713228151988, + "learning_rate": 5.145873693974188e-05, + "loss": 0.4059, + "num_input_tokens_seen": 18424432, + "step": 1755 + }, + { + "epoch": 0.45325779036827196, + "grad_norm": 2.0707361276884013, + "learning_rate": 5.12156668275638e-05, + "loss": 0.3709, + "num_input_tokens_seen": 18476736, + "step": 1760 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 4.619909130275672, + "learning_rate": 5.097256796468598e-05, + "loss": 0.4075, + "num_input_tokens_seen": 18529552, + "step": 1765 + }, + { + "epoch": 0.45583311872263715, + "grad_norm": 3.094020664423217, + "learning_rate": 5.072944610043232e-05, + "loss": 0.3101, + "num_input_tokens_seen": 18583232, + "step": 1770 + }, + { + "epoch": 0.4571207828998197, + "grad_norm": 5.699455809009083, + "learning_rate": 5.048630698467081e-05, + "loss": 0.33, + "num_input_tokens_seen": 18636296, + "step": 1775 + }, + { + "epoch": 0.4584084470770023, + "grad_norm": 7.4549216170752395, + "learning_rate": 5.024315636767738e-05, + "loss": 0.4204, + "num_input_tokens_seen": 18688376, + "step": 1780 + }, + { + "epoch": 0.4596961112541849, + "grad_norm": 4.888497930569714, + "learning_rate": 5e-05, + "loss": 0.4855, + "num_input_tokens_seen": 18741192, + "step": 1785 + }, + { + "epoch": 0.4609837754313675, + "grad_norm": 5.677261933254248, + "learning_rate": 4.9756843632322626e-05, + "loss": 0.3344, + "num_input_tokens_seen": 18794320, + "step": 1790 + }, + { + "epoch": 0.4622714396085501, + "grad_norm": 6.052977008969569, + "learning_rate": 4.9513693015329197e-05, + "loss": 0.3836, + "num_input_tokens_seen": 18846368, + "step": 1795 + }, + { + "epoch": 0.4635591037857327, + "grad_norm": 9.403234122276736, + "learning_rate": 4.9270553899567686e-05, + "loss": 0.4484, + "num_input_tokens_seen": 18898888, + "step": 1800 + }, + { + "epoch": 0.4635591037857327, + "eval_loss": 0.4194311797618866, + "eval_runtime": 38.1985, + "eval_samples_per_second": 3.141, + "eval_steps_per_second": 0.785, + "num_input_tokens_seen": 18898888, + "step": 1800 + }, + { + "epoch": 0.4648467679629153, + "grad_norm": 7.731678512107114, + "learning_rate": 4.902743203531405e-05, + "loss": 0.3301, + "num_input_tokens_seen": 18951672, + "step": 1805 + }, + { + "epoch": 0.46613443214009787, + "grad_norm": 10.15063272505325, + "learning_rate": 4.8784333172436206e-05, + "loss": 0.3861, + "num_input_tokens_seen": 19005008, + "step": 1810 + }, + { + "epoch": 0.46742209631728043, + "grad_norm": 5.469167499287458, + "learning_rate": 4.854126306025812e-05, + "loss": 0.459, + "num_input_tokens_seen": 19057856, + "step": 1815 + }, + { + "epoch": 0.46870976049446306, + "grad_norm": 4.484770144747688, + "learning_rate": 4.829822744742383e-05, + "loss": 0.3944, + "num_input_tokens_seen": 19110992, + "step": 1820 + }, + { + "epoch": 0.4699974246716456, + "grad_norm": 2.93519255562483, + "learning_rate": 4.8055232081761395e-05, + "loss": 0.3447, + "num_input_tokens_seen": 19162816, + "step": 1825 + }, + { + "epoch": 0.47128508884882825, + "grad_norm": 4.537664765353073, + "learning_rate": 4.781228271014704e-05, + "loss": 0.3954, + "num_input_tokens_seen": 19215752, + "step": 1830 + }, + { + "epoch": 0.4725727530260108, + "grad_norm": 5.207010560179338, + "learning_rate": 4.756938507836929e-05, + "loss": 0.415, + "num_input_tokens_seen": 19268392, + "step": 1835 + }, + { + "epoch": 0.4738604172031934, + "grad_norm": 2.8028260441863044, + "learning_rate": 4.732654493099291e-05, + "loss": 0.2794, + "num_input_tokens_seen": 19321696, + "step": 1840 + }, + { + "epoch": 0.475148081380376, + "grad_norm": 3.4995694598409406, + "learning_rate": 4.708376801122321e-05, + "loss": 0.3699, + "num_input_tokens_seen": 19373584, + "step": 1845 + }, + { + "epoch": 0.4764357455575586, + "grad_norm": 3.1019810158916212, + "learning_rate": 4.6841060060770154e-05, + "loss": 0.3941, + "num_input_tokens_seen": 19424688, + "step": 1850 + }, + { + "epoch": 0.4764357455575586, + "eval_loss": 0.45103010535240173, + "eval_runtime": 38.1071, + "eval_samples_per_second": 3.149, + "eval_steps_per_second": 0.787, + "num_input_tokens_seen": 19424688, + "step": 1850 + }, + { + "epoch": 0.4777234097347412, + "grad_norm": 17.624135718164755, + "learning_rate": 4.659842681971258e-05, + "loss": 0.4362, + "num_input_tokens_seen": 19477320, + "step": 1855 + }, + { + "epoch": 0.47901107391192377, + "grad_norm": 3.653356682833457, + "learning_rate": 4.635587402636241e-05, + "loss": 0.4027, + "num_input_tokens_seen": 19529000, + "step": 1860 + }, + { + "epoch": 0.48029873808910634, + "grad_norm": 7.862514036245574, + "learning_rate": 4.611340741712901e-05, + "loss": 0.3981, + "num_input_tokens_seen": 19581736, + "step": 1865 + }, + { + "epoch": 0.48158640226628896, + "grad_norm": 6.303826242506772, + "learning_rate": 4.5871032726383386e-05, + "loss": 0.3545, + "num_input_tokens_seen": 19634744, + "step": 1870 + }, + { + "epoch": 0.48287406644347153, + "grad_norm": 1.3395331237019557, + "learning_rate": 4.562875568632278e-05, + "loss": 0.34, + "num_input_tokens_seen": 19686792, + "step": 1875 + }, + { + "epoch": 0.48416173062065415, + "grad_norm": 9.317753726798626, + "learning_rate": 4.5386582026834906e-05, + "loss": 0.3041, + "num_input_tokens_seen": 19739784, + "step": 1880 + }, + { + "epoch": 0.4854493947978367, + "grad_norm": 10.151546411219384, + "learning_rate": 4.5144517475362514e-05, + "loss": 0.445, + "num_input_tokens_seen": 19792024, + "step": 1885 + }, + { + "epoch": 0.4867370589750193, + "grad_norm": 5.018653041136549, + "learning_rate": 4.490256775676797e-05, + "loss": 0.3532, + "num_input_tokens_seen": 19844568, + "step": 1890 + }, + { + "epoch": 0.4880247231522019, + "grad_norm": 7.538802514963857, + "learning_rate": 4.466073859319781e-05, + "loss": 0.4356, + "num_input_tokens_seen": 19897464, + "step": 1895 + }, + { + "epoch": 0.4893123873293845, + "grad_norm": 6.295489433923776, + "learning_rate": 4.441903570394739e-05, + "loss": 0.2877, + "num_input_tokens_seen": 19950480, + "step": 1900 + }, + { + "epoch": 0.4893123873293845, + "eval_loss": 0.4511750042438507, + "eval_runtime": 38.0275, + "eval_samples_per_second": 3.156, + "eval_steps_per_second": 0.789, + "num_input_tokens_seen": 19950480, + "step": 1900 + }, + { + "epoch": 0.4906000515065671, + "grad_norm": 5.366705146420215, + "learning_rate": 4.41774648053257e-05, + "loss": 0.3542, + "num_input_tokens_seen": 20002968, + "step": 1905 + }, + { + "epoch": 0.49188771568374967, + "grad_norm": 16.028429879963088, + "learning_rate": 4.3936031610520124e-05, + "loss": 0.4095, + "num_input_tokens_seen": 20055560, + "step": 1910 + }, + { + "epoch": 0.4931753798609323, + "grad_norm": 5.057782615493361, + "learning_rate": 4.3694741829461336e-05, + "loss": 0.3887, + "num_input_tokens_seen": 20108016, + "step": 1915 + }, + { + "epoch": 0.49446304403811486, + "grad_norm": 5.383554110802039, + "learning_rate": 4.345360116868823e-05, + "loss": 0.3485, + "num_input_tokens_seen": 20160480, + "step": 1920 + }, + { + "epoch": 0.49575070821529743, + "grad_norm": 5.9195432883109484, + "learning_rate": 4.321261533121303e-05, + "loss": 0.4348, + "num_input_tokens_seen": 20213312, + "step": 1925 + }, + { + "epoch": 0.49703837239248005, + "grad_norm": 4.857089792199489, + "learning_rate": 4.2971790016386286e-05, + "loss": 0.4073, + "num_input_tokens_seen": 20266288, + "step": 1930 + }, + { + "epoch": 0.4983260365696626, + "grad_norm": 3.5886814648964824, + "learning_rate": 4.273113091976225e-05, + "loss": 0.3297, + "num_input_tokens_seen": 20318576, + "step": 1935 + }, + { + "epoch": 0.49961370074684525, + "grad_norm": 3.543407544361586, + "learning_rate": 4.249064373296403e-05, + "loss": 0.3352, + "num_input_tokens_seen": 20370696, + "step": 1940 + }, + { + "epoch": 0.5009013649240278, + "grad_norm": 9.841855265781527, + "learning_rate": 4.225033414354908e-05, + "loss": 0.3195, + "num_input_tokens_seen": 20423480, + "step": 1945 + }, + { + "epoch": 0.5021890291012104, + "grad_norm": 4.324269958230436, + "learning_rate": 4.201020783487464e-05, + "loss": 0.365, + "num_input_tokens_seen": 20476176, + "step": 1950 + }, + { + "epoch": 0.5021890291012104, + "eval_loss": 0.4763557016849518, + "eval_runtime": 38.0861, + "eval_samples_per_second": 3.151, + "eval_steps_per_second": 0.788, + "num_input_tokens_seen": 20476176, + "step": 1950 + }, + { + "epoch": 0.503476693278393, + "grad_norm": 3.649701013250087, + "learning_rate": 4.17702704859633e-05, + "loss": 0.3723, + "num_input_tokens_seen": 20529160, + "step": 1955 + }, + { + "epoch": 0.5047643574555756, + "grad_norm": 3.531932033124302, + "learning_rate": 4.153052777136879e-05, + "loss": 0.3637, + "num_input_tokens_seen": 20580864, + "step": 1960 + }, + { + "epoch": 0.5060520216327582, + "grad_norm": 4.047584724343972, + "learning_rate": 4.1290985361041614e-05, + "loss": 0.3513, + "num_input_tokens_seen": 20633720, + "step": 1965 + }, + { + "epoch": 0.5073396858099408, + "grad_norm": 3.217808286049099, + "learning_rate": 4.105164892019514e-05, + "loss": 0.3569, + "num_input_tokens_seen": 20685832, + "step": 1970 + }, + { + "epoch": 0.5086273499871233, + "grad_norm": 9.648291719407668, + "learning_rate": 4.0812524109171476e-05, + "loss": 0.3072, + "num_input_tokens_seen": 20737832, + "step": 1975 + }, + { + "epoch": 0.509915014164306, + "grad_norm": 3.324786506149817, + "learning_rate": 4.0573616583307705e-05, + "loss": 0.3884, + "num_input_tokens_seen": 20791184, + "step": 1980 + }, + { + "epoch": 0.5112026783414886, + "grad_norm": 1.6012419002883989, + "learning_rate": 4.033493199280202e-05, + "loss": 0.3414, + "num_input_tokens_seen": 20843672, + "step": 1985 + }, + { + "epoch": 0.5124903425186711, + "grad_norm": 6.104847001850503, + "learning_rate": 4.009647598258022e-05, + "loss": 0.3545, + "num_input_tokens_seen": 20895760, + "step": 1990 + }, + { + "epoch": 0.5137780066958537, + "grad_norm": 8.908240180870123, + "learning_rate": 3.985825419216207e-05, + "loss": 0.3406, + "num_input_tokens_seen": 20948448, + "step": 1995 + }, + { + "epoch": 0.5150656708730363, + "grad_norm": 3.002560544048349, + "learning_rate": 3.962027225552807e-05, + "loss": 0.3814, + "num_input_tokens_seen": 21002032, + "step": 2000 + }, + { + "epoch": 0.5150656708730363, + "eval_loss": 0.5097677707672119, + "eval_runtime": 38.0631, + "eval_samples_per_second": 3.153, + "eval_steps_per_second": 0.788, + "num_input_tokens_seen": 21002032, + "step": 2000 + }, + { + "epoch": 0.5163533350502189, + "grad_norm": 6.685054288102831, + "learning_rate": 3.938253580098613e-05, + "loss": 0.3312, + "num_input_tokens_seen": 21054264, + "step": 2005 + }, + { + "epoch": 0.5176409992274015, + "grad_norm": 4.2707442476569035, + "learning_rate": 3.914505045103845e-05, + "loss": 0.2914, + "num_input_tokens_seen": 21106872, + "step": 2010 + }, + { + "epoch": 0.5189286634045841, + "grad_norm": 3.417443771833172, + "learning_rate": 3.8907821822248605e-05, + "loss": 0.3414, + "num_input_tokens_seen": 21159976, + "step": 2015 + }, + { + "epoch": 0.5202163275817667, + "grad_norm": 2.150501014687572, + "learning_rate": 3.867085552510864e-05, + "loss": 0.3701, + "num_input_tokens_seen": 21211920, + "step": 2020 + }, + { + "epoch": 0.5215039917589492, + "grad_norm": 6.342590085133259, + "learning_rate": 3.843415716390644e-05, + "loss": 0.3867, + "num_input_tokens_seen": 21265128, + "step": 2025 + }, + { + "epoch": 0.5227916559361319, + "grad_norm": 2.264113173351574, + "learning_rate": 3.819773233659314e-05, + "loss": 0.3515, + "num_input_tokens_seen": 21317592, + "step": 2030 + }, + { + "epoch": 0.5240793201133145, + "grad_norm": 10.341962425227234, + "learning_rate": 3.7961586634650767e-05, + "loss": 0.3359, + "num_input_tokens_seen": 21370976, + "step": 2035 + }, + { + "epoch": 0.525366984290497, + "grad_norm": 2.2141855095716103, + "learning_rate": 3.772572564296005e-05, + "loss": 0.3265, + "num_input_tokens_seen": 21424056, + "step": 2040 + }, + { + "epoch": 0.5266546484676796, + "grad_norm": 14.141024644600748, + "learning_rate": 3.749015493966817e-05, + "loss": 0.3738, + "num_input_tokens_seen": 21476248, + "step": 2045 + }, + { + "epoch": 0.5279423126448622, + "grad_norm": 4.242661385103271, + "learning_rate": 3.7254880096057073e-05, + "loss": 0.3389, + "num_input_tokens_seen": 21527496, + "step": 2050 + }, + { + "epoch": 0.5279423126448622, + "eval_loss": 0.4327767789363861, + "eval_runtime": 38.1845, + "eval_samples_per_second": 3.143, + "eval_steps_per_second": 0.786, + "num_input_tokens_seen": 21527496, + "step": 2050 + }, + { + "epoch": 0.5292299768220448, + "grad_norm": 2.354802898537749, + "learning_rate": 3.7019906676411446e-05, + "loss": 0.3937, + "num_input_tokens_seen": 21579816, + "step": 2055 + }, + { + "epoch": 0.5305176409992274, + "grad_norm": 6.103564629763687, + "learning_rate": 3.678524023788735e-05, + "loss": 0.4039, + "num_input_tokens_seen": 21631776, + "step": 2060 + }, + { + "epoch": 0.53180530517641, + "grad_norm": 3.5285927755997655, + "learning_rate": 3.6550886330380665e-05, + "loss": 0.3501, + "num_input_tokens_seen": 21683608, + "step": 2065 + }, + { + "epoch": 0.5330929693535926, + "grad_norm": 4.520457201010945, + "learning_rate": 3.631685049639586e-05, + "loss": 0.3334, + "num_input_tokens_seen": 21735672, + "step": 2070 + }, + { + "epoch": 0.5343806335307751, + "grad_norm": 2.4877611413408554, + "learning_rate": 3.608313827091493e-05, + "loss": 0.3292, + "num_input_tokens_seen": 21787592, + "step": 2075 + }, + { + "epoch": 0.5356682977079578, + "grad_norm": 1.8075536605690385, + "learning_rate": 3.5849755181266474e-05, + "loss": 0.3616, + "num_input_tokens_seen": 21840448, + "step": 2080 + }, + { + "epoch": 0.5369559618851404, + "grad_norm": 10.058001239765861, + "learning_rate": 3.5616706746995026e-05, + "loss": 0.3082, + "num_input_tokens_seen": 21893096, + "step": 2085 + }, + { + "epoch": 0.5382436260623229, + "grad_norm": 5.955966804633529, + "learning_rate": 3.538399847973036e-05, + "loss": 0.293, + "num_input_tokens_seen": 21945184, + "step": 2090 + }, + { + "epoch": 0.5395312902395055, + "grad_norm": 5.3172870269416554, + "learning_rate": 3.515163588305735e-05, + "loss": 0.3835, + "num_input_tokens_seen": 21998016, + "step": 2095 + }, + { + "epoch": 0.5408189544166881, + "grad_norm": 10.736456867600818, + "learning_rate": 3.491962445238569e-05, + "loss": 0.3983, + "num_input_tokens_seen": 22050376, + "step": 2100 + }, + { + "epoch": 0.5408189544166881, + "eval_loss": 0.481829971075058, + "eval_runtime": 37.9835, + "eval_samples_per_second": 3.159, + "eval_steps_per_second": 0.79, + "num_input_tokens_seen": 22050376, + "step": 2100 + }, + { + "epoch": 0.5421066185938708, + "grad_norm": 4.2067016944481965, + "learning_rate": 3.4687969674819906e-05, + "loss": 0.4067, + "num_input_tokens_seen": 22102848, + "step": 2105 + }, + { + "epoch": 0.5433942827710533, + "grad_norm": 3.273955419211119, + "learning_rate": 3.445667702902969e-05, + "loss": 0.36, + "num_input_tokens_seen": 22155432, + "step": 2110 + }, + { + "epoch": 0.5446819469482359, + "grad_norm": 3.815876908682745, + "learning_rate": 3.4225751985120215e-05, + "loss": 0.3569, + "num_input_tokens_seen": 22207528, + "step": 2115 + }, + { + "epoch": 0.5459696111254185, + "grad_norm": 3.5864972836865845, + "learning_rate": 3.3995200004502816e-05, + "loss": 0.3503, + "num_input_tokens_seen": 22260016, + "step": 2120 + }, + { + "epoch": 0.547257275302601, + "grad_norm": 10.259154585756033, + "learning_rate": 3.3765026539765834e-05, + "loss": 0.342, + "num_input_tokens_seen": 22312616, + "step": 2125 + }, + { + "epoch": 0.5485449394797837, + "grad_norm": 10.27121418750564, + "learning_rate": 3.3535237034545675e-05, + "loss": 0.4113, + "num_input_tokens_seen": 22364776, + "step": 2130 + }, + { + "epoch": 0.5498326036569663, + "grad_norm": 7.798195914668443, + "learning_rate": 3.330583692339802e-05, + "loss": 0.311, + "num_input_tokens_seen": 22416944, + "step": 2135 + }, + { + "epoch": 0.5511202678341488, + "grad_norm": 4.484699705421769, + "learning_rate": 3.307683163166934e-05, + "loss": 0.358, + "num_input_tokens_seen": 22468960, + "step": 2140 + }, + { + "epoch": 0.5524079320113314, + "grad_norm": 4.1926914172544665, + "learning_rate": 3.284822657536856e-05, + "loss": 0.378, + "num_input_tokens_seen": 22521624, + "step": 2145 + }, + { + "epoch": 0.553695596188514, + "grad_norm": 4.378365479925035, + "learning_rate": 3.262002716103897e-05, + "loss": 0.3687, + "num_input_tokens_seen": 22574104, + "step": 2150 + }, + { + "epoch": 0.553695596188514, + "eval_loss": 0.4504742920398712, + "eval_runtime": 38.0971, + "eval_samples_per_second": 3.15, + "eval_steps_per_second": 0.787, + "num_input_tokens_seen": 22574104, + "step": 2150 + }, + { + "epoch": 0.5549832603656967, + "grad_norm": 3.1041247759199204, + "learning_rate": 3.2392238785630386e-05, + "loss": 0.3188, + "num_input_tokens_seen": 22626752, + "step": 2155 + }, + { + "epoch": 0.5562709245428792, + "grad_norm": 3.1148146131655734, + "learning_rate": 3.216486683637146e-05, + "loss": 0.3724, + "num_input_tokens_seen": 22679152, + "step": 2160 + }, + { + "epoch": 0.5575585887200618, + "grad_norm": 5.700401460677582, + "learning_rate": 3.1937916690642356e-05, + "loss": 0.3341, + "num_input_tokens_seen": 22732576, + "step": 2165 + }, + { + "epoch": 0.5588462528972444, + "grad_norm": 9.470111774820635, + "learning_rate": 3.1711393715847476e-05, + "loss": 0.3741, + "num_input_tokens_seen": 22785536, + "step": 2170 + }, + { + "epoch": 0.560133917074427, + "grad_norm": 4.526396843690337, + "learning_rate": 3.14853032692886e-05, + "loss": 0.4109, + "num_input_tokens_seen": 22838448, + "step": 2175 + }, + { + "epoch": 0.5614215812516096, + "grad_norm": 4.247095737713632, + "learning_rate": 3.125965069803811e-05, + "loss": 0.3548, + "num_input_tokens_seen": 22891176, + "step": 2180 + }, + { + "epoch": 0.5627092454287922, + "grad_norm": 2.9758376576171446, + "learning_rate": 3.103444133881261e-05, + "loss": 0.339, + "num_input_tokens_seen": 22942832, + "step": 2185 + }, + { + "epoch": 0.5639969096059748, + "grad_norm": 7.772924225732904, + "learning_rate": 3.080968051784666e-05, + "loss": 0.4406, + "num_input_tokens_seen": 22995928, + "step": 2190 + }, + { + "epoch": 0.5652845737831573, + "grad_norm": 3.2135384815306667, + "learning_rate": 3.058537355076683e-05, + "loss": 0.3615, + "num_input_tokens_seen": 23048848, + "step": 2195 + }, + { + "epoch": 0.56657223796034, + "grad_norm": 3.856652958172482, + "learning_rate": 3.0361525742465973e-05, + "loss": 0.3232, + "num_input_tokens_seen": 23101488, + "step": 2200 + }, + { + "epoch": 0.56657223796034, + "eval_loss": 0.45173853635787964, + "eval_runtime": 38.0698, + "eval_samples_per_second": 3.152, + "eval_steps_per_second": 0.788, + "num_input_tokens_seen": 23101488, + "step": 2200 + }, + { + "epoch": 0.5678599021375226, + "grad_norm": 5.085963200845529, + "learning_rate": 3.0138142386977787e-05, + "loss": 0.3453, + "num_input_tokens_seen": 23154320, + "step": 2205 + }, + { + "epoch": 0.5691475663147051, + "grad_norm": 4.403681680794225, + "learning_rate": 2.991522876735154e-05, + "loss": 0.3277, + "num_input_tokens_seen": 23206640, + "step": 2210 + }, + { + "epoch": 0.5704352304918877, + "grad_norm": 1.8098384673150376, + "learning_rate": 2.9692790155527227e-05, + "loss": 0.3368, + "num_input_tokens_seen": 23258992, + "step": 2215 + }, + { + "epoch": 0.5717228946690703, + "grad_norm": 9.043551101712044, + "learning_rate": 2.9470831812210837e-05, + "loss": 0.3518, + "num_input_tokens_seen": 23311640, + "step": 2220 + }, + { + "epoch": 0.5730105588462528, + "grad_norm": 5.292334580485412, + "learning_rate": 2.924935898674992e-05, + "loss": 0.3142, + "num_input_tokens_seen": 23364048, + "step": 2225 + }, + { + "epoch": 0.5742982230234355, + "grad_norm": 8.550510313993195, + "learning_rate": 2.902837691700945e-05, + "loss": 0.347, + "num_input_tokens_seen": 23416632, + "step": 2230 + }, + { + "epoch": 0.5755858872006181, + "grad_norm": 5.025171334634832, + "learning_rate": 2.880789082924798e-05, + "loss": 0.3807, + "num_input_tokens_seen": 23468608, + "step": 2235 + }, + { + "epoch": 0.5768735513778007, + "grad_norm": 3.166864828635947, + "learning_rate": 2.858790593799405e-05, + "loss": 0.3271, + "num_input_tokens_seen": 23521312, + "step": 2240 + }, + { + "epoch": 0.5781612155549832, + "grad_norm": 2.5337353386720167, + "learning_rate": 2.8368427445922696e-05, + "loss": 0.296, + "num_input_tokens_seen": 23574104, + "step": 2245 + }, + { + "epoch": 0.5794488797321659, + "grad_norm": 3.178685740597424, + "learning_rate": 2.8149460543732664e-05, + "loss": 0.325, + "num_input_tokens_seen": 23626952, + "step": 2250 + }, + { + "epoch": 0.5794488797321659, + "eval_loss": 0.4990580379962921, + "eval_runtime": 38.0999, + "eval_samples_per_second": 3.15, + "eval_steps_per_second": 0.787, + "num_input_tokens_seen": 23626952, + "step": 2250 + }, + { + "epoch": 0.5807365439093485, + "grad_norm": 7.013960747751967, + "learning_rate": 2.7931010410023518e-05, + "loss": 0.3544, + "num_input_tokens_seen": 23680112, + "step": 2255 + }, + { + "epoch": 0.582024208086531, + "grad_norm": 4.605744038285181, + "learning_rate": 2.771308221117309e-05, + "loss": 0.3467, + "num_input_tokens_seen": 23731896, + "step": 2260 + }, + { + "epoch": 0.5833118722637136, + "grad_norm": 4.9921005761457895, + "learning_rate": 2.749568110121545e-05, + "loss": 0.3374, + "num_input_tokens_seen": 23784616, + "step": 2265 + }, + { + "epoch": 0.5845995364408962, + "grad_norm": 4.366796972583381, + "learning_rate": 2.7278812221718924e-05, + "loss": 0.3509, + "num_input_tokens_seen": 23836920, + "step": 2270 + }, + { + "epoch": 0.5858872006180788, + "grad_norm": 3.5862787852890876, + "learning_rate": 2.7062480701664488e-05, + "loss": 0.3014, + "num_input_tokens_seen": 23890792, + "step": 2275 + }, + { + "epoch": 0.5871748647952614, + "grad_norm": 7.030699453643037, + "learning_rate": 2.6846691657324473e-05, + "loss": 0.4672, + "num_input_tokens_seen": 23943264, + "step": 2280 + }, + { + "epoch": 0.588462528972444, + "grad_norm": 3.981559816955943, + "learning_rate": 2.663145019214163e-05, + "loss": 0.2817, + "num_input_tokens_seen": 23995760, + "step": 2285 + }, + { + "epoch": 0.5897501931496266, + "grad_norm": 4.699964630974459, + "learning_rate": 2.6416761396608362e-05, + "loss": 0.3538, + "num_input_tokens_seen": 24048696, + "step": 2290 + }, + { + "epoch": 0.5910378573268091, + "grad_norm": 8.241455514660794, + "learning_rate": 2.6202630348146324e-05, + "loss": 0.3018, + "num_input_tokens_seen": 24102248, + "step": 2295 + }, + { + "epoch": 0.5923255215039918, + "grad_norm": 3.2050310147654604, + "learning_rate": 2.598906211098643e-05, + "loss": 0.3322, + "num_input_tokens_seen": 24154624, + "step": 2300 + }, + { + "epoch": 0.5923255215039918, + "eval_loss": 0.4960116744041443, + "eval_runtime": 38.142, + "eval_samples_per_second": 3.146, + "eval_steps_per_second": 0.787, + "num_input_tokens_seen": 24154624, + "step": 2300 + }, + { + "epoch": 0.5936131856811744, + "grad_norm": 8.572037740621793, + "learning_rate": 2.577606173604894e-05, + "loss": 0.3806, + "num_input_tokens_seen": 24206536, + "step": 2305 + }, + { + "epoch": 0.5949008498583569, + "grad_norm": 11.549064999265804, + "learning_rate": 2.5563634260824175e-05, + "loss": 0.4, + "num_input_tokens_seen": 24259448, + "step": 2310 + }, + { + "epoch": 0.5961885140355395, + "grad_norm": 5.644540109257779, + "learning_rate": 2.535178470925323e-05, + "loss": 0.3565, + "num_input_tokens_seen": 24312520, + "step": 2315 + }, + { + "epoch": 0.5974761782127221, + "grad_norm": 4.753875124060683, + "learning_rate": 2.5140518091609256e-05, + "loss": 0.2725, + "num_input_tokens_seen": 24364600, + "step": 2320 + }, + { + "epoch": 0.5987638423899048, + "grad_norm": 3.6385124306884635, + "learning_rate": 2.4929839404378936e-05, + "loss": 0.3154, + "num_input_tokens_seen": 24417624, + "step": 2325 + }, + { + "epoch": 0.6000515065670873, + "grad_norm": 2.660932582778999, + "learning_rate": 2.471975363014428e-05, + "loss": 0.3016, + "num_input_tokens_seen": 24469680, + "step": 2330 + }, + { + "epoch": 0.6013391707442699, + "grad_norm": 6.2187107955592085, + "learning_rate": 2.451026573746482e-05, + "loss": 0.3363, + "num_input_tokens_seen": 24521784, + "step": 2335 + }, + { + "epoch": 0.6026268349214525, + "grad_norm": 4.194683671487059, + "learning_rate": 2.430138068076013e-05, + "loss": 0.3393, + "num_input_tokens_seen": 24573824, + "step": 2340 + }, + { + "epoch": 0.603914499098635, + "grad_norm": 2.8460025008801613, + "learning_rate": 2.4093103400192625e-05, + "loss": 0.3243, + "num_input_tokens_seen": 24625824, + "step": 2345 + }, + { + "epoch": 0.6052021632758177, + "grad_norm": 6.045327035682761, + "learning_rate": 2.388543882155067e-05, + "loss": 0.3651, + "num_input_tokens_seen": 24678768, + "step": 2350 + }, + { + "epoch": 0.6052021632758177, + "eval_loss": 0.4145541489124298, + "eval_runtime": 38.1728, + "eval_samples_per_second": 3.144, + "eval_steps_per_second": 0.786, + "num_input_tokens_seen": 24678768, + "step": 2350 + }, + { + "epoch": 0.6064898274530003, + "grad_norm": 2.7740413001573057, + "learning_rate": 2.3678391856132204e-05, + "loss": 0.3229, + "num_input_tokens_seen": 24730528, + "step": 2355 + }, + { + "epoch": 0.6077774916301828, + "grad_norm": 6.766132750655314, + "learning_rate": 2.3471967400628513e-05, + "loss": 0.3308, + "num_input_tokens_seen": 24784472, + "step": 2360 + }, + { + "epoch": 0.6090651558073654, + "grad_norm": 6.8653733639649515, + "learning_rate": 2.3266170337008398e-05, + "loss": 0.356, + "num_input_tokens_seen": 24838168, + "step": 2365 + }, + { + "epoch": 0.610352819984548, + "grad_norm": 2.3990294356922615, + "learning_rate": 2.306100553240274e-05, + "loss": 0.2784, + "num_input_tokens_seen": 24890552, + "step": 2370 + }, + { + "epoch": 0.6116404841617307, + "grad_norm": 2.929256632803373, + "learning_rate": 2.2856477838989456e-05, + "loss": 0.2859, + "num_input_tokens_seen": 24942904, + "step": 2375 + }, + { + "epoch": 0.6129281483389132, + "grad_norm": 3.932586185965905, + "learning_rate": 2.2652592093878666e-05, + "loss": 0.3107, + "num_input_tokens_seen": 24995776, + "step": 2380 + }, + { + "epoch": 0.6142158125160958, + "grad_norm": 3.999914127947348, + "learning_rate": 2.244935311899829e-05, + "loss": 0.3131, + "num_input_tokens_seen": 25047848, + "step": 2385 + }, + { + "epoch": 0.6155034766932784, + "grad_norm": 3.803358403729212, + "learning_rate": 2.224676572098007e-05, + "loss": 0.3175, + "num_input_tokens_seen": 25100896, + "step": 2390 + }, + { + "epoch": 0.6167911408704609, + "grad_norm": 10.600664919848047, + "learning_rate": 2.2044834691045873e-05, + "loss": 0.3482, + "num_input_tokens_seen": 25153912, + "step": 2395 + }, + { + "epoch": 0.6180788050476436, + "grad_norm": 5.122783317200166, + "learning_rate": 2.184356480489432e-05, + "loss": 0.3445, + "num_input_tokens_seen": 25206168, + "step": 2400 + }, + { + "epoch": 0.6180788050476436, + "eval_loss": 0.42807063460350037, + "eval_runtime": 38.2326, + "eval_samples_per_second": 3.139, + "eval_steps_per_second": 0.785, + "num_input_tokens_seen": 25206168, + "step": 2400 + }, + { + "epoch": 0.6193664692248262, + "grad_norm": 9.751299586219911, + "learning_rate": 2.1642960822587878e-05, + "loss": 0.3147, + "num_input_tokens_seen": 25258880, + "step": 2405 + }, + { + "epoch": 0.6206541334020087, + "grad_norm": 1.3518511785939038, + "learning_rate": 2.1443027488440338e-05, + "loss": 0.3467, + "num_input_tokens_seen": 25310976, + "step": 2410 + }, + { + "epoch": 0.6219417975791913, + "grad_norm": 4.735151631850019, + "learning_rate": 2.124376953090456e-05, + "loss": 0.3085, + "num_input_tokens_seen": 25363520, + "step": 2415 + }, + { + "epoch": 0.623229461756374, + "grad_norm": 6.109775613794598, + "learning_rate": 2.104519166246059e-05, + "loss": 0.3376, + "num_input_tokens_seen": 25415400, + "step": 2420 + }, + { + "epoch": 0.6245171259335566, + "grad_norm": 1.896185856776787, + "learning_rate": 2.0847298579504344e-05, + "loss": 0.3312, + "num_input_tokens_seen": 25468296, + "step": 2425 + }, + { + "epoch": 0.6258047901107391, + "grad_norm": 3.9069124431889932, + "learning_rate": 2.065009496223638e-05, + "loss": 0.3282, + "num_input_tokens_seen": 25520816, + "step": 2430 + }, + { + "epoch": 0.6270924542879217, + "grad_norm": 4.998056844440976, + "learning_rate": 2.045358547455138e-05, + "loss": 0.321, + "num_input_tokens_seen": 25573416, + "step": 2435 + }, + { + "epoch": 0.6283801184651043, + "grad_norm": 4.15352407282998, + "learning_rate": 2.0257774763927655e-05, + "loss": 0.33, + "num_input_tokens_seen": 25626536, + "step": 2440 + }, + { + "epoch": 0.6296677826422868, + "grad_norm": 7.962840580433044, + "learning_rate": 2.0062667461317426e-05, + "loss": 0.2833, + "num_input_tokens_seen": 25679208, + "step": 2445 + }, + { + "epoch": 0.6309554468194695, + "grad_norm": 3.645397800601146, + "learning_rate": 1.9868268181037185e-05, + "loss": 0.3413, + "num_input_tokens_seen": 25730432, + "step": 2450 + }, + { + "epoch": 0.6309554468194695, + "eval_loss": 0.46914541721343994, + "eval_runtime": 38.1221, + "eval_samples_per_second": 3.148, + "eval_steps_per_second": 0.787, + "num_input_tokens_seen": 25730432, + "step": 2450 + }, + { + "epoch": 0.6322431109966521, + "grad_norm": 7.709254974398925, + "learning_rate": 1.967458152065857e-05, + "loss": 0.3132, + "num_input_tokens_seen": 25782992, + "step": 2455 + }, + { + "epoch": 0.6335307751738347, + "grad_norm": 2.0752736771377287, + "learning_rate": 1.9481612060899646e-05, + "loss": 0.2995, + "num_input_tokens_seen": 25835576, + "step": 2460 + }, + { + "epoch": 0.6348184393510172, + "grad_norm": 10.170909003408605, + "learning_rate": 1.928936436551661e-05, + "loss": 0.3104, + "num_input_tokens_seen": 25886784, + "step": 2465 + }, + { + "epoch": 0.6361061035281999, + "grad_norm": 1.8795408100126576, + "learning_rate": 1.9097842981195834e-05, + "loss": 0.2866, + "num_input_tokens_seen": 25939408, + "step": 2470 + }, + { + "epoch": 0.6373937677053825, + "grad_norm": 3.0210608988396617, + "learning_rate": 1.8907052437446272e-05, + "loss": 0.2886, + "num_input_tokens_seen": 25992048, + "step": 2475 + }, + { + "epoch": 0.638681431882565, + "grad_norm": 5.74395959342657, + "learning_rate": 1.871699724649244e-05, + "loss": 0.3752, + "num_input_tokens_seen": 26045216, + "step": 2480 + }, + { + "epoch": 0.6399690960597476, + "grad_norm": 2.165308828868311, + "learning_rate": 1.8527681903167644e-05, + "loss": 0.3039, + "num_input_tokens_seen": 26097424, + "step": 2485 + }, + { + "epoch": 0.6412567602369302, + "grad_norm": 2.3585515677006534, + "learning_rate": 1.833911088480767e-05, + "loss": 0.3142, + "num_input_tokens_seen": 26149616, + "step": 2490 + }, + { + "epoch": 0.6425444244141127, + "grad_norm": 3.733186871212028, + "learning_rate": 1.8151288651144893e-05, + "loss": 0.3576, + "num_input_tokens_seen": 26200744, + "step": 2495 + }, + { + "epoch": 0.6438320885912954, + "grad_norm": 3.1608590396449423, + "learning_rate": 1.796421964420285e-05, + "loss": 0.363, + "num_input_tokens_seen": 26252584, + "step": 2500 + }, + { + "epoch": 0.6438320885912954, + "eval_loss": 0.44705262780189514, + "eval_runtime": 38.2407, + "eval_samples_per_second": 3.138, + "eval_steps_per_second": 0.785, + "num_input_tokens_seen": 26252584, + "step": 2500 + }, + { + "epoch": 0.645119752768478, + "grad_norm": 12.715005532803087, + "learning_rate": 1.7777908288191176e-05, + "loss": 0.3113, + "num_input_tokens_seen": 26304800, + "step": 2505 + }, + { + "epoch": 0.6464074169456606, + "grad_norm": 4.389623559119695, + "learning_rate": 1.7592358989400883e-05, + "loss": 0.3581, + "num_input_tokens_seen": 26357680, + "step": 2510 + }, + { + "epoch": 0.6476950811228431, + "grad_norm": 4.708341940810254, + "learning_rate": 1.740757613610028e-05, + "loss": 0.3353, + "num_input_tokens_seen": 26410432, + "step": 2515 + }, + { + "epoch": 0.6489827453000258, + "grad_norm": 2.698266437964572, + "learning_rate": 1.7223564098431067e-05, + "loss": 0.2796, + "num_input_tokens_seen": 26463016, + "step": 2520 + }, + { + "epoch": 0.6502704094772084, + "grad_norm": 2.4430847474817843, + "learning_rate": 1.704032722830512e-05, + "loss": 0.3197, + "num_input_tokens_seen": 26515408, + "step": 2525 + }, + { + "epoch": 0.6515580736543909, + "grad_norm": 2.729151807047382, + "learning_rate": 1.68578698593014e-05, + "loss": 0.3182, + "num_input_tokens_seen": 26567024, + "step": 2530 + }, + { + "epoch": 0.6528457378315735, + "grad_norm": 12.016926866019531, + "learning_rate": 1.6676196306563613e-05, + "loss": 0.3822, + "num_input_tokens_seen": 26619744, + "step": 2535 + }, + { + "epoch": 0.6541334020087561, + "grad_norm": 3.7284612790252294, + "learning_rate": 1.6495310866698093e-05, + "loss": 0.2853, + "num_input_tokens_seen": 26672408, + "step": 2540 + }, + { + "epoch": 0.6554210661859388, + "grad_norm": 4.562253048250174, + "learning_rate": 1.631521781767214e-05, + "loss": 0.3622, + "num_input_tokens_seen": 26724488, + "step": 2545 + }, + { + "epoch": 0.6567087303631213, + "grad_norm": 9.803435725573266, + "learning_rate": 1.6135921418712956e-05, + "loss": 0.3195, + "num_input_tokens_seen": 26776816, + "step": 2550 + }, + { + "epoch": 0.6567087303631213, + "eval_loss": 0.43731561303138733, + "eval_runtime": 38.3205, + "eval_samples_per_second": 3.131, + "eval_steps_per_second": 0.783, + "num_input_tokens_seen": 26776816, + "step": 2550 + }, + { + "epoch": 0.6579963945403039, + "grad_norm": 4.114015193521732, + "learning_rate": 1.5957425910206785e-05, + "loss": 0.3547, + "num_input_tokens_seen": 26829304, + "step": 2555 + }, + { + "epoch": 0.6592840587174865, + "grad_norm": 3.0843395555920994, + "learning_rate": 1.577973551359877e-05, + "loss": 0.281, + "num_input_tokens_seen": 26881272, + "step": 2560 + }, + { + "epoch": 0.660571722894669, + "grad_norm": 1.2091864940073456, + "learning_rate": 1.560285443129296e-05, + "loss": 0.2814, + "num_input_tokens_seen": 26934104, + "step": 2565 + }, + { + "epoch": 0.6618593870718517, + "grad_norm": 3.2274724027367263, + "learning_rate": 1.542678684655306e-05, + "loss": 0.2602, + "num_input_tokens_seen": 26986248, + "step": 2570 + }, + { + "epoch": 0.6631470512490343, + "grad_norm": 4.701885261412978, + "learning_rate": 1.5251536923403426e-05, + "loss": 0.336, + "num_input_tokens_seen": 27038528, + "step": 2575 + }, + { + "epoch": 0.6644347154262168, + "grad_norm": 8.345266437633946, + "learning_rate": 1.5077108806530581e-05, + "loss": 0.2867, + "num_input_tokens_seen": 27090792, + "step": 2580 + }, + { + "epoch": 0.6657223796033994, + "grad_norm": 1.737622406149806, + "learning_rate": 1.4903506621185192e-05, + "loss": 0.2898, + "num_input_tokens_seen": 27143544, + "step": 2585 + }, + { + "epoch": 0.667010043780582, + "grad_norm": 3.3292538558539, + "learning_rate": 1.4730734473084568e-05, + "loss": 0.2955, + "num_input_tokens_seen": 27195632, + "step": 2590 + }, + { + "epoch": 0.6682977079577647, + "grad_norm": 3.0195263199381492, + "learning_rate": 1.4558796448315504e-05, + "loss": 0.281, + "num_input_tokens_seen": 27248472, + "step": 2595 + }, + { + "epoch": 0.6695853721349472, + "grad_norm": 2.0014947688908067, + "learning_rate": 1.4387696613237612e-05, + "loss": 0.3075, + "num_input_tokens_seen": 27301776, + "step": 2600 + }, + { + "epoch": 0.6695853721349472, + "eval_loss": 0.4504788815975189, + "eval_runtime": 65.1834, + "eval_samples_per_second": 1.841, + "eval_steps_per_second": 0.46, + "num_input_tokens_seen": 27301776, + "step": 2600 + }, + { + "epoch": 0.6708730363121298, + "grad_norm": 11.33277292417631, + "learning_rate": 1.4217439014387251e-05, + "loss": 0.3403, + "num_input_tokens_seen": 27354136, + "step": 2605 + }, + { + "epoch": 0.6721607004893124, + "grad_norm": 5.510244242476496, + "learning_rate": 1.404802767838176e-05, + "loss": 0.286, + "num_input_tokens_seen": 27405792, + "step": 2610 + }, + { + "epoch": 0.6734483646664949, + "grad_norm": 2.193500058552259, + "learning_rate": 1.3879466611824199e-05, + "loss": 0.2851, + "num_input_tokens_seen": 27457864, + "step": 2615 + }, + { + "epoch": 0.6747360288436776, + "grad_norm": 4.514329899988941, + "learning_rate": 1.371175980120864e-05, + "loss": 0.3605, + "num_input_tokens_seen": 27511520, + "step": 2620 + }, + { + "epoch": 0.6760236930208602, + "grad_norm": 5.425713946109257, + "learning_rate": 1.3544911212825906e-05, + "loss": 0.2961, + "num_input_tokens_seen": 27564200, + "step": 2625 + }, + { + "epoch": 0.6773113571980427, + "grad_norm": 4.175489265713165, + "learning_rate": 1.337892479266974e-05, + "loss": 0.3004, + "num_input_tokens_seen": 27616704, + "step": 2630 + }, + { + "epoch": 0.6785990213752253, + "grad_norm": 10.44575522954796, + "learning_rate": 1.3213804466343421e-05, + "loss": 0.2878, + "num_input_tokens_seen": 27668944, + "step": 2635 + }, + { + "epoch": 0.6798866855524079, + "grad_norm": 3.7122542259834996, + "learning_rate": 1.3049554138967051e-05, + "loss": 0.2863, + "num_input_tokens_seen": 27722304, + "step": 2640 + }, + { + "epoch": 0.6811743497295906, + "grad_norm": 3.3427153391507605, + "learning_rate": 1.2886177695085078e-05, + "loss": 0.3084, + "num_input_tokens_seen": 27775400, + "step": 2645 + }, + { + "epoch": 0.6824620139067731, + "grad_norm": 4.866560148374017, + "learning_rate": 1.2723678998574512e-05, + "loss": 0.324, + "num_input_tokens_seen": 27827480, + "step": 2650 + }, + { + "epoch": 0.6824620139067731, + "eval_loss": 0.5079630613327026, + "eval_runtime": 37.7274, + "eval_samples_per_second": 3.181, + "eval_steps_per_second": 0.795, + "num_input_tokens_seen": 27827480, + "step": 2650 + }, + { + "epoch": 0.6837496780839557, + "grad_norm": 4.053406132978294, + "learning_rate": 1.2562061892553473e-05, + "loss": 0.3189, + "num_input_tokens_seen": 27879064, + "step": 2655 + }, + { + "epoch": 0.6850373422611383, + "grad_norm": 3.0207067473597937, + "learning_rate": 1.2401330199290367e-05, + "loss": 0.2458, + "num_input_tokens_seen": 27931864, + "step": 2660 + }, + { + "epoch": 0.6863250064383208, + "grad_norm": 4.476781511295854, + "learning_rate": 1.224148772011346e-05, + "loss": 0.3055, + "num_input_tokens_seen": 27984408, + "step": 2665 + }, + { + "epoch": 0.6876126706155035, + "grad_norm": 10.00871121504839, + "learning_rate": 1.2082538235320929e-05, + "loss": 0.2993, + "num_input_tokens_seen": 28037368, + "step": 2670 + }, + { + "epoch": 0.6889003347926861, + "grad_norm": 8.5963867396194, + "learning_rate": 1.1924485504091565e-05, + "loss": 0.3572, + "num_input_tokens_seen": 28090768, + "step": 2675 + }, + { + "epoch": 0.6901879989698687, + "grad_norm": 2.8349545388422857, + "learning_rate": 1.1767333264395736e-05, + "loss": 0.4043, + "num_input_tokens_seen": 28142432, + "step": 2680 + }, + { + "epoch": 0.6914756631470512, + "grad_norm": 3.857351389318571, + "learning_rate": 1.1611085232907132e-05, + "loss": 0.3288, + "num_input_tokens_seen": 28194896, + "step": 2685 + }, + { + "epoch": 0.6927633273242338, + "grad_norm": 2.9121673846993943, + "learning_rate": 1.14557451049147e-05, + "loss": 0.3491, + "num_input_tokens_seen": 28247264, + "step": 2690 + }, + { + "epoch": 0.6940509915014165, + "grad_norm": 5.691957290096305, + "learning_rate": 1.1301316554235397e-05, + "loss": 0.2881, + "num_input_tokens_seen": 28299864, + "step": 2695 + }, + { + "epoch": 0.695338655678599, + "grad_norm": 3.366628982199851, + "learning_rate": 1.114780323312724e-05, + "loss": 0.3076, + "num_input_tokens_seen": 28352368, + "step": 2700 + }, + { + "epoch": 0.695338655678599, + "eval_loss": 0.4338160753250122, + "eval_runtime": 38.6118, + "eval_samples_per_second": 3.108, + "eval_steps_per_second": 0.777, + "num_input_tokens_seen": 28352368, + "step": 2700 + }, + { + "epoch": 0.6966263198557816, + "grad_norm": 4.846751637664359, + "learning_rate": 1.0995208772202897e-05, + "loss": 0.3024, + "num_input_tokens_seen": 28404360, + "step": 2705 + }, + { + "epoch": 0.6979139840329642, + "grad_norm": 3.389210038033102, + "learning_rate": 1.0843536780343865e-05, + "loss": 0.2668, + "num_input_tokens_seen": 28456960, + "step": 2710 + }, + { + "epoch": 0.6992016482101467, + "grad_norm": 3.86068493876399, + "learning_rate": 1.069279084461513e-05, + "loss": 0.3344, + "num_input_tokens_seen": 28509448, + "step": 2715 + }, + { + "epoch": 0.7004893123873294, + "grad_norm": 4.640152880083763, + "learning_rate": 1.0542974530180327e-05, + "loss": 0.2942, + "num_input_tokens_seen": 28561496, + "step": 2720 + }, + { + "epoch": 0.701776976564512, + "grad_norm": 4.8249731253866965, + "learning_rate": 1.0394091380217352e-05, + "loss": 0.3209, + "num_input_tokens_seen": 28613224, + "step": 2725 + }, + { + "epoch": 0.7030646407416946, + "grad_norm": 3.5230741478299064, + "learning_rate": 1.0246144915834683e-05, + "loss": 0.3021, + "num_input_tokens_seen": 28665360, + "step": 2730 + }, + { + "epoch": 0.7043523049188771, + "grad_norm": 3.128735151687698, + "learning_rate": 1.0099138635988026e-05, + "loss": 0.211, + "num_input_tokens_seen": 28719488, + "step": 2735 + }, + { + "epoch": 0.7056399690960597, + "grad_norm": 1.3121990500145593, + "learning_rate": 9.953076017397578e-06, + "loss": 0.3017, + "num_input_tokens_seen": 28771880, + "step": 2740 + }, + { + "epoch": 0.7069276332732424, + "grad_norm": 8.516269925603744, + "learning_rate": 9.807960514465792e-06, + "loss": 0.3022, + "num_input_tokens_seen": 28825096, + "step": 2745 + }, + { + "epoch": 0.7082152974504249, + "grad_norm": 2.554842562479388, + "learning_rate": 9.663795559195733e-06, + "loss": 0.2817, + "num_input_tokens_seen": 28877960, + "step": 2750 + }, + { + "epoch": 0.7082152974504249, + "eval_loss": 0.4439634680747986, + "eval_runtime": 38.5485, + "eval_samples_per_second": 3.113, + "eval_steps_per_second": 0.778, + "num_input_tokens_seen": 28877960, + "step": 2750 + }, + { + "epoch": 0.7095029616276075, + "grad_norm": 5.663465836449411, + "learning_rate": 9.520584561109864e-06, + "loss": 0.2854, + "num_input_tokens_seen": 28930512, + "step": 2755 + }, + { + "epoch": 0.7107906258047901, + "grad_norm": 7.439380473916582, + "learning_rate": 9.378330907169386e-06, + "loss": 0.3635, + "num_input_tokens_seen": 28984048, + "step": 2760 + }, + { + "epoch": 0.7120782899819728, + "grad_norm": 4.5771879776359405, + "learning_rate": 9.237037961694223e-06, + "loss": 0.3276, + "num_input_tokens_seen": 29034368, + "step": 2765 + }, + { + "epoch": 0.7133659541591553, + "grad_norm": 7.8677378416300465, + "learning_rate": 9.096709066283354e-06, + "loss": 0.2939, + "num_input_tokens_seen": 29086720, + "step": 2770 + }, + { + "epoch": 0.7146536183363379, + "grad_norm": 1.5291169053199134, + "learning_rate": 8.957347539735872e-06, + "loss": 0.2814, + "num_input_tokens_seen": 29139744, + "step": 2775 + }, + { + "epoch": 0.7159412825135205, + "grad_norm": 4.891124821027818, + "learning_rate": 8.818956677972406e-06, + "loss": 0.3773, + "num_input_tokens_seen": 29192168, + "step": 2780 + }, + { + "epoch": 0.717228946690703, + "grad_norm": 9.680575721553687, + "learning_rate": 8.681539753957269e-06, + "loss": 0.3126, + "num_input_tokens_seen": 29244896, + "step": 2785 + }, + { + "epoch": 0.7185166108678857, + "grad_norm": 1.859678529995154, + "learning_rate": 8.545100017620988e-06, + "loss": 0.3038, + "num_input_tokens_seen": 29297424, + "step": 2790 + }, + { + "epoch": 0.7198042750450683, + "grad_norm": 3.4590192982362518, + "learning_rate": 8.409640695783443e-06, + "loss": 0.308, + "num_input_tokens_seen": 29349664, + "step": 2795 + }, + { + "epoch": 0.7210919392222508, + "grad_norm": 2.2011746218628416, + "learning_rate": 8.275164992077556e-06, + "loss": 0.3567, + "num_input_tokens_seen": 29402040, + "step": 2800 + }, + { + "epoch": 0.7210919392222508, + "eval_loss": 0.4282406270503998, + "eval_runtime": 38.0305, + "eval_samples_per_second": 3.155, + "eval_steps_per_second": 0.789, + "num_input_tokens_seen": 29402040, + "step": 2800 + }, + { + "epoch": 0.7223796033994334, + "grad_norm": 3.561696884073749, + "learning_rate": 8.141676086873572e-06, + "loss": 0.2538, + "num_input_tokens_seen": 29455456, + "step": 2805 + }, + { + "epoch": 0.723667267576616, + "grad_norm": 2.941091025150912, + "learning_rate": 8.009177137203794e-06, + "loss": 0.3374, + "num_input_tokens_seen": 29507136, + "step": 2810 + }, + { + "epoch": 0.7249549317537987, + "grad_norm": 1.7626408698187983, + "learning_rate": 7.877671276687898e-06, + "loss": 0.3303, + "num_input_tokens_seen": 29558760, + "step": 2815 + }, + { + "epoch": 0.7262425959309812, + "grad_norm": 2.788131053787238, + "learning_rate": 7.747161615458902e-06, + "loss": 0.2834, + "num_input_tokens_seen": 29612000, + "step": 2820 + }, + { + "epoch": 0.7275302601081638, + "grad_norm": 3.8899073323572444, + "learning_rate": 7.617651240089546e-06, + "loss": 0.2746, + "num_input_tokens_seen": 29664472, + "step": 2825 + }, + { + "epoch": 0.7288179242853464, + "grad_norm": 1.3611659955678468, + "learning_rate": 7.489143213519301e-06, + "loss": 0.315, + "num_input_tokens_seen": 29716440, + "step": 2830 + }, + { + "epoch": 0.7301055884625289, + "grad_norm": 5.373259186650034, + "learning_rate": 7.361640574981937e-06, + "loss": 0.2877, + "num_input_tokens_seen": 29769248, + "step": 2835 + }, + { + "epoch": 0.7313932526397116, + "grad_norm": 2.796854999712465, + "learning_rate": 7.2351463399336735e-06, + "loss": 0.2953, + "num_input_tokens_seen": 29821968, + "step": 2840 + }, + { + "epoch": 0.7326809168168942, + "grad_norm": 6.039730307609144, + "learning_rate": 7.109663499981834e-06, + "loss": 0.2709, + "num_input_tokens_seen": 29875104, + "step": 2845 + }, + { + "epoch": 0.7339685809940767, + "grad_norm": 4.572150266141393, + "learning_rate": 6.985195022814067e-06, + "loss": 0.3024, + "num_input_tokens_seen": 29928032, + "step": 2850 + }, + { + "epoch": 0.7339685809940767, + "eval_loss": 0.47043517231941223, + "eval_runtime": 37.451, + "eval_samples_per_second": 3.204, + "eval_steps_per_second": 0.801, + "num_input_tokens_seen": 29928032, + "step": 2850 + }, + { + "epoch": 0.7352562451712593, + "grad_norm": 9.02733401851065, + "learning_rate": 6.861743852128233e-06, + "loss": 0.3425, + "num_input_tokens_seen": 29980608, + "step": 2855 + }, + { + "epoch": 0.7365439093484419, + "grad_norm": 3.7033826107513947, + "learning_rate": 6.7393129075627335e-06, + "loss": 0.3095, + "num_input_tokens_seen": 30033680, + "step": 2860 + }, + { + "epoch": 0.7378315735256246, + "grad_norm": 6.765425363789277, + "learning_rate": 6.6179050846274515e-06, + "loss": 0.2894, + "num_input_tokens_seen": 30086016, + "step": 2865 + }, + { + "epoch": 0.7391192377028071, + "grad_norm": 3.14927032436602, + "learning_rate": 6.497523254635296e-06, + "loss": 0.3044, + "num_input_tokens_seen": 30139216, + "step": 2870 + }, + { + "epoch": 0.7404069018799897, + "grad_norm": 4.393134612270862, + "learning_rate": 6.37817026463432e-06, + "loss": 0.3116, + "num_input_tokens_seen": 30191240, + "step": 2875 + }, + { + "epoch": 0.7416945660571723, + "grad_norm": 7.359516623863216, + "learning_rate": 6.25984893734034e-06, + "loss": 0.3229, + "num_input_tokens_seen": 30243680, + "step": 2880 + }, + { + "epoch": 0.7429822302343548, + "grad_norm": 3.1847287957730313, + "learning_rate": 6.142562071070179e-06, + "loss": 0.2495, + "num_input_tokens_seen": 30296376, + "step": 2885 + }, + { + "epoch": 0.7442698944115375, + "grad_norm": 4.477442751590225, + "learning_rate": 6.026312439675552e-06, + "loss": 0.3083, + "num_input_tokens_seen": 30349864, + "step": 2890 + }, + { + "epoch": 0.7455575585887201, + "grad_norm": 4.438478594240989, + "learning_rate": 5.911102792477357e-06, + "loss": 0.3252, + "num_input_tokens_seen": 30402248, + "step": 2895 + }, + { + "epoch": 0.7468452227659027, + "grad_norm": 2.1585574607675873, + "learning_rate": 5.796935854200763e-06, + "loss": 0.3167, + "num_input_tokens_seen": 30455480, + "step": 2900 + }, + { + "epoch": 0.7468452227659027, + "eval_loss": 0.46323254704475403, + "eval_runtime": 38.4671, + "eval_samples_per_second": 3.12, + "eval_steps_per_second": 0.78, + "num_input_tokens_seen": 30455480, + "step": 2900 + }, + { + "epoch": 0.7481328869430852, + "grad_norm": 2.3992253819845457, + "learning_rate": 5.683814324910685e-06, + "loss": 0.3063, + "num_input_tokens_seen": 30507096, + "step": 2905 + }, + { + "epoch": 0.7494205511202678, + "grad_norm": 1.9374646045639132, + "learning_rate": 5.571740879947979e-06, + "loss": 0.2694, + "num_input_tokens_seen": 30558760, + "step": 2910 + }, + { + "epoch": 0.7507082152974505, + "grad_norm": 2.935921090645564, + "learning_rate": 5.4607181698661634e-06, + "loss": 0.2578, + "num_input_tokens_seen": 30612024, + "step": 2915 + }, + { + "epoch": 0.751995879474633, + "grad_norm": 3.630419333089186, + "learning_rate": 5.35074882036869e-06, + "loss": 0.3526, + "num_input_tokens_seen": 30665272, + "step": 2920 + }, + { + "epoch": 0.7532835436518156, + "grad_norm": 5.477070657190314, + "learning_rate": 5.241835432246889e-06, + "loss": 0.2965, + "num_input_tokens_seen": 30717104, + "step": 2925 + }, + { + "epoch": 0.7545712078289982, + "grad_norm": 4.515862685899356, + "learning_rate": 5.133980581318459e-06, + "loss": 0.3122, + "num_input_tokens_seen": 30769656, + "step": 2930 + }, + { + "epoch": 0.7558588720061807, + "grad_norm": 2.2550437769263585, + "learning_rate": 5.027186818366542e-06, + "loss": 0.2968, + "num_input_tokens_seen": 30822016, + "step": 2935 + }, + { + "epoch": 0.7571465361833634, + "grad_norm": 5.093463419929814, + "learning_rate": 4.921456669079366e-06, + "loss": 0.3536, + "num_input_tokens_seen": 30873336, + "step": 2940 + }, + { + "epoch": 0.758434200360546, + "grad_norm": 3.53245369843195, + "learning_rate": 4.816792633990569e-06, + "loss": 0.2721, + "num_input_tokens_seen": 30926104, + "step": 2945 + }, + { + "epoch": 0.7597218645377286, + "grad_norm": 6.403963575911819, + "learning_rate": 4.713197188420026e-06, + "loss": 0.2899, + "num_input_tokens_seen": 30979312, + "step": 2950 + }, + { + "epoch": 0.7597218645377286, + "eval_loss": 0.4720001518726349, + "eval_runtime": 38.5709, + "eval_samples_per_second": 3.111, + "eval_steps_per_second": 0.778, + "num_input_tokens_seen": 30979312, + "step": 2950 + }, + { + "epoch": 0.7610095287149111, + "grad_norm": 2.8702187527192606, + "learning_rate": 4.610672782415276e-06, + "loss": 0.262, + "num_input_tokens_seen": 31032752, + "step": 2955 + }, + { + "epoch": 0.7622971928920937, + "grad_norm": 5.0777277551984685, + "learning_rate": 4.509221840693656e-06, + "loss": 0.3094, + "num_input_tokens_seen": 31085208, + "step": 2960 + }, + { + "epoch": 0.7635848570692764, + "grad_norm": 4.075901162722221, + "learning_rate": 4.408846762584901e-06, + "loss": 0.2995, + "num_input_tokens_seen": 31137584, + "step": 2965 + }, + { + "epoch": 0.7648725212464589, + "grad_norm": 2.7403373602794407, + "learning_rate": 4.309549921974421e-06, + "loss": 0.312, + "num_input_tokens_seen": 31190160, + "step": 2970 + }, + { + "epoch": 0.7661601854236415, + "grad_norm": 10.928832756124109, + "learning_rate": 4.2113336672471245e-06, + "loss": 0.2961, + "num_input_tokens_seen": 31242024, + "step": 2975 + }, + { + "epoch": 0.7674478496008241, + "grad_norm": 3.207870206407515, + "learning_rate": 4.114200321231937e-06, + "loss": 0.3188, + "num_input_tokens_seen": 31294272, + "step": 2980 + }, + { + "epoch": 0.7687355137780066, + "grad_norm": 10.104667188926854, + "learning_rate": 4.018152181146823e-06, + "loss": 0.2721, + "num_input_tokens_seen": 31347128, + "step": 2985 + }, + { + "epoch": 0.7700231779551893, + "grad_norm": 3.011784186718581, + "learning_rate": 3.923191518544434e-06, + "loss": 0.2993, + "num_input_tokens_seen": 31399576, + "step": 2990 + }, + { + "epoch": 0.7713108421323719, + "grad_norm": 5.814155800149598, + "learning_rate": 3.829320579258466e-06, + "loss": 0.3327, + "num_input_tokens_seen": 31451704, + "step": 2995 + }, + { + "epoch": 0.7725985063095545, + "grad_norm": 2.292303703876774, + "learning_rate": 3.7365415833504725e-06, + "loss": 0.3522, + "num_input_tokens_seen": 31503344, + "step": 3000 + }, + { + "epoch": 0.7725985063095545, + "eval_loss": 0.4726044833660126, + "eval_runtime": 37.8146, + "eval_samples_per_second": 3.173, + "eval_steps_per_second": 0.793, + "num_input_tokens_seen": 31503344, + "step": 3000 + }, + { + "epoch": 0.773886170486737, + "grad_norm": 1.6974033383197926, + "learning_rate": 3.644856725057405e-06, + "loss": 0.3076, + "num_input_tokens_seen": 31555896, + "step": 3005 + }, + { + "epoch": 0.7751738346639196, + "grad_norm": 2.9199863233877923, + "learning_rate": 3.554268172739661e-06, + "loss": 0.2934, + "num_input_tokens_seen": 31608208, + "step": 3010 + }, + { + "epoch": 0.7764614988411023, + "grad_norm": 3.5836227929171787, + "learning_rate": 3.4647780688298826e-06, + "loss": 0.3333, + "num_input_tokens_seen": 31659576, + "step": 3015 + }, + { + "epoch": 0.7777491630182848, + "grad_norm": 5.740247853887878, + "learning_rate": 3.376388529782215e-06, + "loss": 0.2666, + "num_input_tokens_seen": 31712176, + "step": 3020 + }, + { + "epoch": 0.7790368271954674, + "grad_norm": 1.6722725422207578, + "learning_rate": 3.2891016460222967e-06, + "loss": 0.2454, + "num_input_tokens_seen": 31765672, + "step": 3025 + }, + { + "epoch": 0.78032449137265, + "grad_norm": 1.3887229307567428, + "learning_rate": 3.2029194818977983e-06, + "loss": 0.3242, + "num_input_tokens_seen": 31818456, + "step": 3030 + }, + { + "epoch": 0.7816121555498327, + "grad_norm": 6.645653838568147, + "learning_rate": 3.117844075629617e-06, + "loss": 0.3378, + "num_input_tokens_seen": 31871648, + "step": 3035 + }, + { + "epoch": 0.7828998197270152, + "grad_norm": 13.953403962515383, + "learning_rate": 3.033877439263666e-06, + "loss": 0.2981, + "num_input_tokens_seen": 31924688, + "step": 3040 + }, + { + "epoch": 0.7841874839041978, + "grad_norm": 8.96486812016057, + "learning_rate": 2.951021558623274e-06, + "loss": 0.2909, + "num_input_tokens_seen": 31977752, + "step": 3045 + }, + { + "epoch": 0.7854751480813804, + "grad_norm": 5.486803825217382, + "learning_rate": 2.869278393262226e-06, + "loss": 0.3137, + "num_input_tokens_seen": 32030016, + "step": 3050 + }, + { + "epoch": 0.7854751480813804, + "eval_loss": 0.4746885299682617, + "eval_runtime": 38.0464, + "eval_samples_per_second": 3.154, + "eval_steps_per_second": 0.789, + "num_input_tokens_seen": 32030016, + "step": 3050 + }, + { + "epoch": 0.7867628122585629, + "grad_norm": 1.6401540308745686, + "learning_rate": 2.7886498764184588e-06, + "loss": 0.3247, + "num_input_tokens_seen": 32082256, + "step": 3055 + }, + { + "epoch": 0.7880504764357456, + "grad_norm": 2.305310160123287, + "learning_rate": 2.7091379149682685e-06, + "loss": 0.2895, + "num_input_tokens_seen": 32134592, + "step": 3060 + }, + { + "epoch": 0.7893381406129282, + "grad_norm": 7.79411199672483, + "learning_rate": 2.6307443893812843e-06, + "loss": 0.294, + "num_input_tokens_seen": 32187064, + "step": 3065 + }, + { + "epoch": 0.7906258047901107, + "grad_norm": 7.8803119684251355, + "learning_rate": 2.5534711536759404e-06, + "loss": 0.3205, + "num_input_tokens_seen": 32238944, + "step": 3070 + }, + { + "epoch": 0.7919134689672933, + "grad_norm": 1.651743530845747, + "learning_rate": 2.4773200353756798e-06, + "loss": 0.2726, + "num_input_tokens_seen": 32291528, + "step": 3075 + }, + { + "epoch": 0.7932011331444759, + "grad_norm": 5.642476416103777, + "learning_rate": 2.4022928354656473e-06, + "loss": 0.3012, + "num_input_tokens_seen": 32343600, + "step": 3080 + }, + { + "epoch": 0.7944887973216586, + "grad_norm": 3.639368693424175, + "learning_rate": 2.3283913283502044e-06, + "loss": 0.2712, + "num_input_tokens_seen": 32396128, + "step": 3085 + }, + { + "epoch": 0.7957764614988411, + "grad_norm": 6.532937861900995, + "learning_rate": 2.2556172618108997e-06, + "loss": 0.3342, + "num_input_tokens_seen": 32448624, + "step": 3090 + }, + { + "epoch": 0.7970641256760237, + "grad_norm": 1.4088613654984938, + "learning_rate": 2.183972356965125e-06, + "loss": 0.3132, + "num_input_tokens_seen": 32500664, + "step": 3095 + }, + { + "epoch": 0.7983517898532063, + "grad_norm": 3.205147557757995, + "learning_rate": 2.113458308225458e-06, + "loss": 0.2856, + "num_input_tokens_seen": 32553288, + "step": 3100 + }, + { + "epoch": 0.7983517898532063, + "eval_loss": 0.4740166962146759, + "eval_runtime": 37.5378, + "eval_samples_per_second": 3.197, + "eval_steps_per_second": 0.799, + "num_input_tokens_seen": 32553288, + "step": 3100 + }, + { + "epoch": 0.7996394540303888, + "grad_norm": 8.05961476917595, + "learning_rate": 2.0440767832595574e-06, + "loss": 0.3052, + "num_input_tokens_seen": 32606096, + "step": 3105 + }, + { + "epoch": 0.8009271182075715, + "grad_norm": 3.1428977500375326, + "learning_rate": 1.975829422950709e-06, + "loss": 0.2125, + "num_input_tokens_seen": 32659376, + "step": 3110 + }, + { + "epoch": 0.8022147823847541, + "grad_norm": 2.8855336999591295, + "learning_rate": 1.908717841359048e-06, + "loss": 0.3122, + "num_input_tokens_seen": 32712168, + "step": 3115 + }, + { + "epoch": 0.8035024465619367, + "grad_norm": 12.674047700213576, + "learning_rate": 1.8427436256833852e-06, + "loss": 0.3006, + "num_input_tokens_seen": 32764296, + "step": 3120 + }, + { + "epoch": 0.8047901107391192, + "grad_norm": 1.5292819995856641, + "learning_rate": 1.7779083362236547e-06, + "loss": 0.3077, + "num_input_tokens_seen": 32815296, + "step": 3125 + }, + { + "epoch": 0.8060777749163018, + "grad_norm": 12.068461011216378, + "learning_rate": 1.7142135063440035e-06, + "loss": 0.29, + "num_input_tokens_seen": 32867288, + "step": 3130 + }, + { + "epoch": 0.8073654390934845, + "grad_norm": 9.37062799812982, + "learning_rate": 1.6516606424365643e-06, + "loss": 0.3574, + "num_input_tokens_seen": 32919584, + "step": 3135 + }, + { + "epoch": 0.808653103270667, + "grad_norm": 5.777474878278418, + "learning_rate": 1.5902512238857858e-06, + "loss": 0.2414, + "num_input_tokens_seen": 32972736, + "step": 3140 + }, + { + "epoch": 0.8099407674478496, + "grad_norm": 3.0096174763729864, + "learning_rate": 1.5299867030334814e-06, + "loss": 0.2521, + "num_input_tokens_seen": 33026320, + "step": 3145 + }, + { + "epoch": 0.8112284316250322, + "grad_norm": 5.573236169553209, + "learning_rate": 1.4708685051444515e-06, + "loss": 0.2669, + "num_input_tokens_seen": 33078960, + "step": 3150 + }, + { + "epoch": 0.8112284316250322, + "eval_loss": 0.4687062203884125, + "eval_runtime": 37.4391, + "eval_samples_per_second": 3.205, + "eval_steps_per_second": 0.801, + "num_input_tokens_seen": 33078960, + "step": 3150 + }, + { + "epoch": 0.8125160958022147, + "grad_norm": 9.248479787863037, + "learning_rate": 1.4128980283727943e-06, + "loss": 0.2622, + "num_input_tokens_seen": 33131352, + "step": 3155 + }, + { + "epoch": 0.8138037599793974, + "grad_norm": 2.5549795824819377, + "learning_rate": 1.356076643728843e-06, + "loss": 0.2776, + "num_input_tokens_seen": 33183032, + "step": 3160 + }, + { + "epoch": 0.81509142415658, + "grad_norm": 10.008731313823478, + "learning_rate": 1.3004056950467135e-06, + "loss": 0.247, + "num_input_tokens_seen": 33235992, + "step": 3165 + }, + { + "epoch": 0.8163790883337626, + "grad_norm": 4.169049205827014, + "learning_rate": 1.2458864989525698e-06, + "loss": 0.2917, + "num_input_tokens_seen": 33288696, + "step": 3170 + }, + { + "epoch": 0.8176667525109451, + "grad_norm": 7.528979068812372, + "learning_rate": 1.19252034483342e-06, + "loss": 0.257, + "num_input_tokens_seen": 33341472, + "step": 3175 + }, + { + "epoch": 0.8189544166881277, + "grad_norm": 2.293449526780795, + "learning_rate": 1.1403084948067021e-06, + "loss": 0.2836, + "num_input_tokens_seen": 33394856, + "step": 3180 + }, + { + "epoch": 0.8202420808653104, + "grad_norm": 3.9562566364327987, + "learning_rate": 1.089252183690348e-06, + "loss": 0.3201, + "num_input_tokens_seen": 33447208, + "step": 3185 + }, + { + "epoch": 0.8215297450424929, + "grad_norm": 3.5877536742177227, + "learning_rate": 1.0393526189736602e-06, + "loss": 0.2751, + "num_input_tokens_seen": 33500288, + "step": 3190 + }, + { + "epoch": 0.8228174092196755, + "grad_norm": 3.3758478518136252, + "learning_rate": 9.906109807887032e-07, + "loss": 0.3231, + "num_input_tokens_seen": 33552400, + "step": 3195 + }, + { + "epoch": 0.8241050733968581, + "grad_norm": 5.857495732246955, + "learning_rate": 9.430284218824026e-07, + "loss": 0.3322, + "num_input_tokens_seen": 33604328, + "step": 3200 + }, + { + "epoch": 0.8241050733968581, + "eval_loss": 0.47025421261787415, + "eval_runtime": 37.5509, + "eval_samples_per_second": 3.196, + "eval_steps_per_second": 0.799, + "num_input_tokens_seen": 33604328, + "step": 3200 + }, + { + "epoch": 0.8253927375740406, + "grad_norm": 3.137517906900648, + "learning_rate": 8.966060675892951e-07, + "loss": 0.2841, + "num_input_tokens_seen": 33656768, + "step": 3205 + }, + { + "epoch": 0.8266804017512233, + "grad_norm": 6.220162233685774, + "learning_rate": 8.513450158049108e-07, + "loss": 0.3064, + "num_input_tokens_seen": 33709960, + "step": 3210 + }, + { + "epoch": 0.8279680659284059, + "grad_norm": 2.9217052129873453, + "learning_rate": 8.072463369597993e-07, + "loss": 0.3126, + "num_input_tokens_seen": 33762336, + "step": 3215 + }, + { + "epoch": 0.8292557301055885, + "grad_norm": 4.188551959084003, + "learning_rate": 7.643110739942172e-07, + "loss": 0.2758, + "num_input_tokens_seen": 33814544, + "step": 3220 + }, + { + "epoch": 0.830543394282771, + "grad_norm": 7.312341995663223, + "learning_rate": 7.225402423334693e-07, + "loss": 0.3154, + "num_input_tokens_seen": 33867184, + "step": 3225 + }, + { + "epoch": 0.8318310584599536, + "grad_norm": 9.247395364718018, + "learning_rate": 6.819348298638839e-07, + "loss": 0.2894, + "num_input_tokens_seen": 33920120, + "step": 3230 + }, + { + "epoch": 0.8331187226371363, + "grad_norm": 4.011556835563422, + "learning_rate": 6.424957969094536e-07, + "loss": 0.2521, + "num_input_tokens_seen": 33971928, + "step": 3235 + }, + { + "epoch": 0.8344063868143188, + "grad_norm": 6.487526690091496, + "learning_rate": 6.0422407620912e-07, + "loss": 0.3532, + "num_input_tokens_seen": 34024272, + "step": 3240 + }, + { + "epoch": 0.8356940509915014, + "grad_norm": 3.7460726144210414, + "learning_rate": 5.671205728947305e-07, + "loss": 0.2519, + "num_input_tokens_seen": 34077920, + "step": 3245 + }, + { + "epoch": 0.836981715168684, + "grad_norm": 2.055442328676098, + "learning_rate": 5.311861644696048e-07, + "loss": 0.2836, + "num_input_tokens_seen": 34129832, + "step": 3250 + }, + { + "epoch": 0.836981715168684, + "eval_loss": 0.46573224663734436, + "eval_runtime": 37.4469, + "eval_samples_per_second": 3.205, + "eval_steps_per_second": 0.801, + "num_input_tokens_seen": 34129832, + "step": 3250 + }, + { + "epoch": 0.8382693793458666, + "grad_norm": 9.810236827636258, + "learning_rate": 4.964217007878081e-07, + "loss": 0.3243, + "num_input_tokens_seen": 34182360, + "step": 3255 + }, + { + "epoch": 0.8395570435230492, + "grad_norm": 3.5923057623468972, + "learning_rate": 4.6282800403402715e-07, + "loss": 0.295, + "num_input_tokens_seen": 34234176, + "step": 3260 + }, + { + "epoch": 0.8408447077002318, + "grad_norm": 4.895595503237554, + "learning_rate": 4.3040586870415346e-07, + "loss": 0.3189, + "num_input_tokens_seen": 34287472, + "step": 3265 + }, + { + "epoch": 0.8421323718774144, + "grad_norm": 4.790419220901576, + "learning_rate": 3.991560615864587e-07, + "loss": 0.2927, + "num_input_tokens_seen": 34339496, + "step": 3270 + }, + { + "epoch": 0.8434200360545969, + "grad_norm": 3.222572566824057, + "learning_rate": 3.6907932174349846e-07, + "loss": 0.299, + "num_input_tokens_seen": 34391688, + "step": 3275 + }, + { + "epoch": 0.8447077002317795, + "grad_norm": 4.29141108538951, + "learning_rate": 3.40176360494604e-07, + "loss": 0.3218, + "num_input_tokens_seen": 34443720, + "step": 3280 + }, + { + "epoch": 0.8459953644089622, + "grad_norm": 8.665138339120979, + "learning_rate": 3.124478613990733e-07, + "loss": 0.3295, + "num_input_tokens_seen": 34495512, + "step": 3285 + }, + { + "epoch": 0.8472830285861447, + "grad_norm": 3.6750059216889857, + "learning_rate": 2.8589448023998987e-07, + "loss": 0.2889, + "num_input_tokens_seen": 34547936, + "step": 3290 + }, + { + "epoch": 0.8485706927633273, + "grad_norm": 3.80128536969918, + "learning_rate": 2.605168450087514e-07, + "loss": 0.2697, + "num_input_tokens_seen": 34601320, + "step": 3295 + }, + { + "epoch": 0.8498583569405099, + "grad_norm": 4.353943864411861, + "learning_rate": 2.363155558901542e-07, + "loss": 0.3135, + "num_input_tokens_seen": 34654480, + "step": 3300 + }, + { + "epoch": 0.8498583569405099, + "eval_loss": 0.4714098274707794, + "eval_runtime": 37.5939, + "eval_samples_per_second": 3.192, + "eval_steps_per_second": 0.798, + "num_input_tokens_seen": 34654480, + "step": 3300 + }, + { + "epoch": 0.8511460211176926, + "grad_norm": 1.3186556509713656, + "learning_rate": 2.1329118524827662e-07, + "loss": 0.3195, + "num_input_tokens_seen": 34706600, + "step": 3305 + }, + { + "epoch": 0.8524336852948751, + "grad_norm": 5.6829477329970475, + "learning_rate": 1.9144427761286222e-07, + "loss": 0.2817, + "num_input_tokens_seen": 34759528, + "step": 3310 + }, + { + "epoch": 0.8537213494720577, + "grad_norm": 10.648820125115563, + "learning_rate": 1.7077534966650766e-07, + "loss": 0.3131, + "num_input_tokens_seen": 34811832, + "step": 3315 + }, + { + "epoch": 0.8550090136492403, + "grad_norm": 1.412998871252427, + "learning_rate": 1.51284890232406e-07, + "loss": 0.2926, + "num_input_tokens_seen": 34864696, + "step": 3320 + }, + { + "epoch": 0.8562966778264228, + "grad_norm": 2.3084307837646425, + "learning_rate": 1.3297336026280027e-07, + "loss": 0.2606, + "num_input_tokens_seen": 34917584, + "step": 3325 + }, + { + "epoch": 0.8575843420036054, + "grad_norm": 1.9582259889928806, + "learning_rate": 1.158411928280645e-07, + "loss": 0.3203, + "num_input_tokens_seen": 34969720, + "step": 3330 + }, + { + "epoch": 0.8588720061807881, + "grad_norm": 4.1936933517667825, + "learning_rate": 9.988879310649513e-08, + "loss": 0.3211, + "num_input_tokens_seen": 35021296, + "step": 3335 + }, + { + "epoch": 0.8601596703579707, + "grad_norm": 5.882661534247897, + "learning_rate": 8.511653837470212e-08, + "loss": 0.2923, + "num_input_tokens_seen": 35073120, + "step": 3340 + }, + { + "epoch": 0.8614473345351532, + "grad_norm": 6.708718647923996, + "learning_rate": 7.152477799867719e-08, + "loss": 0.289, + "num_input_tokens_seen": 35126296, + "step": 3345 + }, + { + "epoch": 0.8627349987123358, + "grad_norm": 2.5743366606144185, + "learning_rate": 5.911383342556143e-08, + "loss": 0.3253, + "num_input_tokens_seen": 35179104, + "step": 3350 + }, + { + "epoch": 0.8627349987123358, + "eval_loss": 0.4714648127555847, + "eval_runtime": 37.5951, + "eval_samples_per_second": 3.192, + "eval_steps_per_second": 0.798, + "num_input_tokens_seen": 35179104, + "step": 3350 + }, + { + "epoch": 0.8640226628895185, + "grad_norm": 5.553235621230406, + "learning_rate": 4.788399817602929e-08, + "loss": 0.3179, + "num_input_tokens_seen": 35231608, + "step": 3355 + }, + { + "epoch": 0.865310327066701, + "grad_norm": 4.202242288396885, + "learning_rate": 3.7835537837338506e-08, + "loss": 0.2829, + "num_input_tokens_seen": 35284448, + "step": 3360 + }, + { + "epoch": 0.8665979912438836, + "grad_norm": 3.6414795103608255, + "learning_rate": 2.8968690057051828e-08, + "loss": 0.2579, + "num_input_tokens_seen": 35336520, + "step": 3365 + }, + { + "epoch": 0.8678856554210662, + "grad_norm": 4.481511308866539, + "learning_rate": 2.128366453743591e-08, + "loss": 0.2862, + "num_input_tokens_seen": 35388728, + "step": 3370 + }, + { + "epoch": 0.8691733195982487, + "grad_norm": 4.3032070874799, + "learning_rate": 1.4780643030476438e-08, + "loss": 0.2812, + "num_input_tokens_seen": 35441824, + "step": 3375 + }, + { + "epoch": 0.8704609837754314, + "grad_norm": 5.8821140210764336, + "learning_rate": 9.459779333587104e-09, + "loss": 0.3174, + "num_input_tokens_seen": 35495128, + "step": 3380 + }, + { + "epoch": 0.871748647952614, + "grad_norm": 7.380544386822247, + "learning_rate": 5.3211992859791835e-09, + "loss": 0.3049, + "num_input_tokens_seen": 35548144, + "step": 3385 + }, + { + "epoch": 0.8730363121297966, + "grad_norm": 1.5375610441333851, + "learning_rate": 2.3650007656805806e-09, + "loss": 0.2882, + "num_input_tokens_seen": 35600936, + "step": 3390 + }, + { + "epoch": 0.8743239763069791, + "grad_norm": 6.170480848656164, + "learning_rate": 5.912536872321184e-10, + "loss": 0.2789, + "num_input_tokens_seen": 35653896, + "step": 3395 + }, + { + "epoch": 0.8756116404841617, + "grad_norm": 5.211578123351505, + "learning_rate": 0.0, + "loss": 0.3187, + "num_input_tokens_seen": 35706848, + "step": 3400 + }, + { + "epoch": 0.8756116404841617, + "eval_loss": 0.4701705873012543, + "eval_runtime": 37.5324, + "eval_samples_per_second": 3.197, + "eval_steps_per_second": 0.799, + "num_input_tokens_seen": 35706848, + "step": 3400 + }, + { + "epoch": 0.8756116404841617, + "num_input_tokens_seen": 35706848, + "step": 3400, + "total_flos": 2355853440057344.0, + "train_loss": 0.07519739676924313, + "train_runtime": 15513.1083, + "train_samples_per_second": 5.26, + "train_steps_per_second": 0.219 + } + ], + "logging_steps": 5, + "max_steps": 3400, + "num_input_tokens_seen": 35706848, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2355853440057344.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}