{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.997215777262181, "eval_steps": 500, "global_step": 538, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014849187935034803, "grad_norm": 895.9867553710938, "learning_rate": 2e-05, "loss": 41.8575, "step": 4 }, { "epoch": 0.029698375870069606, "grad_norm": 164.2000732421875, "learning_rate": 2e-05, "loss": 19.2704, "step": 8 }, { "epoch": 0.044547563805104405, "grad_norm": 130.14195251464844, "learning_rate": 2e-05, "loss": 16.7431, "step": 12 }, { "epoch": 0.05939675174013921, "grad_norm": 119.65748596191406, "learning_rate": 2e-05, "loss": 17.2431, "step": 16 }, { "epoch": 0.07424593967517401, "grad_norm": 123.41026306152344, "learning_rate": 2e-05, "loss": 17.5812, "step": 20 }, { "epoch": 0.08909512761020881, "grad_norm": 143.79872131347656, "learning_rate": 2e-05, "loss": 16.1039, "step": 24 }, { "epoch": 0.10394431554524362, "grad_norm": 191.55752563476562, "learning_rate": 2e-05, "loss": 15.5393, "step": 28 }, { "epoch": 0.11879350348027842, "grad_norm": 125.146728515625, "learning_rate": 2e-05, "loss": 15.1988, "step": 32 }, { "epoch": 0.13364269141531324, "grad_norm": 122.55828857421875, "learning_rate": 2e-05, "loss": 15.456, "step": 36 }, { "epoch": 0.14849187935034802, "grad_norm": 126.60418701171875, "learning_rate": 2e-05, "loss": 16.9079, "step": 40 }, { "epoch": 0.16334106728538283, "grad_norm": 116.0846176147461, "learning_rate": 2e-05, "loss": 15.9405, "step": 44 }, { "epoch": 0.17819025522041762, "grad_norm": 135.65383911132812, "learning_rate": 2e-05, "loss": 13.6821, "step": 48 }, { "epoch": 0.19303944315545243, "grad_norm": 115.77993774414062, "learning_rate": 2e-05, "loss": 15.6503, "step": 52 }, { "epoch": 0.20788863109048725, "grad_norm": 131.34146118164062, "learning_rate": 2e-05, "loss": 15.7174, "step": 56 }, { "epoch": 0.22273781902552203, "grad_norm": 150.83935546875, "learning_rate": 2e-05, "loss": 16.6436, "step": 60 }, { "epoch": 0.23758700696055685, "grad_norm": 152.6024169921875, "learning_rate": 2e-05, "loss": 16.1857, "step": 64 }, { "epoch": 0.25243619489559166, "grad_norm": 165.27406311035156, "learning_rate": 2e-05, "loss": 15.5328, "step": 68 }, { "epoch": 0.2672853828306265, "grad_norm": 119.0411376953125, "learning_rate": 2e-05, "loss": 14.212, "step": 72 }, { "epoch": 0.28213457076566123, "grad_norm": 130.3306884765625, "learning_rate": 2e-05, "loss": 16.7866, "step": 76 }, { "epoch": 0.29698375870069604, "grad_norm": 115.24845123291016, "learning_rate": 2e-05, "loss": 15.0373, "step": 80 }, { "epoch": 0.31183294663573086, "grad_norm": 174.6798858642578, "learning_rate": 2e-05, "loss": 15.3437, "step": 84 }, { "epoch": 0.32668213457076567, "grad_norm": 145.3719482421875, "learning_rate": 2e-05, "loss": 14.4015, "step": 88 }, { "epoch": 0.3415313225058005, "grad_norm": 117.09785461425781, "learning_rate": 2e-05, "loss": 13.7134, "step": 92 }, { "epoch": 0.35638051044083524, "grad_norm": 120.23141479492188, "learning_rate": 2e-05, "loss": 14.641, "step": 96 }, { "epoch": 0.37122969837587005, "grad_norm": 107.27012634277344, "learning_rate": 2e-05, "loss": 14.7094, "step": 100 }, { "epoch": 0.38607888631090487, "grad_norm": 136.1507568359375, "learning_rate": 2e-05, "loss": 14.8711, "step": 104 }, { "epoch": 0.4009280742459397, "grad_norm": 136.19911193847656, "learning_rate": 2e-05, "loss": 14.7636, "step": 108 }, { "epoch": 0.4157772621809745, "grad_norm": 120.15601348876953, "learning_rate": 2e-05, "loss": 16.0424, "step": 112 }, { "epoch": 0.4306264501160093, "grad_norm": 104.66596221923828, "learning_rate": 2e-05, "loss": 14.2951, "step": 116 }, { "epoch": 0.44547563805104406, "grad_norm": 102.8609619140625, "learning_rate": 2e-05, "loss": 13.2711, "step": 120 }, { "epoch": 0.4603248259860789, "grad_norm": 108.99791717529297, "learning_rate": 2e-05, "loss": 14.4603, "step": 124 }, { "epoch": 0.4751740139211137, "grad_norm": 100.2767333984375, "learning_rate": 2e-05, "loss": 14.5153, "step": 128 }, { "epoch": 0.4900232018561485, "grad_norm": 108.51724243164062, "learning_rate": 2e-05, "loss": 14.3767, "step": 132 }, { "epoch": 0.5048723897911833, "grad_norm": 139.0511932373047, "learning_rate": 2e-05, "loss": 15.0579, "step": 136 }, { "epoch": 0.5197215777262181, "grad_norm": 131.45651245117188, "learning_rate": 2e-05, "loss": 16.0837, "step": 140 }, { "epoch": 0.534570765661253, "grad_norm": 128.41012573242188, "learning_rate": 2e-05, "loss": 13.679, "step": 144 }, { "epoch": 0.5494199535962877, "grad_norm": 138.88658142089844, "learning_rate": 2e-05, "loss": 13.4384, "step": 148 }, { "epoch": 0.5642691415313225, "grad_norm": 119.11845397949219, "learning_rate": 2e-05, "loss": 13.9317, "step": 152 }, { "epoch": 0.5791183294663573, "grad_norm": 119.57584381103516, "learning_rate": 2e-05, "loss": 14.371, "step": 156 }, { "epoch": 0.5939675174013921, "grad_norm": 96.74629211425781, "learning_rate": 2e-05, "loss": 15.2401, "step": 160 }, { "epoch": 0.608816705336427, "grad_norm": 111.12255096435547, "learning_rate": 2e-05, "loss": 15.1936, "step": 164 }, { "epoch": 0.6236658932714617, "grad_norm": 148.77015686035156, "learning_rate": 2e-05, "loss": 14.4655, "step": 168 }, { "epoch": 0.6385150812064965, "grad_norm": 107.04643249511719, "learning_rate": 2e-05, "loss": 12.6344, "step": 172 }, { "epoch": 0.6533642691415313, "grad_norm": 104.93022918701172, "learning_rate": 2e-05, "loss": 13.9102, "step": 176 }, { "epoch": 0.6682134570765661, "grad_norm": 104.616943359375, "learning_rate": 2e-05, "loss": 14.9522, "step": 180 }, { "epoch": 0.683062645011601, "grad_norm": 139.63406372070312, "learning_rate": 2e-05, "loss": 15.4642, "step": 184 }, { "epoch": 0.6979118329466357, "grad_norm": 106.42848205566406, "learning_rate": 2e-05, "loss": 14.1578, "step": 188 }, { "epoch": 0.7127610208816705, "grad_norm": 95.40778350830078, "learning_rate": 2e-05, "loss": 15.5809, "step": 192 }, { "epoch": 0.7276102088167054, "grad_norm": 106.99407958984375, "learning_rate": 2e-05, "loss": 12.3565, "step": 196 }, { "epoch": 0.7424593967517401, "grad_norm": 116.07793426513672, "learning_rate": 2e-05, "loss": 13.6122, "step": 200 }, { "epoch": 0.757308584686775, "grad_norm": 117.84542846679688, "learning_rate": 2e-05, "loss": 14.2531, "step": 204 }, { "epoch": 0.7721577726218097, "grad_norm": 90.03235626220703, "learning_rate": 2e-05, "loss": 14.2915, "step": 208 }, { "epoch": 0.7870069605568445, "grad_norm": 99.91178894042969, "learning_rate": 2e-05, "loss": 13.7193, "step": 212 }, { "epoch": 0.8018561484918794, "grad_norm": 127.37728881835938, "learning_rate": 2e-05, "loss": 14.4029, "step": 216 }, { "epoch": 0.8167053364269141, "grad_norm": 106.17198181152344, "learning_rate": 2e-05, "loss": 14.152, "step": 220 }, { "epoch": 0.831554524361949, "grad_norm": 109.1567611694336, "learning_rate": 2e-05, "loss": 14.6705, "step": 224 }, { "epoch": 0.8464037122969837, "grad_norm": 101.11131286621094, "learning_rate": 2e-05, "loss": 13.956, "step": 228 }, { "epoch": 0.8612529002320186, "grad_norm": 113.48827362060547, "learning_rate": 2e-05, "loss": 14.138, "step": 232 }, { "epoch": 0.8761020881670534, "grad_norm": 112.26351165771484, "learning_rate": 2e-05, "loss": 12.2284, "step": 236 }, { "epoch": 0.8909512761020881, "grad_norm": 100.76663970947266, "learning_rate": 2e-05, "loss": 13.7275, "step": 240 }, { "epoch": 0.905800464037123, "grad_norm": 104.24567413330078, "learning_rate": 2e-05, "loss": 12.7694, "step": 244 }, { "epoch": 0.9206496519721578, "grad_norm": 106.16858673095703, "learning_rate": 2e-05, "loss": 14.139, "step": 248 }, { "epoch": 0.9354988399071926, "grad_norm": 112.65348815917969, "learning_rate": 2e-05, "loss": 13.8694, "step": 252 }, { "epoch": 0.9503480278422274, "grad_norm": 91.72236633300781, "learning_rate": 2e-05, "loss": 15.5933, "step": 256 }, { "epoch": 0.9651972157772621, "grad_norm": 90.93212127685547, "learning_rate": 2e-05, "loss": 14.2187, "step": 260 }, { "epoch": 0.980046403712297, "grad_norm": 100.89374542236328, "learning_rate": 2e-05, "loss": 13.7716, "step": 264 }, { "epoch": 0.9948955916473318, "grad_norm": 92.8128662109375, "learning_rate": 2e-05, "loss": 12.5682, "step": 268 }, { "epoch": 1.0097447795823666, "grad_norm": 95.66116333007812, "learning_rate": 2e-05, "loss": 14.4997, "step": 272 }, { "epoch": 1.0245939675174014, "grad_norm": 104.52428436279297, "learning_rate": 2e-05, "loss": 11.8475, "step": 276 }, { "epoch": 1.0394431554524362, "grad_norm": 104.34024810791016, "learning_rate": 2e-05, "loss": 10.1835, "step": 280 }, { "epoch": 1.054292343387471, "grad_norm": 98.30239868164062, "learning_rate": 2e-05, "loss": 10.2298, "step": 284 }, { "epoch": 1.069141531322506, "grad_norm": 109.97785949707031, "learning_rate": 2e-05, "loss": 10.6023, "step": 288 }, { "epoch": 1.0839907192575406, "grad_norm": 122.24370574951172, "learning_rate": 2e-05, "loss": 10.0427, "step": 292 }, { "epoch": 1.0988399071925754, "grad_norm": 109.37757873535156, "learning_rate": 2e-05, "loss": 10.0441, "step": 296 }, { "epoch": 1.1136890951276102, "grad_norm": 127.94110107421875, "learning_rate": 2e-05, "loss": 9.7277, "step": 300 }, { "epoch": 1.128538283062645, "grad_norm": 124.07524108886719, "learning_rate": 2e-05, "loss": 9.7969, "step": 304 }, { "epoch": 1.14338747099768, "grad_norm": 126.29171752929688, "learning_rate": 2e-05, "loss": 9.5134, "step": 308 }, { "epoch": 1.1582366589327147, "grad_norm": 104.21505737304688, "learning_rate": 2e-05, "loss": 10.8362, "step": 312 }, { "epoch": 1.1730858468677494, "grad_norm": 121.6202392578125, "learning_rate": 2e-05, "loss": 8.8389, "step": 316 }, { "epoch": 1.1879350348027842, "grad_norm": 110.58162689208984, "learning_rate": 2e-05, "loss": 9.0145, "step": 320 }, { "epoch": 1.202784222737819, "grad_norm": 127.4255599975586, "learning_rate": 2e-05, "loss": 9.6973, "step": 324 }, { "epoch": 1.217633410672854, "grad_norm": 108.92906951904297, "learning_rate": 2e-05, "loss": 9.6894, "step": 328 }, { "epoch": 1.2324825986078887, "grad_norm": 131.8388214111328, "learning_rate": 2e-05, "loss": 11.288, "step": 332 }, { "epoch": 1.2473317865429234, "grad_norm": 106.78469848632812, "learning_rate": 2e-05, "loss": 9.656, "step": 336 }, { "epoch": 1.2621809744779582, "grad_norm": 120.8875503540039, "learning_rate": 2e-05, "loss": 9.6884, "step": 340 }, { "epoch": 1.2770301624129932, "grad_norm": 112.69973754882812, "learning_rate": 2e-05, "loss": 8.8555, "step": 344 }, { "epoch": 1.291879350348028, "grad_norm": 122.43771362304688, "learning_rate": 2e-05, "loss": 9.6718, "step": 348 }, { "epoch": 1.3067285382830627, "grad_norm": 116.25230407714844, "learning_rate": 2e-05, "loss": 8.7905, "step": 352 }, { "epoch": 1.3215777262180974, "grad_norm": 114.96141815185547, "learning_rate": 2e-05, "loss": 9.7848, "step": 356 }, { "epoch": 1.3364269141531322, "grad_norm": 119.10284423828125, "learning_rate": 2e-05, "loss": 7.9737, "step": 360 }, { "epoch": 1.3512761020881672, "grad_norm": 109.69094848632812, "learning_rate": 2e-05, "loss": 8.7001, "step": 364 }, { "epoch": 1.366125290023202, "grad_norm": 109.21603393554688, "learning_rate": 2e-05, "loss": 8.0757, "step": 368 }, { "epoch": 1.3809744779582367, "grad_norm": 128.07073974609375, "learning_rate": 2e-05, "loss": 10.1842, "step": 372 }, { "epoch": 1.3958236658932714, "grad_norm": 105.088623046875, "learning_rate": 2e-05, "loss": 8.1361, "step": 376 }, { "epoch": 1.4106728538283062, "grad_norm": 117.58355712890625, "learning_rate": 2e-05, "loss": 10.6169, "step": 380 }, { "epoch": 1.4255220417633412, "grad_norm": 102.73584747314453, "learning_rate": 2e-05, "loss": 8.8225, "step": 384 }, { "epoch": 1.440371229698376, "grad_norm": 104.41094207763672, "learning_rate": 2e-05, "loss": 8.593, "step": 388 }, { "epoch": 1.4552204176334107, "grad_norm": 104.82015228271484, "learning_rate": 2e-05, "loss": 8.4753, "step": 392 }, { "epoch": 1.4700696055684455, "grad_norm": 113.64494323730469, "learning_rate": 2e-05, "loss": 7.9889, "step": 396 }, { "epoch": 1.4849187935034802, "grad_norm": 109.5793685913086, "learning_rate": 2e-05, "loss": 8.2657, "step": 400 }, { "epoch": 1.4997679814385152, "grad_norm": 107.78541564941406, "learning_rate": 2e-05, "loss": 8.5209, "step": 404 }, { "epoch": 1.5146171693735497, "grad_norm": 125.47006225585938, "learning_rate": 2e-05, "loss": 9.4815, "step": 408 }, { "epoch": 1.5294663573085847, "grad_norm": 108.86872863769531, "learning_rate": 2e-05, "loss": 7.989, "step": 412 }, { "epoch": 1.5443155452436195, "grad_norm": 102.67842864990234, "learning_rate": 2e-05, "loss": 7.6957, "step": 416 }, { "epoch": 1.5591647331786542, "grad_norm": 109.05705261230469, "learning_rate": 2e-05, "loss": 8.5562, "step": 420 }, { "epoch": 1.5740139211136892, "grad_norm": 104.20409393310547, "learning_rate": 2e-05, "loss": 8.4751, "step": 424 }, { "epoch": 1.5888631090487237, "grad_norm": 126.31594848632812, "learning_rate": 2e-05, "loss": 8.3718, "step": 428 }, { "epoch": 1.6037122969837587, "grad_norm": 120.48487091064453, "learning_rate": 2e-05, "loss": 8.8551, "step": 432 }, { "epoch": 1.6185614849187935, "grad_norm": 105.4981689453125, "learning_rate": 2e-05, "loss": 8.2207, "step": 436 }, { "epoch": 1.6334106728538282, "grad_norm": 112.6336441040039, "learning_rate": 2e-05, "loss": 8.4217, "step": 440 }, { "epoch": 1.6482598607888632, "grad_norm": 132.0428009033203, "learning_rate": 2e-05, "loss": 7.7686, "step": 444 }, { "epoch": 1.6631090487238978, "grad_norm": 125.45011901855469, "learning_rate": 2e-05, "loss": 9.2927, "step": 448 }, { "epoch": 1.6779582366589327, "grad_norm": 136.8842315673828, "learning_rate": 2e-05, "loss": 8.7879, "step": 452 }, { "epoch": 1.6928074245939675, "grad_norm": 128.8678741455078, "learning_rate": 2e-05, "loss": 9.6716, "step": 456 }, { "epoch": 1.7076566125290022, "grad_norm": 111.33040618896484, "learning_rate": 2e-05, "loss": 8.5814, "step": 460 }, { "epoch": 1.7225058004640372, "grad_norm": 123.63487243652344, "learning_rate": 2e-05, "loss": 8.4478, "step": 464 }, { "epoch": 1.7373549883990718, "grad_norm": 113.80644989013672, "learning_rate": 2e-05, "loss": 7.6935, "step": 468 }, { "epoch": 1.7522041763341067, "grad_norm": 107.1911392211914, "learning_rate": 2e-05, "loss": 8.6608, "step": 472 }, { "epoch": 1.7670533642691415, "grad_norm": 102.86659240722656, "learning_rate": 2e-05, "loss": 8.3795, "step": 476 }, { "epoch": 1.7819025522041763, "grad_norm": 110.92539978027344, "learning_rate": 2e-05, "loss": 7.6919, "step": 480 }, { "epoch": 1.7967517401392112, "grad_norm": 104.6399917602539, "learning_rate": 2e-05, "loss": 8.2716, "step": 484 }, { "epoch": 1.8116009280742458, "grad_norm": 115.54898071289062, "learning_rate": 2e-05, "loss": 8.5838, "step": 488 }, { "epoch": 1.8264501160092808, "grad_norm": 105.62113952636719, "learning_rate": 2e-05, "loss": 8.113, "step": 492 }, { "epoch": 1.8412993039443155, "grad_norm": 100.64768981933594, "learning_rate": 2e-05, "loss": 7.9681, "step": 496 }, { "epoch": 1.8561484918793503, "grad_norm": 113.74981689453125, "learning_rate": 2e-05, "loss": 8.3435, "step": 500 }, { "epoch": 1.8709976798143853, "grad_norm": 111.69252014160156, "learning_rate": 2e-05, "loss": 7.9597, "step": 504 }, { "epoch": 1.88584686774942, "grad_norm": 127.168212890625, "learning_rate": 2e-05, "loss": 7.4817, "step": 508 }, { "epoch": 1.9006960556844548, "grad_norm": 112.72080993652344, "learning_rate": 2e-05, "loss": 8.3011, "step": 512 }, { "epoch": 1.9155452436194895, "grad_norm": 96.97032928466797, "learning_rate": 2e-05, "loss": 7.5826, "step": 516 }, { "epoch": 1.9303944315545243, "grad_norm": 90.76924896240234, "learning_rate": 2e-05, "loss": 7.6501, "step": 520 }, { "epoch": 1.9452436194895593, "grad_norm": 110.57941436767578, "learning_rate": 2e-05, "loss": 8.3152, "step": 524 }, { "epoch": 1.960092807424594, "grad_norm": 97.4187240600586, "learning_rate": 2e-05, "loss": 7.9809, "step": 528 }, { "epoch": 1.9749419953596288, "grad_norm": 107.45658111572266, "learning_rate": 2e-05, "loss": 8.1966, "step": 532 }, { "epoch": 1.9897911832946635, "grad_norm": 125.85009002685547, "learning_rate": 2e-05, "loss": 7.8091, "step": 536 } ], "logging_steps": 4, "max_steps": 538, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 446690230272000.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }