|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.997215777262181, |
|
"eval_steps": 500, |
|
"global_step": 538, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014849187935034803, |
|
"grad_norm": 895.9867553710938, |
|
"learning_rate": 2e-05, |
|
"loss": 41.8575, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.029698375870069606, |
|
"grad_norm": 164.2000732421875, |
|
"learning_rate": 2e-05, |
|
"loss": 19.2704, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.044547563805104405, |
|
"grad_norm": 130.14195251464844, |
|
"learning_rate": 2e-05, |
|
"loss": 16.7431, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05939675174013921, |
|
"grad_norm": 119.65748596191406, |
|
"learning_rate": 2e-05, |
|
"loss": 17.2431, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.07424593967517401, |
|
"grad_norm": 123.41026306152344, |
|
"learning_rate": 2e-05, |
|
"loss": 17.5812, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08909512761020881, |
|
"grad_norm": 143.79872131347656, |
|
"learning_rate": 2e-05, |
|
"loss": 16.1039, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.10394431554524362, |
|
"grad_norm": 191.55752563476562, |
|
"learning_rate": 2e-05, |
|
"loss": 15.5393, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.11879350348027842, |
|
"grad_norm": 125.146728515625, |
|
"learning_rate": 2e-05, |
|
"loss": 15.1988, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.13364269141531324, |
|
"grad_norm": 122.55828857421875, |
|
"learning_rate": 2e-05, |
|
"loss": 15.456, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.14849187935034802, |
|
"grad_norm": 126.60418701171875, |
|
"learning_rate": 2e-05, |
|
"loss": 16.9079, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.16334106728538283, |
|
"grad_norm": 116.0846176147461, |
|
"learning_rate": 2e-05, |
|
"loss": 15.9405, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.17819025522041762, |
|
"grad_norm": 135.65383911132812, |
|
"learning_rate": 2e-05, |
|
"loss": 13.6821, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.19303944315545243, |
|
"grad_norm": 115.77993774414062, |
|
"learning_rate": 2e-05, |
|
"loss": 15.6503, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.20788863109048725, |
|
"grad_norm": 131.34146118164062, |
|
"learning_rate": 2e-05, |
|
"loss": 15.7174, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.22273781902552203, |
|
"grad_norm": 150.83935546875, |
|
"learning_rate": 2e-05, |
|
"loss": 16.6436, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.23758700696055685, |
|
"grad_norm": 152.6024169921875, |
|
"learning_rate": 2e-05, |
|
"loss": 16.1857, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.25243619489559166, |
|
"grad_norm": 165.27406311035156, |
|
"learning_rate": 2e-05, |
|
"loss": 15.5328, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.2672853828306265, |
|
"grad_norm": 119.0411376953125, |
|
"learning_rate": 2e-05, |
|
"loss": 14.212, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.28213457076566123, |
|
"grad_norm": 130.3306884765625, |
|
"learning_rate": 2e-05, |
|
"loss": 16.7866, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.29698375870069604, |
|
"grad_norm": 115.24845123291016, |
|
"learning_rate": 2e-05, |
|
"loss": 15.0373, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.31183294663573086, |
|
"grad_norm": 174.6798858642578, |
|
"learning_rate": 2e-05, |
|
"loss": 15.3437, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.32668213457076567, |
|
"grad_norm": 145.3719482421875, |
|
"learning_rate": 2e-05, |
|
"loss": 14.4015, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.3415313225058005, |
|
"grad_norm": 117.09785461425781, |
|
"learning_rate": 2e-05, |
|
"loss": 13.7134, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.35638051044083524, |
|
"grad_norm": 120.23141479492188, |
|
"learning_rate": 2e-05, |
|
"loss": 14.641, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.37122969837587005, |
|
"grad_norm": 107.27012634277344, |
|
"learning_rate": 2e-05, |
|
"loss": 14.7094, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.38607888631090487, |
|
"grad_norm": 136.1507568359375, |
|
"learning_rate": 2e-05, |
|
"loss": 14.8711, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.4009280742459397, |
|
"grad_norm": 136.19911193847656, |
|
"learning_rate": 2e-05, |
|
"loss": 14.7636, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.4157772621809745, |
|
"grad_norm": 120.15601348876953, |
|
"learning_rate": 2e-05, |
|
"loss": 16.0424, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.4306264501160093, |
|
"grad_norm": 104.66596221923828, |
|
"learning_rate": 2e-05, |
|
"loss": 14.2951, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.44547563805104406, |
|
"grad_norm": 102.8609619140625, |
|
"learning_rate": 2e-05, |
|
"loss": 13.2711, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4603248259860789, |
|
"grad_norm": 108.99791717529297, |
|
"learning_rate": 2e-05, |
|
"loss": 14.4603, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.4751740139211137, |
|
"grad_norm": 100.2767333984375, |
|
"learning_rate": 2e-05, |
|
"loss": 14.5153, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.4900232018561485, |
|
"grad_norm": 108.51724243164062, |
|
"learning_rate": 2e-05, |
|
"loss": 14.3767, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.5048723897911833, |
|
"grad_norm": 139.0511932373047, |
|
"learning_rate": 2e-05, |
|
"loss": 15.0579, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.5197215777262181, |
|
"grad_norm": 131.45651245117188, |
|
"learning_rate": 2e-05, |
|
"loss": 16.0837, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.534570765661253, |
|
"grad_norm": 128.41012573242188, |
|
"learning_rate": 2e-05, |
|
"loss": 13.679, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.5494199535962877, |
|
"grad_norm": 138.88658142089844, |
|
"learning_rate": 2e-05, |
|
"loss": 13.4384, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.5642691415313225, |
|
"grad_norm": 119.11845397949219, |
|
"learning_rate": 2e-05, |
|
"loss": 13.9317, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.5791183294663573, |
|
"grad_norm": 119.57584381103516, |
|
"learning_rate": 2e-05, |
|
"loss": 14.371, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.5939675174013921, |
|
"grad_norm": 96.74629211425781, |
|
"learning_rate": 2e-05, |
|
"loss": 15.2401, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.608816705336427, |
|
"grad_norm": 111.12255096435547, |
|
"learning_rate": 2e-05, |
|
"loss": 15.1936, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.6236658932714617, |
|
"grad_norm": 148.77015686035156, |
|
"learning_rate": 2e-05, |
|
"loss": 14.4655, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.6385150812064965, |
|
"grad_norm": 107.04643249511719, |
|
"learning_rate": 2e-05, |
|
"loss": 12.6344, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.6533642691415313, |
|
"grad_norm": 104.93022918701172, |
|
"learning_rate": 2e-05, |
|
"loss": 13.9102, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.6682134570765661, |
|
"grad_norm": 104.616943359375, |
|
"learning_rate": 2e-05, |
|
"loss": 14.9522, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.683062645011601, |
|
"grad_norm": 139.63406372070312, |
|
"learning_rate": 2e-05, |
|
"loss": 15.4642, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.6979118329466357, |
|
"grad_norm": 106.42848205566406, |
|
"learning_rate": 2e-05, |
|
"loss": 14.1578, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.7127610208816705, |
|
"grad_norm": 95.40778350830078, |
|
"learning_rate": 2e-05, |
|
"loss": 15.5809, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.7276102088167054, |
|
"grad_norm": 106.99407958984375, |
|
"learning_rate": 2e-05, |
|
"loss": 12.3565, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.7424593967517401, |
|
"grad_norm": 116.07793426513672, |
|
"learning_rate": 2e-05, |
|
"loss": 13.6122, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.757308584686775, |
|
"grad_norm": 117.84542846679688, |
|
"learning_rate": 2e-05, |
|
"loss": 14.2531, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.7721577726218097, |
|
"grad_norm": 90.03235626220703, |
|
"learning_rate": 2e-05, |
|
"loss": 14.2915, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.7870069605568445, |
|
"grad_norm": 99.91178894042969, |
|
"learning_rate": 2e-05, |
|
"loss": 13.7193, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.8018561484918794, |
|
"grad_norm": 127.37728881835938, |
|
"learning_rate": 2e-05, |
|
"loss": 14.4029, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.8167053364269141, |
|
"grad_norm": 106.17198181152344, |
|
"learning_rate": 2e-05, |
|
"loss": 14.152, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.831554524361949, |
|
"grad_norm": 109.1567611694336, |
|
"learning_rate": 2e-05, |
|
"loss": 14.6705, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.8464037122969837, |
|
"grad_norm": 101.11131286621094, |
|
"learning_rate": 2e-05, |
|
"loss": 13.956, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.8612529002320186, |
|
"grad_norm": 113.48827362060547, |
|
"learning_rate": 2e-05, |
|
"loss": 14.138, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.8761020881670534, |
|
"grad_norm": 112.26351165771484, |
|
"learning_rate": 2e-05, |
|
"loss": 12.2284, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.8909512761020881, |
|
"grad_norm": 100.76663970947266, |
|
"learning_rate": 2e-05, |
|
"loss": 13.7275, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.905800464037123, |
|
"grad_norm": 104.24567413330078, |
|
"learning_rate": 2e-05, |
|
"loss": 12.7694, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.9206496519721578, |
|
"grad_norm": 106.16858673095703, |
|
"learning_rate": 2e-05, |
|
"loss": 14.139, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.9354988399071926, |
|
"grad_norm": 112.65348815917969, |
|
"learning_rate": 2e-05, |
|
"loss": 13.8694, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.9503480278422274, |
|
"grad_norm": 91.72236633300781, |
|
"learning_rate": 2e-05, |
|
"loss": 15.5933, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.9651972157772621, |
|
"grad_norm": 90.93212127685547, |
|
"learning_rate": 2e-05, |
|
"loss": 14.2187, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.980046403712297, |
|
"grad_norm": 100.89374542236328, |
|
"learning_rate": 2e-05, |
|
"loss": 13.7716, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.9948955916473318, |
|
"grad_norm": 92.8128662109375, |
|
"learning_rate": 2e-05, |
|
"loss": 12.5682, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.0097447795823666, |
|
"grad_norm": 95.66116333007812, |
|
"learning_rate": 2e-05, |
|
"loss": 14.4997, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.0245939675174014, |
|
"grad_norm": 104.52428436279297, |
|
"learning_rate": 2e-05, |
|
"loss": 11.8475, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.0394431554524362, |
|
"grad_norm": 104.34024810791016, |
|
"learning_rate": 2e-05, |
|
"loss": 10.1835, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.054292343387471, |
|
"grad_norm": 98.30239868164062, |
|
"learning_rate": 2e-05, |
|
"loss": 10.2298, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.069141531322506, |
|
"grad_norm": 109.97785949707031, |
|
"learning_rate": 2e-05, |
|
"loss": 10.6023, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.0839907192575406, |
|
"grad_norm": 122.24370574951172, |
|
"learning_rate": 2e-05, |
|
"loss": 10.0427, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.0988399071925754, |
|
"grad_norm": 109.37757873535156, |
|
"learning_rate": 2e-05, |
|
"loss": 10.0441, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.1136890951276102, |
|
"grad_norm": 127.94110107421875, |
|
"learning_rate": 2e-05, |
|
"loss": 9.7277, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.128538283062645, |
|
"grad_norm": 124.07524108886719, |
|
"learning_rate": 2e-05, |
|
"loss": 9.7969, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.14338747099768, |
|
"grad_norm": 126.29171752929688, |
|
"learning_rate": 2e-05, |
|
"loss": 9.5134, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.1582366589327147, |
|
"grad_norm": 104.21505737304688, |
|
"learning_rate": 2e-05, |
|
"loss": 10.8362, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.1730858468677494, |
|
"grad_norm": 121.6202392578125, |
|
"learning_rate": 2e-05, |
|
"loss": 8.8389, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.1879350348027842, |
|
"grad_norm": 110.58162689208984, |
|
"learning_rate": 2e-05, |
|
"loss": 9.0145, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.202784222737819, |
|
"grad_norm": 127.4255599975586, |
|
"learning_rate": 2e-05, |
|
"loss": 9.6973, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.217633410672854, |
|
"grad_norm": 108.92906951904297, |
|
"learning_rate": 2e-05, |
|
"loss": 9.6894, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.2324825986078887, |
|
"grad_norm": 131.8388214111328, |
|
"learning_rate": 2e-05, |
|
"loss": 11.288, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.2473317865429234, |
|
"grad_norm": 106.78469848632812, |
|
"learning_rate": 2e-05, |
|
"loss": 9.656, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.2621809744779582, |
|
"grad_norm": 120.8875503540039, |
|
"learning_rate": 2e-05, |
|
"loss": 9.6884, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.2770301624129932, |
|
"grad_norm": 112.69973754882812, |
|
"learning_rate": 2e-05, |
|
"loss": 8.8555, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.291879350348028, |
|
"grad_norm": 122.43771362304688, |
|
"learning_rate": 2e-05, |
|
"loss": 9.6718, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.3067285382830627, |
|
"grad_norm": 116.25230407714844, |
|
"learning_rate": 2e-05, |
|
"loss": 8.7905, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.3215777262180974, |
|
"grad_norm": 114.96141815185547, |
|
"learning_rate": 2e-05, |
|
"loss": 9.7848, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.3364269141531322, |
|
"grad_norm": 119.10284423828125, |
|
"learning_rate": 2e-05, |
|
"loss": 7.9737, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.3512761020881672, |
|
"grad_norm": 109.69094848632812, |
|
"learning_rate": 2e-05, |
|
"loss": 8.7001, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.366125290023202, |
|
"grad_norm": 109.21603393554688, |
|
"learning_rate": 2e-05, |
|
"loss": 8.0757, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.3809744779582367, |
|
"grad_norm": 128.07073974609375, |
|
"learning_rate": 2e-05, |
|
"loss": 10.1842, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.3958236658932714, |
|
"grad_norm": 105.088623046875, |
|
"learning_rate": 2e-05, |
|
"loss": 8.1361, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.4106728538283062, |
|
"grad_norm": 117.58355712890625, |
|
"learning_rate": 2e-05, |
|
"loss": 10.6169, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.4255220417633412, |
|
"grad_norm": 102.73584747314453, |
|
"learning_rate": 2e-05, |
|
"loss": 8.8225, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.440371229698376, |
|
"grad_norm": 104.41094207763672, |
|
"learning_rate": 2e-05, |
|
"loss": 8.593, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.4552204176334107, |
|
"grad_norm": 104.82015228271484, |
|
"learning_rate": 2e-05, |
|
"loss": 8.4753, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.4700696055684455, |
|
"grad_norm": 113.64494323730469, |
|
"learning_rate": 2e-05, |
|
"loss": 7.9889, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.4849187935034802, |
|
"grad_norm": 109.5793685913086, |
|
"learning_rate": 2e-05, |
|
"loss": 8.2657, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.4997679814385152, |
|
"grad_norm": 107.78541564941406, |
|
"learning_rate": 2e-05, |
|
"loss": 8.5209, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.5146171693735497, |
|
"grad_norm": 125.47006225585938, |
|
"learning_rate": 2e-05, |
|
"loss": 9.4815, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.5294663573085847, |
|
"grad_norm": 108.86872863769531, |
|
"learning_rate": 2e-05, |
|
"loss": 7.989, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.5443155452436195, |
|
"grad_norm": 102.67842864990234, |
|
"learning_rate": 2e-05, |
|
"loss": 7.6957, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.5591647331786542, |
|
"grad_norm": 109.05705261230469, |
|
"learning_rate": 2e-05, |
|
"loss": 8.5562, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.5740139211136892, |
|
"grad_norm": 104.20409393310547, |
|
"learning_rate": 2e-05, |
|
"loss": 8.4751, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.5888631090487237, |
|
"grad_norm": 126.31594848632812, |
|
"learning_rate": 2e-05, |
|
"loss": 8.3718, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.6037122969837587, |
|
"grad_norm": 120.48487091064453, |
|
"learning_rate": 2e-05, |
|
"loss": 8.8551, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.6185614849187935, |
|
"grad_norm": 105.4981689453125, |
|
"learning_rate": 2e-05, |
|
"loss": 8.2207, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.6334106728538282, |
|
"grad_norm": 112.6336441040039, |
|
"learning_rate": 2e-05, |
|
"loss": 8.4217, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.6482598607888632, |
|
"grad_norm": 132.0428009033203, |
|
"learning_rate": 2e-05, |
|
"loss": 7.7686, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.6631090487238978, |
|
"grad_norm": 125.45011901855469, |
|
"learning_rate": 2e-05, |
|
"loss": 9.2927, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.6779582366589327, |
|
"grad_norm": 136.8842315673828, |
|
"learning_rate": 2e-05, |
|
"loss": 8.7879, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.6928074245939675, |
|
"grad_norm": 128.8678741455078, |
|
"learning_rate": 2e-05, |
|
"loss": 9.6716, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.7076566125290022, |
|
"grad_norm": 111.33040618896484, |
|
"learning_rate": 2e-05, |
|
"loss": 8.5814, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.7225058004640372, |
|
"grad_norm": 123.63487243652344, |
|
"learning_rate": 2e-05, |
|
"loss": 8.4478, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.7373549883990718, |
|
"grad_norm": 113.80644989013672, |
|
"learning_rate": 2e-05, |
|
"loss": 7.6935, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.7522041763341067, |
|
"grad_norm": 107.1911392211914, |
|
"learning_rate": 2e-05, |
|
"loss": 8.6608, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.7670533642691415, |
|
"grad_norm": 102.86659240722656, |
|
"learning_rate": 2e-05, |
|
"loss": 8.3795, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.7819025522041763, |
|
"grad_norm": 110.92539978027344, |
|
"learning_rate": 2e-05, |
|
"loss": 7.6919, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.7967517401392112, |
|
"grad_norm": 104.6399917602539, |
|
"learning_rate": 2e-05, |
|
"loss": 8.2716, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.8116009280742458, |
|
"grad_norm": 115.54898071289062, |
|
"learning_rate": 2e-05, |
|
"loss": 8.5838, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.8264501160092808, |
|
"grad_norm": 105.62113952636719, |
|
"learning_rate": 2e-05, |
|
"loss": 8.113, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.8412993039443155, |
|
"grad_norm": 100.64768981933594, |
|
"learning_rate": 2e-05, |
|
"loss": 7.9681, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.8561484918793503, |
|
"grad_norm": 113.74981689453125, |
|
"learning_rate": 2e-05, |
|
"loss": 8.3435, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.8709976798143853, |
|
"grad_norm": 111.69252014160156, |
|
"learning_rate": 2e-05, |
|
"loss": 7.9597, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.88584686774942, |
|
"grad_norm": 127.168212890625, |
|
"learning_rate": 2e-05, |
|
"loss": 7.4817, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.9006960556844548, |
|
"grad_norm": 112.72080993652344, |
|
"learning_rate": 2e-05, |
|
"loss": 8.3011, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.9155452436194895, |
|
"grad_norm": 96.97032928466797, |
|
"learning_rate": 2e-05, |
|
"loss": 7.5826, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.9303944315545243, |
|
"grad_norm": 90.76924896240234, |
|
"learning_rate": 2e-05, |
|
"loss": 7.6501, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.9452436194895593, |
|
"grad_norm": 110.57941436767578, |
|
"learning_rate": 2e-05, |
|
"loss": 8.3152, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.960092807424594, |
|
"grad_norm": 97.4187240600586, |
|
"learning_rate": 2e-05, |
|
"loss": 7.9809, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.9749419953596288, |
|
"grad_norm": 107.45658111572266, |
|
"learning_rate": 2e-05, |
|
"loss": 8.1966, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.9897911832946635, |
|
"grad_norm": 125.85009002685547, |
|
"learning_rate": 2e-05, |
|
"loss": 7.8091, |
|
"step": 536 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 538, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 446690230272000.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|