|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 100.0, |
|
"eval_steps": 500, |
|
"global_step": 224800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 17.773962020874023, |
|
"learning_rate": 9.900311387900357e-06, |
|
"loss": 1.2276, |
|
"step": 2248 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.5729537606239319, |
|
"eval_loss": 1.3142218589782715, |
|
"eval_runtime": 43.9858, |
|
"eval_samples_per_second": 12.777, |
|
"eval_steps_per_second": 12.777, |
|
"step": 2248 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 33.84526824951172, |
|
"learning_rate": 9.800355871886121e-06, |
|
"loss": 1.4493, |
|
"step": 4496 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.5907473564147949, |
|
"eval_loss": 1.5339363813400269, |
|
"eval_runtime": 44.2068, |
|
"eval_samples_per_second": 12.713, |
|
"eval_steps_per_second": 12.713, |
|
"step": 4496 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.590007483959198, |
|
"learning_rate": 9.700400355871887e-06, |
|
"loss": 1.4328, |
|
"step": 6744 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.6583629846572876, |
|
"eval_loss": 1.5228670835494995, |
|
"eval_runtime": 44.2648, |
|
"eval_samples_per_second": 12.696, |
|
"eval_steps_per_second": 12.696, |
|
"step": 6744 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 27.285722732543945, |
|
"learning_rate": 9.600533807829182e-06, |
|
"loss": 1.3839, |
|
"step": 8992 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.6921707987785339, |
|
"eval_loss": 1.333614706993103, |
|
"eval_runtime": 44.2465, |
|
"eval_samples_per_second": 12.702, |
|
"eval_steps_per_second": 12.702, |
|
"step": 8992 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 28.629926681518555, |
|
"learning_rate": 9.500533807829183e-06, |
|
"loss": 1.4591, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.6761565804481506, |
|
"eval_loss": 1.4407259225845337, |
|
"eval_runtime": 44.1453, |
|
"eval_samples_per_second": 12.731, |
|
"eval_steps_per_second": 12.731, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 14.747943878173828, |
|
"learning_rate": 9.400533807829182e-06, |
|
"loss": 1.3609, |
|
"step": 13488 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.7117437720298767, |
|
"eval_loss": 1.1341328620910645, |
|
"eval_runtime": 44.9034, |
|
"eval_samples_per_second": 12.516, |
|
"eval_steps_per_second": 12.516, |
|
"step": 13488 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 44.90867233276367, |
|
"learning_rate": 9.300578291814948e-06, |
|
"loss": 1.222, |
|
"step": 15736 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.7028470039367676, |
|
"eval_loss": 1.5203638076782227, |
|
"eval_runtime": 44.0645, |
|
"eval_samples_per_second": 12.754, |
|
"eval_steps_per_second": 12.754, |
|
"step": 15736 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.5433468818664551, |
|
"learning_rate": 9.200622775800714e-06, |
|
"loss": 1.1713, |
|
"step": 17984 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.7473309636116028, |
|
"eval_loss": 1.3221626281738281, |
|
"eval_runtime": 44.4952, |
|
"eval_samples_per_second": 12.631, |
|
"eval_steps_per_second": 12.631, |
|
"step": 17984 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.15495184063911438, |
|
"learning_rate": 9.100622775800713e-06, |
|
"loss": 1.1414, |
|
"step": 20232 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.7562277317047119, |
|
"eval_loss": 1.3233251571655273, |
|
"eval_runtime": 44.534, |
|
"eval_samples_per_second": 12.62, |
|
"eval_steps_per_second": 12.62, |
|
"step": 20232 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.034351445734500885, |
|
"learning_rate": 9.000711743772242e-06, |
|
"loss": 1.0977, |
|
"step": 22480 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.7384341359138489, |
|
"eval_loss": 1.3587311506271362, |
|
"eval_runtime": 44.5493, |
|
"eval_samples_per_second": 12.615, |
|
"eval_steps_per_second": 12.615, |
|
"step": 22480 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 0.17834441363811493, |
|
"learning_rate": 8.900711743772243e-06, |
|
"loss": 0.9768, |
|
"step": 24728 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.7633451819419861, |
|
"eval_loss": 1.348164677619934, |
|
"eval_runtime": 44.2727, |
|
"eval_samples_per_second": 12.694, |
|
"eval_steps_per_second": 12.694, |
|
"step": 24728 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.0854184553027153, |
|
"learning_rate": 8.800800711743773e-06, |
|
"loss": 0.9219, |
|
"step": 26976 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.7580071091651917, |
|
"eval_loss": 1.3742923736572266, |
|
"eval_runtime": 44.1709, |
|
"eval_samples_per_second": 12.723, |
|
"eval_steps_per_second": 12.723, |
|
"step": 26976 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 1.15831458568573, |
|
"learning_rate": 8.700800711743772e-06, |
|
"loss": 0.8636, |
|
"step": 29224 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.8167259693145752, |
|
"eval_loss": 1.1195218563079834, |
|
"eval_runtime": 44.3771, |
|
"eval_samples_per_second": 12.664, |
|
"eval_steps_per_second": 12.664, |
|
"step": 29224 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 658.5977172851562, |
|
"learning_rate": 8.600845195729538e-06, |
|
"loss": 0.8543, |
|
"step": 31472 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.8042704463005066, |
|
"eval_loss": 1.171613097190857, |
|
"eval_runtime": 43.8385, |
|
"eval_samples_per_second": 12.82, |
|
"eval_steps_per_second": 12.82, |
|
"step": 31472 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 1.0300912857055664, |
|
"learning_rate": 8.500934163701069e-06, |
|
"loss": 0.8053, |
|
"step": 33720 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.8113878965377808, |
|
"eval_loss": 1.2032980918884277, |
|
"eval_runtime": 44.439, |
|
"eval_samples_per_second": 12.647, |
|
"eval_steps_per_second": 12.647, |
|
"step": 33720 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.1946033537387848, |
|
"learning_rate": 8.400978647686834e-06, |
|
"loss": 0.7718, |
|
"step": 35968 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.7989323735237122, |
|
"eval_loss": 1.249149203300476, |
|
"eval_runtime": 46.8737, |
|
"eval_samples_per_second": 11.99, |
|
"eval_steps_per_second": 11.99, |
|
"step": 35968 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 0.08109813928604126, |
|
"learning_rate": 8.300978647686834e-06, |
|
"loss": 0.6904, |
|
"step": 38216 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.836298942565918, |
|
"eval_loss": 1.0850577354431152, |
|
"eval_runtime": 46.9201, |
|
"eval_samples_per_second": 11.978, |
|
"eval_steps_per_second": 11.978, |
|
"step": 38216 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.05036694556474686, |
|
"learning_rate": 8.2010231316726e-06, |
|
"loss": 0.6545, |
|
"step": 40464 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.8007117509841919, |
|
"eval_loss": 1.2962548732757568, |
|
"eval_runtime": 44.6234, |
|
"eval_samples_per_second": 12.594, |
|
"eval_steps_per_second": 12.594, |
|
"step": 40464 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.5516142249107361, |
|
"learning_rate": 8.101023131672599e-06, |
|
"loss": 0.6858, |
|
"step": 42712 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.8078292012214661, |
|
"eval_loss": 1.3231106996536255, |
|
"eval_runtime": 45.2364, |
|
"eval_samples_per_second": 12.424, |
|
"eval_steps_per_second": 12.424, |
|
"step": 42712 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 21.766986846923828, |
|
"learning_rate": 8.001067615658363e-06, |
|
"loss": 0.6444, |
|
"step": 44960 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.8238434195518494, |
|
"eval_loss": 1.1918330192565918, |
|
"eval_runtime": 45.4969, |
|
"eval_samples_per_second": 12.352, |
|
"eval_steps_per_second": 12.352, |
|
"step": 44960 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 0.047945111989974976, |
|
"learning_rate": 7.90111209964413e-06, |
|
"loss": 0.6166, |
|
"step": 47208 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.8345195651054382, |
|
"eval_loss": 1.1358013153076172, |
|
"eval_runtime": 44.2578, |
|
"eval_samples_per_second": 12.698, |
|
"eval_steps_per_second": 12.698, |
|
"step": 47208 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 0.016365328803658485, |
|
"learning_rate": 7.801156583629894e-06, |
|
"loss": 0.5437, |
|
"step": 49456 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.8256227970123291, |
|
"eval_loss": 1.2446305751800537, |
|
"eval_runtime": 44.445, |
|
"eval_samples_per_second": 12.645, |
|
"eval_steps_per_second": 12.645, |
|
"step": 49456 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"grad_norm": 0.040828485041856766, |
|
"learning_rate": 7.701156583629893e-06, |
|
"loss": 0.4719, |
|
"step": 51704 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.8149465918540955, |
|
"eval_loss": 1.4120286703109741, |
|
"eval_runtime": 44.091, |
|
"eval_samples_per_second": 12.746, |
|
"eval_steps_per_second": 12.746, |
|
"step": 51704 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 1.33746337890625, |
|
"learning_rate": 7.601201067615659e-06, |
|
"loss": 0.4802, |
|
"step": 53952 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.8202847242355347, |
|
"eval_loss": 1.2611161470413208, |
|
"eval_runtime": 44.1951, |
|
"eval_samples_per_second": 12.716, |
|
"eval_steps_per_second": 12.716, |
|
"step": 53952 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 95.64888763427734, |
|
"learning_rate": 7.501290035587189e-06, |
|
"loss": 0.484, |
|
"step": 56200 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.836298942565918, |
|
"eval_loss": 1.2840217351913452, |
|
"eval_runtime": 44.2468, |
|
"eval_samples_per_second": 12.701, |
|
"eval_steps_per_second": 12.701, |
|
"step": 56200 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 0.019676001742482185, |
|
"learning_rate": 7.40129003558719e-06, |
|
"loss": 0.3649, |
|
"step": 58448 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.8434163928031921, |
|
"eval_loss": 1.2421414852142334, |
|
"eval_runtime": 44.4397, |
|
"eval_samples_per_second": 12.646, |
|
"eval_steps_per_second": 12.646, |
|
"step": 58448 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"grad_norm": 0.01975702866911888, |
|
"learning_rate": 7.301379003558719e-06, |
|
"loss": 0.4146, |
|
"step": 60696 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.8291814923286438, |
|
"eval_loss": 1.3464977741241455, |
|
"eval_runtime": 43.9971, |
|
"eval_samples_per_second": 12.774, |
|
"eval_steps_per_second": 12.774, |
|
"step": 60696 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 0.0131510766223073, |
|
"learning_rate": 7.201423487544484e-06, |
|
"loss": 0.3998, |
|
"step": 62944 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.8505337834358215, |
|
"eval_loss": 1.2309142351150513, |
|
"eval_runtime": 44.0844, |
|
"eval_samples_per_second": 12.748, |
|
"eval_steps_per_second": 12.748, |
|
"step": 62944 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"grad_norm": 0.043094441294670105, |
|
"learning_rate": 7.101423487544484e-06, |
|
"loss": 0.4113, |
|
"step": 65192 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 0.8523131608963013, |
|
"eval_loss": 1.166311264038086, |
|
"eval_runtime": 45.2787, |
|
"eval_samples_per_second": 12.412, |
|
"eval_steps_per_second": 12.412, |
|
"step": 65192 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 0.04462951049208641, |
|
"learning_rate": 7.001467971530249e-06, |
|
"loss": 0.3385, |
|
"step": 67440 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.8469750881195068, |
|
"eval_loss": 1.2566747665405273, |
|
"eval_runtime": 43.6729, |
|
"eval_samples_per_second": 12.868, |
|
"eval_steps_per_second": 12.868, |
|
"step": 67440 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"grad_norm": 0.003071392187848687, |
|
"learning_rate": 6.901512455516015e-06, |
|
"loss": 0.3188, |
|
"step": 69688 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_accuracy": 0.8434163928031921, |
|
"eval_loss": 1.2580705881118774, |
|
"eval_runtime": 45.193, |
|
"eval_samples_per_second": 12.436, |
|
"eval_steps_per_second": 12.436, |
|
"step": 69688 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 0.051271870732307434, |
|
"learning_rate": 6.80155693950178e-06, |
|
"loss": 0.3203, |
|
"step": 71936 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.854092538356781, |
|
"eval_loss": 1.2454197406768799, |
|
"eval_runtime": 43.9824, |
|
"eval_samples_per_second": 12.778, |
|
"eval_steps_per_second": 12.778, |
|
"step": 71936 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"grad_norm": 0.003354266518726945, |
|
"learning_rate": 6.701645907473309e-06, |
|
"loss": 0.2766, |
|
"step": 74184 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_accuracy": 0.8523131608963013, |
|
"eval_loss": 1.2542338371276855, |
|
"eval_runtime": 44.114, |
|
"eval_samples_per_second": 12.74, |
|
"eval_steps_per_second": 12.74, |
|
"step": 74184 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"grad_norm": 0.0016816665884107351, |
|
"learning_rate": 6.6016459074733095e-06, |
|
"loss": 0.2505, |
|
"step": 76432 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 0.8149465918540955, |
|
"eval_loss": 1.5897480249404907, |
|
"eval_runtime": 44.5247, |
|
"eval_samples_per_second": 12.622, |
|
"eval_steps_per_second": 12.622, |
|
"step": 76432 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"grad_norm": 0.0018826358718797565, |
|
"learning_rate": 6.501690391459075e-06, |
|
"loss": 0.2777, |
|
"step": 78680 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_accuracy": 0.836298942565918, |
|
"eval_loss": 1.348265528678894, |
|
"eval_runtime": 44.5684, |
|
"eval_samples_per_second": 12.61, |
|
"eval_steps_per_second": 12.61, |
|
"step": 78680 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 0.0017322949133813381, |
|
"learning_rate": 6.40173487544484e-06, |
|
"loss": 0.2816, |
|
"step": 80928 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.8523131608963013, |
|
"eval_loss": 1.2510393857955933, |
|
"eval_runtime": 44.2735, |
|
"eval_samples_per_second": 12.694, |
|
"eval_steps_per_second": 12.694, |
|
"step": 80928 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"grad_norm": 0.002890912117436528, |
|
"learning_rate": 6.301779359430605e-06, |
|
"loss": 0.2728, |
|
"step": 83176 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_accuracy": 0.8327401876449585, |
|
"eval_loss": 1.4422318935394287, |
|
"eval_runtime": 45.8456, |
|
"eval_samples_per_second": 12.259, |
|
"eval_steps_per_second": 12.259, |
|
"step": 83176 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"grad_norm": 0.021676644682884216, |
|
"learning_rate": 6.20182384341637e-06, |
|
"loss": 0.255, |
|
"step": 85424 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 0.8487544655799866, |
|
"eval_loss": 1.2928466796875, |
|
"eval_runtime": 44.6873, |
|
"eval_samples_per_second": 12.576, |
|
"eval_steps_per_second": 12.576, |
|
"step": 85424 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"grad_norm": 0.06743878126144409, |
|
"learning_rate": 6.10182384341637e-06, |
|
"loss": 0.2172, |
|
"step": 87672 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_accuracy": 0.8451957106590271, |
|
"eval_loss": 1.4022176265716553, |
|
"eval_runtime": 44.9479, |
|
"eval_samples_per_second": 12.503, |
|
"eval_steps_per_second": 12.503, |
|
"step": 87672 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 0.0027778081130236387, |
|
"learning_rate": 6.001823843416371e-06, |
|
"loss": 0.2204, |
|
"step": 89920 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.8380783200263977, |
|
"eval_loss": 1.4114270210266113, |
|
"eval_runtime": 44.0198, |
|
"eval_samples_per_second": 12.767, |
|
"eval_steps_per_second": 12.767, |
|
"step": 89920 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"grad_norm": 0.007720357272773981, |
|
"learning_rate": 5.901912811387901e-06, |
|
"loss": 0.2232, |
|
"step": 92168 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_accuracy": 0.8416370153427124, |
|
"eval_loss": 1.4324493408203125, |
|
"eval_runtime": 43.6963, |
|
"eval_samples_per_second": 12.861, |
|
"eval_steps_per_second": 12.861, |
|
"step": 92168 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"grad_norm": 0.007710450328886509, |
|
"learning_rate": 5.8019572953736655e-06, |
|
"loss": 0.2301, |
|
"step": 94416 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_accuracy": 0.8487544655799866, |
|
"eval_loss": 1.3528090715408325, |
|
"eval_runtime": 45.0443, |
|
"eval_samples_per_second": 12.477, |
|
"eval_steps_per_second": 12.477, |
|
"step": 94416 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"grad_norm": 0.00038957124343141913, |
|
"learning_rate": 5.70200177935943e-06, |
|
"loss": 0.1751, |
|
"step": 96664 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_accuracy": 0.8434163928031921, |
|
"eval_loss": 1.4648776054382324, |
|
"eval_runtime": 44.1086, |
|
"eval_samples_per_second": 12.741, |
|
"eval_steps_per_second": 12.741, |
|
"step": 96664 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"grad_norm": 0.0042132362723350525, |
|
"learning_rate": 5.602046263345197e-06, |
|
"loss": 0.1982, |
|
"step": 98912 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.8754448294639587, |
|
"eval_loss": 1.221611499786377, |
|
"eval_runtime": 44.0087, |
|
"eval_samples_per_second": 12.77, |
|
"eval_steps_per_second": 12.77, |
|
"step": 98912 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"grad_norm": 0.011830995790660381, |
|
"learning_rate": 5.502046263345196e-06, |
|
"loss": 0.1803, |
|
"step": 101160 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_accuracy": 0.8451957106590271, |
|
"eval_loss": 1.4569214582443237, |
|
"eval_runtime": 44.6868, |
|
"eval_samples_per_second": 12.576, |
|
"eval_steps_per_second": 12.576, |
|
"step": 101160 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"grad_norm": 0.0046292925253510475, |
|
"learning_rate": 5.402090747330961e-06, |
|
"loss": 0.1582, |
|
"step": 103408 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_accuracy": 0.8665480613708496, |
|
"eval_loss": 1.365021824836731, |
|
"eval_runtime": 43.9986, |
|
"eval_samples_per_second": 12.773, |
|
"eval_steps_per_second": 12.773, |
|
"step": 103408 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"grad_norm": 0.041185297071933746, |
|
"learning_rate": 5.302135231316726e-06, |
|
"loss": 0.1837, |
|
"step": 105656 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_accuracy": 0.854092538356781, |
|
"eval_loss": 1.2877007722854614, |
|
"eval_runtime": 44.0731, |
|
"eval_samples_per_second": 12.752, |
|
"eval_steps_per_second": 12.752, |
|
"step": 105656 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 0.04080544412136078, |
|
"learning_rate": 5.202179715302491e-06, |
|
"loss": 0.1458, |
|
"step": 107904 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.8309608697891235, |
|
"eval_loss": 1.7388625144958496, |
|
"eval_runtime": 45.0509, |
|
"eval_samples_per_second": 12.475, |
|
"eval_steps_per_second": 12.475, |
|
"step": 107904 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"grad_norm": 0.003914756700396538, |
|
"learning_rate": 5.1022241992882574e-06, |
|
"loss": 0.1664, |
|
"step": 110152 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_accuracy": 0.854092538356781, |
|
"eval_loss": 1.4000823497772217, |
|
"eval_runtime": 44.2826, |
|
"eval_samples_per_second": 12.691, |
|
"eval_steps_per_second": 12.691, |
|
"step": 110152 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 0.0011822431115433574, |
|
"learning_rate": 5.002268683274022e-06, |
|
"loss": 0.1473, |
|
"step": 112400 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_accuracy": 0.8701067566871643, |
|
"eval_loss": 1.2979094982147217, |
|
"eval_runtime": 44.3693, |
|
"eval_samples_per_second": 12.666, |
|
"eval_steps_per_second": 12.666, |
|
"step": 112400 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"grad_norm": 0.0008859778754413128, |
|
"learning_rate": 4.9022686832740216e-06, |
|
"loss": 0.1341, |
|
"step": 114648 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"eval_accuracy": 0.8469750881195068, |
|
"eval_loss": 1.570462942123413, |
|
"eval_runtime": 44.0624, |
|
"eval_samples_per_second": 12.755, |
|
"eval_steps_per_second": 12.755, |
|
"step": 114648 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"grad_norm": 0.0008422972168773413, |
|
"learning_rate": 4.8023131672597865e-06, |
|
"loss": 0.1603, |
|
"step": 116896 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_accuracy": 0.8380783200263977, |
|
"eval_loss": 1.6043403148651123, |
|
"eval_runtime": 44.715, |
|
"eval_samples_per_second": 12.568, |
|
"eval_steps_per_second": 12.568, |
|
"step": 116896 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"grad_norm": 0.1126694455742836, |
|
"learning_rate": 4.702357651245552e-06, |
|
"loss": 0.1133, |
|
"step": 119144 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"eval_accuracy": 0.8451957106590271, |
|
"eval_loss": 1.6194249391555786, |
|
"eval_runtime": 44.3847, |
|
"eval_samples_per_second": 12.662, |
|
"eval_steps_per_second": 12.662, |
|
"step": 119144 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"grad_norm": 0.0018404704751446843, |
|
"learning_rate": 4.602402135231317e-06, |
|
"loss": 0.107, |
|
"step": 121392 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"eval_accuracy": 0.8629893064498901, |
|
"eval_loss": 1.4172945022583008, |
|
"eval_runtime": 45.9409, |
|
"eval_samples_per_second": 12.233, |
|
"eval_steps_per_second": 12.233, |
|
"step": 121392 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"grad_norm": 0.0005189430085010827, |
|
"learning_rate": 4.502402135231317e-06, |
|
"loss": 0.116, |
|
"step": 123640 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"eval_accuracy": 0.854092538356781, |
|
"eval_loss": 1.526777982711792, |
|
"eval_runtime": 44.5269, |
|
"eval_samples_per_second": 12.622, |
|
"eval_steps_per_second": 12.622, |
|
"step": 123640 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"grad_norm": 0.000902254250831902, |
|
"learning_rate": 4.402446619217082e-06, |
|
"loss": 0.0988, |
|
"step": 125888 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_accuracy": 0.8523131608963013, |
|
"eval_loss": 1.6091721057891846, |
|
"eval_runtime": 44.4807, |
|
"eval_samples_per_second": 12.635, |
|
"eval_steps_per_second": 12.635, |
|
"step": 125888 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"grad_norm": 0.04167209565639496, |
|
"learning_rate": 4.302491103202847e-06, |
|
"loss": 0.139, |
|
"step": 128136 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"eval_accuracy": 0.8647686839103699, |
|
"eval_loss": 1.4311786890029907, |
|
"eval_runtime": 44.6921, |
|
"eval_samples_per_second": 12.575, |
|
"eval_steps_per_second": 12.575, |
|
"step": 128136 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"grad_norm": 0.0009059112053364515, |
|
"learning_rate": 4.202535587188613e-06, |
|
"loss": 0.0798, |
|
"step": 130384 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"eval_accuracy": 0.8327401876449585, |
|
"eval_loss": 1.7888070344924927, |
|
"eval_runtime": 46.1517, |
|
"eval_samples_per_second": 12.177, |
|
"eval_steps_per_second": 12.177, |
|
"step": 130384 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"grad_norm": 0.00021046701294835657, |
|
"learning_rate": 4.102535587188613e-06, |
|
"loss": 0.0776, |
|
"step": 132632 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"eval_accuracy": 0.8665480613708496, |
|
"eval_loss": 1.5457098484039307, |
|
"eval_runtime": 44.2018, |
|
"eval_samples_per_second": 12.714, |
|
"eval_steps_per_second": 12.714, |
|
"step": 132632 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 0.0024244808591902256, |
|
"learning_rate": 4.002580071174378e-06, |
|
"loss": 0.1288, |
|
"step": 134880 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_accuracy": 0.8629893064498901, |
|
"eval_loss": 1.4553505182266235, |
|
"eval_runtime": 43.9122, |
|
"eval_samples_per_second": 12.798, |
|
"eval_steps_per_second": 12.798, |
|
"step": 134880 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"grad_norm": 0.0004949413705617189, |
|
"learning_rate": 3.9026245551601425e-06, |
|
"loss": 0.0828, |
|
"step": 137128 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"eval_accuracy": 0.8558719158172607, |
|
"eval_loss": 1.7077866792678833, |
|
"eval_runtime": 44.3471, |
|
"eval_samples_per_second": 12.673, |
|
"eval_steps_per_second": 12.673, |
|
"step": 137128 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"grad_norm": 0.00021063751773908734, |
|
"learning_rate": 3.8026690391459074e-06, |
|
"loss": 0.0823, |
|
"step": 139376 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"eval_accuracy": 0.8754448294639587, |
|
"eval_loss": 1.4733619689941406, |
|
"eval_runtime": 44.2713, |
|
"eval_samples_per_second": 12.694, |
|
"eval_steps_per_second": 12.694, |
|
"step": 139376 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"grad_norm": 0.0006752557819709182, |
|
"learning_rate": 3.702713523131673e-06, |
|
"loss": 0.0803, |
|
"step": 141624 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"eval_accuracy": 0.8594306111335754, |
|
"eval_loss": 1.6007416248321533, |
|
"eval_runtime": 44.5589, |
|
"eval_samples_per_second": 12.613, |
|
"eval_steps_per_second": 12.613, |
|
"step": 141624 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"grad_norm": 0.0015100333839654922, |
|
"learning_rate": 3.602758007117438e-06, |
|
"loss": 0.0947, |
|
"step": 143872 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_accuracy": 0.8701067566871643, |
|
"eval_loss": 1.4466949701309204, |
|
"eval_runtime": 44.6568, |
|
"eval_samples_per_second": 12.585, |
|
"eval_steps_per_second": 12.585, |
|
"step": 143872 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"grad_norm": 0.0003055017732549459, |
|
"learning_rate": 3.5028024911032033e-06, |
|
"loss": 0.0916, |
|
"step": 146120 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"eval_accuracy": 0.873665452003479, |
|
"eval_loss": 1.4410454034805298, |
|
"eval_runtime": 44.5129, |
|
"eval_samples_per_second": 12.626, |
|
"eval_steps_per_second": 12.626, |
|
"step": 146120 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"grad_norm": 0.0007535702898167074, |
|
"learning_rate": 3.402846975088968e-06, |
|
"loss": 0.0814, |
|
"step": 148368 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"eval_accuracy": 0.8469750881195068, |
|
"eval_loss": 1.711586356163025, |
|
"eval_runtime": 44.7221, |
|
"eval_samples_per_second": 12.566, |
|
"eval_steps_per_second": 12.566, |
|
"step": 148368 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.3028914590747335e-06, |
|
"loss": 0.0938, |
|
"step": 150616 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"eval_accuracy": 0.8629893064498901, |
|
"eval_loss": 1.5838223695755005, |
|
"eval_runtime": 44.5452, |
|
"eval_samples_per_second": 12.616, |
|
"eval_steps_per_second": 12.616, |
|
"step": 150616 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"grad_norm": 0.00016811703972052783, |
|
"learning_rate": 3.2028914590747336e-06, |
|
"loss": 0.066, |
|
"step": 152864 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"eval_accuracy": 0.8558719158172607, |
|
"eval_loss": 1.645817518234253, |
|
"eval_runtime": 44.4578, |
|
"eval_samples_per_second": 12.641, |
|
"eval_steps_per_second": 12.641, |
|
"step": 152864 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"grad_norm": 0.0004475938912946731, |
|
"learning_rate": 3.102935943060498e-06, |
|
"loss": 0.096, |
|
"step": 155112 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"eval_accuracy": 0.8558719158172607, |
|
"eval_loss": 1.6925562620162964, |
|
"eval_runtime": 44.7853, |
|
"eval_samples_per_second": 12.549, |
|
"eval_steps_per_second": 12.549, |
|
"step": 155112 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"grad_norm": 0.005046023055911064, |
|
"learning_rate": 3.0030249110320286e-06, |
|
"loss": 0.0638, |
|
"step": 157360 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"eval_accuracy": 0.8629893064498901, |
|
"eval_loss": 1.5232763290405273, |
|
"eval_runtime": 44.8809, |
|
"eval_samples_per_second": 12.522, |
|
"eval_steps_per_second": 12.522, |
|
"step": 157360 |
|
}, |
|
{ |
|
"epoch": 71.0, |
|
"grad_norm": 0.00024611467961221933, |
|
"learning_rate": 2.9030249110320287e-06, |
|
"loss": 0.063, |
|
"step": 159608 |
|
}, |
|
{ |
|
"epoch": 71.0, |
|
"eval_accuracy": 0.8594306111335754, |
|
"eval_loss": 1.5641191005706787, |
|
"eval_runtime": 44.3769, |
|
"eval_samples_per_second": 12.664, |
|
"eval_steps_per_second": 12.664, |
|
"step": 159608 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"grad_norm": 0.001299203373491764, |
|
"learning_rate": 2.8030249110320284e-06, |
|
"loss": 0.0758, |
|
"step": 161856 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"eval_accuracy": 0.8505337834358215, |
|
"eval_loss": 1.6766804456710815, |
|
"eval_runtime": 44.9521, |
|
"eval_samples_per_second": 12.502, |
|
"eval_steps_per_second": 12.502, |
|
"step": 161856 |
|
}, |
|
{ |
|
"epoch": 73.0, |
|
"grad_norm": 0.00021305486734490842, |
|
"learning_rate": 2.7030693950177937e-06, |
|
"loss": 0.0579, |
|
"step": 164104 |
|
}, |
|
{ |
|
"epoch": 73.0, |
|
"eval_accuracy": 0.8629893064498901, |
|
"eval_loss": 1.5338364839553833, |
|
"eval_runtime": 46.2439, |
|
"eval_samples_per_second": 12.153, |
|
"eval_steps_per_second": 12.153, |
|
"step": 164104 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"grad_norm": 0.0005646486533805728, |
|
"learning_rate": 2.6031138790035586e-06, |
|
"loss": 0.0379, |
|
"step": 166352 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"eval_accuracy": 0.8629893064498901, |
|
"eval_loss": 1.6348403692245483, |
|
"eval_runtime": 44.783, |
|
"eval_samples_per_second": 12.549, |
|
"eval_steps_per_second": 12.549, |
|
"step": 166352 |
|
}, |
|
{ |
|
"epoch": 75.0, |
|
"grad_norm": 0.00010759654833236709, |
|
"learning_rate": 2.5031583629893243e-06, |
|
"loss": 0.0351, |
|
"step": 168600 |
|
}, |
|
{ |
|
"epoch": 75.0, |
|
"eval_accuracy": 0.8558719158172607, |
|
"eval_loss": 1.70370352268219, |
|
"eval_runtime": 47.2555, |
|
"eval_samples_per_second": 11.893, |
|
"eval_steps_per_second": 11.893, |
|
"step": 168600 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"grad_norm": 0.000321146595524624, |
|
"learning_rate": 2.403202846975089e-06, |
|
"loss": 0.0472, |
|
"step": 170848 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"eval_accuracy": 0.8754448294639587, |
|
"eval_loss": 1.5681896209716797, |
|
"eval_runtime": 44.4779, |
|
"eval_samples_per_second": 12.635, |
|
"eval_steps_per_second": 12.635, |
|
"step": 170848 |
|
}, |
|
{ |
|
"epoch": 77.0, |
|
"grad_norm": 0.00014101610577199608, |
|
"learning_rate": 2.3032028469750893e-06, |
|
"loss": 0.0253, |
|
"step": 173096 |
|
}, |
|
{ |
|
"epoch": 77.0, |
|
"eval_accuracy": 0.8558719158172607, |
|
"eval_loss": 1.7067368030548096, |
|
"eval_runtime": 44.3009, |
|
"eval_samples_per_second": 12.686, |
|
"eval_steps_per_second": 12.686, |
|
"step": 173096 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"grad_norm": 0.0002290535339852795, |
|
"learning_rate": 2.203247330960854e-06, |
|
"loss": 0.073, |
|
"step": 175344 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"eval_accuracy": 0.8754448294639587, |
|
"eval_loss": 1.4460408687591553, |
|
"eval_runtime": 44.9156, |
|
"eval_samples_per_second": 12.512, |
|
"eval_steps_per_second": 12.512, |
|
"step": 175344 |
|
}, |
|
{ |
|
"epoch": 79.0, |
|
"grad_norm": 0.0008211834938265383, |
|
"learning_rate": 2.1032918149466195e-06, |
|
"loss": 0.049, |
|
"step": 177592 |
|
}, |
|
{ |
|
"epoch": 79.0, |
|
"eval_accuracy": 0.8594306111335754, |
|
"eval_loss": 1.5897014141082764, |
|
"eval_runtime": 44.349, |
|
"eval_samples_per_second": 12.672, |
|
"eval_steps_per_second": 12.672, |
|
"step": 177592 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"grad_norm": 0.0002687973901629448, |
|
"learning_rate": 2.003291814946619e-06, |
|
"loss": 0.0503, |
|
"step": 179840 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_accuracy": 0.8647686839103699, |
|
"eval_loss": 1.601679801940918, |
|
"eval_runtime": 44.9732, |
|
"eval_samples_per_second": 12.496, |
|
"eval_steps_per_second": 12.496, |
|
"step": 179840 |
|
}, |
|
{ |
|
"epoch": 81.0, |
|
"grad_norm": 0.00014994025696069002, |
|
"learning_rate": 1.9033807829181496e-06, |
|
"loss": 0.0497, |
|
"step": 182088 |
|
}, |
|
{ |
|
"epoch": 81.0, |
|
"eval_accuracy": 0.8683273792266846, |
|
"eval_loss": 1.5318866968154907, |
|
"eval_runtime": 44.556, |
|
"eval_samples_per_second": 12.613, |
|
"eval_steps_per_second": 12.613, |
|
"step": 182088 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"grad_norm": 0.00014429475413635373, |
|
"learning_rate": 1.8034697508896797e-06, |
|
"loss": 0.0553, |
|
"step": 184336 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"eval_accuracy": 0.8612099885940552, |
|
"eval_loss": 1.5479316711425781, |
|
"eval_runtime": 44.3184, |
|
"eval_samples_per_second": 12.681, |
|
"eval_steps_per_second": 12.681, |
|
"step": 184336 |
|
}, |
|
{ |
|
"epoch": 83.0, |
|
"grad_norm": 0.0028621815145015717, |
|
"learning_rate": 1.70346975088968e-06, |
|
"loss": 0.0416, |
|
"step": 186584 |
|
}, |
|
{ |
|
"epoch": 83.0, |
|
"eval_accuracy": 0.8576512336730957, |
|
"eval_loss": 1.5556381940841675, |
|
"eval_runtime": 44.9266, |
|
"eval_samples_per_second": 12.509, |
|
"eval_steps_per_second": 12.509, |
|
"step": 186584 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"grad_norm": 0.0002136415132554248, |
|
"learning_rate": 1.60346975088968e-06, |
|
"loss": 0.0641, |
|
"step": 188832 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"eval_accuracy": 0.8594306111335754, |
|
"eval_loss": 1.5674761533737183, |
|
"eval_runtime": 44.2532, |
|
"eval_samples_per_second": 12.7, |
|
"eval_steps_per_second": 12.7, |
|
"step": 188832 |
|
}, |
|
{ |
|
"epoch": 85.0, |
|
"grad_norm": 0.00014172360533848405, |
|
"learning_rate": 1.503514234875445e-06, |
|
"loss": 0.0425, |
|
"step": 191080 |
|
}, |
|
{ |
|
"epoch": 85.0, |
|
"eval_accuracy": 0.8558719158172607, |
|
"eval_loss": 1.6853959560394287, |
|
"eval_runtime": 44.7377, |
|
"eval_samples_per_second": 12.562, |
|
"eval_steps_per_second": 12.562, |
|
"step": 191080 |
|
}, |
|
{ |
|
"epoch": 86.0, |
|
"grad_norm": 0.0003552982525434345, |
|
"learning_rate": 1.403514234875445e-06, |
|
"loss": 0.0311, |
|
"step": 193328 |
|
}, |
|
{ |
|
"epoch": 86.0, |
|
"eval_accuracy": 0.873665452003479, |
|
"eval_loss": 1.4628252983093262, |
|
"eval_runtime": 44.4922, |
|
"eval_samples_per_second": 12.631, |
|
"eval_steps_per_second": 12.631, |
|
"step": 193328 |
|
}, |
|
{ |
|
"epoch": 87.0, |
|
"grad_norm": 0.00020055480126757175, |
|
"learning_rate": 1.3036032028469752e-06, |
|
"loss": 0.0456, |
|
"step": 195576 |
|
}, |
|
{ |
|
"epoch": 87.0, |
|
"eval_accuracy": 0.8701067566871643, |
|
"eval_loss": 1.5069490671157837, |
|
"eval_runtime": 44.3942, |
|
"eval_samples_per_second": 12.659, |
|
"eval_steps_per_second": 12.659, |
|
"step": 195576 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"grad_norm": 0.0007724438328295946, |
|
"learning_rate": 1.2036032028469751e-06, |
|
"loss": 0.0224, |
|
"step": 197824 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"eval_accuracy": 0.8665480613708496, |
|
"eval_loss": 1.612963080406189, |
|
"eval_runtime": 44.8107, |
|
"eval_samples_per_second": 12.542, |
|
"eval_steps_per_second": 12.542, |
|
"step": 197824 |
|
}, |
|
{ |
|
"epoch": 89.0, |
|
"grad_norm": 0.00014038340304978192, |
|
"learning_rate": 1.1036476868327404e-06, |
|
"loss": 0.0345, |
|
"step": 200072 |
|
}, |
|
{ |
|
"epoch": 89.0, |
|
"eval_accuracy": 0.8701067566871643, |
|
"eval_loss": 1.5750340223312378, |
|
"eval_runtime": 44.6873, |
|
"eval_samples_per_second": 12.576, |
|
"eval_steps_per_second": 12.576, |
|
"step": 200072 |
|
}, |
|
{ |
|
"epoch": 90.0, |
|
"grad_norm": 6.56926931696944e-05, |
|
"learning_rate": 1.0036476868327403e-06, |
|
"loss": 0.041, |
|
"step": 202320 |
|
}, |
|
{ |
|
"epoch": 90.0, |
|
"eval_accuracy": 0.871886134147644, |
|
"eval_loss": 1.5229840278625488, |
|
"eval_runtime": 44.4416, |
|
"eval_samples_per_second": 12.646, |
|
"eval_steps_per_second": 12.646, |
|
"step": 202320 |
|
}, |
|
{ |
|
"epoch": 91.0, |
|
"grad_norm": 0.00015356726362369955, |
|
"learning_rate": 9.036921708185054e-07, |
|
"loss": 0.0165, |
|
"step": 204568 |
|
}, |
|
{ |
|
"epoch": 91.0, |
|
"eval_accuracy": 0.8594306111335754, |
|
"eval_loss": 1.6564196348190308, |
|
"eval_runtime": 44.2201, |
|
"eval_samples_per_second": 12.709, |
|
"eval_steps_per_second": 12.709, |
|
"step": 204568 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"grad_norm": 8.38369523989968e-05, |
|
"learning_rate": 8.037811387900357e-07, |
|
"loss": 0.0478, |
|
"step": 206816 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"eval_accuracy": 0.8629893064498901, |
|
"eval_loss": 1.5940439701080322, |
|
"eval_runtime": 44.433, |
|
"eval_samples_per_second": 12.648, |
|
"eval_steps_per_second": 12.648, |
|
"step": 206816 |
|
}, |
|
{ |
|
"epoch": 93.0, |
|
"grad_norm": 0.0002615041739773005, |
|
"learning_rate": 7.037811387900356e-07, |
|
"loss": 0.032, |
|
"step": 209064 |
|
}, |
|
{ |
|
"epoch": 93.0, |
|
"eval_accuracy": 0.8807829022407532, |
|
"eval_loss": 1.4740957021713257, |
|
"eval_runtime": 44.2628, |
|
"eval_samples_per_second": 12.697, |
|
"eval_steps_per_second": 12.697, |
|
"step": 209064 |
|
}, |
|
{ |
|
"epoch": 94.0, |
|
"grad_norm": 6.115555286407471, |
|
"learning_rate": 6.037811387900357e-07, |
|
"loss": 0.0433, |
|
"step": 211312 |
|
}, |
|
{ |
|
"epoch": 94.0, |
|
"eval_accuracy": 0.871886134147644, |
|
"eval_loss": 1.5333490371704102, |
|
"eval_runtime": 44.3613, |
|
"eval_samples_per_second": 12.669, |
|
"eval_steps_per_second": 12.669, |
|
"step": 211312 |
|
}, |
|
{ |
|
"epoch": 95.0, |
|
"grad_norm": 0.003299184376373887, |
|
"learning_rate": 5.038256227758007e-07, |
|
"loss": 0.0243, |
|
"step": 213560 |
|
}, |
|
{ |
|
"epoch": 95.0, |
|
"eval_accuracy": 0.871886134147644, |
|
"eval_loss": 1.5164633989334106, |
|
"eval_runtime": 44.7905, |
|
"eval_samples_per_second": 12.547, |
|
"eval_steps_per_second": 12.547, |
|
"step": 213560 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"grad_norm": 5.206004061619751e-05, |
|
"learning_rate": 4.038701067615659e-07, |
|
"loss": 0.0165, |
|
"step": 215808 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"eval_accuracy": 0.8683273792266846, |
|
"eval_loss": 1.5774868726730347, |
|
"eval_runtime": 44.4019, |
|
"eval_samples_per_second": 12.657, |
|
"eval_steps_per_second": 12.657, |
|
"step": 215808 |
|
}, |
|
{ |
|
"epoch": 97.0, |
|
"grad_norm": 0.0002649713715072721, |
|
"learning_rate": 3.03914590747331e-07, |
|
"loss": 0.0177, |
|
"step": 218056 |
|
}, |
|
{ |
|
"epoch": 97.0, |
|
"eval_accuracy": 0.8772242069244385, |
|
"eval_loss": 1.5302330255508423, |
|
"eval_runtime": 44.581, |
|
"eval_samples_per_second": 12.606, |
|
"eval_steps_per_second": 12.606, |
|
"step": 218056 |
|
}, |
|
{ |
|
"epoch": 98.0, |
|
"grad_norm": 5.933016291237436e-05, |
|
"learning_rate": 2.039590747330961e-07, |
|
"loss": 0.0253, |
|
"step": 220304 |
|
}, |
|
{ |
|
"epoch": 98.0, |
|
"eval_accuracy": 0.8754448294639587, |
|
"eval_loss": 1.5424113273620605, |
|
"eval_runtime": 44.8846, |
|
"eval_samples_per_second": 12.521, |
|
"eval_steps_per_second": 12.521, |
|
"step": 220304 |
|
}, |
|
{ |
|
"epoch": 99.0, |
|
"grad_norm": 7.96098611317575e-05, |
|
"learning_rate": 1.0400355871886121e-07, |
|
"loss": 0.0224, |
|
"step": 222552 |
|
}, |
|
{ |
|
"epoch": 99.0, |
|
"eval_accuracy": 0.871886134147644, |
|
"eval_loss": 1.5461581945419312, |
|
"eval_runtime": 44.4094, |
|
"eval_samples_per_second": 12.655, |
|
"eval_steps_per_second": 12.655, |
|
"step": 222552 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"grad_norm": 0.00019997352501377463, |
|
"learning_rate": 4.0925266903914596e-09, |
|
"loss": 0.0213, |
|
"step": 224800 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"eval_accuracy": 0.873665452003479, |
|
"eval_loss": 1.5357913970947266, |
|
"eval_runtime": 44.4377, |
|
"eval_samples_per_second": 12.647, |
|
"eval_steps_per_second": 12.647, |
|
"step": 224800 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"step": 224800, |
|
"total_flos": 2.049428872704e+19, |
|
"train_loss": 0.33036238875677576, |
|
"train_runtime": 38665.4857, |
|
"train_samples_per_second": 5.814, |
|
"train_steps_per_second": 5.814 |
|
} |
|
], |
|
"logging_steps": 35, |
|
"max_steps": 224800, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.049428872704e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|