|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 50.0, |
|
"eval_steps": 500, |
|
"global_step": 112400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 39.0721435546875, |
|
"learning_rate": 9.800622775800713e-06, |
|
"loss": 1.2025, |
|
"step": 2248 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.5195729732513428, |
|
"eval_loss": 1.3446124792099, |
|
"eval_runtime": 43.8683, |
|
"eval_samples_per_second": 12.811, |
|
"eval_steps_per_second": 12.811, |
|
"step": 2248 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.9895684123039246, |
|
"learning_rate": 9.600800711743772e-06, |
|
"loss": 1.415, |
|
"step": 4496 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.6032028198242188, |
|
"eval_loss": 1.6350069046020508, |
|
"eval_runtime": 43.9062, |
|
"eval_samples_per_second": 12.8, |
|
"eval_steps_per_second": 12.8, |
|
"step": 4496 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.6042933464050293, |
|
"learning_rate": 9.400800711743772e-06, |
|
"loss": 1.4176, |
|
"step": 6744 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.6583629846572876, |
|
"eval_loss": 1.6250243186950684, |
|
"eval_runtime": 44.433, |
|
"eval_samples_per_second": 12.648, |
|
"eval_steps_per_second": 12.648, |
|
"step": 6744 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.39769357442855835, |
|
"learning_rate": 9.200889679715304e-06, |
|
"loss": 1.384, |
|
"step": 8992 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.7241992950439453, |
|
"eval_loss": 1.3693625926971436, |
|
"eval_runtime": 44.1124, |
|
"eval_samples_per_second": 12.74, |
|
"eval_steps_per_second": 12.74, |
|
"step": 8992 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 18.08792495727539, |
|
"learning_rate": 9.000978647686834e-06, |
|
"loss": 1.3658, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.709964394569397, |
|
"eval_loss": 1.4330979585647583, |
|
"eval_runtime": 44.4448, |
|
"eval_samples_per_second": 12.645, |
|
"eval_steps_per_second": 12.645, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.2233850061893463, |
|
"learning_rate": 8.800978647686834e-06, |
|
"loss": 1.2763, |
|
"step": 13488 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.7437722682952881, |
|
"eval_loss": 1.3310519456863403, |
|
"eval_runtime": 45.1045, |
|
"eval_samples_per_second": 12.46, |
|
"eval_steps_per_second": 12.46, |
|
"step": 13488 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 27.293367385864258, |
|
"learning_rate": 8.601156583629893e-06, |
|
"loss": 1.2175, |
|
"step": 15736 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.7580071091651917, |
|
"eval_loss": 1.2726552486419678, |
|
"eval_runtime": 44.7638, |
|
"eval_samples_per_second": 12.555, |
|
"eval_steps_per_second": 12.555, |
|
"step": 15736 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 4.855543613433838, |
|
"learning_rate": 8.401156583629893e-06, |
|
"loss": 1.1276, |
|
"step": 17984 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.7330960631370544, |
|
"eval_loss": 1.4520480632781982, |
|
"eval_runtime": 44.2783, |
|
"eval_samples_per_second": 12.692, |
|
"eval_steps_per_second": 12.692, |
|
"step": 17984 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.08200906962156296, |
|
"learning_rate": 8.201245551601425e-06, |
|
"loss": 1.1053, |
|
"step": 20232 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.77224200963974, |
|
"eval_loss": 1.2134090662002563, |
|
"eval_runtime": 44.2632, |
|
"eval_samples_per_second": 12.697, |
|
"eval_steps_per_second": 12.697, |
|
"step": 20232 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.09874199330806732, |
|
"learning_rate": 8.001334519572955e-06, |
|
"loss": 1.0314, |
|
"step": 22480 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.7829181551933289, |
|
"eval_loss": 1.2143168449401855, |
|
"eval_runtime": 44.2853, |
|
"eval_samples_per_second": 12.69, |
|
"eval_steps_per_second": 12.69, |
|
"step": 22480 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 0.23497702181339264, |
|
"learning_rate": 7.801423487544484e-06, |
|
"loss": 1.0029, |
|
"step": 24728 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.7811387777328491, |
|
"eval_loss": 1.3311693668365479, |
|
"eval_runtime": 44.0086, |
|
"eval_samples_per_second": 12.77, |
|
"eval_steps_per_second": 12.77, |
|
"step": 24728 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.08876941353082657, |
|
"learning_rate": 7.6014234875444846e-06, |
|
"loss": 0.9108, |
|
"step": 26976 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.8024911284446716, |
|
"eval_loss": 1.2227764129638672, |
|
"eval_runtime": 44.4307, |
|
"eval_samples_per_second": 12.649, |
|
"eval_steps_per_second": 12.649, |
|
"step": 26976 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 0.13910023868083954, |
|
"learning_rate": 7.401512455516014e-06, |
|
"loss": 0.8335, |
|
"step": 29224 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.8078292012214661, |
|
"eval_loss": 1.1526196002960205, |
|
"eval_runtime": 44.5737, |
|
"eval_samples_per_second": 12.608, |
|
"eval_steps_per_second": 12.608, |
|
"step": 29224 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 159.14675903320312, |
|
"learning_rate": 7.201601423487545e-06, |
|
"loss": 0.8514, |
|
"step": 31472 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.8202847242355347, |
|
"eval_loss": 0.9904452562332153, |
|
"eval_runtime": 44.2371, |
|
"eval_samples_per_second": 12.704, |
|
"eval_steps_per_second": 12.704, |
|
"step": 31472 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.09662589430809021, |
|
"learning_rate": 7.001601423487545e-06, |
|
"loss": 0.7389, |
|
"step": 33720 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.8024911284446716, |
|
"eval_loss": 1.2999956607818604, |
|
"eval_runtime": 43.9303, |
|
"eval_samples_per_second": 12.793, |
|
"eval_steps_per_second": 12.793, |
|
"step": 33720 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.1123761385679245, |
|
"learning_rate": 6.801690391459075e-06, |
|
"loss": 0.6993, |
|
"step": 35968 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.8202847242355347, |
|
"eval_loss": 1.0872814655303955, |
|
"eval_runtime": 44.4779, |
|
"eval_samples_per_second": 12.636, |
|
"eval_steps_per_second": 12.636, |
|
"step": 35968 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 0.5431187748908997, |
|
"learning_rate": 6.601690391459076e-06, |
|
"loss": 0.6177, |
|
"step": 38216 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.8327401876449585, |
|
"eval_loss": 1.0856190919876099, |
|
"eval_runtime": 44.047, |
|
"eval_samples_per_second": 12.759, |
|
"eval_steps_per_second": 12.759, |
|
"step": 38216 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.034104421734809875, |
|
"learning_rate": 6.401868327402135e-06, |
|
"loss": 0.641, |
|
"step": 40464 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.7971529960632324, |
|
"eval_loss": 1.3224000930786133, |
|
"eval_runtime": 43.8821, |
|
"eval_samples_per_second": 12.807, |
|
"eval_steps_per_second": 12.807, |
|
"step": 40464 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.05449577793478966, |
|
"learning_rate": 6.201957295373666e-06, |
|
"loss": 0.611, |
|
"step": 42712 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.8291814923286438, |
|
"eval_loss": 1.1800107955932617, |
|
"eval_runtime": 43.8606, |
|
"eval_samples_per_second": 12.813, |
|
"eval_steps_per_second": 12.813, |
|
"step": 42712 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 24.56753158569336, |
|
"learning_rate": 6.001957295373666e-06, |
|
"loss": 0.5744, |
|
"step": 44960 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.809608519077301, |
|
"eval_loss": 1.2937426567077637, |
|
"eval_runtime": 44.1614, |
|
"eval_samples_per_second": 12.726, |
|
"eval_steps_per_second": 12.726, |
|
"step": 44960 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 0.14437897503376007, |
|
"learning_rate": 5.802046263345196e-06, |
|
"loss": 0.5008, |
|
"step": 47208 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.8416370153427124, |
|
"eval_loss": 1.1565003395080566, |
|
"eval_runtime": 44.2577, |
|
"eval_samples_per_second": 12.698, |
|
"eval_steps_per_second": 12.698, |
|
"step": 47208 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 0.044370945543050766, |
|
"learning_rate": 5.6021352313167265e-06, |
|
"loss": 0.4396, |
|
"step": 49456 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.8149465918540955, |
|
"eval_loss": 1.366288661956787, |
|
"eval_runtime": 44.4773, |
|
"eval_samples_per_second": 12.636, |
|
"eval_steps_per_second": 12.636, |
|
"step": 49456 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"grad_norm": 0.21999333798885345, |
|
"learning_rate": 5.402224199288256e-06, |
|
"loss": 0.4313, |
|
"step": 51704 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.8220640420913696, |
|
"eval_loss": 1.3266714811325073, |
|
"eval_runtime": 43.9122, |
|
"eval_samples_per_second": 12.798, |
|
"eval_steps_per_second": 12.798, |
|
"step": 51704 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 0.037919577211141586, |
|
"learning_rate": 5.202313167259787e-06, |
|
"loss": 0.3954, |
|
"step": 53952 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.8469750881195068, |
|
"eval_loss": 1.1824334859848022, |
|
"eval_runtime": 44.2008, |
|
"eval_samples_per_second": 12.715, |
|
"eval_steps_per_second": 12.715, |
|
"step": 53952 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 106.5145263671875, |
|
"learning_rate": 5.002491103202848e-06, |
|
"loss": 0.4217, |
|
"step": 56200 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.8042704463005066, |
|
"eval_loss": 1.5585525035858154, |
|
"eval_runtime": 43.9488, |
|
"eval_samples_per_second": 12.788, |
|
"eval_steps_per_second": 12.788, |
|
"step": 56200 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 0.015882058069109917, |
|
"learning_rate": 4.802491103202847e-06, |
|
"loss": 0.3797, |
|
"step": 58448 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.8523131608963013, |
|
"eval_loss": 1.1745574474334717, |
|
"eval_runtime": 43.9074, |
|
"eval_samples_per_second": 12.8, |
|
"eval_steps_per_second": 12.8, |
|
"step": 58448 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"grad_norm": 0.03550243377685547, |
|
"learning_rate": 4.602580071174377e-06, |
|
"loss": 0.358, |
|
"step": 60696 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.8451957106590271, |
|
"eval_loss": 1.1937276124954224, |
|
"eval_runtime": 43.8357, |
|
"eval_samples_per_second": 12.821, |
|
"eval_steps_per_second": 12.821, |
|
"step": 60696 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 0.009829429909586906, |
|
"learning_rate": 4.402669039145908e-06, |
|
"loss": 0.2963, |
|
"step": 62944 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.8309608697891235, |
|
"eval_loss": 1.4036489725112915, |
|
"eval_runtime": 44.7776, |
|
"eval_samples_per_second": 12.551, |
|
"eval_steps_per_second": 12.551, |
|
"step": 62944 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"grad_norm": Infinity, |
|
"learning_rate": 4.202758007117438e-06, |
|
"loss": 0.3338, |
|
"step": 65192 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 0.8505337834358215, |
|
"eval_loss": 1.3133819103240967, |
|
"eval_runtime": 43.9603, |
|
"eval_samples_per_second": 12.784, |
|
"eval_steps_per_second": 12.784, |
|
"step": 65192 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 0.012281532399356365, |
|
"learning_rate": 4.0027580071174384e-06, |
|
"loss": 0.2565, |
|
"step": 67440 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.8345195651054382, |
|
"eval_loss": 1.480637788772583, |
|
"eval_runtime": 44.0574, |
|
"eval_samples_per_second": 12.756, |
|
"eval_steps_per_second": 12.756, |
|
"step": 67440 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"grad_norm": 0.0023416499607264996, |
|
"learning_rate": 3.8028469750889686e-06, |
|
"loss": 0.2798, |
|
"step": 69688 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_accuracy": 0.8309608697891235, |
|
"eval_loss": 1.5173381567001343, |
|
"eval_runtime": 44.0999, |
|
"eval_samples_per_second": 12.744, |
|
"eval_steps_per_second": 12.744, |
|
"step": 69688 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 5.919681072235107, |
|
"learning_rate": 3.602935943060499e-06, |
|
"loss": 0.2674, |
|
"step": 71936 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.8131672739982605, |
|
"eval_loss": 1.5758373737335205, |
|
"eval_runtime": 44.4876, |
|
"eval_samples_per_second": 12.633, |
|
"eval_steps_per_second": 12.633, |
|
"step": 71936 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"grad_norm": 0.0024983042385429144, |
|
"learning_rate": 3.4029359430604986e-06, |
|
"loss": 0.2334, |
|
"step": 74184 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_accuracy": 0.8558719158172607, |
|
"eval_loss": 1.3400838375091553, |
|
"eval_runtime": 44.5806, |
|
"eval_samples_per_second": 12.606, |
|
"eval_steps_per_second": 12.606, |
|
"step": 74184 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"grad_norm": 0.0015680283540859818, |
|
"learning_rate": 3.2030249110320288e-06, |
|
"loss": 0.2352, |
|
"step": 76432 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 0.8469750881195068, |
|
"eval_loss": 1.2716737985610962, |
|
"eval_runtime": 44.3768, |
|
"eval_samples_per_second": 12.664, |
|
"eval_steps_per_second": 12.664, |
|
"step": 76432 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"grad_norm": 0.0035891232546418905, |
|
"learning_rate": 3.003113879003559e-06, |
|
"loss": 0.2406, |
|
"step": 78680 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_accuracy": 0.8256227970123291, |
|
"eval_loss": 1.6162846088409424, |
|
"eval_runtime": 44.1614, |
|
"eval_samples_per_second": 12.726, |
|
"eval_steps_per_second": 12.726, |
|
"step": 78680 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 0.003527622204273939, |
|
"learning_rate": 2.8032028469750896e-06, |
|
"loss": 0.2208, |
|
"step": 80928 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.8505337834358215, |
|
"eval_loss": 1.3815460205078125, |
|
"eval_runtime": 44.6638, |
|
"eval_samples_per_second": 12.583, |
|
"eval_steps_per_second": 12.583, |
|
"step": 80928 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"grad_norm": 0.0015290452865883708, |
|
"learning_rate": 2.6033807829181497e-06, |
|
"loss": 0.1796, |
|
"step": 83176 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_accuracy": 0.8576512336730957, |
|
"eval_loss": 1.392934799194336, |
|
"eval_runtime": 44.8201, |
|
"eval_samples_per_second": 12.539, |
|
"eval_steps_per_second": 12.539, |
|
"step": 83176 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"grad_norm": 0.00960911251604557, |
|
"learning_rate": 2.4033807829181495e-06, |
|
"loss": 0.2127, |
|
"step": 85424 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 0.8274021148681641, |
|
"eval_loss": 1.5271013975143433, |
|
"eval_runtime": 44.255, |
|
"eval_samples_per_second": 12.699, |
|
"eval_steps_per_second": 12.699, |
|
"step": 85424 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"grad_norm": 0.004349031951278448, |
|
"learning_rate": 2.20355871886121e-06, |
|
"loss": 0.1748, |
|
"step": 87672 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_accuracy": 0.8416370153427124, |
|
"eval_loss": 1.50688636302948, |
|
"eval_runtime": 44.0422, |
|
"eval_samples_per_second": 12.76, |
|
"eval_steps_per_second": 12.76, |
|
"step": 87672 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 0.0012758744414895773, |
|
"learning_rate": 2.0035587188612103e-06, |
|
"loss": 0.1612, |
|
"step": 89920 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.8469750881195068, |
|
"eval_loss": 1.396645426750183, |
|
"eval_runtime": 44.1524, |
|
"eval_samples_per_second": 12.729, |
|
"eval_steps_per_second": 12.729, |
|
"step": 89920 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"grad_norm": 0.0022624109406024218, |
|
"learning_rate": 1.8036476868327405e-06, |
|
"loss": 0.1757, |
|
"step": 92168 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_accuracy": 0.8469750881195068, |
|
"eval_loss": 1.4628039598464966, |
|
"eval_runtime": 44.3029, |
|
"eval_samples_per_second": 12.685, |
|
"eval_steps_per_second": 12.685, |
|
"step": 92168 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"grad_norm": 0.007528578396886587, |
|
"learning_rate": 1.6037366548042707e-06, |
|
"loss": 0.1664, |
|
"step": 94416 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_accuracy": 0.8523131608963013, |
|
"eval_loss": 1.3362900018692017, |
|
"eval_runtime": 43.78, |
|
"eval_samples_per_second": 12.837, |
|
"eval_steps_per_second": 12.837, |
|
"step": 94416 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"grad_norm": 0.0013060200726613402, |
|
"learning_rate": 1.4038256227758006e-06, |
|
"loss": 0.1313, |
|
"step": 96664 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_accuracy": 0.8434163928031921, |
|
"eval_loss": 1.438825011253357, |
|
"eval_runtime": 43.9878, |
|
"eval_samples_per_second": 12.776, |
|
"eval_steps_per_second": 12.776, |
|
"step": 96664 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"grad_norm": 0.042194243520498276, |
|
"learning_rate": 1.203914590747331e-06, |
|
"loss": 0.1272, |
|
"step": 98912 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.8629893064498901, |
|
"eval_loss": 1.3669886589050293, |
|
"eval_runtime": 43.6968, |
|
"eval_samples_per_second": 12.861, |
|
"eval_steps_per_second": 12.861, |
|
"step": 98912 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"grad_norm": 0.00658341683447361, |
|
"learning_rate": 1.0040035587188612e-06, |
|
"loss": 0.1127, |
|
"step": 101160 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_accuracy": 0.854092538356781, |
|
"eval_loss": 1.4244239330291748, |
|
"eval_runtime": 43.9697, |
|
"eval_samples_per_second": 12.782, |
|
"eval_steps_per_second": 12.782, |
|
"step": 101160 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"grad_norm": 0.0009845913155004382, |
|
"learning_rate": 8.040035587188613e-07, |
|
"loss": 0.1062, |
|
"step": 103408 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_accuracy": 0.854092538356781, |
|
"eval_loss": 1.3811708688735962, |
|
"eval_runtime": 44.3254, |
|
"eval_samples_per_second": 12.679, |
|
"eval_steps_per_second": 12.679, |
|
"step": 103408 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"grad_norm": 0.0018777468940243125, |
|
"learning_rate": 6.040925266903915e-07, |
|
"loss": 0.0924, |
|
"step": 105656 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_accuracy": 0.854092538356781, |
|
"eval_loss": 1.4448031187057495, |
|
"eval_runtime": 44.1197, |
|
"eval_samples_per_second": 12.738, |
|
"eval_steps_per_second": 12.738, |
|
"step": 105656 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 0.0018540391465649009, |
|
"learning_rate": 4.040925266903915e-07, |
|
"loss": 0.0998, |
|
"step": 107904 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.8683273792266846, |
|
"eval_loss": 1.305090069770813, |
|
"eval_runtime": 43.7364, |
|
"eval_samples_per_second": 12.85, |
|
"eval_steps_per_second": 12.85, |
|
"step": 107904 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"grad_norm": 0.0024543912149965763, |
|
"learning_rate": 2.0418149466192174e-07, |
|
"loss": 0.1055, |
|
"step": 110152 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_accuracy": 0.8701067566871643, |
|
"eval_loss": 1.262986660003662, |
|
"eval_runtime": 44.2858, |
|
"eval_samples_per_second": 12.69, |
|
"eval_steps_per_second": 12.69, |
|
"step": 110152 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 0.00034937995951622725, |
|
"learning_rate": 4.181494661921708e-09, |
|
"loss": 0.1073, |
|
"step": 112400 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_accuracy": 0.8629893064498901, |
|
"eval_loss": 1.268770456314087, |
|
"eval_runtime": 44.0645, |
|
"eval_samples_per_second": 12.754, |
|
"eval_steps_per_second": 12.754, |
|
"step": 112400 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"step": 112400, |
|
"total_flos": 1.024714436352e+19, |
|
"train_loss": 0.5453376764589357, |
|
"train_runtime": 19116.6154, |
|
"train_samples_per_second": 5.88, |
|
"train_steps_per_second": 5.88 |
|
} |
|
], |
|
"logging_steps": 35, |
|
"max_steps": 112400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.024714436352e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|