|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 30.0, |
|
"eval_steps": 500, |
|
"global_step": 67440, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 20.024471282958984, |
|
"learning_rate": 9.66755634638197e-06, |
|
"loss": 1.2057, |
|
"step": 2248 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.450177937746048, |
|
"eval_loss": 1.4521995782852173, |
|
"eval_runtime": 46.9769, |
|
"eval_samples_per_second": 11.963, |
|
"eval_steps_per_second": 11.963, |
|
"step": 2248 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 40.918148040771484, |
|
"learning_rate": 9.334371293001186e-06, |
|
"loss": 1.3873, |
|
"step": 4496 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.6423487663269043, |
|
"eval_loss": 1.4503390789031982, |
|
"eval_runtime": 47.9141, |
|
"eval_samples_per_second": 11.729, |
|
"eval_steps_per_second": 11.729, |
|
"step": 4496 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 14.9369478225708, |
|
"learning_rate": 9.001334519572953e-06, |
|
"loss": 1.4246, |
|
"step": 6744 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.6672598123550415, |
|
"eval_loss": 1.6165008544921875, |
|
"eval_runtime": 47.7427, |
|
"eval_samples_per_second": 11.771, |
|
"eval_steps_per_second": 11.771, |
|
"step": 6744 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.12427664548158646, |
|
"learning_rate": 8.668149466192172e-06, |
|
"loss": 1.3335, |
|
"step": 8992 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.7206405401229858, |
|
"eval_loss": 1.478636384010315, |
|
"eval_runtime": 47.5592, |
|
"eval_samples_per_second": 11.817, |
|
"eval_steps_per_second": 11.817, |
|
"step": 8992 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.1769200563430786, |
|
"learning_rate": 8.334964412811389e-06, |
|
"loss": 1.251, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.6886121034622192, |
|
"eval_loss": 1.6414002180099487, |
|
"eval_runtime": 47.4426, |
|
"eval_samples_per_second": 11.846, |
|
"eval_steps_per_second": 11.846, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.14434462785720825, |
|
"learning_rate": 8.001779359430606e-06, |
|
"loss": 1.1859, |
|
"step": 13488 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.754448413848877, |
|
"eval_loss": 1.329976201057434, |
|
"eval_runtime": 47.6847, |
|
"eval_samples_per_second": 11.786, |
|
"eval_steps_per_second": 11.786, |
|
"step": 13488 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 24.07697105407715, |
|
"learning_rate": 7.668446026097273e-06, |
|
"loss": 1.1132, |
|
"step": 15736 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.7508896589279175, |
|
"eval_loss": 1.3665430545806885, |
|
"eval_runtime": 47.6469, |
|
"eval_samples_per_second": 11.795, |
|
"eval_steps_per_second": 11.795, |
|
"step": 15736 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 3.64510178565979, |
|
"learning_rate": 7.335112692763939e-06, |
|
"loss": 1.0189, |
|
"step": 17984 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.7153024673461914, |
|
"eval_loss": 1.6664679050445557, |
|
"eval_runtime": 47.663, |
|
"eval_samples_per_second": 11.791, |
|
"eval_steps_per_second": 11.791, |
|
"step": 17984 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.2391408234834671, |
|
"learning_rate": 7.001927639383156e-06, |
|
"loss": 0.9807, |
|
"step": 20232 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.7793594598770142, |
|
"eval_loss": 1.1174825429916382, |
|
"eval_runtime": 47.784, |
|
"eval_samples_per_second": 11.761, |
|
"eval_steps_per_second": 11.761, |
|
"step": 20232 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.037992771714925766, |
|
"learning_rate": 6.668890865954924e-06, |
|
"loss": 0.8786, |
|
"step": 22480 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.7882562279701233, |
|
"eval_loss": 1.1786267757415771, |
|
"eval_runtime": 47.6281, |
|
"eval_samples_per_second": 11.8, |
|
"eval_steps_per_second": 11.8, |
|
"step": 22480 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 0.07834123820066452, |
|
"learning_rate": 6.3355575326215905e-06, |
|
"loss": 0.8677, |
|
"step": 24728 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.7811387777328491, |
|
"eval_loss": 1.1294955015182495, |
|
"eval_runtime": 47.9525, |
|
"eval_samples_per_second": 11.72, |
|
"eval_steps_per_second": 11.72, |
|
"step": 24728 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.0959361270070076, |
|
"learning_rate": 6.0025207591933574e-06, |
|
"loss": 0.7554, |
|
"step": 26976 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.8185053467750549, |
|
"eval_loss": 1.1184855699539185, |
|
"eval_runtime": 47.4663, |
|
"eval_samples_per_second": 11.84, |
|
"eval_steps_per_second": 11.84, |
|
"step": 26976 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 0.06532555818557739, |
|
"learning_rate": 5.669335705812574e-06, |
|
"loss": 0.7196, |
|
"step": 29224 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.7846975326538086, |
|
"eval_loss": 1.4066756963729858, |
|
"eval_runtime": 48.1233, |
|
"eval_samples_per_second": 11.678, |
|
"eval_steps_per_second": 11.678, |
|
"step": 29224 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 177.36297607421875, |
|
"learning_rate": 5.336150652431792e-06, |
|
"loss": 0.692, |
|
"step": 31472 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.8202847242355347, |
|
"eval_loss": 1.1174949407577515, |
|
"eval_runtime": 47.9299, |
|
"eval_samples_per_second": 11.725, |
|
"eval_steps_per_second": 11.725, |
|
"step": 31472 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.7878779768943787, |
|
"learning_rate": 5.002817319098459e-06, |
|
"loss": 0.6276, |
|
"step": 33720 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.7882562279701233, |
|
"eval_loss": 1.4490171670913696, |
|
"eval_runtime": 47.6011, |
|
"eval_samples_per_second": 11.806, |
|
"eval_steps_per_second": 11.806, |
|
"step": 33720 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.04724971204996109, |
|
"learning_rate": 4.6696322657176755e-06, |
|
"loss": 0.6083, |
|
"step": 35968 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.8345195651054382, |
|
"eval_loss": 1.0982587337493896, |
|
"eval_runtime": 47.8014, |
|
"eval_samples_per_second": 11.757, |
|
"eval_steps_per_second": 11.757, |
|
"step": 35968 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 0.3455263674259186, |
|
"learning_rate": 4.336447212336892e-06, |
|
"loss": 0.5204, |
|
"step": 38216 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.8256227970123291, |
|
"eval_loss": 1.181368350982666, |
|
"eval_runtime": 47.9669, |
|
"eval_samples_per_second": 11.716, |
|
"eval_steps_per_second": 11.716, |
|
"step": 38216 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.03767940774559975, |
|
"learning_rate": 4.003262158956109e-06, |
|
"loss": 0.5197, |
|
"step": 40464 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.8167259693145752, |
|
"eval_loss": 1.2945315837860107, |
|
"eval_runtime": 47.911, |
|
"eval_samples_per_second": 11.73, |
|
"eval_steps_per_second": 11.73, |
|
"step": 40464 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 365.2795104980469, |
|
"learning_rate": 3.6700771055753265e-06, |
|
"loss": 0.488, |
|
"step": 42712 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.8024911284446716, |
|
"eval_loss": 1.4494409561157227, |
|
"eval_runtime": 44.9229, |
|
"eval_samples_per_second": 12.51, |
|
"eval_steps_per_second": 12.51, |
|
"step": 42712 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 31.586769104003906, |
|
"learning_rate": 3.3368920521945437e-06, |
|
"loss": 0.4714, |
|
"step": 44960 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.8113878965377808, |
|
"eval_loss": 1.3498995304107666, |
|
"eval_runtime": 44.7889, |
|
"eval_samples_per_second": 12.548, |
|
"eval_steps_per_second": 12.548, |
|
"step": 44960 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 4.5781145095825195, |
|
"learning_rate": 3.0035587188612105e-06, |
|
"loss": 0.3641, |
|
"step": 47208 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.8380783200263977, |
|
"eval_loss": 1.252496600151062, |
|
"eval_runtime": 44.8551, |
|
"eval_samples_per_second": 12.529, |
|
"eval_steps_per_second": 12.529, |
|
"step": 47208 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 0.028242342174053192, |
|
"learning_rate": 2.6703736654804273e-06, |
|
"loss": 0.3877, |
|
"step": 49456 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.8380783200263977, |
|
"eval_loss": 1.2610100507736206, |
|
"eval_runtime": 45.2543, |
|
"eval_samples_per_second": 12.419, |
|
"eval_steps_per_second": 12.419, |
|
"step": 49456 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"grad_norm": 0.02240828238427639, |
|
"learning_rate": 2.337188612099644e-06, |
|
"loss": 0.3253, |
|
"step": 51704 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.8274021148681641, |
|
"eval_loss": 1.3913415670394897, |
|
"eval_runtime": 45.0743, |
|
"eval_samples_per_second": 12.468, |
|
"eval_steps_per_second": 12.468, |
|
"step": 51704 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 776.1561279296875, |
|
"learning_rate": 2.0040035587188614e-06, |
|
"loss": 0.2978, |
|
"step": 53952 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.8416370153427124, |
|
"eval_loss": 1.2989881038665771, |
|
"eval_runtime": 44.7173, |
|
"eval_samples_per_second": 12.568, |
|
"eval_steps_per_second": 12.568, |
|
"step": 53952 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 118.77944946289062, |
|
"learning_rate": 1.6706702253855281e-06, |
|
"loss": 0.3238, |
|
"step": 56200 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.8274021148681641, |
|
"eval_loss": 1.4328011274337769, |
|
"eval_runtime": 45.0042, |
|
"eval_samples_per_second": 12.488, |
|
"eval_steps_per_second": 12.488, |
|
"step": 56200 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 0.006892771925777197, |
|
"learning_rate": 1.3373368920521945e-06, |
|
"loss": 0.2669, |
|
"step": 58448 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.8327401876449585, |
|
"eval_loss": 1.3079112768173218, |
|
"eval_runtime": 45.1126, |
|
"eval_samples_per_second": 12.458, |
|
"eval_steps_per_second": 12.458, |
|
"step": 58448 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"grad_norm": 0.0030675730668008327, |
|
"learning_rate": 1.0041518386714117e-06, |
|
"loss": 0.2521, |
|
"step": 60696 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.8398576378822327, |
|
"eval_loss": 1.3250336647033691, |
|
"eval_runtime": 45.3922, |
|
"eval_samples_per_second": 12.381, |
|
"eval_steps_per_second": 12.381, |
|
"step": 60696 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 0.06381255388259888, |
|
"learning_rate": 6.709667852906288e-07, |
|
"loss": 0.2632, |
|
"step": 62944 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.8416370153427124, |
|
"eval_loss": 1.3356966972351074, |
|
"eval_runtime": 44.8744, |
|
"eval_samples_per_second": 12.524, |
|
"eval_steps_per_second": 12.524, |
|
"step": 62944 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"grad_norm": 0.0834624245762825, |
|
"learning_rate": 3.377817319098458e-07, |
|
"loss": 0.2655, |
|
"step": 65192 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 0.8434163928031921, |
|
"eval_loss": 1.2957476377487183, |
|
"eval_runtime": 44.9511, |
|
"eval_samples_per_second": 12.502, |
|
"eval_steps_per_second": 12.502, |
|
"step": 65192 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 0.026273438706994057, |
|
"learning_rate": 4.4483985765124555e-09, |
|
"loss": 0.2379, |
|
"step": 67440 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.8487544655799866, |
|
"eval_loss": 1.2928475141525269, |
|
"eval_runtime": 44.8607, |
|
"eval_samples_per_second": 12.528, |
|
"eval_steps_per_second": 12.528, |
|
"step": 67440 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"step": 67440, |
|
"total_flos": 6.148286618112e+18, |
|
"train_loss": 0.7011247130334024, |
|
"train_runtime": 11844.845, |
|
"train_samples_per_second": 5.694, |
|
"train_steps_per_second": 5.694 |
|
} |
|
], |
|
"logging_steps": 35, |
|
"max_steps": 67440, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.148286618112e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|