kurosekurose's picture
End of training
9c57240 verified
raw
history blame contribute delete
No virus
13.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 30.0,
"eval_steps": 500,
"global_step": 67440,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"grad_norm": 20.024471282958984,
"learning_rate": 9.66755634638197e-06,
"loss": 1.2057,
"step": 2248
},
{
"epoch": 1.0,
"eval_accuracy": 0.450177937746048,
"eval_loss": 1.4521995782852173,
"eval_runtime": 46.9769,
"eval_samples_per_second": 11.963,
"eval_steps_per_second": 11.963,
"step": 2248
},
{
"epoch": 2.0,
"grad_norm": 40.918148040771484,
"learning_rate": 9.334371293001186e-06,
"loss": 1.3873,
"step": 4496
},
{
"epoch": 2.0,
"eval_accuracy": 0.6423487663269043,
"eval_loss": 1.4503390789031982,
"eval_runtime": 47.9141,
"eval_samples_per_second": 11.729,
"eval_steps_per_second": 11.729,
"step": 4496
},
{
"epoch": 3.0,
"grad_norm": 14.9369478225708,
"learning_rate": 9.001334519572953e-06,
"loss": 1.4246,
"step": 6744
},
{
"epoch": 3.0,
"eval_accuracy": 0.6672598123550415,
"eval_loss": 1.6165008544921875,
"eval_runtime": 47.7427,
"eval_samples_per_second": 11.771,
"eval_steps_per_second": 11.771,
"step": 6744
},
{
"epoch": 4.0,
"grad_norm": 0.12427664548158646,
"learning_rate": 8.668149466192172e-06,
"loss": 1.3335,
"step": 8992
},
{
"epoch": 4.0,
"eval_accuracy": 0.7206405401229858,
"eval_loss": 1.478636384010315,
"eval_runtime": 47.5592,
"eval_samples_per_second": 11.817,
"eval_steps_per_second": 11.817,
"step": 8992
},
{
"epoch": 5.0,
"grad_norm": 0.1769200563430786,
"learning_rate": 8.334964412811389e-06,
"loss": 1.251,
"step": 11240
},
{
"epoch": 5.0,
"eval_accuracy": 0.6886121034622192,
"eval_loss": 1.6414002180099487,
"eval_runtime": 47.4426,
"eval_samples_per_second": 11.846,
"eval_steps_per_second": 11.846,
"step": 11240
},
{
"epoch": 6.0,
"grad_norm": 0.14434462785720825,
"learning_rate": 8.001779359430606e-06,
"loss": 1.1859,
"step": 13488
},
{
"epoch": 6.0,
"eval_accuracy": 0.754448413848877,
"eval_loss": 1.329976201057434,
"eval_runtime": 47.6847,
"eval_samples_per_second": 11.786,
"eval_steps_per_second": 11.786,
"step": 13488
},
{
"epoch": 7.0,
"grad_norm": 24.07697105407715,
"learning_rate": 7.668446026097273e-06,
"loss": 1.1132,
"step": 15736
},
{
"epoch": 7.0,
"eval_accuracy": 0.7508896589279175,
"eval_loss": 1.3665430545806885,
"eval_runtime": 47.6469,
"eval_samples_per_second": 11.795,
"eval_steps_per_second": 11.795,
"step": 15736
},
{
"epoch": 8.0,
"grad_norm": 3.64510178565979,
"learning_rate": 7.335112692763939e-06,
"loss": 1.0189,
"step": 17984
},
{
"epoch": 8.0,
"eval_accuracy": 0.7153024673461914,
"eval_loss": 1.6664679050445557,
"eval_runtime": 47.663,
"eval_samples_per_second": 11.791,
"eval_steps_per_second": 11.791,
"step": 17984
},
{
"epoch": 9.0,
"grad_norm": 0.2391408234834671,
"learning_rate": 7.001927639383156e-06,
"loss": 0.9807,
"step": 20232
},
{
"epoch": 9.0,
"eval_accuracy": 0.7793594598770142,
"eval_loss": 1.1174825429916382,
"eval_runtime": 47.784,
"eval_samples_per_second": 11.761,
"eval_steps_per_second": 11.761,
"step": 20232
},
{
"epoch": 10.0,
"grad_norm": 0.037992771714925766,
"learning_rate": 6.668890865954924e-06,
"loss": 0.8786,
"step": 22480
},
{
"epoch": 10.0,
"eval_accuracy": 0.7882562279701233,
"eval_loss": 1.1786267757415771,
"eval_runtime": 47.6281,
"eval_samples_per_second": 11.8,
"eval_steps_per_second": 11.8,
"step": 22480
},
{
"epoch": 11.0,
"grad_norm": 0.07834123820066452,
"learning_rate": 6.3355575326215905e-06,
"loss": 0.8677,
"step": 24728
},
{
"epoch": 11.0,
"eval_accuracy": 0.7811387777328491,
"eval_loss": 1.1294955015182495,
"eval_runtime": 47.9525,
"eval_samples_per_second": 11.72,
"eval_steps_per_second": 11.72,
"step": 24728
},
{
"epoch": 12.0,
"grad_norm": 0.0959361270070076,
"learning_rate": 6.0025207591933574e-06,
"loss": 0.7554,
"step": 26976
},
{
"epoch": 12.0,
"eval_accuracy": 0.8185053467750549,
"eval_loss": 1.1184855699539185,
"eval_runtime": 47.4663,
"eval_samples_per_second": 11.84,
"eval_steps_per_second": 11.84,
"step": 26976
},
{
"epoch": 13.0,
"grad_norm": 0.06532555818557739,
"learning_rate": 5.669335705812574e-06,
"loss": 0.7196,
"step": 29224
},
{
"epoch": 13.0,
"eval_accuracy": 0.7846975326538086,
"eval_loss": 1.4066756963729858,
"eval_runtime": 48.1233,
"eval_samples_per_second": 11.678,
"eval_steps_per_second": 11.678,
"step": 29224
},
{
"epoch": 14.0,
"grad_norm": 177.36297607421875,
"learning_rate": 5.336150652431792e-06,
"loss": 0.692,
"step": 31472
},
{
"epoch": 14.0,
"eval_accuracy": 0.8202847242355347,
"eval_loss": 1.1174949407577515,
"eval_runtime": 47.9299,
"eval_samples_per_second": 11.725,
"eval_steps_per_second": 11.725,
"step": 31472
},
{
"epoch": 15.0,
"grad_norm": 0.7878779768943787,
"learning_rate": 5.002817319098459e-06,
"loss": 0.6276,
"step": 33720
},
{
"epoch": 15.0,
"eval_accuracy": 0.7882562279701233,
"eval_loss": 1.4490171670913696,
"eval_runtime": 47.6011,
"eval_samples_per_second": 11.806,
"eval_steps_per_second": 11.806,
"step": 33720
},
{
"epoch": 16.0,
"grad_norm": 0.04724971204996109,
"learning_rate": 4.6696322657176755e-06,
"loss": 0.6083,
"step": 35968
},
{
"epoch": 16.0,
"eval_accuracy": 0.8345195651054382,
"eval_loss": 1.0982587337493896,
"eval_runtime": 47.8014,
"eval_samples_per_second": 11.757,
"eval_steps_per_second": 11.757,
"step": 35968
},
{
"epoch": 17.0,
"grad_norm": 0.3455263674259186,
"learning_rate": 4.336447212336892e-06,
"loss": 0.5204,
"step": 38216
},
{
"epoch": 17.0,
"eval_accuracy": 0.8256227970123291,
"eval_loss": 1.181368350982666,
"eval_runtime": 47.9669,
"eval_samples_per_second": 11.716,
"eval_steps_per_second": 11.716,
"step": 38216
},
{
"epoch": 18.0,
"grad_norm": 0.03767940774559975,
"learning_rate": 4.003262158956109e-06,
"loss": 0.5197,
"step": 40464
},
{
"epoch": 18.0,
"eval_accuracy": 0.8167259693145752,
"eval_loss": 1.2945315837860107,
"eval_runtime": 47.911,
"eval_samples_per_second": 11.73,
"eval_steps_per_second": 11.73,
"step": 40464
},
{
"epoch": 19.0,
"grad_norm": 365.2795104980469,
"learning_rate": 3.6700771055753265e-06,
"loss": 0.488,
"step": 42712
},
{
"epoch": 19.0,
"eval_accuracy": 0.8024911284446716,
"eval_loss": 1.4494409561157227,
"eval_runtime": 44.9229,
"eval_samples_per_second": 12.51,
"eval_steps_per_second": 12.51,
"step": 42712
},
{
"epoch": 20.0,
"grad_norm": 31.586769104003906,
"learning_rate": 3.3368920521945437e-06,
"loss": 0.4714,
"step": 44960
},
{
"epoch": 20.0,
"eval_accuracy": 0.8113878965377808,
"eval_loss": 1.3498995304107666,
"eval_runtime": 44.7889,
"eval_samples_per_second": 12.548,
"eval_steps_per_second": 12.548,
"step": 44960
},
{
"epoch": 21.0,
"grad_norm": 4.5781145095825195,
"learning_rate": 3.0035587188612105e-06,
"loss": 0.3641,
"step": 47208
},
{
"epoch": 21.0,
"eval_accuracy": 0.8380783200263977,
"eval_loss": 1.252496600151062,
"eval_runtime": 44.8551,
"eval_samples_per_second": 12.529,
"eval_steps_per_second": 12.529,
"step": 47208
},
{
"epoch": 22.0,
"grad_norm": 0.028242342174053192,
"learning_rate": 2.6703736654804273e-06,
"loss": 0.3877,
"step": 49456
},
{
"epoch": 22.0,
"eval_accuracy": 0.8380783200263977,
"eval_loss": 1.2610100507736206,
"eval_runtime": 45.2543,
"eval_samples_per_second": 12.419,
"eval_steps_per_second": 12.419,
"step": 49456
},
{
"epoch": 23.0,
"grad_norm": 0.02240828238427639,
"learning_rate": 2.337188612099644e-06,
"loss": 0.3253,
"step": 51704
},
{
"epoch": 23.0,
"eval_accuracy": 0.8274021148681641,
"eval_loss": 1.3913415670394897,
"eval_runtime": 45.0743,
"eval_samples_per_second": 12.468,
"eval_steps_per_second": 12.468,
"step": 51704
},
{
"epoch": 24.0,
"grad_norm": 776.1561279296875,
"learning_rate": 2.0040035587188614e-06,
"loss": 0.2978,
"step": 53952
},
{
"epoch": 24.0,
"eval_accuracy": 0.8416370153427124,
"eval_loss": 1.2989881038665771,
"eval_runtime": 44.7173,
"eval_samples_per_second": 12.568,
"eval_steps_per_second": 12.568,
"step": 53952
},
{
"epoch": 25.0,
"grad_norm": 118.77944946289062,
"learning_rate": 1.6706702253855281e-06,
"loss": 0.3238,
"step": 56200
},
{
"epoch": 25.0,
"eval_accuracy": 0.8274021148681641,
"eval_loss": 1.4328011274337769,
"eval_runtime": 45.0042,
"eval_samples_per_second": 12.488,
"eval_steps_per_second": 12.488,
"step": 56200
},
{
"epoch": 26.0,
"grad_norm": 0.006892771925777197,
"learning_rate": 1.3373368920521945e-06,
"loss": 0.2669,
"step": 58448
},
{
"epoch": 26.0,
"eval_accuracy": 0.8327401876449585,
"eval_loss": 1.3079112768173218,
"eval_runtime": 45.1126,
"eval_samples_per_second": 12.458,
"eval_steps_per_second": 12.458,
"step": 58448
},
{
"epoch": 27.0,
"grad_norm": 0.0030675730668008327,
"learning_rate": 1.0041518386714117e-06,
"loss": 0.2521,
"step": 60696
},
{
"epoch": 27.0,
"eval_accuracy": 0.8398576378822327,
"eval_loss": 1.3250336647033691,
"eval_runtime": 45.3922,
"eval_samples_per_second": 12.381,
"eval_steps_per_second": 12.381,
"step": 60696
},
{
"epoch": 28.0,
"grad_norm": 0.06381255388259888,
"learning_rate": 6.709667852906288e-07,
"loss": 0.2632,
"step": 62944
},
{
"epoch": 28.0,
"eval_accuracy": 0.8416370153427124,
"eval_loss": 1.3356966972351074,
"eval_runtime": 44.8744,
"eval_samples_per_second": 12.524,
"eval_steps_per_second": 12.524,
"step": 62944
},
{
"epoch": 29.0,
"grad_norm": 0.0834624245762825,
"learning_rate": 3.377817319098458e-07,
"loss": 0.2655,
"step": 65192
},
{
"epoch": 29.0,
"eval_accuracy": 0.8434163928031921,
"eval_loss": 1.2957476377487183,
"eval_runtime": 44.9511,
"eval_samples_per_second": 12.502,
"eval_steps_per_second": 12.502,
"step": 65192
},
{
"epoch": 30.0,
"grad_norm": 0.026273438706994057,
"learning_rate": 4.4483985765124555e-09,
"loss": 0.2379,
"step": 67440
},
{
"epoch": 30.0,
"eval_accuracy": 0.8487544655799866,
"eval_loss": 1.2928475141525269,
"eval_runtime": 44.8607,
"eval_samples_per_second": 12.528,
"eval_steps_per_second": 12.528,
"step": 67440
},
{
"epoch": 30.0,
"step": 67440,
"total_flos": 6.148286618112e+18,
"train_loss": 0.7011247130334024,
"train_runtime": 11844.845,
"train_samples_per_second": 5.694,
"train_steps_per_second": 5.694
}
],
"logging_steps": 35,
"max_steps": 67440,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.148286618112e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}