{ "best_metric": null, "best_model_checkpoint": null, "epoch": 30.0, "eval_steps": 500, "global_step": 67440, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 20.024471282958984, "learning_rate": 9.66755634638197e-06, "loss": 1.2057, "step": 2248 }, { "epoch": 1.0, "eval_accuracy": 0.450177937746048, "eval_loss": 1.4521995782852173, "eval_runtime": 46.9769, "eval_samples_per_second": 11.963, "eval_steps_per_second": 11.963, "step": 2248 }, { "epoch": 2.0, "grad_norm": 40.918148040771484, "learning_rate": 9.334371293001186e-06, "loss": 1.3873, "step": 4496 }, { "epoch": 2.0, "eval_accuracy": 0.6423487663269043, "eval_loss": 1.4503390789031982, "eval_runtime": 47.9141, "eval_samples_per_second": 11.729, "eval_steps_per_second": 11.729, "step": 4496 }, { "epoch": 3.0, "grad_norm": 14.9369478225708, "learning_rate": 9.001334519572953e-06, "loss": 1.4246, "step": 6744 }, { "epoch": 3.0, "eval_accuracy": 0.6672598123550415, "eval_loss": 1.6165008544921875, "eval_runtime": 47.7427, "eval_samples_per_second": 11.771, "eval_steps_per_second": 11.771, "step": 6744 }, { "epoch": 4.0, "grad_norm": 0.12427664548158646, "learning_rate": 8.668149466192172e-06, "loss": 1.3335, "step": 8992 }, { "epoch": 4.0, "eval_accuracy": 0.7206405401229858, "eval_loss": 1.478636384010315, "eval_runtime": 47.5592, "eval_samples_per_second": 11.817, "eval_steps_per_second": 11.817, "step": 8992 }, { "epoch": 5.0, "grad_norm": 0.1769200563430786, "learning_rate": 8.334964412811389e-06, "loss": 1.251, "step": 11240 }, { "epoch": 5.0, "eval_accuracy": 0.6886121034622192, "eval_loss": 1.6414002180099487, "eval_runtime": 47.4426, "eval_samples_per_second": 11.846, "eval_steps_per_second": 11.846, "step": 11240 }, { "epoch": 6.0, "grad_norm": 0.14434462785720825, "learning_rate": 8.001779359430606e-06, "loss": 1.1859, "step": 13488 }, { "epoch": 6.0, "eval_accuracy": 0.754448413848877, "eval_loss": 1.329976201057434, "eval_runtime": 47.6847, "eval_samples_per_second": 11.786, "eval_steps_per_second": 11.786, "step": 13488 }, { "epoch": 7.0, "grad_norm": 24.07697105407715, "learning_rate": 7.668446026097273e-06, "loss": 1.1132, "step": 15736 }, { "epoch": 7.0, "eval_accuracy": 0.7508896589279175, "eval_loss": 1.3665430545806885, "eval_runtime": 47.6469, "eval_samples_per_second": 11.795, "eval_steps_per_second": 11.795, "step": 15736 }, { "epoch": 8.0, "grad_norm": 3.64510178565979, "learning_rate": 7.335112692763939e-06, "loss": 1.0189, "step": 17984 }, { "epoch": 8.0, "eval_accuracy": 0.7153024673461914, "eval_loss": 1.6664679050445557, "eval_runtime": 47.663, "eval_samples_per_second": 11.791, "eval_steps_per_second": 11.791, "step": 17984 }, { "epoch": 9.0, "grad_norm": 0.2391408234834671, "learning_rate": 7.001927639383156e-06, "loss": 0.9807, "step": 20232 }, { "epoch": 9.0, "eval_accuracy": 0.7793594598770142, "eval_loss": 1.1174825429916382, "eval_runtime": 47.784, "eval_samples_per_second": 11.761, "eval_steps_per_second": 11.761, "step": 20232 }, { "epoch": 10.0, "grad_norm": 0.037992771714925766, "learning_rate": 6.668890865954924e-06, "loss": 0.8786, "step": 22480 }, { "epoch": 10.0, "eval_accuracy": 0.7882562279701233, "eval_loss": 1.1786267757415771, "eval_runtime": 47.6281, "eval_samples_per_second": 11.8, "eval_steps_per_second": 11.8, "step": 22480 }, { "epoch": 11.0, "grad_norm": 0.07834123820066452, "learning_rate": 6.3355575326215905e-06, "loss": 0.8677, "step": 24728 }, { "epoch": 11.0, "eval_accuracy": 0.7811387777328491, "eval_loss": 1.1294955015182495, "eval_runtime": 47.9525, "eval_samples_per_second": 11.72, "eval_steps_per_second": 11.72, "step": 24728 }, { "epoch": 12.0, "grad_norm": 0.0959361270070076, "learning_rate": 6.0025207591933574e-06, "loss": 0.7554, "step": 26976 }, { "epoch": 12.0, "eval_accuracy": 0.8185053467750549, "eval_loss": 1.1184855699539185, "eval_runtime": 47.4663, "eval_samples_per_second": 11.84, "eval_steps_per_second": 11.84, "step": 26976 }, { "epoch": 13.0, "grad_norm": 0.06532555818557739, "learning_rate": 5.669335705812574e-06, "loss": 0.7196, "step": 29224 }, { "epoch": 13.0, "eval_accuracy": 0.7846975326538086, "eval_loss": 1.4066756963729858, "eval_runtime": 48.1233, "eval_samples_per_second": 11.678, "eval_steps_per_second": 11.678, "step": 29224 }, { "epoch": 14.0, "grad_norm": 177.36297607421875, "learning_rate": 5.336150652431792e-06, "loss": 0.692, "step": 31472 }, { "epoch": 14.0, "eval_accuracy": 0.8202847242355347, "eval_loss": 1.1174949407577515, "eval_runtime": 47.9299, "eval_samples_per_second": 11.725, "eval_steps_per_second": 11.725, "step": 31472 }, { "epoch": 15.0, "grad_norm": 0.7878779768943787, "learning_rate": 5.002817319098459e-06, "loss": 0.6276, "step": 33720 }, { "epoch": 15.0, "eval_accuracy": 0.7882562279701233, "eval_loss": 1.4490171670913696, "eval_runtime": 47.6011, "eval_samples_per_second": 11.806, "eval_steps_per_second": 11.806, "step": 33720 }, { "epoch": 16.0, "grad_norm": 0.04724971204996109, "learning_rate": 4.6696322657176755e-06, "loss": 0.6083, "step": 35968 }, { "epoch": 16.0, "eval_accuracy": 0.8345195651054382, "eval_loss": 1.0982587337493896, "eval_runtime": 47.8014, "eval_samples_per_second": 11.757, "eval_steps_per_second": 11.757, "step": 35968 }, { "epoch": 17.0, "grad_norm": 0.3455263674259186, "learning_rate": 4.336447212336892e-06, "loss": 0.5204, "step": 38216 }, { "epoch": 17.0, "eval_accuracy": 0.8256227970123291, "eval_loss": 1.181368350982666, "eval_runtime": 47.9669, "eval_samples_per_second": 11.716, "eval_steps_per_second": 11.716, "step": 38216 }, { "epoch": 18.0, "grad_norm": 0.03767940774559975, "learning_rate": 4.003262158956109e-06, "loss": 0.5197, "step": 40464 }, { "epoch": 18.0, "eval_accuracy": 0.8167259693145752, "eval_loss": 1.2945315837860107, "eval_runtime": 47.911, "eval_samples_per_second": 11.73, "eval_steps_per_second": 11.73, "step": 40464 }, { "epoch": 19.0, "grad_norm": 365.2795104980469, "learning_rate": 3.6700771055753265e-06, "loss": 0.488, "step": 42712 }, { "epoch": 19.0, "eval_accuracy": 0.8024911284446716, "eval_loss": 1.4494409561157227, "eval_runtime": 44.9229, "eval_samples_per_second": 12.51, "eval_steps_per_second": 12.51, "step": 42712 }, { "epoch": 20.0, "grad_norm": 31.586769104003906, "learning_rate": 3.3368920521945437e-06, "loss": 0.4714, "step": 44960 }, { "epoch": 20.0, "eval_accuracy": 0.8113878965377808, "eval_loss": 1.3498995304107666, "eval_runtime": 44.7889, "eval_samples_per_second": 12.548, "eval_steps_per_second": 12.548, "step": 44960 }, { "epoch": 21.0, "grad_norm": 4.5781145095825195, "learning_rate": 3.0035587188612105e-06, "loss": 0.3641, "step": 47208 }, { "epoch": 21.0, "eval_accuracy": 0.8380783200263977, "eval_loss": 1.252496600151062, "eval_runtime": 44.8551, "eval_samples_per_second": 12.529, "eval_steps_per_second": 12.529, "step": 47208 }, { "epoch": 22.0, "grad_norm": 0.028242342174053192, "learning_rate": 2.6703736654804273e-06, "loss": 0.3877, "step": 49456 }, { "epoch": 22.0, "eval_accuracy": 0.8380783200263977, "eval_loss": 1.2610100507736206, "eval_runtime": 45.2543, "eval_samples_per_second": 12.419, "eval_steps_per_second": 12.419, "step": 49456 }, { "epoch": 23.0, "grad_norm": 0.02240828238427639, "learning_rate": 2.337188612099644e-06, "loss": 0.3253, "step": 51704 }, { "epoch": 23.0, "eval_accuracy": 0.8274021148681641, "eval_loss": 1.3913415670394897, "eval_runtime": 45.0743, "eval_samples_per_second": 12.468, "eval_steps_per_second": 12.468, "step": 51704 }, { "epoch": 24.0, "grad_norm": 776.1561279296875, "learning_rate": 2.0040035587188614e-06, "loss": 0.2978, "step": 53952 }, { "epoch": 24.0, "eval_accuracy": 0.8416370153427124, "eval_loss": 1.2989881038665771, "eval_runtime": 44.7173, "eval_samples_per_second": 12.568, "eval_steps_per_second": 12.568, "step": 53952 }, { "epoch": 25.0, "grad_norm": 118.77944946289062, "learning_rate": 1.6706702253855281e-06, "loss": 0.3238, "step": 56200 }, { "epoch": 25.0, "eval_accuracy": 0.8274021148681641, "eval_loss": 1.4328011274337769, "eval_runtime": 45.0042, "eval_samples_per_second": 12.488, "eval_steps_per_second": 12.488, "step": 56200 }, { "epoch": 26.0, "grad_norm": 0.006892771925777197, "learning_rate": 1.3373368920521945e-06, "loss": 0.2669, "step": 58448 }, { "epoch": 26.0, "eval_accuracy": 0.8327401876449585, "eval_loss": 1.3079112768173218, "eval_runtime": 45.1126, "eval_samples_per_second": 12.458, "eval_steps_per_second": 12.458, "step": 58448 }, { "epoch": 27.0, "grad_norm": 0.0030675730668008327, "learning_rate": 1.0041518386714117e-06, "loss": 0.2521, "step": 60696 }, { "epoch": 27.0, "eval_accuracy": 0.8398576378822327, "eval_loss": 1.3250336647033691, "eval_runtime": 45.3922, "eval_samples_per_second": 12.381, "eval_steps_per_second": 12.381, "step": 60696 }, { "epoch": 28.0, "grad_norm": 0.06381255388259888, "learning_rate": 6.709667852906288e-07, "loss": 0.2632, "step": 62944 }, { "epoch": 28.0, "eval_accuracy": 0.8416370153427124, "eval_loss": 1.3356966972351074, "eval_runtime": 44.8744, "eval_samples_per_second": 12.524, "eval_steps_per_second": 12.524, "step": 62944 }, { "epoch": 29.0, "grad_norm": 0.0834624245762825, "learning_rate": 3.377817319098458e-07, "loss": 0.2655, "step": 65192 }, { "epoch": 29.0, "eval_accuracy": 0.8434163928031921, "eval_loss": 1.2957476377487183, "eval_runtime": 44.9511, "eval_samples_per_second": 12.502, "eval_steps_per_second": 12.502, "step": 65192 }, { "epoch": 30.0, "grad_norm": 0.026273438706994057, "learning_rate": 4.4483985765124555e-09, "loss": 0.2379, "step": 67440 }, { "epoch": 30.0, "eval_accuracy": 0.8487544655799866, "eval_loss": 1.2928475141525269, "eval_runtime": 44.8607, "eval_samples_per_second": 12.528, "eval_steps_per_second": 12.528, "step": 67440 }, { "epoch": 30.0, "step": 67440, "total_flos": 6.148286618112e+18, "train_loss": 0.7011247130334024, "train_runtime": 11844.845, "train_samples_per_second": 5.694, "train_steps_per_second": 5.694 } ], "logging_steps": 35, "max_steps": 67440, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.148286618112e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }