{ "best_metric": 1.6614694595336914, "best_model_checkpoint": "outputs/checkpoint-348", "epoch": 9.978494623655914, "eval_steps": 500, "global_step": 348, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5734767025089605, "grad_norm": 1.2473632097244263, "learning_rate": 2.0000000000000003e-06, "loss": 2.9249, "step": 20 }, { "epoch": 0.974910394265233, "eval_loss": 2.817430019378662, "eval_runtime": 11.5884, "eval_samples_per_second": 32.101, "eval_steps_per_second": 4.056, "step": 34 }, { "epoch": 1.146953405017921, "grad_norm": 1.330991268157959, "learning_rate": 4.000000000000001e-06, "loss": 2.8932, "step": 40 }, { "epoch": 1.7204301075268817, "grad_norm": 1.5549895763397217, "learning_rate": 6e-06, "loss": 2.8287, "step": 60 }, { "epoch": 1.978494623655914, "eval_loss": 2.5992982387542725, "eval_runtime": 11.5924, "eval_samples_per_second": 32.09, "eval_steps_per_second": 4.054, "step": 69 }, { "epoch": 2.293906810035842, "grad_norm": 1.9673638343811035, "learning_rate": 8.000000000000001e-06, "loss": 2.6659, "step": 80 }, { "epoch": 2.867383512544803, "grad_norm": 1.759926199913025, "learning_rate": 1e-05, "loss": 2.3145, "step": 100 }, { "epoch": 2.982078853046595, "eval_loss": 2.0121684074401855, "eval_runtime": 11.5907, "eval_samples_per_second": 32.095, "eval_steps_per_second": 4.055, "step": 104 }, { "epoch": 3.4408602150537635, "grad_norm": 0.7806220650672913, "learning_rate": 9.896320793787106e-06, "loss": 2.0158, "step": 120 }, { "epoch": 3.985663082437276, "eval_loss": 1.8575435876846313, "eval_runtime": 11.5927, "eval_samples_per_second": 32.089, "eval_steps_per_second": 4.054, "step": 139 }, { "epoch": 4.014336917562724, "grad_norm": 0.6744422316551208, "learning_rate": 9.589582926268798e-06, "loss": 1.9297, "step": 140 }, { "epoch": 4.587813620071684, "grad_norm": 0.636353075504303, "learning_rate": 9.092507332892968e-06, "loss": 1.8752, "step": 160 }, { "epoch": 4.989247311827957, "eval_loss": 1.7877445220947266, "eval_runtime": 11.5902, "eval_samples_per_second": 32.096, "eval_steps_per_second": 4.055, "step": 174 }, { "epoch": 5.161290322580645, "grad_norm": 0.6430570483207703, "learning_rate": 8.425708574839221e-06, "loss": 1.813, "step": 180 }, { "epoch": 5.734767025089606, "grad_norm": 0.6457915306091309, "learning_rate": 7.616839918483061e-06, "loss": 1.7878, "step": 200 }, { "epoch": 5.992831541218638, "eval_loss": 1.7403738498687744, "eval_runtime": 11.5876, "eval_samples_per_second": 32.103, "eval_steps_per_second": 4.056, "step": 209 }, { "epoch": 6.308243727598566, "grad_norm": 0.6910040974617004, "learning_rate": 6.699446507913083e-06, "loss": 1.7346, "step": 220 }, { "epoch": 6.881720430107527, "grad_norm": 0.673575758934021, "learning_rate": 5.711574191366427e-06, "loss": 1.7293, "step": 240 }, { "epoch": 6.996415770609319, "eval_loss": 1.705881118774414, "eval_runtime": 11.588, "eval_samples_per_second": 32.102, "eval_steps_per_second": 4.056, "step": 244 }, { "epoch": 7.455197132616488, "grad_norm": 0.6435667276382446, "learning_rate": 4.694191695890788e-06, "loss": 1.7172, "step": 260 }, { "epoch": 8.0, "eval_loss": 1.6822781562805176, "eval_runtime": 11.5832, "eval_samples_per_second": 32.115, "eval_steps_per_second": 4.058, "step": 279 }, { "epoch": 8.028673835125447, "grad_norm": 0.5970215201377869, "learning_rate": 3.689491585304491e-06, "loss": 1.6713, "step": 280 }, { "epoch": 8.602150537634408, "grad_norm": 0.734722375869751, "learning_rate": 2.7391404635865725e-06, "loss": 1.6777, "step": 300 }, { "epoch": 8.974910394265233, "eval_loss": 1.6685205698013306, "eval_runtime": 11.5892, "eval_samples_per_second": 32.099, "eval_steps_per_second": 4.056, "step": 313 }, { "epoch": 9.175627240143369, "grad_norm": 0.6750782132148743, "learning_rate": 1.8825509907063328e-06, "loss": 1.6782, "step": 320 }, { "epoch": 9.74910394265233, "grad_norm": 0.8320801854133606, "learning_rate": 1.1552473733031893e-06, "loss": 1.6629, "step": 340 }, { "epoch": 9.978494623655914, "eval_loss": 1.6614694595336914, "eval_runtime": 11.5856, "eval_samples_per_second": 32.109, "eval_steps_per_second": 4.057, "step": 348 } ], "logging_steps": 20, "max_steps": 408, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 500, "total_flos": 1.835118238875648e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }