{ "best_metric": 1.6345170736312866, "best_model_checkpoint": "outputs/checkpoint-418", "epoch": 11.985663082437275, "eval_steps": 500, "global_step": 418, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5734767025089605, "grad_norm": 1.2666417360305786, "learning_rate": 2.0000000000000003e-06, "loss": 2.9248, "step": 20 }, { "epoch": 0.974910394265233, "eval_loss": 2.8157026767730713, "eval_runtime": 11.7687, "eval_samples_per_second": 31.609, "eval_steps_per_second": 3.994, "step": 34 }, { "epoch": 1.146953405017921, "grad_norm": 1.3574857711791992, "learning_rate": 4.000000000000001e-06, "loss": 2.8919, "step": 40 }, { "epoch": 1.7204301075268817, "grad_norm": 1.58558189868927, "learning_rate": 6e-06, "loss": 2.8246, "step": 60 }, { "epoch": 1.978494623655914, "eval_loss": 2.589261293411255, "eval_runtime": 11.7715, "eval_samples_per_second": 31.602, "eval_steps_per_second": 3.993, "step": 69 }, { "epoch": 2.293906810035842, "grad_norm": 2.0228383541107178, "learning_rate": 8.000000000000001e-06, "loss": 2.6556, "step": 80 }, { "epoch": 2.867383512544803, "grad_norm": 1.7711297273635864, "learning_rate": 1e-05, "loss": 2.2948, "step": 100 }, { "epoch": 2.982078853046595, "eval_loss": 1.9966150522232056, "eval_runtime": 11.7703, "eval_samples_per_second": 31.605, "eval_steps_per_second": 3.993, "step": 104 }, { "epoch": 3.4408602150537635, "grad_norm": 0.7906980514526367, "learning_rate": 9.915855517973776e-06, "loss": 2.0075, "step": 120 }, { "epoch": 3.985663082437276, "eval_loss": 1.8547258377075195, "eval_runtime": 11.7654, "eval_samples_per_second": 31.618, "eval_steps_per_second": 3.995, "step": 139 }, { "epoch": 4.014336917562724, "grad_norm": 0.6804280281066895, "learning_rate": 9.666254189437286e-06, "loss": 1.9264, "step": 140 }, { "epoch": 4.587813620071684, "grad_norm": 0.6464373469352722, "learning_rate": 9.259597044191635e-06, "loss": 1.8719, "step": 160 }, { "epoch": 4.989247311827957, "eval_loss": 1.7840723991394043, "eval_runtime": 11.7636, "eval_samples_per_second": 31.623, "eval_steps_per_second": 3.995, "step": 174 }, { "epoch": 5.161290322580645, "grad_norm": 0.6501721739768982, "learning_rate": 8.709571264176408e-06, "loss": 1.8091, "step": 180 }, { "epoch": 5.734767025089606, "grad_norm": 0.6568534970283508, "learning_rate": 8.034689503135785e-06, "loss": 1.7829, "step": 200 }, { "epoch": 5.992831541218638, "eval_loss": 1.7349460124969482, "eval_runtime": 11.7693, "eval_samples_per_second": 31.608, "eval_steps_per_second": 3.993, "step": 209 }, { "epoch": 6.308243727598566, "grad_norm": 0.701804518699646, "learning_rate": 7.257666791554448e-06, "loss": 1.7286, "step": 220 }, { "epoch": 6.881720430107527, "grad_norm": 0.6902477741241455, "learning_rate": 6.4046559988678485e-06, "loss": 1.7219, "step": 240 }, { "epoch": 6.996415770609319, "eval_loss": 1.697705626487732, "eval_runtime": 11.7645, "eval_samples_per_second": 31.621, "eval_steps_per_second": 3.995, "step": 244 }, { "epoch": 7.455197132616488, "grad_norm": 0.6586928963661194, "learning_rate": 5.504367585601342e-06, "loss": 1.7072, "step": 260 }, { "epoch": 8.0, "eval_loss": 1.670423984527588, "eval_runtime": 11.7671, "eval_samples_per_second": 31.614, "eval_steps_per_second": 3.994, "step": 279 }, { "epoch": 8.028673835125447, "grad_norm": 0.6181725263595581, "learning_rate": 4.587103272638339e-06, "loss": 1.6589, "step": 280 }, { "epoch": 8.602150537634408, "grad_norm": 0.7612842321395874, "learning_rate": 3.6837361521770056e-06, "loss": 1.6622, "step": 300 }, { "epoch": 8.974910394265233, "eval_loss": 1.6525731086730957, "eval_runtime": 11.7681, "eval_samples_per_second": 31.611, "eval_steps_per_second": 3.994, "step": 313 }, { "epoch": 9.175627240143369, "grad_norm": 0.7089374661445618, "learning_rate": 2.8246715675896354e-06, "loss": 1.6601, "step": 320 }, { "epoch": 9.74910394265233, "grad_norm": 0.8760582804679871, "learning_rate": 2.0388237366751005e-06, "loss": 1.6416, "step": 340 }, { "epoch": 9.978494623655914, "eval_loss": 1.6415046453475952, "eval_runtime": 11.7688, "eval_samples_per_second": 31.609, "eval_steps_per_second": 3.994, "step": 348 }, { "epoch": 10.32258064516129, "grad_norm": 0.7347344160079956, "learning_rate": 1.3526425629068968e-06, "loss": 1.6421, "step": 360 }, { "epoch": 10.89605734767025, "grad_norm": 0.7853028774261475, "learning_rate": 7.89223390062172e-07, "loss": 1.6276, "step": 380 }, { "epoch": 10.982078853046595, "eval_loss": 1.6362229585647583, "eval_runtime": 11.826, "eval_samples_per_second": 31.456, "eval_steps_per_second": 3.974, "step": 383 }, { "epoch": 11.469534050179211, "grad_norm": 0.7689109444618225, "learning_rate": 3.675296639259912e-07, "loss": 1.6334, "step": 400 }, { "epoch": 11.985663082437275, "eval_loss": 1.6345170736312866, "eval_runtime": 11.8501, "eval_samples_per_second": 31.392, "eval_steps_per_second": 3.966, "step": 418 } ], "logging_steps": 20, "max_steps": 442, "num_input_tokens_seen": 0, "num_train_epochs": 13, "save_steps": 500, "total_flos": 2.202789493191475e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }