|
{ |
|
"best_metric": 1.634413242340088, |
|
"best_model_checkpoint": "outputs/checkpoint-442", |
|
"epoch": 12.67383512544803, |
|
"eval_steps": 500, |
|
"global_step": 442, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.5734767025089605, |
|
"grad_norm": 1.2666417360305786, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.9248, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.974910394265233, |
|
"eval_loss": 2.8157026767730713, |
|
"eval_runtime": 11.7687, |
|
"eval_samples_per_second": 31.609, |
|
"eval_steps_per_second": 3.994, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.146953405017921, |
|
"grad_norm": 1.3574857711791992, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.8919, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.7204301075268817, |
|
"grad_norm": 1.58558189868927, |
|
"learning_rate": 6e-06, |
|
"loss": 2.8246, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.978494623655914, |
|
"eval_loss": 2.589261293411255, |
|
"eval_runtime": 11.7715, |
|
"eval_samples_per_second": 31.602, |
|
"eval_steps_per_second": 3.993, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.293906810035842, |
|
"grad_norm": 2.0228383541107178, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.6556, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.867383512544803, |
|
"grad_norm": 1.7711297273635864, |
|
"learning_rate": 1e-05, |
|
"loss": 2.2948, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.982078853046595, |
|
"eval_loss": 1.9966150522232056, |
|
"eval_runtime": 11.7703, |
|
"eval_samples_per_second": 31.605, |
|
"eval_steps_per_second": 3.993, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.4408602150537635, |
|
"grad_norm": 0.7906980514526367, |
|
"learning_rate": 9.915855517973776e-06, |
|
"loss": 2.0075, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.985663082437276, |
|
"eval_loss": 1.8547258377075195, |
|
"eval_runtime": 11.7654, |
|
"eval_samples_per_second": 31.618, |
|
"eval_steps_per_second": 3.995, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.014336917562724, |
|
"grad_norm": 0.6804280281066895, |
|
"learning_rate": 9.666254189437286e-06, |
|
"loss": 1.9264, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.587813620071684, |
|
"grad_norm": 0.6464373469352722, |
|
"learning_rate": 9.259597044191635e-06, |
|
"loss": 1.8719, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 4.989247311827957, |
|
"eval_loss": 1.7840723991394043, |
|
"eval_runtime": 11.7636, |
|
"eval_samples_per_second": 31.623, |
|
"eval_steps_per_second": 3.995, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 5.161290322580645, |
|
"grad_norm": 0.6501721739768982, |
|
"learning_rate": 8.709571264176408e-06, |
|
"loss": 1.8091, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.734767025089606, |
|
"grad_norm": 0.6568534970283508, |
|
"learning_rate": 8.034689503135785e-06, |
|
"loss": 1.7829, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 5.992831541218638, |
|
"eval_loss": 1.7349460124969482, |
|
"eval_runtime": 11.7693, |
|
"eval_samples_per_second": 31.608, |
|
"eval_steps_per_second": 3.993, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 6.308243727598566, |
|
"grad_norm": 0.701804518699646, |
|
"learning_rate": 7.257666791554448e-06, |
|
"loss": 1.7286, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.881720430107527, |
|
"grad_norm": 0.6902477741241455, |
|
"learning_rate": 6.4046559988678485e-06, |
|
"loss": 1.7219, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 6.996415770609319, |
|
"eval_loss": 1.697705626487732, |
|
"eval_runtime": 11.7645, |
|
"eval_samples_per_second": 31.621, |
|
"eval_steps_per_second": 3.995, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 7.455197132616488, |
|
"grad_norm": 0.6586928963661194, |
|
"learning_rate": 5.504367585601342e-06, |
|
"loss": 1.7072, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.670423984527588, |
|
"eval_runtime": 11.7671, |
|
"eval_samples_per_second": 31.614, |
|
"eval_steps_per_second": 3.994, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 8.028673835125447, |
|
"grad_norm": 0.6181725263595581, |
|
"learning_rate": 4.587103272638339e-06, |
|
"loss": 1.6589, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 8.602150537634408, |
|
"grad_norm": 0.7612842321395874, |
|
"learning_rate": 3.6837361521770056e-06, |
|
"loss": 1.6622, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 8.974910394265233, |
|
"eval_loss": 1.6525731086730957, |
|
"eval_runtime": 11.7681, |
|
"eval_samples_per_second": 31.611, |
|
"eval_steps_per_second": 3.994, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 9.175627240143369, |
|
"grad_norm": 0.7089374661445618, |
|
"learning_rate": 2.8246715675896354e-06, |
|
"loss": 1.6601, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 9.74910394265233, |
|
"grad_norm": 0.8760582804679871, |
|
"learning_rate": 2.0388237366751005e-06, |
|
"loss": 1.6416, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 9.978494623655914, |
|
"eval_loss": 1.6415046453475952, |
|
"eval_runtime": 11.7688, |
|
"eval_samples_per_second": 31.609, |
|
"eval_steps_per_second": 3.994, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 10.32258064516129, |
|
"grad_norm": 0.7347344160079956, |
|
"learning_rate": 1.3526425629068968e-06, |
|
"loss": 1.6421, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 10.89605734767025, |
|
"grad_norm": 0.7853028774261475, |
|
"learning_rate": 7.89223390062172e-07, |
|
"loss": 1.6276, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 10.982078853046595, |
|
"eval_loss": 1.6362229585647583, |
|
"eval_runtime": 11.826, |
|
"eval_samples_per_second": 31.456, |
|
"eval_steps_per_second": 3.974, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 11.469534050179211, |
|
"grad_norm": 0.7689109444618225, |
|
"learning_rate": 3.675296639259912e-07, |
|
"loss": 1.6334, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 11.985663082437275, |
|
"eval_loss": 1.6345170736312866, |
|
"eval_runtime": 11.8501, |
|
"eval_samples_per_second": 31.392, |
|
"eval_steps_per_second": 3.966, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 12.043010752688172, |
|
"grad_norm": 0.7731339335441589, |
|
"learning_rate": 1.0175466456213034e-07, |
|
"loss": 1.6217, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 12.616487455197133, |
|
"grad_norm": 0.7765325307846069, |
|
"learning_rate": 8.437918333864537e-10, |
|
"loss": 1.6348, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 12.67383512544803, |
|
"eval_loss": 1.634413242340088, |
|
"eval_runtime": 11.9143, |
|
"eval_samples_per_second": 31.223, |
|
"eval_steps_per_second": 3.945, |
|
"step": 442 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 442, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 13, |
|
"save_steps": 500, |
|
"total_flos": 2.327705803685069e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|