|
{ |
|
"best_metric": 0.9539479613304138, |
|
"best_model_checkpoint": "outputs/checkpoint-231", |
|
"epoch": 5.98110661268556, |
|
"eval_steps": 500, |
|
"global_step": 277, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.4318488529014845, |
|
"grad_norm": 0.7265238165855408, |
|
"learning_rate": 4e-05, |
|
"loss": 2.2901, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.863697705802969, |
|
"grad_norm": 0.4902186989784241, |
|
"learning_rate": 8e-05, |
|
"loss": 2.0553, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.9932523616734144, |
|
"eval_loss": 1.7082113027572632, |
|
"eval_runtime": 11.6752, |
|
"eval_samples_per_second": 31.862, |
|
"eval_steps_per_second": 4.026, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.2955465587044535, |
|
"grad_norm": 0.5778974294662476, |
|
"learning_rate": 0.00012, |
|
"loss": 1.9034, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.7273954116059378, |
|
"grad_norm": 0.818723201751709, |
|
"learning_rate": 0.00016, |
|
"loss": 1.6895, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.9865047233468287, |
|
"eval_loss": 1.3229522705078125, |
|
"eval_runtime": 11.7014, |
|
"eval_samples_per_second": 31.791, |
|
"eval_steps_per_second": 4.017, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.1592442645074224, |
|
"grad_norm": 1.2037190198898315, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5763, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.591093117408907, |
|
"grad_norm": 1.2296253442764282, |
|
"learning_rate": 0.00019978499773373596, |
|
"loss": 1.3997, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.979757085020243, |
|
"eval_loss": 1.0936287641525269, |
|
"eval_runtime": 11.7036, |
|
"eval_samples_per_second": 31.785, |
|
"eval_steps_per_second": 4.016, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 3.0229419703103915, |
|
"grad_norm": 1.3699569702148438, |
|
"learning_rate": 0.0001991409154544338, |
|
"loss": 1.3491, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.454790823211876, |
|
"grad_norm": 1.451266884803772, |
|
"learning_rate": 0.00019807052274508773, |
|
"loss": 1.1626, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.8866396761133606, |
|
"grad_norm": 1.3704801797866821, |
|
"learning_rate": 0.0001965784223428638, |
|
"loss": 1.1558, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.9946018893387314, |
|
"eval_loss": 0.9916501641273499, |
|
"eval_runtime": 11.7248, |
|
"eval_samples_per_second": 31.728, |
|
"eval_steps_per_second": 4.009, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 4.318488529014845, |
|
"grad_norm": 1.6158828735351562, |
|
"learning_rate": 0.0001946710303471214, |
|
"loss": 1.0048, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.75033738191633, |
|
"grad_norm": 1.7279417514801025, |
|
"learning_rate": 0.00019235654862989537, |
|
"loss": 1.0293, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.987854251012146, |
|
"eval_loss": 0.9539479613304138, |
|
"eval_runtime": 11.7041, |
|
"eval_samples_per_second": 31.784, |
|
"eval_steps_per_second": 4.016, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 5.182186234817814, |
|
"grad_norm": 1.5809475183486938, |
|
"learning_rate": 0.00018964492956747425, |
|
"loss": 0.966, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.614035087719298, |
|
"grad_norm": 1.6080421209335327, |
|
"learning_rate": 0.00018654783324473137, |
|
"loss": 0.8655, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.98110661268556, |
|
"eval_loss": 0.979802131652832, |
|
"eval_runtime": 11.7244, |
|
"eval_samples_per_second": 31.729, |
|
"eval_steps_per_second": 4.009, |
|
"step": 277 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 1058, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 23, |
|
"save_steps": 500, |
|
"total_flos": 2.462211106480128e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|