|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.986666666666667, |
|
"eval_steps": 500, |
|
"global_step": 168, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 7.053971290588379, |
|
"learning_rate": 4.97273712672844e-06, |
|
"loss": 0.5704, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 5.397775650024414, |
|
"learning_rate": 4.861084470200228e-06, |
|
"loss": 0.2394, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 2.6659767627716064, |
|
"learning_rate": 4.667009949002349e-06, |
|
"loss": 0.1046, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 3.866196870803833, |
|
"learning_rate": 4.397288409237892e-06, |
|
"loss": 0.0639, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 3.2750980854034424, |
|
"learning_rate": 4.061335419273658e-06, |
|
"loss": 0.0841, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 2.892080783843994, |
|
"learning_rate": 3.6708785865814186e-06, |
|
"loss": 0.1054, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.2444444444444445, |
|
"grad_norm": 3.0646233558654785, |
|
"learning_rate": 3.239548164813544e-06, |
|
"loss": 0.0709, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.4222222222222223, |
|
"grad_norm": 3.744271755218506, |
|
"learning_rate": 2.782401242396799e-06, |
|
"loss": 0.0684, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 3.9203922748565674, |
|
"learning_rate": 2.3153961224961665e-06, |
|
"loss": 0.082, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 2.8430514335632324, |
|
"learning_rate": 1.854835242944048e-06, |
|
"loss": 0.0505, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.9555555555555557, |
|
"grad_norm": 1.6261467933654785, |
|
"learning_rate": 1.4167960829520933e-06, |
|
"loss": 0.0843, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.1333333333333333, |
|
"grad_norm": 2.9942877292633057, |
|
"learning_rate": 1.0165699227860215e-06, |
|
"loss": 0.0642, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.311111111111111, |
|
"grad_norm": 1.675881266593933, |
|
"learning_rate": 6.681280484488576e-07, |
|
"loss": 0.0684, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.488888888888889, |
|
"grad_norm": 5.115616321563721, |
|
"learning_rate": 3.8363403535449846e-07, |
|
"loss": 0.0837, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 2.429882287979126, |
|
"learning_rate": 1.7301913642614382e-07, |
|
"loss": 0.0603, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.8444444444444446, |
|
"grad_norm": 3.372321128845215, |
|
"learning_rate": 4.3635597174694347e-08, |
|
"loss": 0.06, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.986666666666667, |
|
"step": 168, |
|
"total_flos": 1.5795369631678464e+16, |
|
"train_loss": 0.11316087664592833, |
|
"train_runtime": 355.6972, |
|
"train_samples_per_second": 7.591, |
|
"train_steps_per_second": 0.472 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 168, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 1.5795369631678464e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|