|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.806032970887408, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5909375548362732, |
|
"learning_rate": 1.9932584269662923e-05, |
|
"loss": 2.0237, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5826025009155273, |
|
"learning_rate": 1.9857677902621722e-05, |
|
"loss": 1.9306, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5491089820861816, |
|
"learning_rate": 1.9782771535580525e-05, |
|
"loss": 1.7959, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.362810730934143, |
|
"learning_rate": 1.970786516853933e-05, |
|
"loss": 1.6599, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.4427486658096313, |
|
"learning_rate": 1.963295880149813e-05, |
|
"loss": 1.5685, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.9993659257888794, |
|
"learning_rate": 1.956179775280899e-05, |
|
"loss": 1.4621, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.614562749862671, |
|
"learning_rate": 1.9486891385767793e-05, |
|
"loss": 1.31, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.1975798606872559, |
|
"learning_rate": 1.9411985018726593e-05, |
|
"loss": 1.2322, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.7684128880500793, |
|
"learning_rate": 1.9337078651685396e-05, |
|
"loss": 1.1361, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.9336960911750793, |
|
"learning_rate": 1.9262172284644195e-05, |
|
"loss": 1.0797, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.8471770882606506, |
|
"learning_rate": 1.9187265917603e-05, |
|
"loss": 1.0368, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.111340045928955, |
|
"learning_rate": 1.9112359550561798e-05, |
|
"loss": 0.9738, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.8093781471252441, |
|
"learning_rate": 1.90374531835206e-05, |
|
"loss": 0.9494, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.8438062071800232, |
|
"learning_rate": 1.89625468164794e-05, |
|
"loss": 0.9276, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.9896701574325562, |
|
"learning_rate": 1.8887640449438204e-05, |
|
"loss": 0.8656, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.8278244137763977, |
|
"learning_rate": 1.8812734082397007e-05, |
|
"loss": 0.8431, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.931291937828064, |
|
"learning_rate": 1.8737827715355807e-05, |
|
"loss": 0.7945, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.21769380569458, |
|
"learning_rate": 1.866292134831461e-05, |
|
"loss": 0.7647, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 3.5183286666870117, |
|
"learning_rate": 1.858801498127341e-05, |
|
"loss": 0.7497, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.1153030395507812, |
|
"learning_rate": 1.8513108614232212e-05, |
|
"loss": 0.7507, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 1.0140526294708252, |
|
"learning_rate": 1.8438202247191012e-05, |
|
"loss": 0.7415, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 1.4395232200622559, |
|
"learning_rate": 1.8363295880149815e-05, |
|
"loss": 0.6947, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 1.4253089427947998, |
|
"learning_rate": 1.8288389513108615e-05, |
|
"loss": 0.7429, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.3152351379394531, |
|
"learning_rate": 1.8213483146067418e-05, |
|
"loss": 0.7363, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 2.5935957431793213, |
|
"learning_rate": 1.8138576779026217e-05, |
|
"loss": 0.6486, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 5340, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 500, |
|
"total_flos": 1.2995638935552e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|