|
{ |
|
"best_metric": 1.0420600175857544, |
|
"best_model_checkpoint": "outputs/checkpoint-555", |
|
"epoch": 13.991902834008098, |
|
"eval_steps": 500, |
|
"global_step": 648, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.4318488529014845, |
|
"grad_norm": 0.9230825901031494, |
|
"learning_rate": 6e-06, |
|
"loss": 2.3403, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.863697705802969, |
|
"grad_norm": 0.656217098236084, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.2959, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.9932523616734144, |
|
"eval_loss": 2.0449142456054688, |
|
"eval_runtime": 11.6816, |
|
"eval_samples_per_second": 31.845, |
|
"eval_steps_per_second": 4.023, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.2955465587044535, |
|
"grad_norm": 0.5638908743858337, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.1979, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.7273954116059378, |
|
"grad_norm": 0.51393723487854, |
|
"learning_rate": 2.4e-05, |
|
"loss": 2.0118, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.9865047233468287, |
|
"eval_loss": 1.7561986446380615, |
|
"eval_runtime": 11.7086, |
|
"eval_samples_per_second": 31.772, |
|
"eval_steps_per_second": 4.014, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.1592442645074224, |
|
"grad_norm": 0.6104076504707336, |
|
"learning_rate": 3e-05, |
|
"loss": 1.9541, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.591093117408907, |
|
"grad_norm": 0.5912227034568787, |
|
"learning_rate": 2.9981931843077588e-05, |
|
"loss": 1.849, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.979757085020243, |
|
"eval_loss": 1.5828752517700195, |
|
"eval_runtime": 11.7012, |
|
"eval_samples_per_second": 31.792, |
|
"eval_steps_per_second": 4.017, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 3.0229419703103915, |
|
"grad_norm": 0.9186816215515137, |
|
"learning_rate": 2.9927770900082956e-05, |
|
"loss": 1.8383, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.454790823211876, |
|
"grad_norm": 0.9306802153587341, |
|
"learning_rate": 2.9837647649471717e-05, |
|
"loss": 1.7307, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.8866396761133606, |
|
"grad_norm": 0.9665244817733765, |
|
"learning_rate": 2.9711779206048457e-05, |
|
"loss": 1.695, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.9946018893387314, |
|
"eval_loss": 1.4405076503753662, |
|
"eval_runtime": 11.7039, |
|
"eval_samples_per_second": 31.784, |
|
"eval_steps_per_second": 4.016, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 4.318488529014845, |
|
"grad_norm": 1.23567795753479, |
|
"learning_rate": 2.9550468797918162e-05, |
|
"loss": 1.6044, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.75033738191633, |
|
"grad_norm": 1.4878095388412476, |
|
"learning_rate": 2.9354105035983133e-05, |
|
"loss": 1.5957, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.987854251012146, |
|
"eval_loss": 1.32224702835083, |
|
"eval_runtime": 11.7081, |
|
"eval_samples_per_second": 31.773, |
|
"eval_steps_per_second": 4.014, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 5.182186234817814, |
|
"grad_norm": 1.3139092922210693, |
|
"learning_rate": 2.912316097774531e-05, |
|
"loss": 1.5493, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.614035087719298, |
|
"grad_norm": 1.5993897914886475, |
|
"learning_rate": 2.8858192987669303e-05, |
|
"loss": 1.4703, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.98110661268556, |
|
"eval_loss": 1.231151819229126, |
|
"eval_runtime": 11.7193, |
|
"eval_samples_per_second": 31.743, |
|
"eval_steps_per_second": 4.01, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 6.045883940620783, |
|
"grad_norm": 1.660462737083435, |
|
"learning_rate": 2.8559839396851653e-05, |
|
"loss": 1.4259, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 6.477732793522267, |
|
"grad_norm": 1.586065411567688, |
|
"learning_rate": 2.8228818965225325e-05, |
|
"loss": 1.334, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.909581646423752, |
|
"grad_norm": 1.869524359703064, |
|
"learning_rate": 2.7865929150004083e-05, |
|
"loss": 1.3783, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.995951417004049, |
|
"eval_loss": 1.1627566814422607, |
|
"eval_runtime": 11.7057, |
|
"eval_samples_per_second": 31.779, |
|
"eval_steps_per_second": 4.015, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 7.341430499325236, |
|
"grad_norm": 2.316075325012207, |
|
"learning_rate": 2.747204418453818e-05, |
|
"loss": 1.2803, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 7.77327935222672, |
|
"grad_norm": 2.061739921569824, |
|
"learning_rate": 2.7048112972209674e-05, |
|
"loss": 1.2944, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.989203778677463, |
|
"eval_loss": 1.1163532733917236, |
|
"eval_runtime": 11.7029, |
|
"eval_samples_per_second": 31.787, |
|
"eval_steps_per_second": 4.016, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 8.205128205128204, |
|
"grad_norm": 2.3766236305236816, |
|
"learning_rate": 2.6595156800441055e-05, |
|
"loss": 1.2365, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 8.63697705802969, |
|
"grad_norm": 2.696465015411377, |
|
"learning_rate": 2.611426688032439e-05, |
|
"loss": 1.2014, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 8.982456140350877, |
|
"eval_loss": 1.0828282833099365, |
|
"eval_runtime": 11.7054, |
|
"eval_samples_per_second": 31.78, |
|
"eval_steps_per_second": 4.015, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 9.068825910931174, |
|
"grad_norm": 2.374746322631836, |
|
"learning_rate": 2.5606601717798212e-05, |
|
"loss": 1.1982, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 9.50067476383266, |
|
"grad_norm": 2.3256847858428955, |
|
"learning_rate": 2.5073384322705278e-05, |
|
"loss": 1.1262, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 9.932523616734143, |
|
"grad_norm": 2.200650930404663, |
|
"learning_rate": 2.4515899262454684e-05, |
|
"loss": 1.1541, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 9.997300944669366, |
|
"eval_loss": 1.0607314109802246, |
|
"eval_runtime": 11.7013, |
|
"eval_samples_per_second": 31.791, |
|
"eval_steps_per_second": 4.017, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 10.364372469635628, |
|
"grad_norm": 2.9810495376586914, |
|
"learning_rate": 2.3935489567386502e-05, |
|
"loss": 1.0843, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 10.796221322537113, |
|
"grad_norm": 2.9131200313568115, |
|
"learning_rate": 2.3333553495294033e-05, |
|
"loss": 1.0568, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 10.99055330634278, |
|
"eval_loss": 1.0553909540176392, |
|
"eval_runtime": 11.6966, |
|
"eval_samples_per_second": 31.804, |
|
"eval_steps_per_second": 4.018, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 11.228070175438596, |
|
"grad_norm": 2.6710944175720215, |
|
"learning_rate": 2.2711541162898324e-05, |
|
"loss": 1.0991, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 11.65991902834008, |
|
"grad_norm": 3.4023935794830322, |
|
"learning_rate": 2.207095105238997e-05, |
|
"loss": 1.0467, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 11.983805668016194, |
|
"eval_loss": 1.0420600175857544, |
|
"eval_runtime": 11.7007, |
|
"eval_samples_per_second": 31.793, |
|
"eval_steps_per_second": 4.017, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 12.091767881241566, |
|
"grad_norm": 2.8432018756866455, |
|
"learning_rate": 2.141332640145423e-05, |
|
"loss": 1.0121, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 12.523616734143049, |
|
"grad_norm": 2.7599430084228516, |
|
"learning_rate": 2.074025148547635e-05, |
|
"loss": 0.9882, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 12.955465587044534, |
|
"grad_norm": 3.7460949420928955, |
|
"learning_rate": 2.00533478008833e-05, |
|
"loss": 0.9781, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 12.998650472334683, |
|
"eval_loss": 1.0568532943725586, |
|
"eval_runtime": 11.6959, |
|
"eval_samples_per_second": 31.806, |
|
"eval_steps_per_second": 4.018, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 13.387314439946019, |
|
"grad_norm": 2.7911863327026367, |
|
"learning_rate": 1.9354270158816936e-05, |
|
"loss": 0.9352, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 13.819163292847504, |
|
"grad_norm": 3.0418496131896973, |
|
"learning_rate": 1.8644702698548963e-05, |
|
"loss": 0.9929, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 13.991902834008098, |
|
"eval_loss": 1.0488306283950806, |
|
"eval_runtime": 11.7173, |
|
"eval_samples_per_second": 31.748, |
|
"eval_steps_per_second": 4.011, |
|
"step": 648 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 1380, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 500, |
|
"total_flos": 5.760316675783066e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|