|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9957446808510637, |
|
"eval_steps": 500, |
|
"global_step": 264, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11347517730496454, |
|
"grad_norm": 2.358112635127941, |
|
"learning_rate": 5e-06, |
|
"loss": 1.045, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.22695035460992907, |
|
"grad_norm": 2.2025871190711044, |
|
"learning_rate": 5e-06, |
|
"loss": 0.939, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3404255319148936, |
|
"grad_norm": 1.183824809408691, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8991, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.45390070921985815, |
|
"grad_norm": 1.2551021476147783, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8706, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5673758865248227, |
|
"grad_norm": 1.6978646589888085, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8565, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6808510638297872, |
|
"grad_norm": 2.13778699873673, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8386, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.7943262411347518, |
|
"grad_norm": 0.993295806091264, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8309, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9078014184397163, |
|
"grad_norm": 1.1398762974593635, |
|
"learning_rate": 5e-06, |
|
"loss": 0.823, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.9985815602836879, |
|
"eval_loss": 0.8090236783027649, |
|
"eval_runtime": 63.3465, |
|
"eval_samples_per_second": 37.445, |
|
"eval_steps_per_second": 0.6, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.0212765957446808, |
|
"grad_norm": 1.013500095467207, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8742, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.1347517730496455, |
|
"grad_norm": 1.056225323349834, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7667, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.24822695035461, |
|
"grad_norm": 0.7290196034792423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.755, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.3617021276595744, |
|
"grad_norm": 0.8838498260846974, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7554, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.475177304964539, |
|
"grad_norm": 0.821991213787815, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7556, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.5886524822695036, |
|
"grad_norm": 0.9855152726966359, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7493, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.702127659574468, |
|
"grad_norm": 0.6490086567527167, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7477, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.8156028368794326, |
|
"grad_norm": 1.0694149262660388, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7414, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.9290780141843973, |
|
"grad_norm": 0.9645011140855406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7481, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.9971631205673759, |
|
"eval_loss": 0.7898643016815186, |
|
"eval_runtime": 62.2492, |
|
"eval_samples_per_second": 38.105, |
|
"eval_steps_per_second": 0.61, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.0425531914893615, |
|
"grad_norm": 1.581859190270789, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7818, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.1560283687943262, |
|
"grad_norm": 1.0466470957786433, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6863, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.269503546099291, |
|
"grad_norm": 0.9663026123669691, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6798, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.382978723404255, |
|
"grad_norm": 0.8243226264574698, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6826, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.49645390070922, |
|
"grad_norm": 1.0907354136557872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6839, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.6099290780141846, |
|
"grad_norm": 0.7996806357479502, |
|
"learning_rate": 5e-06, |
|
"loss": 0.687, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.723404255319149, |
|
"grad_norm": 0.9108831837931511, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6902, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.8368794326241136, |
|
"grad_norm": 0.8473372600949097, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6873, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.950354609929078, |
|
"grad_norm": 0.8244777156304377, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6866, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.9957446808510637, |
|
"eval_loss": 0.7845782041549683, |
|
"eval_runtime": 58.8988, |
|
"eval_samples_per_second": 40.272, |
|
"eval_steps_per_second": 0.645, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.9957446808510637, |
|
"step": 264, |
|
"total_flos": 442000453140480.0, |
|
"train_loss": 0.7778021304896383, |
|
"train_runtime": 8882.1716, |
|
"train_samples_per_second": 15.22, |
|
"train_steps_per_second": 0.03 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 264, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 442000453140480.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|