|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.932862190812721, |
|
"eval_steps": 500, |
|
"global_step": 105, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1413427561837456, |
|
"grad_norm": 10.981925964355469, |
|
"learning_rate": 4.972077065562821e-05, |
|
"loss": 4.3219, |
|
"num_input_tokens_seen": 89360, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.2826855123674912, |
|
"grad_norm": 19.007112503051758, |
|
"learning_rate": 4.888932014465352e-05, |
|
"loss": 2.7795, |
|
"num_input_tokens_seen": 173680, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.42402826855123676, |
|
"grad_norm": 48.207359313964844, |
|
"learning_rate": 4.783863644106502e-05, |
|
"loss": 3.1788, |
|
"num_input_tokens_seen": 256320, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5653710247349824, |
|
"grad_norm": 21.095312118530273, |
|
"learning_rate": 4.606802396635098e-05, |
|
"loss": 2.8887, |
|
"num_input_tokens_seen": 354080, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.7067137809187279, |
|
"grad_norm": 28.128597259521484, |
|
"learning_rate": 4.382678665009028e-05, |
|
"loss": 2.5454, |
|
"num_input_tokens_seen": 445120, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8480565371024735, |
|
"grad_norm": 20.008413314819336, |
|
"learning_rate": 4.116499003039499e-05, |
|
"loss": 2.201, |
|
"num_input_tokens_seen": 536160, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.9893992932862191, |
|
"grad_norm": 14.743931770324707, |
|
"learning_rate": 3.814209424526262e-05, |
|
"loss": 1.3345, |
|
"num_input_tokens_seen": 622160, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.1130742049469964, |
|
"grad_norm": 8.293540000915527, |
|
"learning_rate": 3.4825625791348096e-05, |
|
"loss": 0.9507, |
|
"num_input_tokens_seen": 702992, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.254416961130742, |
|
"grad_norm": 17.665626525878906, |
|
"learning_rate": 3.1289669093612714e-05, |
|
"loss": 0.6257, |
|
"num_input_tokens_seen": 795712, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.3957597173144876, |
|
"grad_norm": 33.63065719604492, |
|
"learning_rate": 2.761321158169134e-05, |
|
"loss": 1.9766, |
|
"num_input_tokens_seen": 880032, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.5371024734982333, |
|
"grad_norm": 23.377574920654297, |
|
"learning_rate": 2.3878379241237136e-05, |
|
"loss": 1.0377, |
|
"num_input_tokens_seen": 969392, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.6784452296819787, |
|
"grad_norm": 55.45833969116211, |
|
"learning_rate": 2.0168602055111173e-05, |
|
"loss": 1.5486, |
|
"num_input_tokens_seen": 1058752, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.8197879858657244, |
|
"grad_norm": 48.33625411987305, |
|
"learning_rate": 1.6566750315429254e-05, |
|
"loss": 0.7542, |
|
"num_input_tokens_seen": 1143072, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.96113074204947, |
|
"grad_norm": 33.805084228515625, |
|
"learning_rate": 1.3153283438175034e-05, |
|
"loss": 0.7561, |
|
"num_input_tokens_seen": 1234112, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.0848056537102475, |
|
"grad_norm": 40.988277435302734, |
|
"learning_rate": 1.0004452632802158e-05, |
|
"loss": 0.73, |
|
"num_input_tokens_seen": 1303184, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.2261484098939928, |
|
"grad_norm": 8.889869689941406, |
|
"learning_rate": 7.190597576216385e-06, |
|
"loss": 0.5124, |
|
"num_input_tokens_seen": 1385824, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.3674911660777385, |
|
"grad_norm": 17.493833541870117, |
|
"learning_rate": 4.7745751406263165e-06, |
|
"loss": 0.7801, |
|
"num_input_tokens_seen": 1476864, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.508833922261484, |
|
"grad_norm": 16.65656280517578, |
|
"learning_rate": 2.8103552748861476e-06, |
|
"loss": 0.3849, |
|
"num_input_tokens_seen": 1569584, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.65017667844523, |
|
"grad_norm": 21.752721786499023, |
|
"learning_rate": 1.3418154050208936e-06, |
|
"loss": 0.4539, |
|
"num_input_tokens_seen": 1657264, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.791519434628975, |
|
"grad_norm": 18.232982635498047, |
|
"learning_rate": 4.0176028503425835e-07, |
|
"loss": 1.1474, |
|
"num_input_tokens_seen": 1746624, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.932862190812721, |
|
"grad_norm": 17.093202590942383, |
|
"learning_rate": 1.1189192912416934e-08, |
|
"loss": 0.7139, |
|
"num_input_tokens_seen": 1841024, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.932862190812721, |
|
"num_input_tokens_seen": 1841024, |
|
"step": 105, |
|
"total_flos": 7451631889809408.0, |
|
"train_loss": 1.5058091197695052, |
|
"train_runtime": 2933.1034, |
|
"train_samples_per_second": 0.578, |
|
"train_steps_per_second": 0.036 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 105, |
|
"num_input_tokens_seen": 1841024, |
|
"num_train_epochs": 3, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7451631889809408.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|