|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 76, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013157894736842105, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 2.5e-06, |
|
"loss": 3.1602, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.06578947368421052, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.25e-05, |
|
"loss": 3.2031, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.13157894736842105, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.9957341762950346e-05, |
|
"loss": 3.2461, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.19736842105263158, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.948160647590966e-05, |
|
"loss": 3.2055, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2631578947368421, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.8502171357296144e-05, |
|
"loss": 3.168, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.32894736842105265, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.7071067811865477e-05, |
|
"loss": 3.15, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.39473684210526316, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.526432162877356e-05, |
|
"loss": 3.1352, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4605263157894737, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.3177914195819018e-05, |
|
"loss": 3.1156, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.092268359463302e-05, |
|
"loss": 3.1227, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5921052631578947, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 8.618436450481182e-06, |
|
"loss": 3.1273, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6578947368421053, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 6.387583338128471e-06, |
|
"loss": 3.1133, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7236842105263158, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 4.348635855774082e-06, |
|
"loss": 3.125, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.7894736842105263, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 2.6099108277934105e-06, |
|
"loss": 3.1, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8552631578947368, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.2637760935363053e-06, |
|
"loss": 3.1195, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.9210526315789473, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 3.817435682718096e-07, |
|
"loss": 3.1187, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9868421052631579, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.0670251976275803e-08, |
|
"loss": 3.1156, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.950244188308716, |
|
"eval_runtime": 862.1559, |
|
"eval_samples_per_second": 30.726, |
|
"eval_steps_per_second": 0.96, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 76, |
|
"total_flos": 317731110912000.0, |
|
"train_loss": 3.1433490953947367, |
|
"train_runtime": 1149.309, |
|
"train_samples_per_second": 2.11, |
|
"train_steps_per_second": 0.066 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 76, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 317731110912000.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|