|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 320, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 71.75464630126953, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 3.61, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 45.94756317138672, |
|
"learning_rate": 1.4000000000000001e-06, |
|
"loss": 3.3547, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 29.31285858154297, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 2.7402, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 19.44002914428711, |
|
"learning_rate": 3.4000000000000005e-06, |
|
"loss": 2.12, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 13.159948348999023, |
|
"learning_rate": 4.4e-06, |
|
"loss": 1.5923, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 8.778341293334961, |
|
"learning_rate": 5.400000000000001e-06, |
|
"loss": 1.2181, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 5.181447982788086, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 0.9039, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 5.2995686531066895, |
|
"learning_rate": 7.4e-06, |
|
"loss": 0.6798, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_cer": 0.1788136402712771, |
|
"eval_loss": 0.5512658953666687, |
|
"eval_runtime": 167.9142, |
|
"eval_samples_per_second": 7.522, |
|
"eval_steps_per_second": 0.06, |
|
"eval_wer": 0.6854535695115406, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 3.869173288345337, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 0.5485, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 3.1130523681640625, |
|
"learning_rate": 9.4e-06, |
|
"loss": 0.4481, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 2.2851545810699463, |
|
"learning_rate": 9.81818181818182e-06, |
|
"loss": 0.4103, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.8455662727355957, |
|
"learning_rate": 9.363636363636365e-06, |
|
"loss": 0.37, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 2.575657367706299, |
|
"learning_rate": 8.90909090909091e-06, |
|
"loss": 0.3513, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.5928118228912354, |
|
"learning_rate": 8.454545454545455e-06, |
|
"loss": 0.3504, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 2.5617828369140625, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.324, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 3.5175580978393555, |
|
"learning_rate": 7.545454545454546e-06, |
|
"loss": 0.3095, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_cer": 0.09716305282261917, |
|
"eval_loss": 0.2984148859977722, |
|
"eval_runtime": 164.9586, |
|
"eval_samples_per_second": 7.656, |
|
"eval_steps_per_second": 0.061, |
|
"eval_wer": 0.4486312399355878, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 2.2015202045440674, |
|
"learning_rate": 7.0909090909090916e-06, |
|
"loss": 0.2926, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 3.0814766883850098, |
|
"learning_rate": 6.6363636363636375e-06, |
|
"loss": 0.2836, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 1.8548959493637085, |
|
"learning_rate": 6.181818181818182e-06, |
|
"loss": 0.2743, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 2.3138978481292725, |
|
"learning_rate": 5.727272727272728e-06, |
|
"loss": 0.2707, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 2.2470438480377197, |
|
"learning_rate": 5.272727272727273e-06, |
|
"loss": 0.2799, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 2.5589115619659424, |
|
"learning_rate": 4.818181818181819e-06, |
|
"loss": 0.2669, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 2.1574714183807373, |
|
"learning_rate": 4.363636363636364e-06, |
|
"loss": 0.2615, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 3.022969961166382, |
|
"learning_rate": 3.90909090909091e-06, |
|
"loss": 0.2673, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_cer": 0.0882032667876588, |
|
"eval_loss": 0.2675623595714569, |
|
"eval_runtime": 162.9283, |
|
"eval_samples_per_second": 7.752, |
|
"eval_steps_per_second": 0.061, |
|
"eval_wer": 0.4142780461621041, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 1.893800973892212, |
|
"learning_rate": 3.454545454545455e-06, |
|
"loss": 0.2601, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 2.2726192474365234, |
|
"learning_rate": 3e-06, |
|
"loss": 0.2575, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"grad_norm": 1.9179538488388062, |
|
"learning_rate": 2.5454545454545456e-06, |
|
"loss": 0.2375, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 1.734628677368164, |
|
"learning_rate": 2.090909090909091e-06, |
|
"loss": 0.238, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"grad_norm": 1.9225579500198364, |
|
"learning_rate": 1.6363636363636365e-06, |
|
"loss": 0.237, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 1.7918205261230469, |
|
"learning_rate": 1.181818181818182e-06, |
|
"loss": 0.2364, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"grad_norm": 2.021205186843872, |
|
"learning_rate": 7.272727272727273e-07, |
|
"loss": 0.2422, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.901258707046509, |
|
"learning_rate": 2.7272727272727274e-07, |
|
"loss": 0.2428, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_cer": 0.08564332792052727, |
|
"eval_loss": 0.2611912190914154, |
|
"eval_runtime": 163.7635, |
|
"eval_samples_per_second": 7.712, |
|
"eval_steps_per_second": 0.061, |
|
"eval_wer": 0.40128824476650565, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 320, |
|
"total_flos": 2.6372074438656e+18, |
|
"train_loss": 0.7337436556816102, |
|
"train_runtime": 5396.4455, |
|
"train_samples_per_second": 7.535, |
|
"train_steps_per_second": 0.059 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 320, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.6372074438656e+18, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|