|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 30.0, |
|
"eval_steps": 500, |
|
"global_step": 24210, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 6.501698017120361, |
|
"learning_rate": 9.66914498141264e-06, |
|
"loss": 1.2014, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.3661971688270569, |
|
"eval_loss": 1.183009147644043, |
|
"eval_runtime": 5.7015, |
|
"eval_samples_per_second": 12.453, |
|
"eval_steps_per_second": 12.453, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 11.675583839416504, |
|
"learning_rate": 9.335811648079307e-06, |
|
"loss": 1.0915, |
|
"step": 1614 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.3239436745643616, |
|
"eval_loss": 1.5119783878326416, |
|
"eval_runtime": 5.7636, |
|
"eval_samples_per_second": 12.319, |
|
"eval_steps_per_second": 12.319, |
|
"step": 1614 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 16.97098731994629, |
|
"learning_rate": 9.002478314745973e-06, |
|
"loss": 1.1433, |
|
"step": 2421 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.4084506928920746, |
|
"eval_loss": 1.5698989629745483, |
|
"eval_runtime": 5.8045, |
|
"eval_samples_per_second": 12.232, |
|
"eval_steps_per_second": 12.232, |
|
"step": 2421 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 127.35404205322266, |
|
"learning_rate": 8.669558033870302e-06, |
|
"loss": 1.2819, |
|
"step": 3228 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.47887325286865234, |
|
"eval_loss": 1.737151861190796, |
|
"eval_runtime": 5.8167, |
|
"eval_samples_per_second": 12.206, |
|
"eval_steps_per_second": 12.206, |
|
"step": 3228 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.40140867233276367, |
|
"learning_rate": 8.336224700536968e-06, |
|
"loss": 1.2718, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.4647887349128723, |
|
"eval_loss": 2.216925859451294, |
|
"eval_runtime": 5.8112, |
|
"eval_samples_per_second": 12.218, |
|
"eval_steps_per_second": 12.218, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 30.236343383789062, |
|
"learning_rate": 8.003304419661297e-06, |
|
"loss": 1.4535, |
|
"step": 4842 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.577464759349823, |
|
"eval_loss": 1.7295811176300049, |
|
"eval_runtime": 5.8523, |
|
"eval_samples_per_second": 12.132, |
|
"eval_steps_per_second": 12.132, |
|
"step": 4842 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.2374914586544037, |
|
"learning_rate": 7.670384138785627e-06, |
|
"loss": 1.3433, |
|
"step": 5649 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.5492957830429077, |
|
"eval_loss": 2.268446683883667, |
|
"eval_runtime": 5.8095, |
|
"eval_samples_per_second": 12.221, |
|
"eval_steps_per_second": 12.221, |
|
"step": 5649 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 32.27313232421875, |
|
"learning_rate": 7.3370508054522925e-06, |
|
"loss": 1.4086, |
|
"step": 6456 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.6478873491287231, |
|
"eval_loss": 1.8598684072494507, |
|
"eval_runtime": 5.7903, |
|
"eval_samples_per_second": 12.262, |
|
"eval_steps_per_second": 12.262, |
|
"step": 6456 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 42.521121978759766, |
|
"learning_rate": 7.003717472118959e-06, |
|
"loss": 1.3923, |
|
"step": 7263 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.6197183132171631, |
|
"eval_loss": 1.9419935941696167, |
|
"eval_runtime": 5.8464, |
|
"eval_samples_per_second": 12.144, |
|
"eval_steps_per_second": 12.144, |
|
"step": 7263 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.37923476099967957, |
|
"learning_rate": 6.670384138785626e-06, |
|
"loss": 1.3353, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.577464759349823, |
|
"eval_loss": 2.2150189876556396, |
|
"eval_runtime": 5.8548, |
|
"eval_samples_per_second": 12.127, |
|
"eval_steps_per_second": 12.127, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 19.937904357910156, |
|
"learning_rate": 6.337463857909954e-06, |
|
"loss": 1.367, |
|
"step": 8877 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.6338028311729431, |
|
"eval_loss": 1.9825525283813477, |
|
"eval_runtime": 5.9069, |
|
"eval_samples_per_second": 12.02, |
|
"eval_steps_per_second": 12.02, |
|
"step": 8877 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 454.2120666503906, |
|
"learning_rate": 6.004543577034284e-06, |
|
"loss": 1.1848, |
|
"step": 9684 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.6478873491287231, |
|
"eval_loss": 1.9545217752456665, |
|
"eval_runtime": 5.8618, |
|
"eval_samples_per_second": 12.112, |
|
"eval_steps_per_second": 12.112, |
|
"step": 9684 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 0.09417306631803513, |
|
"learning_rate": 5.671210243700951e-06, |
|
"loss": 1.1355, |
|
"step": 10491 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.6619718074798584, |
|
"eval_loss": 1.9863765239715576, |
|
"eval_runtime": 5.8578, |
|
"eval_samples_per_second": 12.12, |
|
"eval_steps_per_second": 12.12, |
|
"step": 10491 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 33.132957458496094, |
|
"learning_rate": 5.337876910367618e-06, |
|
"loss": 1.1549, |
|
"step": 11298 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.6338028311729431, |
|
"eval_loss": 1.9428231716156006, |
|
"eval_runtime": 5.8665, |
|
"eval_samples_per_second": 12.103, |
|
"eval_steps_per_second": 12.103, |
|
"step": 11298 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.12686021625995636, |
|
"learning_rate": 5.004543577034285e-06, |
|
"loss": 1.0505, |
|
"step": 12105 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.6901408433914185, |
|
"eval_loss": 1.9100552797317505, |
|
"eval_runtime": 5.8853, |
|
"eval_samples_per_second": 12.064, |
|
"eval_steps_per_second": 12.064, |
|
"step": 12105 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 1043.1195068359375, |
|
"learning_rate": 4.6712102437009505e-06, |
|
"loss": 1.0442, |
|
"step": 12912 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.6478873491287231, |
|
"eval_loss": 2.1706087589263916, |
|
"eval_runtime": 5.8878, |
|
"eval_samples_per_second": 12.059, |
|
"eval_steps_per_second": 12.059, |
|
"step": 12912 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 26.500118255615234, |
|
"learning_rate": 4.338289962825279e-06, |
|
"loss": 0.9922, |
|
"step": 13719 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.6197183132171631, |
|
"eval_loss": 2.462040901184082, |
|
"eval_runtime": 5.8428, |
|
"eval_samples_per_second": 12.152, |
|
"eval_steps_per_second": 12.152, |
|
"step": 13719 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 162.029052734375, |
|
"learning_rate": 4.004956629491946e-06, |
|
"loss": 0.8698, |
|
"step": 14526 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.6619718074798584, |
|
"eval_loss": 2.142850637435913, |
|
"eval_runtime": 5.9312, |
|
"eval_samples_per_second": 11.971, |
|
"eval_steps_per_second": 11.971, |
|
"step": 14526 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.0350579209625721, |
|
"learning_rate": 3.6716232961586124e-06, |
|
"loss": 0.8202, |
|
"step": 15333 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.6197183132171631, |
|
"eval_loss": 2.372546911239624, |
|
"eval_runtime": 5.8453, |
|
"eval_samples_per_second": 12.147, |
|
"eval_steps_per_second": 12.147, |
|
"step": 15333 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.042548421770334244, |
|
"learning_rate": 3.3387030152829415e-06, |
|
"loss": 0.8612, |
|
"step": 16140 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.6619718074798584, |
|
"eval_loss": 2.1631431579589844, |
|
"eval_runtime": 5.8804, |
|
"eval_samples_per_second": 12.074, |
|
"eval_steps_per_second": 12.074, |
|
"step": 16140 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 0.09584546834230423, |
|
"learning_rate": 3.0053696819496083e-06, |
|
"loss": 0.8197, |
|
"step": 16947 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.6338028311729431, |
|
"eval_loss": 2.393216848373413, |
|
"eval_runtime": 5.8299, |
|
"eval_samples_per_second": 12.179, |
|
"eval_steps_per_second": 12.179, |
|
"step": 16947 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 0.0448361337184906, |
|
"learning_rate": 2.672036348616274e-06, |
|
"loss": 0.7858, |
|
"step": 17754 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.6478873491287231, |
|
"eval_loss": 2.25321364402771, |
|
"eval_runtime": 5.8111, |
|
"eval_samples_per_second": 12.218, |
|
"eval_steps_per_second": 12.218, |
|
"step": 17754 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"grad_norm": 0.057099100202322006, |
|
"learning_rate": 2.3391160677406034e-06, |
|
"loss": 0.7717, |
|
"step": 18561 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.5633803009986877, |
|
"eval_loss": 2.813150644302368, |
|
"eval_runtime": 5.8476, |
|
"eval_samples_per_second": 12.142, |
|
"eval_steps_per_second": 12.142, |
|
"step": 18561 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 202.8040008544922, |
|
"learning_rate": 2.00578273440727e-06, |
|
"loss": 0.6282, |
|
"step": 19368 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.6197183132171631, |
|
"eval_loss": 2.549309492111206, |
|
"eval_runtime": 5.839, |
|
"eval_samples_per_second": 12.16, |
|
"eval_steps_per_second": 12.16, |
|
"step": 19368 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 0.06747495383024216, |
|
"learning_rate": 1.6728624535315987e-06, |
|
"loss": 0.7394, |
|
"step": 20175 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.6619718074798584, |
|
"eval_loss": 2.3195266723632812, |
|
"eval_runtime": 5.8894, |
|
"eval_samples_per_second": 12.056, |
|
"eval_steps_per_second": 12.056, |
|
"step": 20175 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 4.30875301361084, |
|
"learning_rate": 1.3395291201982652e-06, |
|
"loss": 0.5895, |
|
"step": 20982 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.6619718074798584, |
|
"eval_loss": 2.43306565284729, |
|
"eval_runtime": 5.8283, |
|
"eval_samples_per_second": 12.182, |
|
"eval_steps_per_second": 12.182, |
|
"step": 20982 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"grad_norm": 0.05214543268084526, |
|
"learning_rate": 1.006195786864932e-06, |
|
"loss": 0.5854, |
|
"step": 21789 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.6760563254356384, |
|
"eval_loss": 2.428130626678467, |
|
"eval_runtime": 5.8154, |
|
"eval_samples_per_second": 12.209, |
|
"eval_steps_per_second": 12.209, |
|
"step": 21789 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 0.1047598198056221, |
|
"learning_rate": 6.732755059892606e-07, |
|
"loss": 0.6911, |
|
"step": 22596 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.6619718074798584, |
|
"eval_loss": 2.499279737472534, |
|
"eval_runtime": 5.8567, |
|
"eval_samples_per_second": 12.123, |
|
"eval_steps_per_second": 12.123, |
|
"step": 22596 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"grad_norm": 92.8244857788086, |
|
"learning_rate": 3.3994217265592734e-07, |
|
"loss": 0.5502, |
|
"step": 23403 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 0.6338028311729431, |
|
"eval_loss": 2.64582896232605, |
|
"eval_runtime": 5.8325, |
|
"eval_samples_per_second": 12.173, |
|
"eval_steps_per_second": 12.173, |
|
"step": 23403 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 0.047546908259391785, |
|
"learning_rate": 6.60883932259397e-09, |
|
"loss": 0.584, |
|
"step": 24210 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.6338028311729431, |
|
"eval_loss": 2.586568593978882, |
|
"eval_runtime": 5.8563, |
|
"eval_samples_per_second": 12.124, |
|
"eval_steps_per_second": 12.124, |
|
"step": 24210 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"step": 24210, |
|
"total_flos": 2.2475408052256202e+18, |
|
"train_loss": 1.0049422518175921, |
|
"train_runtime": 3971.9322, |
|
"train_samples_per_second": 6.095, |
|
"train_steps_per_second": 6.095 |
|
} |
|
], |
|
"logging_steps": 12, |
|
"max_steps": 24210, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.2475408052256202e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|