|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.943454686289698, |
|
"eval_steps": 500, |
|
"global_step": 38000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.038729666924864445, |
|
"grad_norm": 5.300307750701904, |
|
"learning_rate": 1.9741802220500904e-05, |
|
"loss": 0.008, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.07745933384972889, |
|
"grad_norm": 0.046279702335596085, |
|
"learning_rate": 1.9483604441001807e-05, |
|
"loss": 0.0079, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11618900077459333, |
|
"grad_norm": 0.16018961369991302, |
|
"learning_rate": 1.9225406661502713e-05, |
|
"loss": 0.0077, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.15491866769945778, |
|
"grad_norm": 0.12051185965538025, |
|
"learning_rate": 1.8967208882003616e-05, |
|
"loss": 0.0078, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.19364833462432224, |
|
"grad_norm": 0.16797564923763275, |
|
"learning_rate": 1.870901110250452e-05, |
|
"loss": 0.007, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.23237800154918667, |
|
"grad_norm": 0.20184361934661865, |
|
"learning_rate": 1.845081332300542e-05, |
|
"loss": 0.0072, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2711076684740511, |
|
"grad_norm": 0.3552689552307129, |
|
"learning_rate": 1.8192615543506328e-05, |
|
"loss": 0.0069, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.30983733539891556, |
|
"grad_norm": 0.14999011158943176, |
|
"learning_rate": 1.7937516137361218e-05, |
|
"loss": 0.0185, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.34856700232378, |
|
"grad_norm": 0.04176361858844757, |
|
"learning_rate": 1.7679318357862124e-05, |
|
"loss": 0.007, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.3872966692486445, |
|
"grad_norm": 0.11695006489753723, |
|
"learning_rate": 1.7421120578363027e-05, |
|
"loss": 0.0062, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.42602633617350893, |
|
"grad_norm": 0.11298699676990509, |
|
"learning_rate": 1.716292279886393e-05, |
|
"loss": 0.0077, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.46475600309837334, |
|
"grad_norm": 0.7519420385360718, |
|
"learning_rate": 1.6904725019364833e-05, |
|
"loss": 0.0065, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5034856700232379, |
|
"grad_norm": 0.146876260638237, |
|
"learning_rate": 1.664652723986574e-05, |
|
"loss": 0.0055, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.5422153369481022, |
|
"grad_norm": 0.18907134234905243, |
|
"learning_rate": 1.638832946036664e-05, |
|
"loss": 0.0061, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.5809450038729667, |
|
"grad_norm": 0.3313717842102051, |
|
"learning_rate": 1.6130131680867548e-05, |
|
"loss": 0.0062, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.6196746707978311, |
|
"grad_norm": 0.07646404951810837, |
|
"learning_rate": 1.587193390136845e-05, |
|
"loss": 0.0059, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.6584043377226956, |
|
"grad_norm": 0.09517477452754974, |
|
"learning_rate": 1.5613736121869353e-05, |
|
"loss": 0.0058, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.69713400464756, |
|
"grad_norm": 0.15235310792922974, |
|
"learning_rate": 1.5355538342370256e-05, |
|
"loss": 0.0055, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.7358636715724245, |
|
"grad_norm": 0.07088156044483185, |
|
"learning_rate": 1.509734056287116e-05, |
|
"loss": 0.0058, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.774593338497289, |
|
"grad_norm": 0.19365285336971283, |
|
"learning_rate": 1.4839142783372065e-05, |
|
"loss": 0.0059, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.8133230054221534, |
|
"grad_norm": 0.6815012097358704, |
|
"learning_rate": 1.4580945003872968e-05, |
|
"loss": 0.0058, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.8520526723470179, |
|
"grad_norm": 0.050135429948568344, |
|
"learning_rate": 1.4322747224373872e-05, |
|
"loss": 0.006, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.8907823392718822, |
|
"grad_norm": 0.21455714106559753, |
|
"learning_rate": 1.4064549444874775e-05, |
|
"loss": 0.0061, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.9295120061967467, |
|
"grad_norm": 0.1364729255437851, |
|
"learning_rate": 1.380635166537568e-05, |
|
"loss": 0.0053, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.9682416731216111, |
|
"grad_norm": 0.07110429555177689, |
|
"learning_rate": 1.3548670281435581e-05, |
|
"loss": 0.0054, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_bleu": 91.1948, |
|
"eval_gen_len": 13.3484, |
|
"eval_loss": 0.007155969273298979, |
|
"eval_runtime": 76.0184, |
|
"eval_samples_per_second": 603.801, |
|
"eval_steps_per_second": 18.877, |
|
"step": 12910 |
|
}, |
|
{ |
|
"epoch": 1.0069713400464757, |
|
"grad_norm": 0.2353477030992508, |
|
"learning_rate": 1.3290988897495483e-05, |
|
"loss": 0.0072, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.04570100697134, |
|
"grad_norm": 0.10523347556591034, |
|
"learning_rate": 1.3032791117996386e-05, |
|
"loss": 0.0059, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.0844306738962044, |
|
"grad_norm": 0.13372991979122162, |
|
"learning_rate": 1.277459333849729e-05, |
|
"loss": 0.0052, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.1231603408210689, |
|
"grad_norm": 0.06887730956077576, |
|
"learning_rate": 1.2516395558998193e-05, |
|
"loss": 0.0059, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.1618900077459333, |
|
"grad_norm": 0.11208628118038177, |
|
"learning_rate": 1.2258197779499098e-05, |
|
"loss": 0.0065, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.2006196746707978, |
|
"grad_norm": 0.057284511625766754, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.0053, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.2393493415956622, |
|
"grad_norm": 0.30577805638313293, |
|
"learning_rate": 1.1741802220500905e-05, |
|
"loss": 0.006, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.2780790085205267, |
|
"grad_norm": 0.023146886378526688, |
|
"learning_rate": 1.1483604441001808e-05, |
|
"loss": 0.0058, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.3168086754453912, |
|
"grad_norm": 0.06862912327051163, |
|
"learning_rate": 1.1225406661502712e-05, |
|
"loss": 0.0055, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.3555383423702556, |
|
"grad_norm": 0.11294469982385635, |
|
"learning_rate": 1.0967208882003615e-05, |
|
"loss": 0.0055, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.39426800929512, |
|
"grad_norm": 0.0633481964468956, |
|
"learning_rate": 1.070901110250452e-05, |
|
"loss": 0.0056, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.4329976762199845, |
|
"grad_norm": 0.20660839974880219, |
|
"learning_rate": 1.0450813323005422e-05, |
|
"loss": 0.0055, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.471727343144849, |
|
"grad_norm": 0.2304856777191162, |
|
"learning_rate": 1.0192615543506326e-05, |
|
"loss": 0.0056, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.5104570100697134, |
|
"grad_norm": 0.07686398923397064, |
|
"learning_rate": 9.934417764007231e-06, |
|
"loss": 0.0057, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.549186676994578, |
|
"grad_norm": 0.184184268116951, |
|
"learning_rate": 9.676219984508134e-06, |
|
"loss": 0.0057, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.5879163439194421, |
|
"grad_norm": 0.16291634738445282, |
|
"learning_rate": 9.418022205009038e-06, |
|
"loss": 0.0054, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.6266460108443068, |
|
"grad_norm": 0.08518970757722855, |
|
"learning_rate": 9.159824425509941e-06, |
|
"loss": 0.0054, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.665375677769171, |
|
"grad_norm": 0.15436352789402008, |
|
"learning_rate": 8.901626646010845e-06, |
|
"loss": 0.0054, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.7041053446940357, |
|
"grad_norm": 0.036033745855093, |
|
"learning_rate": 8.643428866511748e-06, |
|
"loss": 0.0054, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.7428350116189, |
|
"grad_norm": 0.10140281915664673, |
|
"learning_rate": 8.385231087012653e-06, |
|
"loss": 0.0058, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.7815646785437647, |
|
"grad_norm": 0.10392390936613083, |
|
"learning_rate": 8.127549703072554e-06, |
|
"loss": 0.0052, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.820294345468629, |
|
"grad_norm": 0.4044785499572754, |
|
"learning_rate": 7.869351923573459e-06, |
|
"loss": 0.0052, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.8590240123934936, |
|
"grad_norm": 0.04380480572581291, |
|
"learning_rate": 7.611154144074362e-06, |
|
"loss": 0.0054, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.8977536793183578, |
|
"grad_norm": 0.49817517399787903, |
|
"learning_rate": 7.352956364575265e-06, |
|
"loss": 0.005, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.9364833462432223, |
|
"grad_norm": 0.1552819013595581, |
|
"learning_rate": 7.094758585076169e-06, |
|
"loss": 0.0052, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.9752130131680867, |
|
"grad_norm": 0.3041650056838989, |
|
"learning_rate": 6.8365608055770725e-06, |
|
"loss": 0.0052, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_bleu": 91.3684, |
|
"eval_gen_len": 13.3464, |
|
"eval_loss": 0.0063232011161744595, |
|
"eval_runtime": 76.6579, |
|
"eval_samples_per_second": 598.764, |
|
"eval_steps_per_second": 18.72, |
|
"step": 25820 |
|
}, |
|
{ |
|
"epoch": 2.0139426800929514, |
|
"grad_norm": 0.07072452455759048, |
|
"learning_rate": 6.578363026077976e-06, |
|
"loss": 0.0053, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.0526723470178156, |
|
"grad_norm": 0.08410033583641052, |
|
"learning_rate": 6.32016524657888e-06, |
|
"loss": 0.0043, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.09140201394268, |
|
"grad_norm": 0.10657070577144623, |
|
"learning_rate": 6.061967467079783e-06, |
|
"loss": 0.0043, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.1301316808675446, |
|
"grad_norm": 0.10941112786531448, |
|
"learning_rate": 5.803769687580687e-06, |
|
"loss": 0.0043, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.168861347792409, |
|
"grad_norm": 0.1383851021528244, |
|
"learning_rate": 5.5455719080815905e-06, |
|
"loss": 0.0044, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.2075910147172735, |
|
"grad_norm": 0.09801546484231949, |
|
"learning_rate": 5.287374128582494e-06, |
|
"loss": 0.0043, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 2.2463206816421377, |
|
"grad_norm": 0.10826662182807922, |
|
"learning_rate": 5.029176349083398e-06, |
|
"loss": 0.0047, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 2.2850503485670024, |
|
"grad_norm": 0.1411546766757965, |
|
"learning_rate": 4.770978569584302e-06, |
|
"loss": 0.0046, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 2.3237800154918666, |
|
"grad_norm": 0.1722804307937622, |
|
"learning_rate": 4.512780790085206e-06, |
|
"loss": 0.0044, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 2.3625096824167313, |
|
"grad_norm": 0.21791347861289978, |
|
"learning_rate": 4.254583010586109e-06, |
|
"loss": 0.0042, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 2.4012393493415956, |
|
"grad_norm": 0.05044485256075859, |
|
"learning_rate": 3.996385231087013e-06, |
|
"loss": 0.0044, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 2.4399690162664602, |
|
"grad_norm": 0.16592107713222504, |
|
"learning_rate": 3.738703847146915e-06, |
|
"loss": 0.0043, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 2.4786986831913245, |
|
"grad_norm": 0.041500113904476166, |
|
"learning_rate": 3.4805060676478185e-06, |
|
"loss": 0.0045, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.517428350116189, |
|
"grad_norm": 0.06472677737474442, |
|
"learning_rate": 3.222308288148722e-06, |
|
"loss": 0.0042, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 2.5561580170410534, |
|
"grad_norm": 0.17371675372123718, |
|
"learning_rate": 2.9641105086496257e-06, |
|
"loss": 0.0044, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.5948876839659176, |
|
"grad_norm": 0.17464491724967957, |
|
"learning_rate": 2.7059127291505293e-06, |
|
"loss": 0.0042, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 2.6336173508907823, |
|
"grad_norm": 0.04553200677037239, |
|
"learning_rate": 2.4477149496514333e-06, |
|
"loss": 0.0043, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.672347017815647, |
|
"grad_norm": 0.07096046209335327, |
|
"learning_rate": 2.190033565711335e-06, |
|
"loss": 0.0041, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 2.7110766847405112, |
|
"grad_norm": 0.06601293385028839, |
|
"learning_rate": 1.931835786212239e-06, |
|
"loss": 0.0043, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.7498063516653755, |
|
"grad_norm": 0.07469964027404785, |
|
"learning_rate": 1.6736380067131424e-06, |
|
"loss": 0.0042, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 2.78853601859024, |
|
"grad_norm": 0.14455720782279968, |
|
"learning_rate": 1.415440227214046e-06, |
|
"loss": 0.0044, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.827265685515105, |
|
"grad_norm": 0.04146069288253784, |
|
"learning_rate": 1.1577588432739479e-06, |
|
"loss": 0.0043, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 2.865995352439969, |
|
"grad_norm": 0.038276560604572296, |
|
"learning_rate": 8.995610637748516e-07, |
|
"loss": 0.0044, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.9047250193648333, |
|
"grad_norm": 0.1604069173336029, |
|
"learning_rate": 6.413632842757553e-07, |
|
"loss": 0.0043, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 2.943454686289698, |
|
"grad_norm": 0.055847618728876114, |
|
"learning_rate": 3.831655047766589e-07, |
|
"loss": 0.0043, |
|
"step": 38000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 38730, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.633798918275072e+16, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|