{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.943454686289698, "eval_steps": 500, "global_step": 38000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.038729666924864445, "grad_norm": 5.300307750701904, "learning_rate": 1.9741802220500904e-05, "loss": 0.008, "step": 500 }, { "epoch": 0.07745933384972889, "grad_norm": 0.046279702335596085, "learning_rate": 1.9483604441001807e-05, "loss": 0.0079, "step": 1000 }, { "epoch": 0.11618900077459333, "grad_norm": 0.16018961369991302, "learning_rate": 1.9225406661502713e-05, "loss": 0.0077, "step": 1500 }, { "epoch": 0.15491866769945778, "grad_norm": 0.12051185965538025, "learning_rate": 1.8967208882003616e-05, "loss": 0.0078, "step": 2000 }, { "epoch": 0.19364833462432224, "grad_norm": 0.16797564923763275, "learning_rate": 1.870901110250452e-05, "loss": 0.007, "step": 2500 }, { "epoch": 0.23237800154918667, "grad_norm": 0.20184361934661865, "learning_rate": 1.845081332300542e-05, "loss": 0.0072, "step": 3000 }, { "epoch": 0.2711076684740511, "grad_norm": 0.3552689552307129, "learning_rate": 1.8192615543506328e-05, "loss": 0.0069, "step": 3500 }, { "epoch": 0.30983733539891556, "grad_norm": 0.14999011158943176, "learning_rate": 1.7937516137361218e-05, "loss": 0.0185, "step": 4000 }, { "epoch": 0.34856700232378, "grad_norm": 0.04176361858844757, "learning_rate": 1.7679318357862124e-05, "loss": 0.007, "step": 4500 }, { "epoch": 0.3872966692486445, "grad_norm": 0.11695006489753723, "learning_rate": 1.7421120578363027e-05, "loss": 0.0062, "step": 5000 }, { "epoch": 0.42602633617350893, "grad_norm": 0.11298699676990509, "learning_rate": 1.716292279886393e-05, "loss": 0.0077, "step": 5500 }, { "epoch": 0.46475600309837334, "grad_norm": 0.7519420385360718, "learning_rate": 1.6904725019364833e-05, "loss": 0.0065, "step": 6000 }, { "epoch": 0.5034856700232379, "grad_norm": 0.146876260638237, "learning_rate": 1.664652723986574e-05, "loss": 0.0055, "step": 6500 }, { "epoch": 0.5422153369481022, "grad_norm": 0.18907134234905243, "learning_rate": 1.638832946036664e-05, "loss": 0.0061, "step": 7000 }, { "epoch": 0.5809450038729667, "grad_norm": 0.3313717842102051, "learning_rate": 1.6130131680867548e-05, "loss": 0.0062, "step": 7500 }, { "epoch": 0.6196746707978311, "grad_norm": 0.07646404951810837, "learning_rate": 1.587193390136845e-05, "loss": 0.0059, "step": 8000 }, { "epoch": 0.6584043377226956, "grad_norm": 0.09517477452754974, "learning_rate": 1.5613736121869353e-05, "loss": 0.0058, "step": 8500 }, { "epoch": 0.69713400464756, "grad_norm": 0.15235310792922974, "learning_rate": 1.5355538342370256e-05, "loss": 0.0055, "step": 9000 }, { "epoch": 0.7358636715724245, "grad_norm": 0.07088156044483185, "learning_rate": 1.509734056287116e-05, "loss": 0.0058, "step": 9500 }, { "epoch": 0.774593338497289, "grad_norm": 0.19365285336971283, "learning_rate": 1.4839142783372065e-05, "loss": 0.0059, "step": 10000 }, { "epoch": 0.8133230054221534, "grad_norm": 0.6815012097358704, "learning_rate": 1.4580945003872968e-05, "loss": 0.0058, "step": 10500 }, { "epoch": 0.8520526723470179, "grad_norm": 0.050135429948568344, "learning_rate": 1.4322747224373872e-05, "loss": 0.006, "step": 11000 }, { "epoch": 0.8907823392718822, "grad_norm": 0.21455714106559753, "learning_rate": 1.4064549444874775e-05, "loss": 0.0061, "step": 11500 }, { "epoch": 0.9295120061967467, "grad_norm": 0.1364729255437851, "learning_rate": 1.380635166537568e-05, "loss": 0.0053, "step": 12000 }, { "epoch": 0.9682416731216111, "grad_norm": 0.07110429555177689, "learning_rate": 1.3548670281435581e-05, "loss": 0.0054, "step": 12500 }, { "epoch": 1.0, "eval_bleu": 91.1948, "eval_gen_len": 13.3484, "eval_loss": 0.007155969273298979, "eval_runtime": 76.0184, "eval_samples_per_second": 603.801, "eval_steps_per_second": 18.877, "step": 12910 }, { "epoch": 1.0069713400464757, "grad_norm": 0.2353477030992508, "learning_rate": 1.3290988897495483e-05, "loss": 0.0072, "step": 13000 }, { "epoch": 1.04570100697134, "grad_norm": 0.10523347556591034, "learning_rate": 1.3032791117996386e-05, "loss": 0.0059, "step": 13500 }, { "epoch": 1.0844306738962044, "grad_norm": 0.13372991979122162, "learning_rate": 1.277459333849729e-05, "loss": 0.0052, "step": 14000 }, { "epoch": 1.1231603408210689, "grad_norm": 0.06887730956077576, "learning_rate": 1.2516395558998193e-05, "loss": 0.0059, "step": 14500 }, { "epoch": 1.1618900077459333, "grad_norm": 0.11208628118038177, "learning_rate": 1.2258197779499098e-05, "loss": 0.0065, "step": 15000 }, { "epoch": 1.2006196746707978, "grad_norm": 0.057284511625766754, "learning_rate": 1.2e-05, "loss": 0.0053, "step": 15500 }, { "epoch": 1.2393493415956622, "grad_norm": 0.30577805638313293, "learning_rate": 1.1741802220500905e-05, "loss": 0.006, "step": 16000 }, { "epoch": 1.2780790085205267, "grad_norm": 0.023146886378526688, "learning_rate": 1.1483604441001808e-05, "loss": 0.0058, "step": 16500 }, { "epoch": 1.3168086754453912, "grad_norm": 0.06862912327051163, "learning_rate": 1.1225406661502712e-05, "loss": 0.0055, "step": 17000 }, { "epoch": 1.3555383423702556, "grad_norm": 0.11294469982385635, "learning_rate": 1.0967208882003615e-05, "loss": 0.0055, "step": 17500 }, { "epoch": 1.39426800929512, "grad_norm": 0.0633481964468956, "learning_rate": 1.070901110250452e-05, "loss": 0.0056, "step": 18000 }, { "epoch": 1.4329976762199845, "grad_norm": 0.20660839974880219, "learning_rate": 1.0450813323005422e-05, "loss": 0.0055, "step": 18500 }, { "epoch": 1.471727343144849, "grad_norm": 0.2304856777191162, "learning_rate": 1.0192615543506326e-05, "loss": 0.0056, "step": 19000 }, { "epoch": 1.5104570100697134, "grad_norm": 0.07686398923397064, "learning_rate": 9.934417764007231e-06, "loss": 0.0057, "step": 19500 }, { "epoch": 1.549186676994578, "grad_norm": 0.184184268116951, "learning_rate": 9.676219984508134e-06, "loss": 0.0057, "step": 20000 }, { "epoch": 1.5879163439194421, "grad_norm": 0.16291634738445282, "learning_rate": 9.418022205009038e-06, "loss": 0.0054, "step": 20500 }, { "epoch": 1.6266460108443068, "grad_norm": 0.08518970757722855, "learning_rate": 9.159824425509941e-06, "loss": 0.0054, "step": 21000 }, { "epoch": 1.665375677769171, "grad_norm": 0.15436352789402008, "learning_rate": 8.901626646010845e-06, "loss": 0.0054, "step": 21500 }, { "epoch": 1.7041053446940357, "grad_norm": 0.036033745855093, "learning_rate": 8.643428866511748e-06, "loss": 0.0054, "step": 22000 }, { "epoch": 1.7428350116189, "grad_norm": 0.10140281915664673, "learning_rate": 8.385231087012653e-06, "loss": 0.0058, "step": 22500 }, { "epoch": 1.7815646785437647, "grad_norm": 0.10392390936613083, "learning_rate": 8.127549703072554e-06, "loss": 0.0052, "step": 23000 }, { "epoch": 1.820294345468629, "grad_norm": 0.4044785499572754, "learning_rate": 7.869351923573459e-06, "loss": 0.0052, "step": 23500 }, { "epoch": 1.8590240123934936, "grad_norm": 0.04380480572581291, "learning_rate": 7.611154144074362e-06, "loss": 0.0054, "step": 24000 }, { "epoch": 1.8977536793183578, "grad_norm": 0.49817517399787903, "learning_rate": 7.352956364575265e-06, "loss": 0.005, "step": 24500 }, { "epoch": 1.9364833462432223, "grad_norm": 0.1552819013595581, "learning_rate": 7.094758585076169e-06, "loss": 0.0052, "step": 25000 }, { "epoch": 1.9752130131680867, "grad_norm": 0.3041650056838989, "learning_rate": 6.8365608055770725e-06, "loss": 0.0052, "step": 25500 }, { "epoch": 2.0, "eval_bleu": 91.3684, "eval_gen_len": 13.3464, "eval_loss": 0.0063232011161744595, "eval_runtime": 76.6579, "eval_samples_per_second": 598.764, "eval_steps_per_second": 18.72, "step": 25820 }, { "epoch": 2.0139426800929514, "grad_norm": 0.07072452455759048, "learning_rate": 6.578363026077976e-06, "loss": 0.0053, "step": 26000 }, { "epoch": 2.0526723470178156, "grad_norm": 0.08410033583641052, "learning_rate": 6.32016524657888e-06, "loss": 0.0043, "step": 26500 }, { "epoch": 2.09140201394268, "grad_norm": 0.10657070577144623, "learning_rate": 6.061967467079783e-06, "loss": 0.0043, "step": 27000 }, { "epoch": 2.1301316808675446, "grad_norm": 0.10941112786531448, "learning_rate": 5.803769687580687e-06, "loss": 0.0043, "step": 27500 }, { "epoch": 2.168861347792409, "grad_norm": 0.1383851021528244, "learning_rate": 5.5455719080815905e-06, "loss": 0.0044, "step": 28000 }, { "epoch": 2.2075910147172735, "grad_norm": 0.09801546484231949, "learning_rate": 5.287374128582494e-06, "loss": 0.0043, "step": 28500 }, { "epoch": 2.2463206816421377, "grad_norm": 0.10826662182807922, "learning_rate": 5.029176349083398e-06, "loss": 0.0047, "step": 29000 }, { "epoch": 2.2850503485670024, "grad_norm": 0.1411546766757965, "learning_rate": 4.770978569584302e-06, "loss": 0.0046, "step": 29500 }, { "epoch": 2.3237800154918666, "grad_norm": 0.1722804307937622, "learning_rate": 4.512780790085206e-06, "loss": 0.0044, "step": 30000 }, { "epoch": 2.3625096824167313, "grad_norm": 0.21791347861289978, "learning_rate": 4.254583010586109e-06, "loss": 0.0042, "step": 30500 }, { "epoch": 2.4012393493415956, "grad_norm": 0.05044485256075859, "learning_rate": 3.996385231087013e-06, "loss": 0.0044, "step": 31000 }, { "epoch": 2.4399690162664602, "grad_norm": 0.16592107713222504, "learning_rate": 3.738703847146915e-06, "loss": 0.0043, "step": 31500 }, { "epoch": 2.4786986831913245, "grad_norm": 0.041500113904476166, "learning_rate": 3.4805060676478185e-06, "loss": 0.0045, "step": 32000 }, { "epoch": 2.517428350116189, "grad_norm": 0.06472677737474442, "learning_rate": 3.222308288148722e-06, "loss": 0.0042, "step": 32500 }, { "epoch": 2.5561580170410534, "grad_norm": 0.17371675372123718, "learning_rate": 2.9641105086496257e-06, "loss": 0.0044, "step": 33000 }, { "epoch": 2.5948876839659176, "grad_norm": 0.17464491724967957, "learning_rate": 2.7059127291505293e-06, "loss": 0.0042, "step": 33500 }, { "epoch": 2.6336173508907823, "grad_norm": 0.04553200677037239, "learning_rate": 2.4477149496514333e-06, "loss": 0.0043, "step": 34000 }, { "epoch": 2.672347017815647, "grad_norm": 0.07096046209335327, "learning_rate": 2.190033565711335e-06, "loss": 0.0041, "step": 34500 }, { "epoch": 2.7110766847405112, "grad_norm": 0.06601293385028839, "learning_rate": 1.931835786212239e-06, "loss": 0.0043, "step": 35000 }, { "epoch": 2.7498063516653755, "grad_norm": 0.07469964027404785, "learning_rate": 1.6736380067131424e-06, "loss": 0.0042, "step": 35500 }, { "epoch": 2.78853601859024, "grad_norm": 0.14455720782279968, "learning_rate": 1.415440227214046e-06, "loss": 0.0044, "step": 36000 }, { "epoch": 2.827265685515105, "grad_norm": 0.04146069288253784, "learning_rate": 1.1577588432739479e-06, "loss": 0.0043, "step": 36500 }, { "epoch": 2.865995352439969, "grad_norm": 0.038276560604572296, "learning_rate": 8.995610637748516e-07, "loss": 0.0044, "step": 37000 }, { "epoch": 2.9047250193648333, "grad_norm": 0.1604069173336029, "learning_rate": 6.413632842757553e-07, "loss": 0.0043, "step": 37500 }, { "epoch": 2.943454686289698, "grad_norm": 0.055847618728876114, "learning_rate": 3.831655047766589e-07, "loss": 0.0043, "step": 38000 } ], "logging_steps": 500, "max_steps": 38730, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.633798918275072e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }