|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0257510729613735, |
|
"eval_steps": 500, |
|
"global_step": 240, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004291845493562232, |
|
"grad_norm": 0.48923006653785706, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 3.0401, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008583690987124463, |
|
"grad_norm": 0.4999135434627533, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 3.0045, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.012875536480686695, |
|
"grad_norm": 0.5719347596168518, |
|
"learning_rate": 7.2e-06, |
|
"loss": 3.233, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.017167381974248927, |
|
"grad_norm": 0.4990224242210388, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 3.0813, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02145922746781116, |
|
"grad_norm": 0.5448071360588074, |
|
"learning_rate": 1.2e-05, |
|
"loss": 3.1513, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02575107296137339, |
|
"grad_norm": 0.4316995441913605, |
|
"learning_rate": 1.44e-05, |
|
"loss": 2.9247, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.030042918454935622, |
|
"grad_norm": 0.39892056584358215, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 3.0529, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.034334763948497854, |
|
"grad_norm": 0.3437541723251343, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 2.742, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03862660944206009, |
|
"grad_norm": 0.3232952356338501, |
|
"learning_rate": 2.16e-05, |
|
"loss": 2.8807, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.04291845493562232, |
|
"grad_norm": 0.2316465526819229, |
|
"learning_rate": 2.4e-05, |
|
"loss": 2.8284, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04721030042918455, |
|
"grad_norm": 0.1895979940891266, |
|
"learning_rate": 2.64e-05, |
|
"loss": 2.846, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.05150214592274678, |
|
"grad_norm": 0.18949951231479645, |
|
"learning_rate": 2.88e-05, |
|
"loss": 2.6129, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.055793991416309016, |
|
"grad_norm": 0.29111406207084656, |
|
"learning_rate": 3.12e-05, |
|
"loss": 2.8553, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.060085836909871244, |
|
"grad_norm": 0.3171645998954773, |
|
"learning_rate": 3.3600000000000004e-05, |
|
"loss": 2.7663, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.06437768240343347, |
|
"grad_norm": 0.3853575587272644, |
|
"learning_rate": 3.6e-05, |
|
"loss": 2.8468, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06866952789699571, |
|
"grad_norm": 0.33584126830101013, |
|
"learning_rate": 3.8400000000000005e-05, |
|
"loss": 2.8286, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.07296137339055794, |
|
"grad_norm": 0.22323380410671234, |
|
"learning_rate": 4.08e-05, |
|
"loss": 2.9605, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.07725321888412018, |
|
"grad_norm": 0.20761723816394806, |
|
"learning_rate": 4.32e-05, |
|
"loss": 2.5953, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0815450643776824, |
|
"grad_norm": 0.18277060985565186, |
|
"learning_rate": 4.5600000000000004e-05, |
|
"loss": 2.5327, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08583690987124463, |
|
"grad_norm": 0.25170719623565674, |
|
"learning_rate": 4.8e-05, |
|
"loss": 2.7127, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09012875536480687, |
|
"grad_norm": 0.2634769678115845, |
|
"learning_rate": 5.04e-05, |
|
"loss": 2.7035, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0944206008583691, |
|
"grad_norm": 0.1749686598777771, |
|
"learning_rate": 5.28e-05, |
|
"loss": 2.7313, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.09871244635193133, |
|
"grad_norm": 0.19000954926013947, |
|
"learning_rate": 5.520000000000001e-05, |
|
"loss": 2.6551, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.10300429184549356, |
|
"grad_norm": 0.16048619151115417, |
|
"learning_rate": 5.76e-05, |
|
"loss": 2.6901, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1072961373390558, |
|
"grad_norm": 0.1346961259841919, |
|
"learning_rate": 6e-05, |
|
"loss": 2.6762, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11158798283261803, |
|
"grad_norm": 0.10837673395872116, |
|
"learning_rate": 5.9999238776847435e-05, |
|
"loss": 2.5797, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.11587982832618025, |
|
"grad_norm": 0.12566141784191132, |
|
"learning_rate": 5.9996955146020456e-05, |
|
"loss": 2.6367, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.12017167381974249, |
|
"grad_norm": 0.10796438157558441, |
|
"learning_rate": 5.999314922340924e-05, |
|
"loss": 2.6986, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.12446351931330472, |
|
"grad_norm": 0.1174619048833847, |
|
"learning_rate": 5.9987821202157545e-05, |
|
"loss": 2.4532, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.12875536480686695, |
|
"grad_norm": 0.12774387001991272, |
|
"learning_rate": 5.998097135265291e-05, |
|
"loss": 2.7154, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13304721030042918, |
|
"grad_norm": 0.12038439512252808, |
|
"learning_rate": 5.9972600022512946e-05, |
|
"loss": 2.6464, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.13733905579399142, |
|
"grad_norm": 0.1207851693034172, |
|
"learning_rate": 5.996270763656767e-05, |
|
"loss": 2.745, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.14163090128755365, |
|
"grad_norm": 0.12519803643226624, |
|
"learning_rate": 5.9951294696837966e-05, |
|
"loss": 2.4953, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1459227467811159, |
|
"grad_norm": 0.11793860048055649, |
|
"learning_rate": 5.993836178251009e-05, |
|
"loss": 2.5615, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.15021459227467812, |
|
"grad_norm": 0.10079298913478851, |
|
"learning_rate": 5.99239095499063e-05, |
|
"loss": 2.5886, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.15450643776824036, |
|
"grad_norm": 0.1022179052233696, |
|
"learning_rate": 5.990793873245154e-05, |
|
"loss": 2.4649, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.15879828326180256, |
|
"grad_norm": 0.1409706175327301, |
|
"learning_rate": 5.989045014063621e-05, |
|
"loss": 2.7143, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.1630901287553648, |
|
"grad_norm": 0.11522237211465836, |
|
"learning_rate": 5.9871444661975037e-05, |
|
"loss": 2.6908, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.16738197424892703, |
|
"grad_norm": 0.11969179660081863, |
|
"learning_rate": 5.9850923260962045e-05, |
|
"loss": 2.6429, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.17167381974248927, |
|
"grad_norm": 0.13146190345287323, |
|
"learning_rate": 5.982888697902161e-05, |
|
"loss": 2.5398, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1759656652360515, |
|
"grad_norm": 0.08940872550010681, |
|
"learning_rate": 5.98053369344556e-05, |
|
"loss": 2.6048, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.18025751072961374, |
|
"grad_norm": 0.0957597866654396, |
|
"learning_rate": 5.978027432238662e-05, |
|
"loss": 2.758, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.18454935622317598, |
|
"grad_norm": 0.12192820012569427, |
|
"learning_rate": 5.975370041469738e-05, |
|
"loss": 2.6317, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.1888412017167382, |
|
"grad_norm": 0.11058337986469269, |
|
"learning_rate": 5.972561655996614e-05, |
|
"loss": 2.631, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.19313304721030042, |
|
"grad_norm": 0.0976683646440506, |
|
"learning_rate": 5.969602418339825e-05, |
|
"loss": 2.5953, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.19742489270386265, |
|
"grad_norm": 0.07907278090715408, |
|
"learning_rate": 5.966492478675385e-05, |
|
"loss": 2.5996, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2017167381974249, |
|
"grad_norm": 0.0848410427570343, |
|
"learning_rate": 5.963231994827169e-05, |
|
"loss": 2.5325, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.20600858369098712, |
|
"grad_norm": 0.0901293084025383, |
|
"learning_rate": 5.9598211322588925e-05, |
|
"loss": 2.4015, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.21030042918454936, |
|
"grad_norm": 0.09382897615432739, |
|
"learning_rate": 5.956260064065728e-05, |
|
"loss": 2.6818, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.2145922746781116, |
|
"grad_norm": 0.10944745689630508, |
|
"learning_rate": 5.952548970965513e-05, |
|
"loss": 2.4901, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21888412017167383, |
|
"grad_norm": 0.1057049036026001, |
|
"learning_rate": 5.948688041289578e-05, |
|
"loss": 2.5959, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.22317596566523606, |
|
"grad_norm": 0.10103785246610641, |
|
"learning_rate": 5.944677470973196e-05, |
|
"loss": 2.493, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.22746781115879827, |
|
"grad_norm": 0.11647158861160278, |
|
"learning_rate": 5.9405174635456315e-05, |
|
"loss": 2.6496, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.2317596566523605, |
|
"grad_norm": 0.09211481362581253, |
|
"learning_rate": 5.9362082301198156e-05, |
|
"loss": 2.6669, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.23605150214592274, |
|
"grad_norm": 0.09014479070901871, |
|
"learning_rate": 5.931749989381632e-05, |
|
"loss": 2.6098, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.24034334763948498, |
|
"grad_norm": 0.16046330332756042, |
|
"learning_rate": 5.9271429675788184e-05, |
|
"loss": 2.4713, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.2446351931330472, |
|
"grad_norm": 0.1069481372833252, |
|
"learning_rate": 5.9223873985094866e-05, |
|
"loss": 2.3664, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.24892703862660945, |
|
"grad_norm": 0.10389809310436249, |
|
"learning_rate": 5.9174835235102536e-05, |
|
"loss": 2.5278, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.2532188841201717, |
|
"grad_norm": 0.11560378223657608, |
|
"learning_rate": 5.912431591443999e-05, |
|
"loss": 2.4859, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.2575107296137339, |
|
"grad_norm": 0.09909158945083618, |
|
"learning_rate": 5.9072318586872344e-05, |
|
"loss": 2.4819, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.26180257510729615, |
|
"grad_norm": 0.08729315549135208, |
|
"learning_rate": 5.901884589117089e-05, |
|
"loss": 2.5432, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.26609442060085836, |
|
"grad_norm": 0.11550580710172653, |
|
"learning_rate": 5.896390054097922e-05, |
|
"loss": 2.592, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.2703862660944206, |
|
"grad_norm": 0.10853075236082077, |
|
"learning_rate": 5.8907485324675545e-05, |
|
"loss": 2.3256, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.27467811158798283, |
|
"grad_norm": 0.09386483579874039, |
|
"learning_rate": 5.884960310523109e-05, |
|
"loss": 2.676, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.27896995708154504, |
|
"grad_norm": 0.12031152844429016, |
|
"learning_rate": 5.879025682006491e-05, |
|
"loss": 2.5849, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2832618025751073, |
|
"grad_norm": 0.09846463054418564, |
|
"learning_rate": 5.872944948089474e-05, |
|
"loss": 2.542, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.2875536480686695, |
|
"grad_norm": 0.09881763160228729, |
|
"learning_rate": 5.8667184173584226e-05, |
|
"loss": 2.6389, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.2918454935622318, |
|
"grad_norm": 0.12872399389743805, |
|
"learning_rate": 5.860346405798625e-05, |
|
"loss": 2.5906, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.296137339055794, |
|
"grad_norm": 0.0927918404340744, |
|
"learning_rate": 5.853829236778266e-05, |
|
"loss": 2.3233, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.30042918454935624, |
|
"grad_norm": 0.1097516193985939, |
|
"learning_rate": 5.847167241032006e-05, |
|
"loss": 2.3085, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.30472103004291845, |
|
"grad_norm": 0.12274881452322006, |
|
"learning_rate": 5.8403607566442066e-05, |
|
"loss": 2.4758, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.3090128755364807, |
|
"grad_norm": 0.10112845152616501, |
|
"learning_rate": 5.833410129031768e-05, |
|
"loss": 2.6385, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.3133047210300429, |
|
"grad_norm": 0.107538603246212, |
|
"learning_rate": 5.8263157109266e-05, |
|
"loss": 2.4394, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.31759656652360513, |
|
"grad_norm": 0.15851671993732452, |
|
"learning_rate": 5.819077862357725e-05, |
|
"loss": 2.348, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.3218884120171674, |
|
"grad_norm": 0.09074151515960693, |
|
"learning_rate": 5.811696950633003e-05, |
|
"loss": 2.4884, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3261802575107296, |
|
"grad_norm": 0.10217540711164474, |
|
"learning_rate": 5.8041733503204934e-05, |
|
"loss": 2.5566, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.33047210300429186, |
|
"grad_norm": 0.12177087366580963, |
|
"learning_rate": 5.796507443229445e-05, |
|
"loss": 2.4852, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.33476394849785407, |
|
"grad_norm": 0.10884562134742737, |
|
"learning_rate": 5.788699618390924e-05, |
|
"loss": 2.6596, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.33905579399141633, |
|
"grad_norm": 0.09898590296506882, |
|
"learning_rate": 5.7807502720380655e-05, |
|
"loss": 2.5299, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.34334763948497854, |
|
"grad_norm": 0.16023777425289154, |
|
"learning_rate": 5.772659807585968e-05, |
|
"loss": 2.5859, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.34763948497854075, |
|
"grad_norm": 0.1152929812669754, |
|
"learning_rate": 5.764428635611223e-05, |
|
"loss": 2.6133, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.351931330472103, |
|
"grad_norm": 0.11540824174880981, |
|
"learning_rate": 5.756057173831075e-05, |
|
"loss": 2.4024, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.3562231759656652, |
|
"grad_norm": 0.10451588034629822, |
|
"learning_rate": 5.7475458470822275e-05, |
|
"loss": 2.4877, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.3605150214592275, |
|
"grad_norm": 0.09626314043998718, |
|
"learning_rate": 5.7388950872992764e-05, |
|
"loss": 2.67, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.3648068669527897, |
|
"grad_norm": 0.11973975598812103, |
|
"learning_rate": 5.7301053334928e-05, |
|
"loss": 2.3961, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.36909871244635195, |
|
"grad_norm": 0.09656506031751633, |
|
"learning_rate": 5.7211770317270696e-05, |
|
"loss": 2.5181, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.37339055793991416, |
|
"grad_norm": 0.09045439958572388, |
|
"learning_rate": 5.712110635097422e-05, |
|
"loss": 2.6467, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.3776824034334764, |
|
"grad_norm": 0.11997085064649582, |
|
"learning_rate": 5.702906603707257e-05, |
|
"loss": 2.6225, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.38197424892703863, |
|
"grad_norm": 0.10198812931776047, |
|
"learning_rate": 5.6935654046446955e-05, |
|
"loss": 2.5756, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.38626609442060084, |
|
"grad_norm": 0.12353700399398804, |
|
"learning_rate": 5.684087511958869e-05, |
|
"loss": 2.5561, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3905579399141631, |
|
"grad_norm": 0.09727780520915985, |
|
"learning_rate": 5.674473406635869e-05, |
|
"loss": 2.3674, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.3948497854077253, |
|
"grad_norm": 0.13741353154182434, |
|
"learning_rate": 5.664723576574332e-05, |
|
"loss": 2.4249, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.39914163090128757, |
|
"grad_norm": 0.09821099787950516, |
|
"learning_rate": 5.6548385165606835e-05, |
|
"loss": 2.5469, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.4034334763948498, |
|
"grad_norm": 0.12704302370548248, |
|
"learning_rate": 5.644818728244027e-05, |
|
"loss": 2.5243, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.40772532188841204, |
|
"grad_norm": 0.14363928139209747, |
|
"learning_rate": 5.634664720110686e-05, |
|
"loss": 2.3621, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.41201716738197425, |
|
"grad_norm": 0.09331246465444565, |
|
"learning_rate": 5.6243770074583985e-05, |
|
"loss": 2.4098, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.41630901287553645, |
|
"grad_norm": 0.13661201298236847, |
|
"learning_rate": 5.613956112370168e-05, |
|
"loss": 2.3998, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.4206008583690987, |
|
"grad_norm": 0.15692031383514404, |
|
"learning_rate": 5.60340256368777e-05, |
|
"loss": 2.6108, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.4248927038626609, |
|
"grad_norm": 0.1194879561662674, |
|
"learning_rate": 5.59271689698491e-05, |
|
"loss": 2.609, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.4291845493562232, |
|
"grad_norm": 0.14594995975494385, |
|
"learning_rate": 5.581899654540049e-05, |
|
"loss": 2.3956, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4334763948497854, |
|
"grad_norm": 0.11137986928224564, |
|
"learning_rate": 5.570951385308879e-05, |
|
"loss": 2.6656, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.43776824034334766, |
|
"grad_norm": 0.16587291657924652, |
|
"learning_rate": 5.559872644896467e-05, |
|
"loss": 2.4815, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.44206008583690987, |
|
"grad_norm": 0.1582924723625183, |
|
"learning_rate": 5.548663995529062e-05, |
|
"loss": 2.4112, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.44635193133047213, |
|
"grad_norm": 0.12027094513177872, |
|
"learning_rate": 5.5373260060255563e-05, |
|
"loss": 2.4611, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.45064377682403434, |
|
"grad_norm": 0.17147387564182281, |
|
"learning_rate": 5.525859251768625e-05, |
|
"loss": 2.4084, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.45493562231759654, |
|
"grad_norm": 0.138417050242424, |
|
"learning_rate": 5.5142643146755215e-05, |
|
"loss": 2.655, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.4592274678111588, |
|
"grad_norm": 0.15900184214115143, |
|
"learning_rate": 5.5025417831685533e-05, |
|
"loss": 2.4029, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.463519313304721, |
|
"grad_norm": 0.1893249750137329, |
|
"learning_rate": 5.4906922521452105e-05, |
|
"loss": 2.367, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.4678111587982833, |
|
"grad_norm": 0.13178469240665436, |
|
"learning_rate": 5.478716322947985e-05, |
|
"loss": 2.4976, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.4721030042918455, |
|
"grad_norm": 0.22058036923408508, |
|
"learning_rate": 5.466614603333848e-05, |
|
"loss": 2.4154, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.47639484978540775, |
|
"grad_norm": 0.10874748975038528, |
|
"learning_rate": 5.4543877074434106e-05, |
|
"loss": 2.3568, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.48068669527896996, |
|
"grad_norm": 0.1645105481147766, |
|
"learning_rate": 5.4420362557697546e-05, |
|
"loss": 2.5891, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.48497854077253216, |
|
"grad_norm": 0.10352698713541031, |
|
"learning_rate": 5.429560875126946e-05, |
|
"loss": 2.2487, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.4892703862660944, |
|
"grad_norm": 0.10524651408195496, |
|
"learning_rate": 5.4169621986182234e-05, |
|
"loss": 2.5208, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.49356223175965663, |
|
"grad_norm": 0.10389380156993866, |
|
"learning_rate": 5.40424086560387e-05, |
|
"loss": 2.3373, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.4978540772532189, |
|
"grad_norm": 0.11282224953174591, |
|
"learning_rate": 5.3913975216687675e-05, |
|
"loss": 2.4336, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5021459227467812, |
|
"grad_norm": 0.14653658866882324, |
|
"learning_rate": 5.378432818589633e-05, |
|
"loss": 2.5201, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.5064377682403434, |
|
"grad_norm": 0.10697951167821884, |
|
"learning_rate": 5.365347414301943e-05, |
|
"loss": 2.5, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.5107296137339056, |
|
"grad_norm": 0.08726029098033905, |
|
"learning_rate": 5.352141972866545e-05, |
|
"loss": 2.4833, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.5150214592274678, |
|
"grad_norm": 0.1065029427409172, |
|
"learning_rate": 5.3388171644359565e-05, |
|
"loss": 2.3921, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.51931330472103, |
|
"grad_norm": 0.12069697678089142, |
|
"learning_rate": 5.325373665220356e-05, |
|
"loss": 2.4335, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.5236051502145923, |
|
"grad_norm": 0.11373002082109451, |
|
"learning_rate": 5.311812157453266e-05, |
|
"loss": 2.5685, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.5278969957081545, |
|
"grad_norm": 0.13439607620239258, |
|
"learning_rate": 5.298133329356934e-05, |
|
"loss": 2.4198, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.5321888412017167, |
|
"grad_norm": 0.10372010618448257, |
|
"learning_rate": 5.284337875107403e-05, |
|
"loss": 2.4413, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5364806866952789, |
|
"grad_norm": 0.1336512565612793, |
|
"learning_rate": 5.2704264947992855e-05, |
|
"loss": 2.5762, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5407725321888412, |
|
"grad_norm": 0.12671151757240295, |
|
"learning_rate": 5.256399894410232e-05, |
|
"loss": 2.5616, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5450643776824035, |
|
"grad_norm": 0.10133809596300125, |
|
"learning_rate": 5.242258785765106e-05, |
|
"loss": 2.3512, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.5493562231759657, |
|
"grad_norm": 0.10729261487722397, |
|
"learning_rate": 5.228003886499863e-05, |
|
"loss": 2.4487, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.5536480686695279, |
|
"grad_norm": 0.10313385725021362, |
|
"learning_rate": 5.213635920025127e-05, |
|
"loss": 2.3446, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.5579399141630901, |
|
"grad_norm": 0.11684712022542953, |
|
"learning_rate": 5.1991556154894786e-05, |
|
"loss": 2.4377, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5622317596566524, |
|
"grad_norm": 0.12217995524406433, |
|
"learning_rate": 5.1845637077424576e-05, |
|
"loss": 2.4721, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.5665236051502146, |
|
"grad_norm": 0.1254771202802658, |
|
"learning_rate": 5.169860937297264e-05, |
|
"loss": 2.3565, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.5708154506437768, |
|
"grad_norm": 0.10808940976858139, |
|
"learning_rate": 5.155048050293183e-05, |
|
"loss": 2.464, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.575107296137339, |
|
"grad_norm": 0.1136014387011528, |
|
"learning_rate": 5.140125798457716e-05, |
|
"loss": 2.4179, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5793991416309013, |
|
"grad_norm": 0.12022742629051208, |
|
"learning_rate": 5.125094939068439e-05, |
|
"loss": 2.6821, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5836909871244635, |
|
"grad_norm": 0.11005590111017227, |
|
"learning_rate": 5.109956234914558e-05, |
|
"loss": 2.5086, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.5879828326180258, |
|
"grad_norm": 0.14693163335323334, |
|
"learning_rate": 5.0947104542582184e-05, |
|
"loss": 2.5458, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.592274678111588, |
|
"grad_norm": 0.09363128244876862, |
|
"learning_rate": 5.0793583707954984e-05, |
|
"loss": 2.5145, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.5965665236051502, |
|
"grad_norm": 0.19561366736888885, |
|
"learning_rate": 5.063900763617156e-05, |
|
"loss": 2.5379, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.6008583690987125, |
|
"grad_norm": 0.10757976770401001, |
|
"learning_rate": 5.04833841716909e-05, |
|
"loss": 2.5076, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6051502145922747, |
|
"grad_norm": 0.13970068097114563, |
|
"learning_rate": 5.032672121212529e-05, |
|
"loss": 2.472, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.6094420600858369, |
|
"grad_norm": 0.13635942339897156, |
|
"learning_rate": 5.0169026707839506e-05, |
|
"loss": 2.4703, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.6137339055793991, |
|
"grad_norm": 0.10295607149600983, |
|
"learning_rate": 5.001030866154741e-05, |
|
"loss": 2.3987, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.6180257510729614, |
|
"grad_norm": 0.1324918419122696, |
|
"learning_rate": 4.985057512790579e-05, |
|
"loss": 2.4456, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.6223175965665236, |
|
"grad_norm": 0.10353874415159225, |
|
"learning_rate": 4.968983421310555e-05, |
|
"loss": 2.3827, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6266094420600858, |
|
"grad_norm": 0.10886164009571075, |
|
"learning_rate": 4.952809407446043e-05, |
|
"loss": 2.4664, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.630901287553648, |
|
"grad_norm": 0.1082598865032196, |
|
"learning_rate": 4.9365362919992994e-05, |
|
"loss": 2.423, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.6351931330472103, |
|
"grad_norm": 0.14654968678951263, |
|
"learning_rate": 4.9201649008018055e-05, |
|
"loss": 2.3452, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.6394849785407726, |
|
"grad_norm": 0.10624393820762634, |
|
"learning_rate": 4.9036960646723617e-05, |
|
"loss": 2.4337, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.6437768240343348, |
|
"grad_norm": 0.10619154572486877, |
|
"learning_rate": 4.887130619374927e-05, |
|
"loss": 2.2317, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.648068669527897, |
|
"grad_norm": 0.1240013837814331, |
|
"learning_rate": 4.870469405576201e-05, |
|
"loss": 2.4641, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.6523605150214592, |
|
"grad_norm": 0.11749018728733063, |
|
"learning_rate": 4.853713268802962e-05, |
|
"loss": 2.6394, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.6566523605150214, |
|
"grad_norm": 0.11559685319662094, |
|
"learning_rate": 4.836863059399161e-05, |
|
"loss": 2.6346, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.6609442060085837, |
|
"grad_norm": 0.12158619612455368, |
|
"learning_rate": 4.819919632482766e-05, |
|
"loss": 2.5024, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.6652360515021459, |
|
"grad_norm": 0.10826972126960754, |
|
"learning_rate": 4.802883847902368e-05, |
|
"loss": 2.5089, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6695278969957081, |
|
"grad_norm": 0.12029755115509033, |
|
"learning_rate": 4.785756570193543e-05, |
|
"loss": 2.5017, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.6738197424892703, |
|
"grad_norm": 0.11460699141025543, |
|
"learning_rate": 4.76853866853498e-05, |
|
"loss": 2.4295, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.6781115879828327, |
|
"grad_norm": 0.10718906670808792, |
|
"learning_rate": 4.75123101670437e-05, |
|
"loss": 2.4886, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.6824034334763949, |
|
"grad_norm": 0.11268241703510284, |
|
"learning_rate": 4.733834493034066e-05, |
|
"loss": 2.5896, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.6866952789699571, |
|
"grad_norm": 0.12579713761806488, |
|
"learning_rate": 4.716349980366509e-05, |
|
"loss": 2.3132, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6909871244635193, |
|
"grad_norm": 0.11618609726428986, |
|
"learning_rate": 4.698778366009421e-05, |
|
"loss": 2.4478, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.6952789699570815, |
|
"grad_norm": 0.12247326225042343, |
|
"learning_rate": 4.681120541690781e-05, |
|
"loss": 2.4638, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.6995708154506438, |
|
"grad_norm": 0.11064168810844421, |
|
"learning_rate": 4.663377403513569e-05, |
|
"loss": 2.6396, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.703862660944206, |
|
"grad_norm": 0.1138993427157402, |
|
"learning_rate": 4.64554985191029e-05, |
|
"loss": 2.3691, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.7081545064377682, |
|
"grad_norm": 0.11352220177650452, |
|
"learning_rate": 4.6276387915972783e-05, |
|
"loss": 2.5472, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.7124463519313304, |
|
"grad_norm": 0.13823212683200836, |
|
"learning_rate": 4.609645131528789e-05, |
|
"loss": 2.3696, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.7167381974248928, |
|
"grad_norm": 0.16149644553661346, |
|
"learning_rate": 4.5915697848508645e-05, |
|
"loss": 2.3184, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.721030042918455, |
|
"grad_norm": 0.10547155141830444, |
|
"learning_rate": 4.5734136688549964e-05, |
|
"loss": 2.4221, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.7253218884120172, |
|
"grad_norm": 0.1176508441567421, |
|
"learning_rate": 4.555177704931576e-05, |
|
"loss": 2.4223, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.7296137339055794, |
|
"grad_norm": 0.12838046252727509, |
|
"learning_rate": 4.5368628185231314e-05, |
|
"loss": 2.5376, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7339055793991416, |
|
"grad_norm": 0.1296073943376541, |
|
"learning_rate": 4.518469939077369e-05, |
|
"loss": 2.4453, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.7381974248927039, |
|
"grad_norm": 0.13209278881549835, |
|
"learning_rate": 4.5e-05, |
|
"loss": 2.4902, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.7424892703862661, |
|
"grad_norm": 0.13168583810329437, |
|
"learning_rate": 4.4814539386073744e-05, |
|
"loss": 2.2446, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.7467811158798283, |
|
"grad_norm": 0.12592634558677673, |
|
"learning_rate": 4.462832696078915e-05, |
|
"loss": 2.5433, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.7510729613733905, |
|
"grad_norm": 0.12793345749378204, |
|
"learning_rate": 4.4441372174093495e-05, |
|
"loss": 2.4155, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7553648068669528, |
|
"grad_norm": 0.12817440927028656, |
|
"learning_rate": 4.4253684513607585e-05, |
|
"loss": 2.4488, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.759656652360515, |
|
"grad_norm": 0.12905055284500122, |
|
"learning_rate": 4.406527350414427e-05, |
|
"loss": 2.3792, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.7639484978540773, |
|
"grad_norm": 0.11602860689163208, |
|
"learning_rate": 4.3876148707225067e-05, |
|
"loss": 2.2747, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.7682403433476395, |
|
"grad_norm": 0.1343417763710022, |
|
"learning_rate": 4.368631972059489e-05, |
|
"loss": 2.3509, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.7725321888412017, |
|
"grad_norm": 0.17558203637599945, |
|
"learning_rate": 4.349579617773507e-05, |
|
"loss": 2.2545, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.776824034334764, |
|
"grad_norm": 0.11374380439519882, |
|
"learning_rate": 4.33045877473744e-05, |
|
"loss": 2.4592, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.7811158798283262, |
|
"grad_norm": 0.16170468926429749, |
|
"learning_rate": 4.31127041329985e-05, |
|
"loss": 2.5294, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.7854077253218884, |
|
"grad_norm": 0.1064993217587471, |
|
"learning_rate": 4.2920155072357335e-05, |
|
"loss": 2.4469, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.7896995708154506, |
|
"grad_norm": 0.14201977849006653, |
|
"learning_rate": 4.2726950336971115e-05, |
|
"loss": 2.4416, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.7939914163090128, |
|
"grad_norm": 0.12264000624418259, |
|
"learning_rate": 4.2533099731634376e-05, |
|
"loss": 2.5181, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.7982832618025751, |
|
"grad_norm": 0.12580525875091553, |
|
"learning_rate": 4.233861309391835e-05, |
|
"loss": 2.3767, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.8025751072961373, |
|
"grad_norm": 0.15109586715698242, |
|
"learning_rate": 4.214350029367181e-05, |
|
"loss": 2.3987, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.8068669527896996, |
|
"grad_norm": 0.15967005491256714, |
|
"learning_rate": 4.1947771232520165e-05, |
|
"loss": 2.4359, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.8111587982832618, |
|
"grad_norm": 0.10725586116313934, |
|
"learning_rate": 4.175143584336295e-05, |
|
"loss": 2.4294, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.8154506437768241, |
|
"grad_norm": 0.12302330881357193, |
|
"learning_rate": 4.155450408986972e-05, |
|
"loss": 2.4109, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8197424892703863, |
|
"grad_norm": 0.13229899108409882, |
|
"learning_rate": 4.1356985965974536e-05, |
|
"loss": 2.5981, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.8240343347639485, |
|
"grad_norm": 0.17274914681911469, |
|
"learning_rate": 4.115889149536863e-05, |
|
"loss": 2.3754, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.8283261802575107, |
|
"grad_norm": 0.10760139673948288, |
|
"learning_rate": 4.0960230730991856e-05, |
|
"loss": 2.3867, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.8326180257510729, |
|
"grad_norm": 0.17432305216789246, |
|
"learning_rate": 4.076101375452241e-05, |
|
"loss": 2.4265, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.8369098712446352, |
|
"grad_norm": 0.15838249027729034, |
|
"learning_rate": 4.05612506758653e-05, |
|
"loss": 2.4993, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.8412017167381974, |
|
"grad_norm": 0.12234227359294891, |
|
"learning_rate": 4.0360951632639226e-05, |
|
"loss": 2.3877, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.8454935622317596, |
|
"grad_norm": 0.1838800013065338, |
|
"learning_rate": 4.016012678966213e-05, |
|
"loss": 2.2853, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.8497854077253219, |
|
"grad_norm": 0.15433116257190704, |
|
"learning_rate": 3.995878633843535e-05, |
|
"loss": 2.4563, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.8540772532188842, |
|
"grad_norm": 0.1247561052441597, |
|
"learning_rate": 3.9756940496626416e-05, |
|
"loss": 2.4479, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.8583690987124464, |
|
"grad_norm": 0.1395588368177414, |
|
"learning_rate": 3.955459950755054e-05, |
|
"loss": 2.5016, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8626609442060086, |
|
"grad_norm": 0.1456160992383957, |
|
"learning_rate": 3.9351773639650745e-05, |
|
"loss": 2.3222, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.8669527896995708, |
|
"grad_norm": 0.1138000562787056, |
|
"learning_rate": 3.914847318597682e-05, |
|
"loss": 2.503, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.871244635193133, |
|
"grad_norm": 0.18854430317878723, |
|
"learning_rate": 3.894470846366291e-05, |
|
"loss": 2.3799, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.8755364806866953, |
|
"grad_norm": 0.14914338290691376, |
|
"learning_rate": 3.874048981340397e-05, |
|
"loss": 2.4057, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.8798283261802575, |
|
"grad_norm": 0.1462559849023819, |
|
"learning_rate": 3.853582759893097e-05, |
|
"loss": 2.3917, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8841201716738197, |
|
"grad_norm": 0.13625341653823853, |
|
"learning_rate": 3.833073220648501e-05, |
|
"loss": 2.3863, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.8884120171673819, |
|
"grad_norm": 0.1319255530834198, |
|
"learning_rate": 3.812521404429016e-05, |
|
"loss": 2.3733, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.8927038626609443, |
|
"grad_norm": 0.12159706652164459, |
|
"learning_rate": 3.7919283542025295e-05, |
|
"loss": 2.402, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.8969957081545065, |
|
"grad_norm": 0.12551629543304443, |
|
"learning_rate": 3.7712951150294845e-05, |
|
"loss": 2.3653, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.9012875536480687, |
|
"grad_norm": 0.12352804839611053, |
|
"learning_rate": 3.7506227340098376e-05, |
|
"loss": 2.2787, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9055793991416309, |
|
"grad_norm": 0.1033705621957779, |
|
"learning_rate": 3.729912260229926e-05, |
|
"loss": 2.2657, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.9098712446351931, |
|
"grad_norm": 0.13322466611862183, |
|
"learning_rate": 3.7091647447092265e-05, |
|
"loss": 2.4372, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.9141630901287554, |
|
"grad_norm": 0.1130763366818428, |
|
"learning_rate": 3.6883812403470165e-05, |
|
"loss": 2.3048, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.9184549356223176, |
|
"grad_norm": 0.13779044151306152, |
|
"learning_rate": 3.667562801868943e-05, |
|
"loss": 2.4289, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.9227467811158798, |
|
"grad_norm": 0.11905843019485474, |
|
"learning_rate": 3.646710485773499e-05, |
|
"loss": 2.513, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.927038626609442, |
|
"grad_norm": 0.14390388131141663, |
|
"learning_rate": 3.625825350278403e-05, |
|
"loss": 2.3498, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.9313304721030042, |
|
"grad_norm": 0.11969032138586044, |
|
"learning_rate": 3.6049084552669e-05, |
|
"loss": 2.3459, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.9356223175965666, |
|
"grad_norm": 0.12738005816936493, |
|
"learning_rate": 3.5839608622339755e-05, |
|
"loss": 2.2709, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.9399141630901288, |
|
"grad_norm": 0.12393856048583984, |
|
"learning_rate": 3.562983634232483e-05, |
|
"loss": 2.3483, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.944206008583691, |
|
"grad_norm": 0.12841208279132843, |
|
"learning_rate": 3.541977835819197e-05, |
|
"loss": 2.3226, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9484978540772532, |
|
"grad_norm": 0.12215293943881989, |
|
"learning_rate": 3.520944533000792e-05, |
|
"loss": 2.4929, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.9527896995708155, |
|
"grad_norm": 0.10832389444112778, |
|
"learning_rate": 3.4998847931797374e-05, |
|
"loss": 2.4507, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.9570815450643777, |
|
"grad_norm": 0.10891756415367126, |
|
"learning_rate": 3.478799685100138e-05, |
|
"loss": 2.4425, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.9613733905579399, |
|
"grad_norm": 0.124311164021492, |
|
"learning_rate": 3.457690278793489e-05, |
|
"loss": 2.5339, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.9656652360515021, |
|
"grad_norm": 0.17225196957588196, |
|
"learning_rate": 3.436557645524379e-05, |
|
"loss": 2.4238, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.9699570815450643, |
|
"grad_norm": 0.12012173235416412, |
|
"learning_rate": 3.415402857736122e-05, |
|
"loss": 2.5209, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.9742489270386266, |
|
"grad_norm": 0.17129479348659515, |
|
"learning_rate": 3.394226988996336e-05, |
|
"loss": 2.5539, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.9785407725321889, |
|
"grad_norm": 0.13525094091892242, |
|
"learning_rate": 3.373031113942456e-05, |
|
"loss": 2.2249, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.9828326180257511, |
|
"grad_norm": 0.11054141074419022, |
|
"learning_rate": 3.351816308227206e-05, |
|
"loss": 2.4088, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.9871244635193133, |
|
"grad_norm": 0.11343766748905182, |
|
"learning_rate": 3.330583648464004e-05, |
|
"loss": 2.0978, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.9914163090128756, |
|
"grad_norm": 0.22153422236442566, |
|
"learning_rate": 3.309334212172331e-05, |
|
"loss": 2.5402, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.9957081545064378, |
|
"grad_norm": 0.11208418756723404, |
|
"learning_rate": 3.288069077723045e-05, |
|
"loss": 2.3747, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.1252536177635193, |
|
"learning_rate": 3.26678932428366e-05, |
|
"loss": 2.4297, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.0042918454935623, |
|
"grad_norm": 0.23914779722690582, |
|
"learning_rate": 3.2454960317635747e-05, |
|
"loss": 2.5602, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.0042918454935623, |
|
"grad_norm": 0.15806083381175995, |
|
"learning_rate": 3.2241902807592734e-05, |
|
"loss": 2.3144, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.0085836909871244, |
|
"grad_norm": 0.11224311590194702, |
|
"learning_rate": 3.202873152499485e-05, |
|
"loss": 2.3083, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.0128755364806867, |
|
"grad_norm": 0.12876267731189728, |
|
"learning_rate": 3.181545728790317e-05, |
|
"loss": 2.2468, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.0171673819742488, |
|
"grad_norm": 0.16172175109386444, |
|
"learning_rate": 3.160209091960347e-05, |
|
"loss": 2.4206, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.0214592274678111, |
|
"grad_norm": 0.10915794968605042, |
|
"learning_rate": 3.138864324805707e-05, |
|
"loss": 2.3789, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.0257510729613735, |
|
"grad_norm": 0.18216541409492493, |
|
"learning_rate": 3.117512510535128e-05, |
|
"loss": 2.3726, |
|
"step": 240 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 466, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 24, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3380404450615624e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|