|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.2631939684715556, |
|
"eval_steps": 500, |
|
"global_step": 150, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02193283070596299, |
|
"grad_norm": 0.4870755076408386, |
|
"learning_rate": 8.695652173913044e-07, |
|
"loss": 2.9525, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.04386566141192598, |
|
"grad_norm": 0.35328975319862366, |
|
"learning_rate": 1.7391304347826088e-06, |
|
"loss": 2.8102, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.06579849211788896, |
|
"grad_norm": 0.4670841097831726, |
|
"learning_rate": 2.6086956521739132e-06, |
|
"loss": 2.8848, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.08773132282385196, |
|
"grad_norm": 0.6589818000793457, |
|
"learning_rate": 3.4782608695652175e-06, |
|
"loss": 3.0569, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.10966415352981494, |
|
"grad_norm": 0.47537556290626526, |
|
"learning_rate": 4.347826086956522e-06, |
|
"loss": 2.8093, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.13159698423577793, |
|
"grad_norm": 0.6465238928794861, |
|
"learning_rate": 5.2173913043478265e-06, |
|
"loss": 3.06, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.15352981494174092, |
|
"grad_norm": 0.4992265999317169, |
|
"learning_rate": 6.086956521739132e-06, |
|
"loss": 2.98, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.17546264564770392, |
|
"grad_norm": 0.5841557383537292, |
|
"learning_rate": 6.956521739130435e-06, |
|
"loss": 2.9831, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.1973954763536669, |
|
"grad_norm": 0.41789695620536804, |
|
"learning_rate": 7.82608695652174e-06, |
|
"loss": 2.8239, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.21932830705962988, |
|
"grad_norm": 0.43609240651130676, |
|
"learning_rate": 8.695652173913044e-06, |
|
"loss": 2.8341, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.24126113776559288, |
|
"grad_norm": 0.3185977041721344, |
|
"learning_rate": 9.565217391304349e-06, |
|
"loss": 2.7383, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.26319396847155585, |
|
"grad_norm": 0.43681928515434265, |
|
"learning_rate": 1.0434782608695653e-05, |
|
"loss": 2.9235, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.2851267991775189, |
|
"grad_norm": 0.3825719952583313, |
|
"learning_rate": 1.1304347826086957e-05, |
|
"loss": 2.6964, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.30705962988348184, |
|
"grad_norm": 0.5028628706932068, |
|
"learning_rate": 1.2173913043478263e-05, |
|
"loss": 2.8067, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.3289924605894448, |
|
"grad_norm": 0.5002133846282959, |
|
"learning_rate": 1.3043478260869566e-05, |
|
"loss": 2.8252, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.35092529129540784, |
|
"grad_norm": 0.3002101182937622, |
|
"learning_rate": 1.391304347826087e-05, |
|
"loss": 2.7313, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.3728581220013708, |
|
"grad_norm": 0.4624859392642975, |
|
"learning_rate": 1.4782608695652174e-05, |
|
"loss": 2.7972, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.3947909527073338, |
|
"grad_norm": 0.57045578956604, |
|
"learning_rate": 1.565217391304348e-05, |
|
"loss": 3.0034, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.4167237834132968, |
|
"grad_norm": 0.4259621500968933, |
|
"learning_rate": 1.6521739130434785e-05, |
|
"loss": 2.8309, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.43865661411925977, |
|
"grad_norm": 0.46009618043899536, |
|
"learning_rate": 1.739130434782609e-05, |
|
"loss": 2.8408, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.46058944482522274, |
|
"grad_norm": 0.5475765466690063, |
|
"learning_rate": 1.8260869565217393e-05, |
|
"loss": 2.8943, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.48252227553118576, |
|
"grad_norm": 0.7618455290794373, |
|
"learning_rate": 1.9130434782608697e-05, |
|
"loss": 2.9959, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.5044551062371487, |
|
"grad_norm": 0.6684309840202332, |
|
"learning_rate": 2e-05, |
|
"loss": 2.9095, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.5263879369431117, |
|
"grad_norm": 0.545538604259491, |
|
"learning_rate": 1.9999729347501484e-05, |
|
"loss": 2.8353, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.5483207676490747, |
|
"grad_norm": 0.6128362417221069, |
|
"learning_rate": 1.9998917404656488e-05, |
|
"loss": 2.8127, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.5702535983550377, |
|
"grad_norm": 0.7084450125694275, |
|
"learning_rate": 1.9997564215415886e-05, |
|
"loss": 2.9335, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.5921864290610007, |
|
"grad_norm": 0.5246658325195312, |
|
"learning_rate": 1.9995669853028485e-05, |
|
"loss": 2.8186, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.6141192597669637, |
|
"grad_norm": 0.7249352335929871, |
|
"learning_rate": 1.9993234420037072e-05, |
|
"loss": 2.8336, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.6360520904729267, |
|
"grad_norm": 0.5946571826934814, |
|
"learning_rate": 1.999025804827285e-05, |
|
"loss": 2.8233, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.6579849211788896, |
|
"grad_norm": 0.42332401871681213, |
|
"learning_rate": 1.9986740898848306e-05, |
|
"loss": 2.7259, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6799177518848526, |
|
"grad_norm": 0.9102485775947571, |
|
"learning_rate": 1.99826831621485e-05, |
|
"loss": 2.9075, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.7018505825908157, |
|
"grad_norm": 0.6430304646492004, |
|
"learning_rate": 1.997808505782075e-05, |
|
"loss": 2.9084, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.7237834132967786, |
|
"grad_norm": 0.5709037780761719, |
|
"learning_rate": 1.9972946834762732e-05, |
|
"loss": 2.8107, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.7457162440027416, |
|
"grad_norm": 0.49835285544395447, |
|
"learning_rate": 1.9967268771109037e-05, |
|
"loss": 2.7338, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.7676490747087046, |
|
"grad_norm": 0.5714299082756042, |
|
"learning_rate": 1.996105117421608e-05, |
|
"loss": 2.7816, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.7895819054146676, |
|
"grad_norm": 0.48475122451782227, |
|
"learning_rate": 1.9954294380645497e-05, |
|
"loss": 2.7685, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.8115147361206305, |
|
"grad_norm": 0.5309097766876221, |
|
"learning_rate": 1.9946998756145894e-05, |
|
"loss": 2.7778, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.8334475668265936, |
|
"grad_norm": 0.45952990651130676, |
|
"learning_rate": 1.9939164695633067e-05, |
|
"loss": 2.7954, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.8553803975325566, |
|
"grad_norm": 0.48351016640663147, |
|
"learning_rate": 1.9930792623168638e-05, |
|
"loss": 2.6792, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.8773132282385195, |
|
"grad_norm": 0.4013523459434509, |
|
"learning_rate": 1.992188299193706e-05, |
|
"loss": 2.7983, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8992460589444825, |
|
"grad_norm": 0.318466454744339, |
|
"learning_rate": 1.9912436284221134e-05, |
|
"loss": 2.5984, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.9211788896504455, |
|
"grad_norm": 0.3661440908908844, |
|
"learning_rate": 1.9902453011375865e-05, |
|
"loss": 2.8038, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.9431117203564084, |
|
"grad_norm": 0.4626653790473938, |
|
"learning_rate": 1.98919337138008e-05, |
|
"loss": 2.7242, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.9650445510623715, |
|
"grad_norm": 0.30195215344429016, |
|
"learning_rate": 1.9880878960910772e-05, |
|
"loss": 2.6637, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.9869773817683345, |
|
"grad_norm": 0.40314823389053345, |
|
"learning_rate": 1.9869289351105087e-05, |
|
"loss": 2.7072, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.4138323962688446, |
|
"learning_rate": 1.9857165511735105e-05, |
|
"loss": 2.6019, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.021932830705963, |
|
"grad_norm": 0.29498204588890076, |
|
"learning_rate": 1.9844508099070313e-05, |
|
"loss": 2.5943, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.043865661411926, |
|
"grad_norm": 0.29517868161201477, |
|
"learning_rate": 1.9831317798262787e-05, |
|
"loss": 2.6963, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.065798492117889, |
|
"grad_norm": 0.29591700434684753, |
|
"learning_rate": 1.98175953233101e-05, |
|
"loss": 2.7676, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.0877313228238519, |
|
"grad_norm": 0.2561758756637573, |
|
"learning_rate": 1.980334141701667e-05, |
|
"loss": 2.629, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.1096641535298148, |
|
"grad_norm": 0.2604333460330963, |
|
"learning_rate": 1.978855685095358e-05, |
|
"loss": 2.7115, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.1315969842357778, |
|
"grad_norm": 0.4252321422100067, |
|
"learning_rate": 1.977324242541677e-05, |
|
"loss": 2.6442, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.153529814941741, |
|
"grad_norm": 0.23818732798099518, |
|
"learning_rate": 1.9757398969383752e-05, |
|
"loss": 2.7172, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.175462645647704, |
|
"grad_norm": 0.16449472308158875, |
|
"learning_rate": 1.974102734046872e-05, |
|
"loss": 2.6405, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.197395476353667, |
|
"grad_norm": 0.25156456232070923, |
|
"learning_rate": 1.9724128424876117e-05, |
|
"loss": 2.6376, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.21932830705963, |
|
"grad_norm": 0.1786828637123108, |
|
"learning_rate": 1.9706703137352695e-05, |
|
"loss": 2.5754, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.2412611377655929, |
|
"grad_norm": 0.21457381546497345, |
|
"learning_rate": 1.968875242113798e-05, |
|
"loss": 2.6309, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.2631939684715559, |
|
"grad_norm": 0.16352739930152893, |
|
"learning_rate": 1.9670277247913205e-05, |
|
"loss": 2.5966, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.2851267991775188, |
|
"grad_norm": 0.19751280546188354, |
|
"learning_rate": 1.965127861774873e-05, |
|
"loss": 2.7516, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.3070596298834818, |
|
"grad_norm": 0.14813798666000366, |
|
"learning_rate": 1.96317575590499e-05, |
|
"loss": 2.6464, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.3289924605894448, |
|
"grad_norm": 0.15344278514385223, |
|
"learning_rate": 1.9611715128501378e-05, |
|
"loss": 2.7081, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.350925291295408, |
|
"grad_norm": 0.1399158239364624, |
|
"learning_rate": 1.9591152411009942e-05, |
|
"loss": 2.6382, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.3728581220013707, |
|
"grad_norm": 0.1705392748117447, |
|
"learning_rate": 1.9570070519645767e-05, |
|
"loss": 2.6442, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.3947909527073339, |
|
"grad_norm": 0.16903157532215118, |
|
"learning_rate": 1.9548470595582166e-05, |
|
"loss": 2.5898, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.4167237834132969, |
|
"grad_norm": 0.13648824393749237, |
|
"learning_rate": 1.9526353808033827e-05, |
|
"loss": 2.5725, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.4386566141192598, |
|
"grad_norm": 0.129099041223526, |
|
"learning_rate": 1.9503721354193507e-05, |
|
"loss": 2.6863, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.4605894448252228, |
|
"grad_norm": 0.12659573554992676, |
|
"learning_rate": 1.948057445916724e-05, |
|
"loss": 2.6224, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.4825222755311858, |
|
"grad_norm": 0.1909603327512741, |
|
"learning_rate": 1.9456914375908026e-05, |
|
"loss": 2.6401, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.5044551062371487, |
|
"grad_norm": 0.25878530740737915, |
|
"learning_rate": 1.9432742385147988e-05, |
|
"loss": 2.6704, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.5263879369431117, |
|
"grad_norm": 0.13662640750408173, |
|
"learning_rate": 1.9408059795329073e-05, |
|
"loss": 2.6154, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.5483207676490747, |
|
"grad_norm": 0.13560184836387634, |
|
"learning_rate": 1.9382867942532195e-05, |
|
"loss": 2.6227, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.5702535983550376, |
|
"grad_norm": 0.1390749216079712, |
|
"learning_rate": 1.9357168190404937e-05, |
|
"loss": 2.6884, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.5921864290610008, |
|
"grad_norm": 0.12690310180187225, |
|
"learning_rate": 1.9330961930087724e-05, |
|
"loss": 2.674, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.6141192597669636, |
|
"grad_norm": 0.09753144532442093, |
|
"learning_rate": 1.9304250580138524e-05, |
|
"loss": 2.6209, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.6360520904729268, |
|
"grad_norm": 0.12032578140497208, |
|
"learning_rate": 1.9277035586456056e-05, |
|
"loss": 2.6915, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.6579849211788895, |
|
"grad_norm": 0.16928426921367645, |
|
"learning_rate": 1.9249318422201524e-05, |
|
"loss": 2.6132, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.6799177518848527, |
|
"grad_norm": 0.09693765640258789, |
|
"learning_rate": 1.9221100587718884e-05, |
|
"loss": 2.6633, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.7018505825908157, |
|
"grad_norm": 0.16512952744960785, |
|
"learning_rate": 1.919238361045362e-05, |
|
"loss": 2.6585, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.7237834132967786, |
|
"grad_norm": 0.14026080071926117, |
|
"learning_rate": 1.916316904487005e-05, |
|
"loss": 2.7693, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.7457162440027416, |
|
"grad_norm": 0.14885276556015015, |
|
"learning_rate": 1.9133458472367216e-05, |
|
"loss": 2.7144, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.7676490747087046, |
|
"grad_norm": 0.11436336487531662, |
|
"learning_rate": 1.9103253501193256e-05, |
|
"loss": 2.5687, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.7895819054146676, |
|
"grad_norm": 0.24450720846652985, |
|
"learning_rate": 1.9072555766358346e-05, |
|
"loss": 2.5209, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.8115147361206305, |
|
"grad_norm": 0.19669459760189056, |
|
"learning_rate": 1.904136692954622e-05, |
|
"loss": 2.5729, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.8334475668265937, |
|
"grad_norm": 0.1368049532175064, |
|
"learning_rate": 1.900968867902419e-05, |
|
"loss": 2.6268, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.8553803975325565, |
|
"grad_norm": 0.1277005970478058, |
|
"learning_rate": 1.89775227295518e-05, |
|
"loss": 2.6601, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.8773132282385196, |
|
"grad_norm": 0.09511096775531769, |
|
"learning_rate": 1.8944870822287957e-05, |
|
"loss": 2.5918, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.8992460589444824, |
|
"grad_norm": 0.06676003336906433, |
|
"learning_rate": 1.891173472469672e-05, |
|
"loss": 2.5671, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.9211788896504456, |
|
"grad_norm": 0.13803276419639587, |
|
"learning_rate": 1.8878116230451615e-05, |
|
"loss": 2.6257, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.9431117203564083, |
|
"grad_norm": 0.10050716996192932, |
|
"learning_rate": 1.884401715933853e-05, |
|
"loss": 2.6772, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.9650445510623715, |
|
"grad_norm": 0.1283024400472641, |
|
"learning_rate": 1.8809439357157226e-05, |
|
"loss": 2.6121, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.9869773817683345, |
|
"grad_norm": 0.13763798773288727, |
|
"learning_rate": 1.8774384695621407e-05, |
|
"loss": 2.568, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.22271642088890076, |
|
"learning_rate": 1.8738855072257428e-05, |
|
"loss": 2.5865, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.021932830705963, |
|
"grad_norm": 0.11131051927804947, |
|
"learning_rate": 1.8702852410301556e-05, |
|
"loss": 2.5144, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.043865661411926, |
|
"grad_norm": 0.11161550134420395, |
|
"learning_rate": 1.8666378658595863e-05, |
|
"loss": 2.5182, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.065798492117889, |
|
"grad_norm": 0.08552844822406769, |
|
"learning_rate": 1.8629435791482765e-05, |
|
"loss": 2.6402, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.087731322823852, |
|
"grad_norm": 0.11428907513618469, |
|
"learning_rate": 1.8592025808698116e-05, |
|
"loss": 2.6265, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 2.109664153529815, |
|
"grad_norm": 0.09980299323797226, |
|
"learning_rate": 1.8554150735262975e-05, |
|
"loss": 2.623, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 2.131596984235778, |
|
"grad_norm": 0.08849837630987167, |
|
"learning_rate": 1.8515812621373998e-05, |
|
"loss": 2.627, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 2.153529814941741, |
|
"grad_norm": 0.09756634384393692, |
|
"learning_rate": 1.8477013542292446e-05, |
|
"loss": 2.6233, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 2.1754626456477038, |
|
"grad_norm": 0.07598986476659775, |
|
"learning_rate": 1.8437755598231857e-05, |
|
"loss": 2.6633, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.197395476353667, |
|
"grad_norm": 0.18077166378498077, |
|
"learning_rate": 1.8398040914244363e-05, |
|
"loss": 2.6253, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 2.2193283070596297, |
|
"grad_norm": 0.14006111025810242, |
|
"learning_rate": 1.8357871640105648e-05, |
|
"loss": 2.6086, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.241261137765593, |
|
"grad_norm": 0.08927474915981293, |
|
"learning_rate": 1.8317249950198598e-05, |
|
"loss": 2.4877, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 2.2631939684715556, |
|
"grad_norm": 0.08106345683336258, |
|
"learning_rate": 1.8276178043395588e-05, |
|
"loss": 2.6523, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.285126799177519, |
|
"grad_norm": 0.09083954989910126, |
|
"learning_rate": 1.8234658142939454e-05, |
|
"loss": 2.6456, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.307059629883482, |
|
"grad_norm": 0.07053636014461517, |
|
"learning_rate": 1.8192692496323158e-05, |
|
"loss": 2.6306, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.3289924605894448, |
|
"grad_norm": 0.06668028235435486, |
|
"learning_rate": 1.8150283375168112e-05, |
|
"loss": 2.5175, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 2.350925291295408, |
|
"grad_norm": 0.13295046985149384, |
|
"learning_rate": 1.8107433075101254e-05, |
|
"loss": 2.473, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.3728581220013707, |
|
"grad_norm": 0.09093775600194931, |
|
"learning_rate": 1.8064143915630723e-05, |
|
"loss": 2.547, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 2.394790952707334, |
|
"grad_norm": 0.18302415311336517, |
|
"learning_rate": 1.8020418240020362e-05, |
|
"loss": 2.5652, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.4167237834132966, |
|
"grad_norm": 0.07250549644231796, |
|
"learning_rate": 1.7976258415162836e-05, |
|
"loss": 2.5828, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.43865661411926, |
|
"grad_norm": 0.3828829526901245, |
|
"learning_rate": 1.7931666831451536e-05, |
|
"loss": 2.5063, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.4605894448252226, |
|
"grad_norm": 0.5052575469017029, |
|
"learning_rate": 1.7886645902651166e-05, |
|
"loss": 2.6184, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.4825222755311858, |
|
"grad_norm": 0.07245208323001862, |
|
"learning_rate": 1.7841198065767107e-05, |
|
"loss": 2.5883, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.504455106237149, |
|
"grad_norm": 0.05822201445698738, |
|
"learning_rate": 1.779532578091347e-05, |
|
"loss": 2.6, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.5263879369431117, |
|
"grad_norm": 0.08909498155117035, |
|
"learning_rate": 1.7749031531179962e-05, |
|
"loss": 2.549, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.5483207676490744, |
|
"grad_norm": 0.07801003754138947, |
|
"learning_rate": 1.7702317822497457e-05, |
|
"loss": 2.5591, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.5702535983550376, |
|
"grad_norm": 0.2434745877981186, |
|
"learning_rate": 1.7655187183502344e-05, |
|
"loss": 2.6557, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.592186429061001, |
|
"grad_norm": 0.06782998889684677, |
|
"learning_rate": 1.7607642165399665e-05, |
|
"loss": 2.5268, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.6141192597669636, |
|
"grad_norm": 0.0920061469078064, |
|
"learning_rate": 1.755968534182501e-05, |
|
"loss": 2.5052, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.6360520904729268, |
|
"grad_norm": 0.0655408501625061, |
|
"learning_rate": 1.7511319308705198e-05, |
|
"loss": 2.5784, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.6579849211788895, |
|
"grad_norm": 0.13081596791744232, |
|
"learning_rate": 1.746254668411778e-05, |
|
"loss": 2.5305, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.6799177518848527, |
|
"grad_norm": 0.3778601884841919, |
|
"learning_rate": 1.7413370108149288e-05, |
|
"loss": 2.5846, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.701850582590816, |
|
"grad_norm": 0.07771953195333481, |
|
"learning_rate": 1.7363792242752354e-05, |
|
"loss": 2.563, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.7237834132967786, |
|
"grad_norm": 0.06316570937633514, |
|
"learning_rate": 1.731381577160161e-05, |
|
"loss": 2.5705, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.7457162440027414, |
|
"grad_norm": 0.10998786240816116, |
|
"learning_rate": 1.726344339994841e-05, |
|
"loss": 2.6364, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.7676490747087046, |
|
"grad_norm": 0.08724892884492874, |
|
"learning_rate": 1.7212677854474402e-05, |
|
"loss": 2.4682, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.7895819054146678, |
|
"grad_norm": 0.10323834419250488, |
|
"learning_rate": 1.7161521883143936e-05, |
|
"loss": 2.6225, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.8115147361206305, |
|
"grad_norm": 0.07787463814020157, |
|
"learning_rate": 1.7109978255055295e-05, |
|
"loss": 2.5786, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.8334475668265937, |
|
"grad_norm": 0.07260199636220932, |
|
"learning_rate": 1.705804976029083e-05, |
|
"loss": 2.678, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.8553803975325565, |
|
"grad_norm": 0.07246023416519165, |
|
"learning_rate": 1.7005739209765906e-05, |
|
"loss": 2.4259, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.8773132282385196, |
|
"grad_norm": 0.09231323003768921, |
|
"learning_rate": 1.6953049435076768e-05, |
|
"loss": 2.6071, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.8992460589444824, |
|
"grad_norm": 0.08409086614847183, |
|
"learning_rate": 1.6899983288347248e-05, |
|
"loss": 2.4503, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.9211788896504456, |
|
"grad_norm": 0.10303400456905365, |
|
"learning_rate": 1.6846543642074382e-05, |
|
"loss": 2.5125, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.9431117203564083, |
|
"grad_norm": 0.053891055285930634, |
|
"learning_rate": 1.679273338897293e-05, |
|
"loss": 2.6059, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.9650445510623715, |
|
"grad_norm": 0.08152981102466583, |
|
"learning_rate": 1.6738555441818785e-05, |
|
"loss": 2.62, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.9869773817683347, |
|
"grad_norm": 0.06032385304570198, |
|
"learning_rate": 1.668401273329129e-05, |
|
"loss": 2.534, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.08430641144514084, |
|
"learning_rate": 1.6629108215814523e-05, |
|
"loss": 2.5903, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 3.021932830705963, |
|
"grad_norm": 0.0818452313542366, |
|
"learning_rate": 1.6573844861397444e-05, |
|
"loss": 2.7053, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 3.043865661411926, |
|
"grad_norm": 0.11237948387861252, |
|
"learning_rate": 1.6518225661473045e-05, |
|
"loss": 2.4826, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.065798492117889, |
|
"grad_norm": 0.0827915221452713, |
|
"learning_rate": 1.6462253626736413e-05, |
|
"loss": 2.5895, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 3.087731322823852, |
|
"grad_norm": 0.11660678684711456, |
|
"learning_rate": 1.6405931786981753e-05, |
|
"loss": 2.5654, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 3.109664153529815, |
|
"grad_norm": 0.09041120111942291, |
|
"learning_rate": 1.63492631909384e-05, |
|
"loss": 2.5197, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 3.131596984235778, |
|
"grad_norm": 0.06614544242620468, |
|
"learning_rate": 1.629225090610577e-05, |
|
"loss": 2.5294, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 3.153529814941741, |
|
"grad_norm": 0.04271303862333298, |
|
"learning_rate": 1.6234898018587336e-05, |
|
"loss": 2.5046, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 3.1754626456477038, |
|
"grad_norm": 0.08055449277162552, |
|
"learning_rate": 1.6177207632923558e-05, |
|
"loss": 2.5966, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 3.197395476353667, |
|
"grad_norm": 0.06253743171691895, |
|
"learning_rate": 1.6119182871923834e-05, |
|
"loss": 2.5158, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 3.2193283070596297, |
|
"grad_norm": 0.08224272727966309, |
|
"learning_rate": 1.606082687649748e-05, |
|
"loss": 2.6654, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 3.241261137765593, |
|
"grad_norm": 0.09529576450586319, |
|
"learning_rate": 1.6002142805483686e-05, |
|
"loss": 2.5656, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 3.2631939684715556, |
|
"grad_norm": 0.04647281765937805, |
|
"learning_rate": 1.5943133835480536e-05, |
|
"loss": 2.5524, |
|
"step": 150 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.2639143951466496e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|