{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.3509252912954075, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02193283070596299, "grad_norm": 0.4870755076408386, "learning_rate": 8.695652173913044e-07, "loss": 2.9525, "step": 1 }, { "epoch": 0.04386566141192598, "grad_norm": 0.35328975319862366, "learning_rate": 1.7391304347826088e-06, "loss": 2.8102, "step": 2 }, { "epoch": 0.06579849211788896, "grad_norm": 0.4670841097831726, "learning_rate": 2.6086956521739132e-06, "loss": 2.8848, "step": 3 }, { "epoch": 0.08773132282385196, "grad_norm": 0.6589818000793457, "learning_rate": 3.4782608695652175e-06, "loss": 3.0569, "step": 4 }, { "epoch": 0.10966415352981494, "grad_norm": 0.47537556290626526, "learning_rate": 4.347826086956522e-06, "loss": 2.8093, "step": 5 }, { "epoch": 0.13159698423577793, "grad_norm": 0.6465238928794861, "learning_rate": 5.2173913043478265e-06, "loss": 3.06, "step": 6 }, { "epoch": 0.15352981494174092, "grad_norm": 0.4992265999317169, "learning_rate": 6.086956521739132e-06, "loss": 2.98, "step": 7 }, { "epoch": 0.17546264564770392, "grad_norm": 0.5841557383537292, "learning_rate": 6.956521739130435e-06, "loss": 2.9831, "step": 8 }, { "epoch": 0.1973954763536669, "grad_norm": 0.41789695620536804, "learning_rate": 7.82608695652174e-06, "loss": 2.8239, "step": 9 }, { "epoch": 0.21932830705962988, "grad_norm": 0.43609240651130676, "learning_rate": 8.695652173913044e-06, "loss": 2.8341, "step": 10 }, { "epoch": 0.24126113776559288, "grad_norm": 0.3185977041721344, "learning_rate": 9.565217391304349e-06, "loss": 2.7383, "step": 11 }, { "epoch": 0.26319396847155585, "grad_norm": 0.43681928515434265, "learning_rate": 1.0434782608695653e-05, "loss": 2.9235, "step": 12 }, { "epoch": 0.2851267991775189, "grad_norm": 0.3825719952583313, "learning_rate": 1.1304347826086957e-05, "loss": 2.6964, "step": 13 }, { "epoch": 0.30705962988348184, "grad_norm": 0.5028628706932068, "learning_rate": 1.2173913043478263e-05, "loss": 2.8067, "step": 14 }, { "epoch": 0.3289924605894448, "grad_norm": 0.5002133846282959, "learning_rate": 1.3043478260869566e-05, "loss": 2.8252, "step": 15 }, { "epoch": 0.35092529129540784, "grad_norm": 0.3002101182937622, "learning_rate": 1.391304347826087e-05, "loss": 2.7313, "step": 16 }, { "epoch": 0.3728581220013708, "grad_norm": 0.4624859392642975, "learning_rate": 1.4782608695652174e-05, "loss": 2.7972, "step": 17 }, { "epoch": 0.3947909527073338, "grad_norm": 0.57045578956604, "learning_rate": 1.565217391304348e-05, "loss": 3.0034, "step": 18 }, { "epoch": 0.4167237834132968, "grad_norm": 0.4259621500968933, "learning_rate": 1.6521739130434785e-05, "loss": 2.8309, "step": 19 }, { "epoch": 0.43865661411925977, "grad_norm": 0.46009618043899536, "learning_rate": 1.739130434782609e-05, "loss": 2.8408, "step": 20 }, { "epoch": 0.46058944482522274, "grad_norm": 0.5475765466690063, "learning_rate": 1.8260869565217393e-05, "loss": 2.8943, "step": 21 }, { "epoch": 0.48252227553118576, "grad_norm": 0.7618455290794373, "learning_rate": 1.9130434782608697e-05, "loss": 2.9959, "step": 22 }, { "epoch": 0.5044551062371487, "grad_norm": 0.6684309840202332, "learning_rate": 2e-05, "loss": 2.9095, "step": 23 }, { "epoch": 0.5263879369431117, "grad_norm": 0.545538604259491, "learning_rate": 1.9999729347501484e-05, "loss": 2.8353, "step": 24 }, { "epoch": 0.5483207676490747, "grad_norm": 0.6128362417221069, "learning_rate": 1.9998917404656488e-05, "loss": 2.8127, "step": 25 }, { "epoch": 0.5702535983550377, "grad_norm": 0.7084450125694275, "learning_rate": 1.9997564215415886e-05, "loss": 2.9335, "step": 26 }, { "epoch": 0.5921864290610007, "grad_norm": 0.5246658325195312, "learning_rate": 1.9995669853028485e-05, "loss": 2.8186, "step": 27 }, { "epoch": 0.6141192597669637, "grad_norm": 0.7249352335929871, "learning_rate": 1.9993234420037072e-05, "loss": 2.8336, "step": 28 }, { "epoch": 0.6360520904729267, "grad_norm": 0.5946571826934814, "learning_rate": 1.999025804827285e-05, "loss": 2.8233, "step": 29 }, { "epoch": 0.6579849211788896, "grad_norm": 0.42332401871681213, "learning_rate": 1.9986740898848306e-05, "loss": 2.7259, "step": 30 }, { "epoch": 0.6799177518848526, "grad_norm": 0.9102485775947571, "learning_rate": 1.99826831621485e-05, "loss": 2.9075, "step": 31 }, { "epoch": 0.7018505825908157, "grad_norm": 0.6430304646492004, "learning_rate": 1.997808505782075e-05, "loss": 2.9084, "step": 32 }, { "epoch": 0.7237834132967786, "grad_norm": 0.5709037780761719, "learning_rate": 1.9972946834762732e-05, "loss": 2.8107, "step": 33 }, { "epoch": 0.7457162440027416, "grad_norm": 0.49835285544395447, "learning_rate": 1.9967268771109037e-05, "loss": 2.7338, "step": 34 }, { "epoch": 0.7676490747087046, "grad_norm": 0.5714299082756042, "learning_rate": 1.996105117421608e-05, "loss": 2.7816, "step": 35 }, { "epoch": 0.7895819054146676, "grad_norm": 0.48475122451782227, "learning_rate": 1.9954294380645497e-05, "loss": 2.7685, "step": 36 }, { "epoch": 0.8115147361206305, "grad_norm": 0.5309097766876221, "learning_rate": 1.9946998756145894e-05, "loss": 2.7778, "step": 37 }, { "epoch": 0.8334475668265936, "grad_norm": 0.45952990651130676, "learning_rate": 1.9939164695633067e-05, "loss": 2.7954, "step": 38 }, { "epoch": 0.8553803975325566, "grad_norm": 0.48351016640663147, "learning_rate": 1.9930792623168638e-05, "loss": 2.6792, "step": 39 }, { "epoch": 0.8773132282385195, "grad_norm": 0.4013523459434509, "learning_rate": 1.992188299193706e-05, "loss": 2.7983, "step": 40 }, { "epoch": 0.8992460589444825, "grad_norm": 0.318466454744339, "learning_rate": 1.9912436284221134e-05, "loss": 2.5984, "step": 41 }, { "epoch": 0.9211788896504455, "grad_norm": 0.3661440908908844, "learning_rate": 1.9902453011375865e-05, "loss": 2.8038, "step": 42 }, { "epoch": 0.9431117203564084, "grad_norm": 0.4626653790473938, "learning_rate": 1.98919337138008e-05, "loss": 2.7242, "step": 43 }, { "epoch": 0.9650445510623715, "grad_norm": 0.30195215344429016, "learning_rate": 1.9880878960910772e-05, "loss": 2.6637, "step": 44 }, { "epoch": 0.9869773817683345, "grad_norm": 0.40314823389053345, "learning_rate": 1.9869289351105087e-05, "loss": 2.7072, "step": 45 }, { "epoch": 1.0, "grad_norm": 0.4138323962688446, "learning_rate": 1.9857165511735105e-05, "loss": 2.6019, "step": 46 }, { "epoch": 1.021932830705963, "grad_norm": 0.29498204588890076, "learning_rate": 1.9844508099070313e-05, "loss": 2.5943, "step": 47 }, { "epoch": 1.043865661411926, "grad_norm": 0.29517868161201477, "learning_rate": 1.9831317798262787e-05, "loss": 2.6963, "step": 48 }, { "epoch": 1.065798492117889, "grad_norm": 0.29591700434684753, "learning_rate": 1.98175953233101e-05, "loss": 2.7676, "step": 49 }, { "epoch": 1.0877313228238519, "grad_norm": 0.2561758756637573, "learning_rate": 1.980334141701667e-05, "loss": 2.629, "step": 50 }, { "epoch": 1.1096641535298148, "grad_norm": 0.2604333460330963, "learning_rate": 1.978855685095358e-05, "loss": 2.7115, "step": 51 }, { "epoch": 1.1315969842357778, "grad_norm": 0.4252321422100067, "learning_rate": 1.977324242541677e-05, "loss": 2.6442, "step": 52 }, { "epoch": 1.153529814941741, "grad_norm": 0.23818732798099518, "learning_rate": 1.9757398969383752e-05, "loss": 2.7172, "step": 53 }, { "epoch": 1.175462645647704, "grad_norm": 0.16449472308158875, "learning_rate": 1.974102734046872e-05, "loss": 2.6405, "step": 54 }, { "epoch": 1.197395476353667, "grad_norm": 0.25156456232070923, "learning_rate": 1.9724128424876117e-05, "loss": 2.6376, "step": 55 }, { "epoch": 1.21932830705963, "grad_norm": 0.1786828637123108, "learning_rate": 1.9706703137352695e-05, "loss": 2.5754, "step": 56 }, { "epoch": 1.2412611377655929, "grad_norm": 0.21457381546497345, "learning_rate": 1.968875242113798e-05, "loss": 2.6309, "step": 57 }, { "epoch": 1.2631939684715559, "grad_norm": 0.16352739930152893, "learning_rate": 1.9670277247913205e-05, "loss": 2.5966, "step": 58 }, { "epoch": 1.2851267991775188, "grad_norm": 0.19751280546188354, "learning_rate": 1.965127861774873e-05, "loss": 2.7516, "step": 59 }, { "epoch": 1.3070596298834818, "grad_norm": 0.14813798666000366, "learning_rate": 1.96317575590499e-05, "loss": 2.6464, "step": 60 }, { "epoch": 1.3289924605894448, "grad_norm": 0.15344278514385223, "learning_rate": 1.9611715128501378e-05, "loss": 2.7081, "step": 61 }, { "epoch": 1.350925291295408, "grad_norm": 0.1399158239364624, "learning_rate": 1.9591152411009942e-05, "loss": 2.6382, "step": 62 }, { "epoch": 1.3728581220013707, "grad_norm": 0.1705392748117447, "learning_rate": 1.9570070519645767e-05, "loss": 2.6442, "step": 63 }, { "epoch": 1.3947909527073339, "grad_norm": 0.16903157532215118, "learning_rate": 1.9548470595582166e-05, "loss": 2.5898, "step": 64 }, { "epoch": 1.4167237834132969, "grad_norm": 0.13648824393749237, "learning_rate": 1.9526353808033827e-05, "loss": 2.5725, "step": 65 }, { "epoch": 1.4386566141192598, "grad_norm": 0.129099041223526, "learning_rate": 1.9503721354193507e-05, "loss": 2.6863, "step": 66 }, { "epoch": 1.4605894448252228, "grad_norm": 0.12659573554992676, "learning_rate": 1.948057445916724e-05, "loss": 2.6224, "step": 67 }, { "epoch": 1.4825222755311858, "grad_norm": 0.1909603327512741, "learning_rate": 1.9456914375908026e-05, "loss": 2.6401, "step": 68 }, { "epoch": 1.5044551062371487, "grad_norm": 0.25878530740737915, "learning_rate": 1.9432742385147988e-05, "loss": 2.6704, "step": 69 }, { "epoch": 1.5263879369431117, "grad_norm": 0.13662640750408173, "learning_rate": 1.9408059795329073e-05, "loss": 2.6154, "step": 70 }, { "epoch": 1.5483207676490747, "grad_norm": 0.13560184836387634, "learning_rate": 1.9382867942532195e-05, "loss": 2.6227, "step": 71 }, { "epoch": 1.5702535983550376, "grad_norm": 0.1390749216079712, "learning_rate": 1.9357168190404937e-05, "loss": 2.6884, "step": 72 }, { "epoch": 1.5921864290610008, "grad_norm": 0.12690310180187225, "learning_rate": 1.9330961930087724e-05, "loss": 2.674, "step": 73 }, { "epoch": 1.6141192597669636, "grad_norm": 0.09753144532442093, "learning_rate": 1.9304250580138524e-05, "loss": 2.6209, "step": 74 }, { "epoch": 1.6360520904729268, "grad_norm": 0.12032578140497208, "learning_rate": 1.9277035586456056e-05, "loss": 2.6915, "step": 75 }, { "epoch": 1.6579849211788895, "grad_norm": 0.16928426921367645, "learning_rate": 1.9249318422201524e-05, "loss": 2.6132, "step": 76 }, { "epoch": 1.6799177518848527, "grad_norm": 0.09693765640258789, "learning_rate": 1.9221100587718884e-05, "loss": 2.6633, "step": 77 }, { "epoch": 1.7018505825908157, "grad_norm": 0.16512952744960785, "learning_rate": 1.919238361045362e-05, "loss": 2.6585, "step": 78 }, { "epoch": 1.7237834132967786, "grad_norm": 0.14026080071926117, "learning_rate": 1.916316904487005e-05, "loss": 2.7693, "step": 79 }, { "epoch": 1.7457162440027416, "grad_norm": 0.14885276556015015, "learning_rate": 1.9133458472367216e-05, "loss": 2.7144, "step": 80 }, { "epoch": 1.7676490747087046, "grad_norm": 0.11436336487531662, "learning_rate": 1.9103253501193256e-05, "loss": 2.5687, "step": 81 }, { "epoch": 1.7895819054146676, "grad_norm": 0.24450720846652985, "learning_rate": 1.9072555766358346e-05, "loss": 2.5209, "step": 82 }, { "epoch": 1.8115147361206305, "grad_norm": 0.19669459760189056, "learning_rate": 1.904136692954622e-05, "loss": 2.5729, "step": 83 }, { "epoch": 1.8334475668265937, "grad_norm": 0.1368049532175064, "learning_rate": 1.900968867902419e-05, "loss": 2.6268, "step": 84 }, { "epoch": 1.8553803975325565, "grad_norm": 0.1277005970478058, "learning_rate": 1.89775227295518e-05, "loss": 2.6601, "step": 85 }, { "epoch": 1.8773132282385196, "grad_norm": 0.09511096775531769, "learning_rate": 1.8944870822287957e-05, "loss": 2.5918, "step": 86 }, { "epoch": 1.8992460589444824, "grad_norm": 0.06676003336906433, "learning_rate": 1.891173472469672e-05, "loss": 2.5671, "step": 87 }, { "epoch": 1.9211788896504456, "grad_norm": 0.13803276419639587, "learning_rate": 1.8878116230451615e-05, "loss": 2.6257, "step": 88 }, { "epoch": 1.9431117203564083, "grad_norm": 0.10050716996192932, "learning_rate": 1.884401715933853e-05, "loss": 2.6772, "step": 89 }, { "epoch": 1.9650445510623715, "grad_norm": 0.1283024400472641, "learning_rate": 1.8809439357157226e-05, "loss": 2.6121, "step": 90 }, { "epoch": 1.9869773817683345, "grad_norm": 0.13763798773288727, "learning_rate": 1.8774384695621407e-05, "loss": 2.568, "step": 91 }, { "epoch": 2.0, "grad_norm": 0.22271642088890076, "learning_rate": 1.8738855072257428e-05, "loss": 2.5865, "step": 92 }, { "epoch": 2.021932830705963, "grad_norm": 0.11131051927804947, "learning_rate": 1.8702852410301556e-05, "loss": 2.5144, "step": 93 }, { "epoch": 2.043865661411926, "grad_norm": 0.11161550134420395, "learning_rate": 1.8666378658595863e-05, "loss": 2.5182, "step": 94 }, { "epoch": 2.065798492117889, "grad_norm": 0.08552844822406769, "learning_rate": 1.8629435791482765e-05, "loss": 2.6402, "step": 95 }, { "epoch": 2.087731322823852, "grad_norm": 0.11428907513618469, "learning_rate": 1.8592025808698116e-05, "loss": 2.6265, "step": 96 }, { "epoch": 2.109664153529815, "grad_norm": 0.09980299323797226, "learning_rate": 1.8554150735262975e-05, "loss": 2.623, "step": 97 }, { "epoch": 2.131596984235778, "grad_norm": 0.08849837630987167, "learning_rate": 1.8515812621373998e-05, "loss": 2.627, "step": 98 }, { "epoch": 2.153529814941741, "grad_norm": 0.09756634384393692, "learning_rate": 1.8477013542292446e-05, "loss": 2.6233, "step": 99 }, { "epoch": 2.1754626456477038, "grad_norm": 0.07598986476659775, "learning_rate": 1.8437755598231857e-05, "loss": 2.6633, "step": 100 }, { "epoch": 2.197395476353667, "grad_norm": 0.18077166378498077, "learning_rate": 1.8398040914244363e-05, "loss": 2.6253, "step": 101 }, { "epoch": 2.2193283070596297, "grad_norm": 0.14006111025810242, "learning_rate": 1.8357871640105648e-05, "loss": 2.6086, "step": 102 }, { "epoch": 2.241261137765593, "grad_norm": 0.08927474915981293, "learning_rate": 1.8317249950198598e-05, "loss": 2.4877, "step": 103 }, { "epoch": 2.2631939684715556, "grad_norm": 0.08106345683336258, "learning_rate": 1.8276178043395588e-05, "loss": 2.6523, "step": 104 }, { "epoch": 2.285126799177519, "grad_norm": 0.09083954989910126, "learning_rate": 1.8234658142939454e-05, "loss": 2.6456, "step": 105 }, { "epoch": 2.307059629883482, "grad_norm": 0.07053636014461517, "learning_rate": 1.8192692496323158e-05, "loss": 2.6306, "step": 106 }, { "epoch": 2.3289924605894448, "grad_norm": 0.06668028235435486, "learning_rate": 1.8150283375168112e-05, "loss": 2.5175, "step": 107 }, { "epoch": 2.350925291295408, "grad_norm": 0.13295046985149384, "learning_rate": 1.8107433075101254e-05, "loss": 2.473, "step": 108 }, { "epoch": 2.3728581220013707, "grad_norm": 0.09093775600194931, "learning_rate": 1.8064143915630723e-05, "loss": 2.547, "step": 109 }, { "epoch": 2.394790952707334, "grad_norm": 0.18302415311336517, "learning_rate": 1.8020418240020362e-05, "loss": 2.5652, "step": 110 }, { "epoch": 2.4167237834132966, "grad_norm": 0.07250549644231796, "learning_rate": 1.7976258415162836e-05, "loss": 2.5828, "step": 111 }, { "epoch": 2.43865661411926, "grad_norm": 0.3828829526901245, "learning_rate": 1.7931666831451536e-05, "loss": 2.5063, "step": 112 }, { "epoch": 2.4605894448252226, "grad_norm": 0.5052575469017029, "learning_rate": 1.7886645902651166e-05, "loss": 2.6184, "step": 113 }, { "epoch": 2.4825222755311858, "grad_norm": 0.07245208323001862, "learning_rate": 1.7841198065767107e-05, "loss": 2.5883, "step": 114 }, { "epoch": 2.504455106237149, "grad_norm": 0.05822201445698738, "learning_rate": 1.779532578091347e-05, "loss": 2.6, "step": 115 }, { "epoch": 2.5263879369431117, "grad_norm": 0.08909498155117035, "learning_rate": 1.7749031531179962e-05, "loss": 2.549, "step": 116 }, { "epoch": 2.5483207676490744, "grad_norm": 0.07801003754138947, "learning_rate": 1.7702317822497457e-05, "loss": 2.5591, "step": 117 }, { "epoch": 2.5702535983550376, "grad_norm": 0.2434745877981186, "learning_rate": 1.7655187183502344e-05, "loss": 2.6557, "step": 118 }, { "epoch": 2.592186429061001, "grad_norm": 0.06782998889684677, "learning_rate": 1.7607642165399665e-05, "loss": 2.5268, "step": 119 }, { "epoch": 2.6141192597669636, "grad_norm": 0.0920061469078064, "learning_rate": 1.755968534182501e-05, "loss": 2.5052, "step": 120 }, { "epoch": 2.6360520904729268, "grad_norm": 0.0655408501625061, "learning_rate": 1.7511319308705198e-05, "loss": 2.5784, "step": 121 }, { "epoch": 2.6579849211788895, "grad_norm": 0.13081596791744232, "learning_rate": 1.746254668411778e-05, "loss": 2.5305, "step": 122 }, { "epoch": 2.6799177518848527, "grad_norm": 0.3778601884841919, "learning_rate": 1.7413370108149288e-05, "loss": 2.5846, "step": 123 }, { "epoch": 2.701850582590816, "grad_norm": 0.07771953195333481, "learning_rate": 1.7363792242752354e-05, "loss": 2.563, "step": 124 }, { "epoch": 2.7237834132967786, "grad_norm": 0.06316570937633514, "learning_rate": 1.731381577160161e-05, "loss": 2.5705, "step": 125 }, { "epoch": 2.7457162440027414, "grad_norm": 0.10998786240816116, "learning_rate": 1.726344339994841e-05, "loss": 2.6364, "step": 126 }, { "epoch": 2.7676490747087046, "grad_norm": 0.08724892884492874, "learning_rate": 1.7212677854474402e-05, "loss": 2.4682, "step": 127 }, { "epoch": 2.7895819054146678, "grad_norm": 0.10323834419250488, "learning_rate": 1.7161521883143936e-05, "loss": 2.6225, "step": 128 }, { "epoch": 2.8115147361206305, "grad_norm": 0.07787463814020157, "learning_rate": 1.7109978255055295e-05, "loss": 2.5786, "step": 129 }, { "epoch": 2.8334475668265937, "grad_norm": 0.07260199636220932, "learning_rate": 1.705804976029083e-05, "loss": 2.678, "step": 130 }, { "epoch": 2.8553803975325565, "grad_norm": 0.07246023416519165, "learning_rate": 1.7005739209765906e-05, "loss": 2.4259, "step": 131 }, { "epoch": 2.8773132282385196, "grad_norm": 0.09231323003768921, "learning_rate": 1.6953049435076768e-05, "loss": 2.6071, "step": 132 }, { "epoch": 2.8992460589444824, "grad_norm": 0.08409086614847183, "learning_rate": 1.6899983288347248e-05, "loss": 2.4503, "step": 133 }, { "epoch": 2.9211788896504456, "grad_norm": 0.10303400456905365, "learning_rate": 1.6846543642074382e-05, "loss": 2.5125, "step": 134 }, { "epoch": 2.9431117203564083, "grad_norm": 0.053891055285930634, "learning_rate": 1.679273338897293e-05, "loss": 2.6059, "step": 135 }, { "epoch": 2.9650445510623715, "grad_norm": 0.08152981102466583, "learning_rate": 1.6738555441818785e-05, "loss": 2.62, "step": 136 }, { "epoch": 2.9869773817683347, "grad_norm": 0.06032385304570198, "learning_rate": 1.668401273329129e-05, "loss": 2.534, "step": 137 }, { "epoch": 3.0, "grad_norm": 0.08430641144514084, "learning_rate": 1.6629108215814523e-05, "loss": 2.5903, "step": 138 }, { "epoch": 3.021932830705963, "grad_norm": 0.0818452313542366, "learning_rate": 1.6573844861397444e-05, "loss": 2.7053, "step": 139 }, { "epoch": 3.043865661411926, "grad_norm": 0.11237948387861252, "learning_rate": 1.6518225661473045e-05, "loss": 2.4826, "step": 140 }, { "epoch": 3.065798492117889, "grad_norm": 0.0827915221452713, "learning_rate": 1.6462253626736413e-05, "loss": 2.5895, "step": 141 }, { "epoch": 3.087731322823852, "grad_norm": 0.11660678684711456, "learning_rate": 1.6405931786981753e-05, "loss": 2.5654, "step": 142 }, { "epoch": 3.109664153529815, "grad_norm": 0.09041120111942291, "learning_rate": 1.63492631909384e-05, "loss": 2.5197, "step": 143 }, { "epoch": 3.131596984235778, "grad_norm": 0.06614544242620468, "learning_rate": 1.629225090610577e-05, "loss": 2.5294, "step": 144 }, { "epoch": 3.153529814941741, "grad_norm": 0.04271303862333298, "learning_rate": 1.6234898018587336e-05, "loss": 2.5046, "step": 145 }, { "epoch": 3.1754626456477038, "grad_norm": 0.08055449277162552, "learning_rate": 1.6177207632923558e-05, "loss": 2.5966, "step": 146 }, { "epoch": 3.197395476353667, "grad_norm": 0.06253743171691895, "learning_rate": 1.6119182871923834e-05, "loss": 2.5158, "step": 147 }, { "epoch": 3.2193283070596297, "grad_norm": 0.08224272727966309, "learning_rate": 1.606082687649748e-05, "loss": 2.6654, "step": 148 }, { "epoch": 3.241261137765593, "grad_norm": 0.09529576450586319, "learning_rate": 1.6002142805483686e-05, "loss": 2.5656, "step": 149 }, { "epoch": 3.2631939684715556, "grad_norm": 0.04647281765937805, "learning_rate": 1.5943133835480536e-05, "loss": 2.5524, "step": 150 }, { "epoch": 3.285126799177519, "grad_norm": 0.07187870889902115, "learning_rate": 1.588380316067307e-05, "loss": 2.5522, "step": 151 }, { "epoch": 3.307059629883482, "grad_norm": 0.06179989129304886, "learning_rate": 1.582415399266036e-05, "loss": 2.6259, "step": 152 }, { "epoch": 3.3289924605894448, "grad_norm": 0.0755513459444046, "learning_rate": 1.5764189560281677e-05, "loss": 2.5568, "step": 153 }, { "epoch": 3.350925291295408, "grad_norm": 0.08130097389221191, "learning_rate": 1.5703913109441715e-05, "loss": 2.6148, "step": 154 }, { "epoch": 3.3728581220013707, "grad_norm": 0.07435321062803268, "learning_rate": 1.564332790293487e-05, "loss": 2.5511, "step": 155 }, { "epoch": 3.394790952707334, "grad_norm": 0.05641249567270279, "learning_rate": 1.5582437220268648e-05, "loss": 2.4647, "step": 156 }, { "epoch": 3.4167237834132966, "grad_norm": 0.05596217140555382, "learning_rate": 1.5521244357486132e-05, "loss": 2.5502, "step": 157 }, { "epoch": 3.43865661411926, "grad_norm": 0.06887087225914001, "learning_rate": 1.5459752626987563e-05, "loss": 2.4662, "step": 158 }, { "epoch": 3.4605894448252226, "grad_norm": 0.06323248893022537, "learning_rate": 1.5397965357351035e-05, "loss": 2.5123, "step": 159 }, { "epoch": 3.4825222755311858, "grad_norm": 0.09084805101156235, "learning_rate": 1.5335885893152335e-05, "loss": 2.4644, "step": 160 }, { "epoch": 3.504455106237149, "grad_norm": 0.07094909995794296, "learning_rate": 1.5273517594783878e-05, "loss": 2.493, "step": 161 }, { "epoch": 3.5263879369431117, "grad_norm": 0.04841725155711174, "learning_rate": 1.521086383827282e-05, "loss": 2.4868, "step": 162 }, { "epoch": 3.5483207676490744, "grad_norm": 0.062218405306339264, "learning_rate": 1.5147928015098309e-05, "loss": 2.4924, "step": 163 }, { "epoch": 3.5702535983550376, "grad_norm": 0.06784085184335709, "learning_rate": 1.5084713532007906e-05, "loss": 2.5649, "step": 164 }, { "epoch": 3.592186429061001, "grad_norm": 0.07132422924041748, "learning_rate": 1.5021223810833165e-05, "loss": 2.5442, "step": 165 }, { "epoch": 3.6141192597669636, "grad_norm": 0.05512155964970589, "learning_rate": 1.4957462288304421e-05, "loss": 2.4669, "step": 166 }, { "epoch": 3.6360520904729268, "grad_norm": 0.060434065759181976, "learning_rate": 1.489343241586475e-05, "loss": 2.549, "step": 167 }, { "epoch": 3.6579849211788895, "grad_norm": 0.05476135015487671, "learning_rate": 1.4829137659483144e-05, "loss": 2.5643, "step": 168 }, { "epoch": 3.6799177518848527, "grad_norm": 0.06881222128868103, "learning_rate": 1.4764581499466895e-05, "loss": 2.3814, "step": 169 }, { "epoch": 3.701850582590816, "grad_norm": 0.08369456231594086, "learning_rate": 1.4699767430273202e-05, "loss": 2.5987, "step": 170 }, { "epoch": 3.7237834132967786, "grad_norm": 0.05814111605286598, "learning_rate": 1.4634698960320018e-05, "loss": 2.5668, "step": 171 }, { "epoch": 3.7457162440027414, "grad_norm": 0.05372776836156845, "learning_rate": 1.4569379611796137e-05, "loss": 2.5888, "step": 172 }, { "epoch": 3.7676490747087046, "grad_norm": 0.060161709785461426, "learning_rate": 1.4503812920470535e-05, "loss": 2.5052, "step": 173 }, { "epoch": 3.7895819054146678, "grad_norm": 0.06631386280059814, "learning_rate": 1.443800243550098e-05, "loss": 2.4963, "step": 174 }, { "epoch": 3.8115147361206305, "grad_norm": 0.06452737748622894, "learning_rate": 1.4371951719241906e-05, "loss": 2.4251, "step": 175 }, { "epoch": 3.8334475668265937, "grad_norm": 0.09428981691598892, "learning_rate": 1.4305664347051586e-05, "loss": 2.5676, "step": 176 }, { "epoch": 3.8553803975325565, "grad_norm": 0.050980083644390106, "learning_rate": 1.423914390709861e-05, "loss": 2.5683, "step": 177 }, { "epoch": 3.8773132282385196, "grad_norm": 0.05442551150918007, "learning_rate": 1.4172394000167625e-05, "loss": 2.5449, "step": 178 }, { "epoch": 3.8992460589444824, "grad_norm": 0.04739952087402344, "learning_rate": 1.4105418239464452e-05, "loss": 2.5018, "step": 179 }, { "epoch": 3.9211788896504456, "grad_norm": 0.06977739185094833, "learning_rate": 1.4038220250420487e-05, "loss": 2.5065, "step": 180 }, { "epoch": 3.9431117203564083, "grad_norm": 0.09621317684650421, "learning_rate": 1.3970803670496453e-05, "loss": 2.5293, "step": 181 }, { "epoch": 3.9650445510623715, "grad_norm": 0.07425831258296967, "learning_rate": 1.390317214898551e-05, "loss": 2.5197, "step": 182 }, { "epoch": 3.9869773817683347, "grad_norm": 0.05756077170372009, "learning_rate": 1.3835329346815716e-05, "loss": 2.4661, "step": 183 }, { "epoch": 4.0, "grad_norm": 0.07039082795381546, "learning_rate": 1.3767278936351853e-05, "loss": 2.7058, "step": 184 }, { "epoch": 4.021932830705963, "grad_norm": 0.06809078902006149, "learning_rate": 1.3699024601196641e-05, "loss": 2.5684, "step": 185 }, { "epoch": 4.043865661411926, "grad_norm": 0.044164013117551804, "learning_rate": 1.3630570035991352e-05, "loss": 2.4878, "step": 186 }, { "epoch": 4.065798492117889, "grad_norm": 0.07478977739810944, "learning_rate": 1.3561918946215807e-05, "loss": 2.5928, "step": 187 }, { "epoch": 4.087731322823852, "grad_norm": 0.053608592599630356, "learning_rate": 1.34930750479878e-05, "loss": 2.5654, "step": 188 }, { "epoch": 4.109664153529815, "grad_norm": 0.049672599881887436, "learning_rate": 1.3424042067861944e-05, "loss": 2.4353, "step": 189 }, { "epoch": 4.131596984235778, "grad_norm": 0.07206689566373825, "learning_rate": 1.335482374262795e-05, "loss": 2.5453, "step": 190 }, { "epoch": 4.153529814941741, "grad_norm": 0.059480294585227966, "learning_rate": 1.3285423819108349e-05, "loss": 2.5138, "step": 191 }, { "epoch": 4.175462645647704, "grad_norm": 0.05027122050523758, "learning_rate": 1.3215846053955683e-05, "loss": 2.441, "step": 192 }, { "epoch": 4.197395476353667, "grad_norm": 0.04946836456656456, "learning_rate": 1.3146094213449148e-05, "loss": 2.5398, "step": 193 }, { "epoch": 4.21932830705963, "grad_norm": 0.073307566344738, "learning_rate": 1.3076172073290726e-05, "loss": 2.5996, "step": 194 }, { "epoch": 4.241261137765592, "grad_norm": 0.04758650064468384, "learning_rate": 1.3006083418400799e-05, "loss": 2.5598, "step": 195 }, { "epoch": 4.263193968471556, "grad_norm": 0.0642954632639885, "learning_rate": 1.2935832042713288e-05, "loss": 2.53, "step": 196 }, { "epoch": 4.285126799177519, "grad_norm": 0.06495189666748047, "learning_rate": 1.2865421748970257e-05, "loss": 2.6442, "step": 197 }, { "epoch": 4.307059629883482, "grad_norm": 0.05023903772234917, "learning_rate": 1.2794856348516095e-05, "loss": 2.5318, "step": 198 }, { "epoch": 4.328992460589445, "grad_norm": 0.04791835695505142, "learning_rate": 1.2724139661091188e-05, "loss": 2.5011, "step": 199 }, { "epoch": 4.3509252912954075, "grad_norm": 0.0522720031440258, "learning_rate": 1.2653275514625165e-05, "loss": 2.5169, "step": 200 } ], "logging_steps": 1, "max_steps": 450, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.351885860195533e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }