{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 14889, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00040298206729800525, "grad_norm": 0.26894956827163696, "learning_rate": 1.6116035455278e-07, "loss": 0.3796, "step": 2 }, { "epoch": 0.0008059641345960105, "grad_norm": 0.26727548241615295, "learning_rate": 3.2232070910556e-07, "loss": 0.4024, "step": 4 }, { "epoch": 0.0012089462018940156, "grad_norm": 0.3290499448776245, "learning_rate": 4.834810636583401e-07, "loss": 0.3773, "step": 6 }, { "epoch": 0.001611928269192021, "grad_norm": 0.2747355103492737, "learning_rate": 6.4464141821112e-07, "loss": 0.4127, "step": 8 }, { "epoch": 0.0020149103364900263, "grad_norm": 0.19865649938583374, "learning_rate": 8.058017727639e-07, "loss": 0.3967, "step": 10 }, { "epoch": 0.0024178924037880313, "grad_norm": 0.20322462916374207, "learning_rate": 9.669621273166802e-07, "loss": 0.4543, "step": 12 }, { "epoch": 0.0028208744710860366, "grad_norm": 0.19949519634246826, "learning_rate": 1.1281224818694602e-06, "loss": 0.3594, "step": 14 }, { "epoch": 0.003223856538384042, "grad_norm": 0.20132654905319214, "learning_rate": 1.28928283642224e-06, "loss": 0.418, "step": 16 }, { "epoch": 0.0036268386056820473, "grad_norm": 0.22182698547840118, "learning_rate": 1.4504431909750204e-06, "loss": 0.4144, "step": 18 }, { "epoch": 0.004029820672980053, "grad_norm": 0.2903820872306824, "learning_rate": 1.6116035455278e-06, "loss": 0.3906, "step": 20 }, { "epoch": 0.004432802740278058, "grad_norm": 0.20233897864818573, "learning_rate": 1.7727639000805805e-06, "loss": 0.3694, "step": 22 }, { "epoch": 0.0048357848075760625, "grad_norm": 0.21239183843135834, "learning_rate": 1.9339242546333603e-06, "loss": 0.4169, "step": 24 }, { "epoch": 0.005238766874874068, "grad_norm": 0.3125857710838318, "learning_rate": 2.09508460918614e-06, "loss": 0.4125, "step": 26 }, { "epoch": 0.005641748942172073, "grad_norm": 0.1859792172908783, "learning_rate": 2.2562449637389205e-06, "loss": 0.3919, "step": 28 }, { "epoch": 0.006044731009470079, "grad_norm": 0.38755470514297485, "learning_rate": 2.4174053182917003e-06, "loss": 0.4378, "step": 30 }, { "epoch": 0.006447713076768084, "grad_norm": 0.2895958125591278, "learning_rate": 2.57856567284448e-06, "loss": 0.3866, "step": 32 }, { "epoch": 0.006850695144066089, "grad_norm": 0.20762863755226135, "learning_rate": 2.7397260273972604e-06, "loss": 0.4852, "step": 34 }, { "epoch": 0.007253677211364095, "grad_norm": 0.1820192039012909, "learning_rate": 2.9008863819500407e-06, "loss": 0.3758, "step": 36 }, { "epoch": 0.007656659278662099, "grad_norm": 0.21095845103263855, "learning_rate": 3.0620467365028206e-06, "loss": 0.3388, "step": 38 }, { "epoch": 0.008059641345960105, "grad_norm": 0.17223376035690308, "learning_rate": 3.2232070910556e-06, "loss": 0.3846, "step": 40 }, { "epoch": 0.00846262341325811, "grad_norm": 0.28751063346862793, "learning_rate": 3.3843674456083807e-06, "loss": 0.424, "step": 42 }, { "epoch": 0.008865605480556116, "grad_norm": 0.22931885719299316, "learning_rate": 3.545527800161161e-06, "loss": 0.3927, "step": 44 }, { "epoch": 0.00926858754785412, "grad_norm": 0.17054371535778046, "learning_rate": 3.706688154713941e-06, "loss": 0.3698, "step": 46 }, { "epoch": 0.009671569615152125, "grad_norm": 0.1970645934343338, "learning_rate": 3.867848509266721e-06, "loss": 0.3825, "step": 48 }, { "epoch": 0.01007455168245013, "grad_norm": 0.226581409573555, "learning_rate": 4.0290088638195005e-06, "loss": 0.388, "step": 50 }, { "epoch": 0.010477533749748136, "grad_norm": 0.23808637261390686, "learning_rate": 4.19016921837228e-06, "loss": 0.3802, "step": 52 }, { "epoch": 0.010880515817046141, "grad_norm": 0.23428556323051453, "learning_rate": 4.351329572925061e-06, "loss": 0.4011, "step": 54 }, { "epoch": 0.011283497884344146, "grad_norm": 0.22363170981407166, "learning_rate": 4.512489927477841e-06, "loss": 0.388, "step": 56 }, { "epoch": 0.011686479951642152, "grad_norm": 0.19610151648521423, "learning_rate": 4.673650282030621e-06, "loss": 0.3456, "step": 58 }, { "epoch": 0.012089462018940157, "grad_norm": 0.32007136940956116, "learning_rate": 4.834810636583401e-06, "loss": 0.4075, "step": 60 }, { "epoch": 0.012492444086238163, "grad_norm": 0.2625230550765991, "learning_rate": 4.9959709911361805e-06, "loss": 0.3101, "step": 62 }, { "epoch": 0.012895426153536168, "grad_norm": 0.25986766815185547, "learning_rate": 5.15713134568896e-06, "loss": 0.3616, "step": 64 }, { "epoch": 0.013298408220834173, "grad_norm": 0.3018483817577362, "learning_rate": 5.31829170024174e-06, "loss": 0.3482, "step": 66 }, { "epoch": 0.013701390288132179, "grad_norm": 0.23337864875793457, "learning_rate": 5.479452054794521e-06, "loss": 0.3467, "step": 68 }, { "epoch": 0.014104372355430184, "grad_norm": 0.19600503146648407, "learning_rate": 5.6406124093473016e-06, "loss": 0.3617, "step": 70 }, { "epoch": 0.01450735442272819, "grad_norm": 0.24549278616905212, "learning_rate": 5.801772763900081e-06, "loss": 0.355, "step": 72 }, { "epoch": 0.014910336490026195, "grad_norm": 0.2724650502204895, "learning_rate": 5.962933118452861e-06, "loss": 0.366, "step": 74 }, { "epoch": 0.015313318557324198, "grad_norm": 0.179411381483078, "learning_rate": 6.124093473005641e-06, "loss": 0.3963, "step": 76 }, { "epoch": 0.015716300624622204, "grad_norm": 0.2288782000541687, "learning_rate": 6.285253827558421e-06, "loss": 0.3481, "step": 78 }, { "epoch": 0.01611928269192021, "grad_norm": 0.27673980593681335, "learning_rate": 6.4464141821112e-06, "loss": 0.3715, "step": 80 }, { "epoch": 0.016522264759218214, "grad_norm": 0.19291406869888306, "learning_rate": 6.607574536663981e-06, "loss": 0.2958, "step": 82 }, { "epoch": 0.01692524682651622, "grad_norm": 0.2432788461446762, "learning_rate": 6.768734891216761e-06, "loss": 0.3135, "step": 84 }, { "epoch": 0.017328228893814225, "grad_norm": 0.35042449831962585, "learning_rate": 6.929895245769541e-06, "loss": 0.289, "step": 86 }, { "epoch": 0.017731210961112232, "grad_norm": 0.2282453328371048, "learning_rate": 7.091055600322322e-06, "loss": 0.3255, "step": 88 }, { "epoch": 0.018134193028410236, "grad_norm": 0.2614617347717285, "learning_rate": 7.252215954875101e-06, "loss": 0.3164, "step": 90 }, { "epoch": 0.01853717509570824, "grad_norm": 0.20971183478832245, "learning_rate": 7.413376309427882e-06, "loss": 0.2639, "step": 92 }, { "epoch": 0.018940157163006247, "grad_norm": 0.2980507016181946, "learning_rate": 7.574536663980661e-06, "loss": 0.2453, "step": 94 }, { "epoch": 0.01934313923030425, "grad_norm": 0.27161258459091187, "learning_rate": 7.735697018533441e-06, "loss": 0.3087, "step": 96 }, { "epoch": 0.019746121297602257, "grad_norm": 0.2935725450515747, "learning_rate": 7.89685737308622e-06, "loss": 0.2627, "step": 98 }, { "epoch": 0.02014910336490026, "grad_norm": 0.20042681694030762, "learning_rate": 8.058017727639001e-06, "loss": 0.3023, "step": 100 }, { "epoch": 0.020552085432198268, "grad_norm": 0.2027212679386139, "learning_rate": 8.21917808219178e-06, "loss": 0.263, "step": 102 }, { "epoch": 0.02095506749949627, "grad_norm": 0.23784109950065613, "learning_rate": 8.38033843674456e-06, "loss": 0.2263, "step": 104 }, { "epoch": 0.02135804956679428, "grad_norm": 0.2175307422876358, "learning_rate": 8.541498791297341e-06, "loss": 0.2717, "step": 106 }, { "epoch": 0.021761031634092282, "grad_norm": 0.38441169261932373, "learning_rate": 8.702659145850122e-06, "loss": 0.253, "step": 108 }, { "epoch": 0.02216401370139029, "grad_norm": 0.32977694272994995, "learning_rate": 8.863819500402901e-06, "loss": 0.2557, "step": 110 }, { "epoch": 0.022566995768688293, "grad_norm": 0.35294121503829956, "learning_rate": 9.024979854955682e-06, "loss": 0.2767, "step": 112 }, { "epoch": 0.0229699778359863, "grad_norm": 0.2736724019050598, "learning_rate": 9.186140209508463e-06, "loss": 0.2394, "step": 114 }, { "epoch": 0.023372959903284304, "grad_norm": 0.48598694801330566, "learning_rate": 9.347300564061242e-06, "loss": 0.2349, "step": 116 }, { "epoch": 0.02377594197058231, "grad_norm": 0.2282780408859253, "learning_rate": 9.508460918614022e-06, "loss": 0.284, "step": 118 }, { "epoch": 0.024178924037880314, "grad_norm": 0.297885537147522, "learning_rate": 9.669621273166801e-06, "loss": 0.2087, "step": 120 }, { "epoch": 0.024581906105178318, "grad_norm": 0.1727965772151947, "learning_rate": 9.830781627719582e-06, "loss": 0.2552, "step": 122 }, { "epoch": 0.024984888172476325, "grad_norm": 0.2835056781768799, "learning_rate": 9.991941982272361e-06, "loss": 0.2698, "step": 124 }, { "epoch": 0.02538787023977433, "grad_norm": 0.29209110140800476, "learning_rate": 1.0153102336825142e-05, "loss": 0.1992, "step": 126 }, { "epoch": 0.025790852307072336, "grad_norm": 0.31140977144241333, "learning_rate": 1.031426269137792e-05, "loss": 0.224, "step": 128 }, { "epoch": 0.02619383437437034, "grad_norm": 0.3317878842353821, "learning_rate": 1.0475423045930701e-05, "loss": 0.216, "step": 130 }, { "epoch": 0.026596816441668347, "grad_norm": 0.22915571928024292, "learning_rate": 1.063658340048348e-05, "loss": 0.2123, "step": 132 }, { "epoch": 0.02699979850896635, "grad_norm": 0.2686645984649658, "learning_rate": 1.0797743755036261e-05, "loss": 0.2238, "step": 134 }, { "epoch": 0.027402780576264357, "grad_norm": 0.22468291223049164, "learning_rate": 1.0958904109589042e-05, "loss": 0.251, "step": 136 }, { "epoch": 0.02780576264356236, "grad_norm": 0.5836780667304993, "learning_rate": 1.1120064464141822e-05, "loss": 0.2242, "step": 138 }, { "epoch": 0.028208744710860368, "grad_norm": 0.6434018611907959, "learning_rate": 1.1281224818694603e-05, "loss": 0.251, "step": 140 }, { "epoch": 0.02861172677815837, "grad_norm": 0.2891378700733185, "learning_rate": 1.1442385173247382e-05, "loss": 0.2184, "step": 142 }, { "epoch": 0.02901470884545638, "grad_norm": 0.3141033351421356, "learning_rate": 1.1603545527800163e-05, "loss": 0.2608, "step": 144 }, { "epoch": 0.029417690912754382, "grad_norm": 0.20865300297737122, "learning_rate": 1.1764705882352942e-05, "loss": 0.2162, "step": 146 }, { "epoch": 0.02982067298005239, "grad_norm": 0.2990216016769409, "learning_rate": 1.1925866236905723e-05, "loss": 0.2296, "step": 148 }, { "epoch": 0.030223655047350393, "grad_norm": 0.28841957449913025, "learning_rate": 1.2087026591458502e-05, "loss": 0.2101, "step": 150 }, { "epoch": 0.030626637114648397, "grad_norm": 0.22070175409317017, "learning_rate": 1.2248186946011282e-05, "loss": 0.2492, "step": 152 }, { "epoch": 0.031029619181946404, "grad_norm": 0.26178812980651855, "learning_rate": 1.2409347300564061e-05, "loss": 0.1963, "step": 154 }, { "epoch": 0.03143260124924441, "grad_norm": 0.5203900933265686, "learning_rate": 1.2570507655116842e-05, "loss": 0.2219, "step": 156 }, { "epoch": 0.03183558331654241, "grad_norm": 0.1924479454755783, "learning_rate": 1.2731668009669623e-05, "loss": 0.2366, "step": 158 }, { "epoch": 0.03223856538384042, "grad_norm": 0.4090641140937805, "learning_rate": 1.28928283642224e-05, "loss": 0.2283, "step": 160 }, { "epoch": 0.032641547451138425, "grad_norm": 0.25200772285461426, "learning_rate": 1.305398871877518e-05, "loss": 0.2175, "step": 162 }, { "epoch": 0.03304452951843643, "grad_norm": 0.2138717919588089, "learning_rate": 1.3215149073327961e-05, "loss": 0.2178, "step": 164 }, { "epoch": 0.03344751158573443, "grad_norm": 0.23985524475574493, "learning_rate": 1.3376309427880742e-05, "loss": 0.2733, "step": 166 }, { "epoch": 0.03385049365303244, "grad_norm": 0.2478920966386795, "learning_rate": 1.3537469782433523e-05, "loss": 0.2489, "step": 168 }, { "epoch": 0.03425347572033045, "grad_norm": 0.3181647062301636, "learning_rate": 1.3698630136986302e-05, "loss": 0.2129, "step": 170 }, { "epoch": 0.03465645778762845, "grad_norm": 0.3164592683315277, "learning_rate": 1.3859790491539082e-05, "loss": 0.238, "step": 172 }, { "epoch": 0.035059439854926454, "grad_norm": 0.2163039594888687, "learning_rate": 1.4020950846091863e-05, "loss": 0.1853, "step": 174 }, { "epoch": 0.035462421922224464, "grad_norm": 0.2521723508834839, "learning_rate": 1.4182111200644644e-05, "loss": 0.2292, "step": 176 }, { "epoch": 0.03586540398952247, "grad_norm": 0.22221054136753082, "learning_rate": 1.4343271555197421e-05, "loss": 0.216, "step": 178 }, { "epoch": 0.03626838605682047, "grad_norm": 0.7538085579872131, "learning_rate": 1.4504431909750202e-05, "loss": 0.2608, "step": 180 }, { "epoch": 0.036671368124118475, "grad_norm": 0.20844882726669312, "learning_rate": 1.4665592264302983e-05, "loss": 0.2367, "step": 182 }, { "epoch": 0.03707435019141648, "grad_norm": 0.2020358443260193, "learning_rate": 1.4826752618855763e-05, "loss": 0.2516, "step": 184 }, { "epoch": 0.03747733225871449, "grad_norm": 0.3234182298183441, "learning_rate": 1.498791297340854e-05, "loss": 0.1662, "step": 186 }, { "epoch": 0.03788031432601249, "grad_norm": 0.18517746031284332, "learning_rate": 1.5149073327961321e-05, "loss": 0.2732, "step": 188 }, { "epoch": 0.0382832963933105, "grad_norm": 0.2105470597743988, "learning_rate": 1.5310233682514102e-05, "loss": 0.2293, "step": 190 }, { "epoch": 0.0386862784606085, "grad_norm": 0.29840391874313354, "learning_rate": 1.5471394037066883e-05, "loss": 0.2547, "step": 192 }, { "epoch": 0.03908926052790651, "grad_norm": 0.22715766727924347, "learning_rate": 1.563255439161966e-05, "loss": 0.1969, "step": 194 }, { "epoch": 0.039492242595204514, "grad_norm": 0.36480122804641724, "learning_rate": 1.579371474617244e-05, "loss": 0.2212, "step": 196 }, { "epoch": 0.03989522466250252, "grad_norm": 0.15495972335338593, "learning_rate": 1.595487510072522e-05, "loss": 0.2534, "step": 198 }, { "epoch": 0.04029820672980052, "grad_norm": 0.2647131383419037, "learning_rate": 1.6116035455278002e-05, "loss": 0.1396, "step": 200 }, { "epoch": 0.04070118879709853, "grad_norm": 0.430426687002182, "learning_rate": 1.6277195809830783e-05, "loss": 0.2301, "step": 202 }, { "epoch": 0.041104170864396536, "grad_norm": 0.19295406341552734, "learning_rate": 1.643835616438356e-05, "loss": 0.1858, "step": 204 }, { "epoch": 0.04150715293169454, "grad_norm": 0.20830343663692474, "learning_rate": 1.659951651893634e-05, "loss": 0.2315, "step": 206 }, { "epoch": 0.04191013499899254, "grad_norm": 0.4402204155921936, "learning_rate": 1.676067687348912e-05, "loss": 0.1993, "step": 208 }, { "epoch": 0.04231311706629055, "grad_norm": 0.40240907669067383, "learning_rate": 1.6921837228041902e-05, "loss": 0.1841, "step": 210 }, { "epoch": 0.04271609913358856, "grad_norm": 0.21966539323329926, "learning_rate": 1.7082997582594683e-05, "loss": 0.193, "step": 212 }, { "epoch": 0.04311908120088656, "grad_norm": 0.2012406885623932, "learning_rate": 1.7244157937147464e-05, "loss": 0.1679, "step": 214 }, { "epoch": 0.043522063268184565, "grad_norm": 0.2646319270133972, "learning_rate": 1.7405318291700244e-05, "loss": 0.2292, "step": 216 }, { "epoch": 0.04392504533548257, "grad_norm": 0.25195521116256714, "learning_rate": 1.7566478646253025e-05, "loss": 0.1845, "step": 218 }, { "epoch": 0.04432802740278058, "grad_norm": 0.24844960868358612, "learning_rate": 1.7727639000805802e-05, "loss": 0.2392, "step": 220 }, { "epoch": 0.04473100947007858, "grad_norm": 0.21988102793693542, "learning_rate": 1.7888799355358583e-05, "loss": 0.227, "step": 222 }, { "epoch": 0.045133991537376586, "grad_norm": 0.34720098972320557, "learning_rate": 1.8049959709911364e-05, "loss": 0.2223, "step": 224 }, { "epoch": 0.04553697360467459, "grad_norm": 0.2286384403705597, "learning_rate": 1.8211120064464144e-05, "loss": 0.195, "step": 226 }, { "epoch": 0.0459399556719726, "grad_norm": 0.5426890850067139, "learning_rate": 1.8372280419016925e-05, "loss": 0.1991, "step": 228 }, { "epoch": 0.046342937739270604, "grad_norm": 0.2465570718050003, "learning_rate": 1.8533440773569702e-05, "loss": 0.2399, "step": 230 }, { "epoch": 0.04674591980656861, "grad_norm": 0.25233685970306396, "learning_rate": 1.8694601128122483e-05, "loss": 0.1794, "step": 232 }, { "epoch": 0.04714890187386661, "grad_norm": 0.29172617197036743, "learning_rate": 1.8855761482675264e-05, "loss": 0.2746, "step": 234 }, { "epoch": 0.04755188394116462, "grad_norm": 0.22605657577514648, "learning_rate": 1.9016921837228044e-05, "loss": 0.2205, "step": 236 }, { "epoch": 0.047954866008462625, "grad_norm": 0.22374406456947327, "learning_rate": 1.9178082191780822e-05, "loss": 0.1858, "step": 238 }, { "epoch": 0.04835784807576063, "grad_norm": 0.3003512918949127, "learning_rate": 1.9339242546333602e-05, "loss": 0.1881, "step": 240 }, { "epoch": 0.04876083014305863, "grad_norm": 0.24595895409584045, "learning_rate": 1.9500402900886383e-05, "loss": 0.2389, "step": 242 }, { "epoch": 0.049163812210356636, "grad_norm": 0.1715419590473175, "learning_rate": 1.9661563255439164e-05, "loss": 0.2205, "step": 244 }, { "epoch": 0.04956679427765465, "grad_norm": 0.1657872498035431, "learning_rate": 1.982272360999194e-05, "loss": 0.2125, "step": 246 }, { "epoch": 0.04996977634495265, "grad_norm": 0.2673228085041046, "learning_rate": 1.9983883964544722e-05, "loss": 0.2688, "step": 248 }, { "epoch": 0.050372758412250654, "grad_norm": 0.2491181194782257, "learning_rate": 2.0145044319097503e-05, "loss": 0.2558, "step": 250 }, { "epoch": 0.05077574047954866, "grad_norm": 0.1648625135421753, "learning_rate": 2.0306204673650283e-05, "loss": 0.2304, "step": 252 }, { "epoch": 0.05117872254684667, "grad_norm": 0.21671715378761292, "learning_rate": 2.0467365028203064e-05, "loss": 0.1811, "step": 254 }, { "epoch": 0.05158170461414467, "grad_norm": 0.19184859097003937, "learning_rate": 2.062852538275584e-05, "loss": 0.1923, "step": 256 }, { "epoch": 0.051984686681442675, "grad_norm": 0.3803746998310089, "learning_rate": 2.0789685737308622e-05, "loss": 0.1985, "step": 258 }, { "epoch": 0.05238766874874068, "grad_norm": 0.2140813171863556, "learning_rate": 2.0950846091861403e-05, "loss": 0.2185, "step": 260 }, { "epoch": 0.05279065081603869, "grad_norm": 0.16832232475280762, "learning_rate": 2.1112006446414183e-05, "loss": 0.2087, "step": 262 }, { "epoch": 0.05319363288333669, "grad_norm": 0.21901485323905945, "learning_rate": 2.127316680096696e-05, "loss": 0.1855, "step": 264 }, { "epoch": 0.0535966149506347, "grad_norm": 0.2979806959629059, "learning_rate": 2.143432715551974e-05, "loss": 0.2444, "step": 266 }, { "epoch": 0.0539995970179327, "grad_norm": 0.2841276228427887, "learning_rate": 2.1595487510072522e-05, "loss": 0.2148, "step": 268 }, { "epoch": 0.054402579085230704, "grad_norm": 0.1896170824766159, "learning_rate": 2.1756647864625303e-05, "loss": 0.2656, "step": 270 }, { "epoch": 0.054805561152528715, "grad_norm": 0.24286943674087524, "learning_rate": 2.1917808219178083e-05, "loss": 0.2079, "step": 272 }, { "epoch": 0.05520854321982672, "grad_norm": 0.41237831115722656, "learning_rate": 2.2078968573730864e-05, "loss": 0.135, "step": 274 }, { "epoch": 0.05561152528712472, "grad_norm": 0.17708082497119904, "learning_rate": 2.2240128928283645e-05, "loss": 0.2, "step": 276 }, { "epoch": 0.056014507354422725, "grad_norm": 0.1603289693593979, "learning_rate": 2.2401289282836426e-05, "loss": 0.1929, "step": 278 }, { "epoch": 0.056417489421720736, "grad_norm": 0.25106561183929443, "learning_rate": 2.2562449637389206e-05, "loss": 0.1673, "step": 280 }, { "epoch": 0.05682047148901874, "grad_norm": 0.19034595787525177, "learning_rate": 2.2723609991941984e-05, "loss": 0.2536, "step": 282 }, { "epoch": 0.05722345355631674, "grad_norm": 0.2286035567522049, "learning_rate": 2.2884770346494764e-05, "loss": 0.1878, "step": 284 }, { "epoch": 0.05762643562361475, "grad_norm": 0.21165654063224792, "learning_rate": 2.3045930701047545e-05, "loss": 0.1909, "step": 286 }, { "epoch": 0.05802941769091276, "grad_norm": 0.18575643002986908, "learning_rate": 2.3207091055600326e-05, "loss": 0.2177, "step": 288 }, { "epoch": 0.05843239975821076, "grad_norm": 0.23574145138263702, "learning_rate": 2.3368251410153103e-05, "loss": 0.1526, "step": 290 }, { "epoch": 0.058835381825508765, "grad_norm": 0.25691771507263184, "learning_rate": 2.3529411764705884e-05, "loss": 0.2041, "step": 292 }, { "epoch": 0.05923836389280677, "grad_norm": 0.16939234733581543, "learning_rate": 2.3690572119258664e-05, "loss": 0.1546, "step": 294 }, { "epoch": 0.05964134596010478, "grad_norm": 0.15170009434223175, "learning_rate": 2.3851732473811445e-05, "loss": 0.2195, "step": 296 }, { "epoch": 0.06004432802740278, "grad_norm": 0.22137551009655, "learning_rate": 2.4012892828364222e-05, "loss": 0.2026, "step": 298 }, { "epoch": 0.060447310094700786, "grad_norm": 0.16329768300056458, "learning_rate": 2.4174053182917003e-05, "loss": 0.1763, "step": 300 }, { "epoch": 0.06085029216199879, "grad_norm": 0.17110270261764526, "learning_rate": 2.4335213537469784e-05, "loss": 0.2008, "step": 302 }, { "epoch": 0.06125327422929679, "grad_norm": 0.2618560492992401, "learning_rate": 2.4496373892022564e-05, "loss": 0.2408, "step": 304 }, { "epoch": 0.061656256296594804, "grad_norm": 0.14731261134147644, "learning_rate": 2.4657534246575342e-05, "loss": 0.2269, "step": 306 }, { "epoch": 0.06205923836389281, "grad_norm": 0.19489218294620514, "learning_rate": 2.4818694601128122e-05, "loss": 0.2106, "step": 308 }, { "epoch": 0.06246222043119081, "grad_norm": 0.18849453330039978, "learning_rate": 2.4979854955680903e-05, "loss": 0.1856, "step": 310 }, { "epoch": 0.06286520249848881, "grad_norm": 0.83469557762146, "learning_rate": 2.5141015310233684e-05, "loss": 0.2558, "step": 312 }, { "epoch": 0.06326818456578683, "grad_norm": 0.17429494857788086, "learning_rate": 2.5302175664786465e-05, "loss": 0.2025, "step": 314 }, { "epoch": 0.06367116663308482, "grad_norm": 0.17523562908172607, "learning_rate": 2.5463336019339245e-05, "loss": 0.2015, "step": 316 }, { "epoch": 0.06407414870038283, "grad_norm": 0.1575326770544052, "learning_rate": 2.5624496373892026e-05, "loss": 0.2234, "step": 318 }, { "epoch": 0.06447713076768084, "grad_norm": 0.19641879200935364, "learning_rate": 2.57856567284448e-05, "loss": 0.1846, "step": 320 }, { "epoch": 0.06488011283497884, "grad_norm": 0.2497224360704422, "learning_rate": 2.594681708299758e-05, "loss": 0.1492, "step": 322 }, { "epoch": 0.06528309490227685, "grad_norm": 0.36354440450668335, "learning_rate": 2.610797743755036e-05, "loss": 0.1787, "step": 324 }, { "epoch": 0.06568607696957485, "grad_norm": 0.16666480898857117, "learning_rate": 2.6269137792103142e-05, "loss": 0.2055, "step": 326 }, { "epoch": 0.06608905903687286, "grad_norm": 0.20041917264461517, "learning_rate": 2.6430298146655923e-05, "loss": 0.2166, "step": 328 }, { "epoch": 0.06649204110417087, "grad_norm": 0.3149804472923279, "learning_rate": 2.6591458501208703e-05, "loss": 0.1834, "step": 330 }, { "epoch": 0.06689502317146886, "grad_norm": 0.17761199176311493, "learning_rate": 2.6752618855761484e-05, "loss": 0.174, "step": 332 }, { "epoch": 0.06729800523876688, "grad_norm": 0.1416761577129364, "learning_rate": 2.6913779210314265e-05, "loss": 0.2131, "step": 334 }, { "epoch": 0.06770098730606489, "grad_norm": 0.207278773188591, "learning_rate": 2.7074939564867045e-05, "loss": 0.265, "step": 336 }, { "epoch": 0.06810396937336288, "grad_norm": 0.3748891055583954, "learning_rate": 2.7236099919419823e-05, "loss": 0.1741, "step": 338 }, { "epoch": 0.0685069514406609, "grad_norm": 0.38350576162338257, "learning_rate": 2.7397260273972603e-05, "loss": 0.165, "step": 340 }, { "epoch": 0.06890993350795889, "grad_norm": 0.2217901349067688, "learning_rate": 2.7558420628525384e-05, "loss": 0.1614, "step": 342 }, { "epoch": 0.0693129155752569, "grad_norm": 0.16407828032970428, "learning_rate": 2.7719580983078165e-05, "loss": 0.1877, "step": 344 }, { "epoch": 0.06971589764255491, "grad_norm": 0.19459925591945648, "learning_rate": 2.7880741337630946e-05, "loss": 0.1603, "step": 346 }, { "epoch": 0.07011887970985291, "grad_norm": 0.1655324548482895, "learning_rate": 2.8041901692183726e-05, "loss": 0.2015, "step": 348 }, { "epoch": 0.07052186177715092, "grad_norm": 0.2289515882730484, "learning_rate": 2.8203062046736507e-05, "loss": 0.2403, "step": 350 }, { "epoch": 0.07092484384444893, "grad_norm": 0.19211649894714355, "learning_rate": 2.8364222401289288e-05, "loss": 0.2097, "step": 352 }, { "epoch": 0.07132782591174693, "grad_norm": 0.17989704012870789, "learning_rate": 2.852538275584206e-05, "loss": 0.19, "step": 354 }, { "epoch": 0.07173080797904494, "grad_norm": 0.1994090974330902, "learning_rate": 2.8686543110394842e-05, "loss": 0.1731, "step": 356 }, { "epoch": 0.07213379004634293, "grad_norm": 0.20826664566993713, "learning_rate": 2.8847703464947623e-05, "loss": 0.1895, "step": 358 }, { "epoch": 0.07253677211364094, "grad_norm": 0.28034508228302, "learning_rate": 2.9008863819500404e-05, "loss": 0.2372, "step": 360 }, { "epoch": 0.07293975418093895, "grad_norm": 0.24738198518753052, "learning_rate": 2.9170024174053184e-05, "loss": 0.1983, "step": 362 }, { "epoch": 0.07334273624823695, "grad_norm": 0.2001752257347107, "learning_rate": 2.9331184528605965e-05, "loss": 0.1969, "step": 364 }, { "epoch": 0.07374571831553496, "grad_norm": 0.2115003913640976, "learning_rate": 2.9492344883158746e-05, "loss": 0.2637, "step": 366 }, { "epoch": 0.07414870038283296, "grad_norm": 0.1739916056394577, "learning_rate": 2.9653505237711526e-05, "loss": 0.2119, "step": 368 }, { "epoch": 0.07455168245013097, "grad_norm": 0.19847404956817627, "learning_rate": 2.9814665592264307e-05, "loss": 0.178, "step": 370 }, { "epoch": 0.07495466451742898, "grad_norm": 0.27303946018218994, "learning_rate": 2.997582594681708e-05, "loss": 0.203, "step": 372 }, { "epoch": 0.07535764658472698, "grad_norm": 0.20624373853206635, "learning_rate": 3.0136986301369862e-05, "loss": 0.2282, "step": 374 }, { "epoch": 0.07576062865202499, "grad_norm": 0.15636469423770905, "learning_rate": 3.0298146655922643e-05, "loss": 0.1499, "step": 376 }, { "epoch": 0.076163610719323, "grad_norm": 0.15241625905036926, "learning_rate": 3.0459307010475423e-05, "loss": 0.234, "step": 378 }, { "epoch": 0.076566592786621, "grad_norm": 0.2701398432254791, "learning_rate": 3.0620467365028204e-05, "loss": 0.232, "step": 380 }, { "epoch": 0.076969574853919, "grad_norm": 0.23230019211769104, "learning_rate": 3.078162771958099e-05, "loss": 0.1942, "step": 382 }, { "epoch": 0.077372556921217, "grad_norm": 0.16999439895153046, "learning_rate": 3.0942788074133765e-05, "loss": 0.1608, "step": 384 }, { "epoch": 0.07777553898851501, "grad_norm": 0.2917148470878601, "learning_rate": 3.110394842868655e-05, "loss": 0.2297, "step": 386 }, { "epoch": 0.07817852105581302, "grad_norm": 0.2013273537158966, "learning_rate": 3.126510878323932e-05, "loss": 0.193, "step": 388 }, { "epoch": 0.07858150312311102, "grad_norm": 0.2821531295776367, "learning_rate": 3.1426269137792104e-05, "loss": 0.1884, "step": 390 }, { "epoch": 0.07898448519040903, "grad_norm": 0.17071853578090668, "learning_rate": 3.158742949234488e-05, "loss": 0.2326, "step": 392 }, { "epoch": 0.07938746725770703, "grad_norm": 0.2293412685394287, "learning_rate": 3.1748589846897665e-05, "loss": 0.2068, "step": 394 }, { "epoch": 0.07979044932500504, "grad_norm": 0.15788160264492035, "learning_rate": 3.190975020145044e-05, "loss": 0.2345, "step": 396 }, { "epoch": 0.08019343139230305, "grad_norm": 0.16741882264614105, "learning_rate": 3.207091055600323e-05, "loss": 0.2019, "step": 398 }, { "epoch": 0.08059641345960104, "grad_norm": 0.17850607633590698, "learning_rate": 3.2232070910556004e-05, "loss": 0.2225, "step": 400 }, { "epoch": 0.08099939552689905, "grad_norm": 0.19540104269981384, "learning_rate": 3.239323126510879e-05, "loss": 0.2101, "step": 402 }, { "epoch": 0.08140237759419706, "grad_norm": 0.26288026571273804, "learning_rate": 3.2554391619661566e-05, "loss": 0.2099, "step": 404 }, { "epoch": 0.08180535966149506, "grad_norm": 0.16300523281097412, "learning_rate": 3.271555197421434e-05, "loss": 0.2115, "step": 406 }, { "epoch": 0.08220834172879307, "grad_norm": 0.14380481839179993, "learning_rate": 3.287671232876712e-05, "loss": 0.1825, "step": 408 }, { "epoch": 0.08261132379609107, "grad_norm": 0.1476619839668274, "learning_rate": 3.3037872683319904e-05, "loss": 0.1802, "step": 410 }, { "epoch": 0.08301430586338908, "grad_norm": 0.2590979337692261, "learning_rate": 3.319903303787268e-05, "loss": 0.2175, "step": 412 }, { "epoch": 0.08341728793068709, "grad_norm": 0.20722797513008118, "learning_rate": 3.3360193392425466e-05, "loss": 0.184, "step": 414 }, { "epoch": 0.08382026999798509, "grad_norm": 0.17478062212467194, "learning_rate": 3.352135374697824e-05, "loss": 0.1962, "step": 416 }, { "epoch": 0.0842232520652831, "grad_norm": 0.18867933750152588, "learning_rate": 3.368251410153103e-05, "loss": 0.2151, "step": 418 }, { "epoch": 0.0846262341325811, "grad_norm": 0.24660547077655792, "learning_rate": 3.3843674456083804e-05, "loss": 0.2336, "step": 420 }, { "epoch": 0.0850292161998791, "grad_norm": 0.26829826831817627, "learning_rate": 3.400483481063659e-05, "loss": 0.2369, "step": 422 }, { "epoch": 0.08543219826717711, "grad_norm": 0.1370697319507599, "learning_rate": 3.4165995165189366e-05, "loss": 0.1818, "step": 424 }, { "epoch": 0.08583518033447511, "grad_norm": 1.2252482175827026, "learning_rate": 3.432715551974214e-05, "loss": 0.2081, "step": 426 }, { "epoch": 0.08623816240177312, "grad_norm": 0.14920452237129211, "learning_rate": 3.448831587429493e-05, "loss": 0.2401, "step": 428 }, { "epoch": 0.08664114446907113, "grad_norm": 0.1496407687664032, "learning_rate": 3.4649476228847704e-05, "loss": 0.2269, "step": 430 }, { "epoch": 0.08704412653636913, "grad_norm": 0.12217120826244354, "learning_rate": 3.481063658340049e-05, "loss": 0.1497, "step": 432 }, { "epoch": 0.08744710860366714, "grad_norm": 0.21569272875785828, "learning_rate": 3.4971796937953266e-05, "loss": 0.2101, "step": 434 }, { "epoch": 0.08785009067096514, "grad_norm": 0.1281503438949585, "learning_rate": 3.513295729250605e-05, "loss": 0.2057, "step": 436 }, { "epoch": 0.08825307273826315, "grad_norm": 0.14296643435955048, "learning_rate": 3.529411764705883e-05, "loss": 0.2512, "step": 438 }, { "epoch": 0.08865605480556116, "grad_norm": 0.12831254303455353, "learning_rate": 3.5455278001611605e-05, "loss": 0.1635, "step": 440 }, { "epoch": 0.08905903687285915, "grad_norm": 0.17410136759281158, "learning_rate": 3.561643835616438e-05, "loss": 0.2307, "step": 442 }, { "epoch": 0.08946201894015716, "grad_norm": 0.12199216336011887, "learning_rate": 3.5777598710717166e-05, "loss": 0.2194, "step": 444 }, { "epoch": 0.08986500100745516, "grad_norm": 0.17704665660858154, "learning_rate": 3.593875906526994e-05, "loss": 0.1931, "step": 446 }, { "epoch": 0.09026798307475317, "grad_norm": 0.20914296805858612, "learning_rate": 3.609991941982273e-05, "loss": 0.256, "step": 448 }, { "epoch": 0.09067096514205118, "grad_norm": 0.14003607630729675, "learning_rate": 3.6261079774375505e-05, "loss": 0.1724, "step": 450 }, { "epoch": 0.09107394720934918, "grad_norm": 0.23989439010620117, "learning_rate": 3.642224012892829e-05, "loss": 0.1871, "step": 452 }, { "epoch": 0.09147692927664719, "grad_norm": 0.17189793288707733, "learning_rate": 3.6583400483481066e-05, "loss": 0.1911, "step": 454 }, { "epoch": 0.0918799113439452, "grad_norm": 0.18356207013130188, "learning_rate": 3.674456083803385e-05, "loss": 0.2036, "step": 456 }, { "epoch": 0.0922828934112432, "grad_norm": 0.19528169929981232, "learning_rate": 3.690572119258662e-05, "loss": 0.2428, "step": 458 }, { "epoch": 0.09268587547854121, "grad_norm": 0.12380246818065643, "learning_rate": 3.7066881547139405e-05, "loss": 0.1746, "step": 460 }, { "epoch": 0.0930888575458392, "grad_norm": 0.20087508857250214, "learning_rate": 3.722804190169218e-05, "loss": 0.257, "step": 462 }, { "epoch": 0.09349183961313721, "grad_norm": 0.22255493700504303, "learning_rate": 3.7389202256244966e-05, "loss": 0.1897, "step": 464 }, { "epoch": 0.09389482168043523, "grad_norm": 0.2199143022298813, "learning_rate": 3.7550362610797743e-05, "loss": 0.208, "step": 466 }, { "epoch": 0.09429780374773322, "grad_norm": 0.12534235417842865, "learning_rate": 3.771152296535053e-05, "loss": 0.2261, "step": 468 }, { "epoch": 0.09470078581503123, "grad_norm": 0.14178584516048431, "learning_rate": 3.7872683319903305e-05, "loss": 0.1724, "step": 470 }, { "epoch": 0.09510376788232924, "grad_norm": 0.192081019282341, "learning_rate": 3.803384367445609e-05, "loss": 0.235, "step": 472 }, { "epoch": 0.09550674994962724, "grad_norm": 0.24138332903385162, "learning_rate": 3.8195004029008866e-05, "loss": 0.187, "step": 474 }, { "epoch": 0.09590973201692525, "grad_norm": 0.20738311111927032, "learning_rate": 3.8356164383561644e-05, "loss": 0.1986, "step": 476 }, { "epoch": 0.09631271408422325, "grad_norm": 0.16928641498088837, "learning_rate": 3.851732473811442e-05, "loss": 0.1959, "step": 478 }, { "epoch": 0.09671569615152126, "grad_norm": 0.1994764506816864, "learning_rate": 3.8678485092667205e-05, "loss": 0.2657, "step": 480 }, { "epoch": 0.09711867821881927, "grad_norm": 0.12018430978059769, "learning_rate": 3.883964544721998e-05, "loss": 0.2247, "step": 482 }, { "epoch": 0.09752166028611726, "grad_norm": 0.12260066717863083, "learning_rate": 3.9000805801772766e-05, "loss": 0.2436, "step": 484 }, { "epoch": 0.09792464235341528, "grad_norm": 0.20381806790828705, "learning_rate": 3.9161966156325544e-05, "loss": 0.2528, "step": 486 }, { "epoch": 0.09832762442071327, "grad_norm": 0.1553252935409546, "learning_rate": 3.932312651087833e-05, "loss": 0.1424, "step": 488 }, { "epoch": 0.09873060648801128, "grad_norm": 0.12027744948863983, "learning_rate": 3.9484286865431105e-05, "loss": 0.1758, "step": 490 }, { "epoch": 0.0991335885553093, "grad_norm": 0.19894269108772278, "learning_rate": 3.964544721998388e-05, "loss": 0.2018, "step": 492 }, { "epoch": 0.09953657062260729, "grad_norm": 0.23655803501605988, "learning_rate": 3.9806607574536666e-05, "loss": 0.2014, "step": 494 }, { "epoch": 0.0999395526899053, "grad_norm": 0.14982983469963074, "learning_rate": 3.9967767929089444e-05, "loss": 0.1603, "step": 496 }, { "epoch": 0.10034253475720331, "grad_norm": 0.17792493104934692, "learning_rate": 4.012892828364223e-05, "loss": 0.2296, "step": 498 }, { "epoch": 0.10074551682450131, "grad_norm": 0.13077042996883392, "learning_rate": 4.0290088638195005e-05, "loss": 0.2162, "step": 500 }, { "epoch": 0.10114849889179932, "grad_norm": 0.15423962473869324, "learning_rate": 4.045124899274779e-05, "loss": 0.2, "step": 502 }, { "epoch": 0.10155148095909731, "grad_norm": 0.1420324593782425, "learning_rate": 4.0612409347300567e-05, "loss": 0.1833, "step": 504 }, { "epoch": 0.10195446302639533, "grad_norm": 0.20684389770030975, "learning_rate": 4.077356970185335e-05, "loss": 0.211, "step": 506 }, { "epoch": 0.10235744509369334, "grad_norm": 0.1461561918258667, "learning_rate": 4.093473005640613e-05, "loss": 0.1594, "step": 508 }, { "epoch": 0.10276042716099133, "grad_norm": 0.34511980414390564, "learning_rate": 4.1095890410958905e-05, "loss": 0.1807, "step": 510 }, { "epoch": 0.10316340922828934, "grad_norm": 0.35092246532440186, "learning_rate": 4.125705076551168e-05, "loss": 0.1953, "step": 512 }, { "epoch": 0.10356639129558734, "grad_norm": 0.1539539396762848, "learning_rate": 4.141821112006447e-05, "loss": 0.2323, "step": 514 }, { "epoch": 0.10396937336288535, "grad_norm": 0.11350340396165848, "learning_rate": 4.1579371474617244e-05, "loss": 0.1686, "step": 516 }, { "epoch": 0.10437235543018336, "grad_norm": 0.21042321622371674, "learning_rate": 4.174053182917003e-05, "loss": 0.1826, "step": 518 }, { "epoch": 0.10477533749748136, "grad_norm": 0.15128977596759796, "learning_rate": 4.1901692183722805e-05, "loss": 0.1553, "step": 520 }, { "epoch": 0.10517831956477937, "grad_norm": 0.13588109612464905, "learning_rate": 4.206285253827559e-05, "loss": 0.1567, "step": 522 }, { "epoch": 0.10558130163207738, "grad_norm": 0.3547620475292206, "learning_rate": 4.222401289282837e-05, "loss": 0.1821, "step": 524 }, { "epoch": 0.10598428369937538, "grad_norm": 0.2869426906108856, "learning_rate": 4.2385173247381144e-05, "loss": 0.209, "step": 526 }, { "epoch": 0.10638726576667339, "grad_norm": 0.2165244221687317, "learning_rate": 4.254633360193392e-05, "loss": 0.1807, "step": 528 }, { "epoch": 0.10679024783397138, "grad_norm": 0.13855288922786713, "learning_rate": 4.2707493956486705e-05, "loss": 0.2015, "step": 530 }, { "epoch": 0.1071932299012694, "grad_norm": 0.16502977907657623, "learning_rate": 4.286865431103948e-05, "loss": 0.2467, "step": 532 }, { "epoch": 0.1075962119685674, "grad_norm": 0.21364782750606537, "learning_rate": 4.302981466559227e-05, "loss": 0.2445, "step": 534 }, { "epoch": 0.1079991940358654, "grad_norm": 0.1428193747997284, "learning_rate": 4.3190975020145044e-05, "loss": 0.2289, "step": 536 }, { "epoch": 0.10840217610316341, "grad_norm": 0.16634321212768555, "learning_rate": 4.335213537469783e-05, "loss": 0.1819, "step": 538 }, { "epoch": 0.10880515817046141, "grad_norm": 0.15989083051681519, "learning_rate": 4.3513295729250606e-05, "loss": 0.1597, "step": 540 }, { "epoch": 0.10920814023775942, "grad_norm": 0.10890760272741318, "learning_rate": 4.367445608380339e-05, "loss": 0.1679, "step": 542 }, { "epoch": 0.10961112230505743, "grad_norm": 0.1206541359424591, "learning_rate": 4.383561643835617e-05, "loss": 0.2062, "step": 544 }, { "epoch": 0.11001410437235543, "grad_norm": 0.18289783596992493, "learning_rate": 4.3996776792908944e-05, "loss": 0.2238, "step": 546 }, { "epoch": 0.11041708643965344, "grad_norm": 0.24548190832138062, "learning_rate": 4.415793714746173e-05, "loss": 0.2025, "step": 548 }, { "epoch": 0.11082006850695145, "grad_norm": 0.14362087845802307, "learning_rate": 4.4319097502014506e-05, "loss": 0.1806, "step": 550 }, { "epoch": 0.11122305057424944, "grad_norm": 0.12100233882665634, "learning_rate": 4.448025785656729e-05, "loss": 0.2202, "step": 552 }, { "epoch": 0.11162603264154745, "grad_norm": 0.12502926588058472, "learning_rate": 4.464141821112007e-05, "loss": 0.2509, "step": 554 }, { "epoch": 0.11202901470884545, "grad_norm": 0.4697296619415283, "learning_rate": 4.480257856567285e-05, "loss": 0.2091, "step": 556 }, { "epoch": 0.11243199677614346, "grad_norm": 0.13163422048091888, "learning_rate": 4.496373892022563e-05, "loss": 0.1593, "step": 558 }, { "epoch": 0.11283497884344147, "grad_norm": 0.11262835562229156, "learning_rate": 4.512489927477841e-05, "loss": 0.2005, "step": 560 }, { "epoch": 0.11323796091073947, "grad_norm": 0.13250380754470825, "learning_rate": 4.528605962933118e-05, "loss": 0.2237, "step": 562 }, { "epoch": 0.11364094297803748, "grad_norm": 0.17639359831809998, "learning_rate": 4.544721998388397e-05, "loss": 0.2141, "step": 564 }, { "epoch": 0.11404392504533548, "grad_norm": 0.16560794413089752, "learning_rate": 4.5608380338436744e-05, "loss": 0.1954, "step": 566 }, { "epoch": 0.11444690711263349, "grad_norm": 0.20894743502140045, "learning_rate": 4.576954069298953e-05, "loss": 0.1817, "step": 568 }, { "epoch": 0.1148498891799315, "grad_norm": 0.17747287452220917, "learning_rate": 4.5930701047542306e-05, "loss": 0.1519, "step": 570 }, { "epoch": 0.1152528712472295, "grad_norm": 0.32621023058891296, "learning_rate": 4.609186140209509e-05, "loss": 0.1936, "step": 572 }, { "epoch": 0.1156558533145275, "grad_norm": 0.169255793094635, "learning_rate": 4.625302175664787e-05, "loss": 0.2114, "step": 574 }, { "epoch": 0.11605883538182551, "grad_norm": 0.11003939807415009, "learning_rate": 4.641418211120065e-05, "loss": 0.1852, "step": 576 }, { "epoch": 0.11646181744912351, "grad_norm": 0.10437988489866257, "learning_rate": 4.657534246575342e-05, "loss": 0.1732, "step": 578 }, { "epoch": 0.11686479951642152, "grad_norm": 0.1323920041322708, "learning_rate": 4.6736502820306206e-05, "loss": 0.1749, "step": 580 }, { "epoch": 0.11726778158371952, "grad_norm": 0.16257719695568085, "learning_rate": 4.689766317485898e-05, "loss": 0.2036, "step": 582 }, { "epoch": 0.11767076365101753, "grad_norm": 0.13824671506881714, "learning_rate": 4.705882352941177e-05, "loss": 0.2065, "step": 584 }, { "epoch": 0.11807374571831554, "grad_norm": 0.14059747755527496, "learning_rate": 4.7219983883964545e-05, "loss": 0.2077, "step": 586 }, { "epoch": 0.11847672778561354, "grad_norm": 0.1127152293920517, "learning_rate": 4.738114423851733e-05, "loss": 0.2296, "step": 588 }, { "epoch": 0.11887970985291155, "grad_norm": 0.1408272385597229, "learning_rate": 4.7542304593070106e-05, "loss": 0.2205, "step": 590 }, { "epoch": 0.11928269192020956, "grad_norm": 0.14287517964839935, "learning_rate": 4.770346494762289e-05, "loss": 0.2082, "step": 592 }, { "epoch": 0.11968567398750755, "grad_norm": 0.11590491980314255, "learning_rate": 4.786462530217567e-05, "loss": 0.1657, "step": 594 }, { "epoch": 0.12008865605480556, "grad_norm": 0.13355225324630737, "learning_rate": 4.8025785656728445e-05, "loss": 0.2237, "step": 596 }, { "epoch": 0.12049163812210356, "grad_norm": 0.3162963092327118, "learning_rate": 4.818694601128122e-05, "loss": 0.197, "step": 598 }, { "epoch": 0.12089462018940157, "grad_norm": 0.19958584010601044, "learning_rate": 4.8348106365834006e-05, "loss": 0.1635, "step": 600 }, { "epoch": 0.12129760225669958, "grad_norm": 0.13679653406143188, "learning_rate": 4.8509266720386783e-05, "loss": 0.2212, "step": 602 }, { "epoch": 0.12170058432399758, "grad_norm": 0.1415790170431137, "learning_rate": 4.867042707493957e-05, "loss": 0.1708, "step": 604 }, { "epoch": 0.12210356639129559, "grad_norm": 0.1621982902288437, "learning_rate": 4.8831587429492345e-05, "loss": 0.2567, "step": 606 }, { "epoch": 0.12250654845859359, "grad_norm": 0.19178320467472076, "learning_rate": 4.899274778404513e-05, "loss": 0.2501, "step": 608 }, { "epoch": 0.1229095305258916, "grad_norm": 0.3538917005062103, "learning_rate": 4.9153908138597906e-05, "loss": 0.1876, "step": 610 }, { "epoch": 0.12331251259318961, "grad_norm": 0.17144201695919037, "learning_rate": 4.9315068493150684e-05, "loss": 0.2218, "step": 612 }, { "epoch": 0.1237154946604876, "grad_norm": 0.12202147394418716, "learning_rate": 4.947622884770347e-05, "loss": 0.2277, "step": 614 }, { "epoch": 0.12411847672778561, "grad_norm": 0.4998626708984375, "learning_rate": 4.9637389202256245e-05, "loss": 0.1924, "step": 616 }, { "epoch": 0.12452145879508363, "grad_norm": 0.15225987136363983, "learning_rate": 4.979854955680903e-05, "loss": 0.1767, "step": 618 }, { "epoch": 0.12492444086238162, "grad_norm": 0.11958550661802292, "learning_rate": 4.9959709911361806e-05, "loss": 0.1952, "step": 620 }, { "epoch": 0.12532742292967963, "grad_norm": 0.1595647633075714, "learning_rate": 5.012087026591459e-05, "loss": 0.2142, "step": 622 }, { "epoch": 0.12573040499697763, "grad_norm": 0.09010529518127441, "learning_rate": 5.028203062046737e-05, "loss": 0.183, "step": 624 }, { "epoch": 0.12613338706427563, "grad_norm": 0.15077313780784607, "learning_rate": 5.044319097502015e-05, "loss": 0.2067, "step": 626 }, { "epoch": 0.12653636913157365, "grad_norm": 0.10020453482866287, "learning_rate": 5.060435132957293e-05, "loss": 0.1862, "step": 628 }, { "epoch": 0.12693935119887165, "grad_norm": 0.11209502071142197, "learning_rate": 5.076551168412571e-05, "loss": 0.238, "step": 630 }, { "epoch": 0.12734233326616964, "grad_norm": 0.38250023126602173, "learning_rate": 5.092667203867849e-05, "loss": 0.2162, "step": 632 }, { "epoch": 0.12774531533346767, "grad_norm": 0.0977146178483963, "learning_rate": 5.1087832393231275e-05, "loss": 0.2049, "step": 634 }, { "epoch": 0.12814829740076566, "grad_norm": 0.49452531337738037, "learning_rate": 5.124899274778405e-05, "loss": 0.1717, "step": 636 }, { "epoch": 0.12855127946806366, "grad_norm": 0.15723511576652527, "learning_rate": 5.141015310233682e-05, "loss": 0.207, "step": 638 }, { "epoch": 0.12895426153536169, "grad_norm": 0.15820923447608948, "learning_rate": 5.15713134568896e-05, "loss": 0.17, "step": 640 }, { "epoch": 0.12935724360265968, "grad_norm": 0.2272777259349823, "learning_rate": 5.1732473811442384e-05, "loss": 0.2383, "step": 642 }, { "epoch": 0.12976022566995768, "grad_norm": 0.18713967502117157, "learning_rate": 5.189363416599516e-05, "loss": 0.2485, "step": 644 }, { "epoch": 0.1301632077372557, "grad_norm": 0.18062381446361542, "learning_rate": 5.2054794520547945e-05, "loss": 0.1667, "step": 646 }, { "epoch": 0.1305661898045537, "grad_norm": 0.1531069129705429, "learning_rate": 5.221595487510072e-05, "loss": 0.2006, "step": 648 }, { "epoch": 0.1309691718718517, "grad_norm": 0.1526888906955719, "learning_rate": 5.237711522965351e-05, "loss": 0.2549, "step": 650 }, { "epoch": 0.1313721539391497, "grad_norm": 0.21363383531570435, "learning_rate": 5.2538275584206284e-05, "loss": 0.1518, "step": 652 }, { "epoch": 0.13177513600644772, "grad_norm": 0.10798677057027817, "learning_rate": 5.269943593875907e-05, "loss": 0.1979, "step": 654 }, { "epoch": 0.13217811807374572, "grad_norm": 0.15461646020412445, "learning_rate": 5.2860596293311845e-05, "loss": 0.1976, "step": 656 }, { "epoch": 0.1325811001410437, "grad_norm": 0.14786763489246368, "learning_rate": 5.302175664786463e-05, "loss": 0.1827, "step": 658 }, { "epoch": 0.13298408220834174, "grad_norm": 0.14497677981853485, "learning_rate": 5.318291700241741e-05, "loss": 0.2326, "step": 660 }, { "epoch": 0.13338706427563973, "grad_norm": 0.13996298611164093, "learning_rate": 5.334407735697019e-05, "loss": 0.174, "step": 662 }, { "epoch": 0.13379004634293773, "grad_norm": 0.13437992334365845, "learning_rate": 5.350523771152297e-05, "loss": 0.191, "step": 664 }, { "epoch": 0.13419302841023575, "grad_norm": 0.08243107795715332, "learning_rate": 5.366639806607575e-05, "loss": 0.2212, "step": 666 }, { "epoch": 0.13459601047753375, "grad_norm": 0.18093906342983246, "learning_rate": 5.382755842062853e-05, "loss": 0.2098, "step": 668 }, { "epoch": 0.13499899254483175, "grad_norm": 0.15489110350608826, "learning_rate": 5.3988718775181314e-05, "loss": 0.2106, "step": 670 }, { "epoch": 0.13540197461212977, "grad_norm": 0.30413106083869934, "learning_rate": 5.414987912973409e-05, "loss": 0.1632, "step": 672 }, { "epoch": 0.13580495667942777, "grad_norm": 0.11697816103696823, "learning_rate": 5.431103948428686e-05, "loss": 0.2011, "step": 674 }, { "epoch": 0.13620793874672577, "grad_norm": 0.15263471007347107, "learning_rate": 5.4472199838839646e-05, "loss": 0.2353, "step": 676 }, { "epoch": 0.1366109208140238, "grad_norm": 0.1420680284500122, "learning_rate": 5.463336019339242e-05, "loss": 0.1445, "step": 678 }, { "epoch": 0.1370139028813218, "grad_norm": 0.1247248575091362, "learning_rate": 5.479452054794521e-05, "loss": 0.2184, "step": 680 }, { "epoch": 0.13741688494861978, "grad_norm": 0.1049036830663681, "learning_rate": 5.4955680902497984e-05, "loss": 0.2056, "step": 682 }, { "epoch": 0.13781986701591778, "grad_norm": 0.15257203578948975, "learning_rate": 5.511684125705077e-05, "loss": 0.2059, "step": 684 }, { "epoch": 0.1382228490832158, "grad_norm": 0.14933043718338013, "learning_rate": 5.5278001611603546e-05, "loss": 0.1938, "step": 686 }, { "epoch": 0.1386258311505138, "grad_norm": 0.12864787876605988, "learning_rate": 5.543916196615633e-05, "loss": 0.169, "step": 688 }, { "epoch": 0.1390288132178118, "grad_norm": 0.1254183053970337, "learning_rate": 5.560032232070911e-05, "loss": 0.1997, "step": 690 }, { "epoch": 0.13943179528510982, "grad_norm": 0.12011466920375824, "learning_rate": 5.576148267526189e-05, "loss": 0.2001, "step": 692 }, { "epoch": 0.13983477735240782, "grad_norm": 0.09766723215579987, "learning_rate": 5.592264302981467e-05, "loss": 0.2324, "step": 694 }, { "epoch": 0.14023775941970582, "grad_norm": 0.14080072939395905, "learning_rate": 5.608380338436745e-05, "loss": 0.1545, "step": 696 }, { "epoch": 0.14064074148700384, "grad_norm": 0.1432671695947647, "learning_rate": 5.624496373892023e-05, "loss": 0.2256, "step": 698 }, { "epoch": 0.14104372355430184, "grad_norm": 0.11519324779510498, "learning_rate": 5.6406124093473014e-05, "loss": 0.2022, "step": 700 }, { "epoch": 0.14144670562159983, "grad_norm": 0.12213943898677826, "learning_rate": 5.656728444802579e-05, "loss": 0.1849, "step": 702 }, { "epoch": 0.14184968768889786, "grad_norm": 0.13887788355350494, "learning_rate": 5.6728444802578575e-05, "loss": 0.2355, "step": 704 }, { "epoch": 0.14225266975619585, "grad_norm": 0.15707039833068848, "learning_rate": 5.688960515713135e-05, "loss": 0.2055, "step": 706 }, { "epoch": 0.14265565182349385, "grad_norm": 0.09798435121774673, "learning_rate": 5.705076551168412e-05, "loss": 0.2247, "step": 708 }, { "epoch": 0.14305863389079185, "grad_norm": 0.12091681361198425, "learning_rate": 5.721192586623691e-05, "loss": 0.2108, "step": 710 }, { "epoch": 0.14346161595808987, "grad_norm": 0.11818325519561768, "learning_rate": 5.7373086220789685e-05, "loss": 0.2348, "step": 712 }, { "epoch": 0.14386459802538787, "grad_norm": 0.11795388907194138, "learning_rate": 5.753424657534247e-05, "loss": 0.2175, "step": 714 }, { "epoch": 0.14426758009268587, "grad_norm": 0.08712694048881531, "learning_rate": 5.7695406929895246e-05, "loss": 0.1859, "step": 716 }, { "epoch": 0.1446705621599839, "grad_norm": 0.10926694422960281, "learning_rate": 5.785656728444802e-05, "loss": 0.2296, "step": 718 }, { "epoch": 0.1450735442272819, "grad_norm": 0.12363690137863159, "learning_rate": 5.801772763900081e-05, "loss": 0.2363, "step": 720 }, { "epoch": 0.14547652629457988, "grad_norm": 0.10204677283763885, "learning_rate": 5.8178887993553585e-05, "loss": 0.1557, "step": 722 }, { "epoch": 0.1458795083618779, "grad_norm": 0.08687115460634232, "learning_rate": 5.834004834810637e-05, "loss": 0.2098, "step": 724 }, { "epoch": 0.1462824904291759, "grad_norm": 0.12376438081264496, "learning_rate": 5.8501208702659146e-05, "loss": 0.1462, "step": 726 }, { "epoch": 0.1466854724964739, "grad_norm": 0.08748535066843033, "learning_rate": 5.866236905721193e-05, "loss": 0.2304, "step": 728 }, { "epoch": 0.14708845456377193, "grad_norm": 0.09505739063024521, "learning_rate": 5.882352941176471e-05, "loss": 0.1962, "step": 730 }, { "epoch": 0.14749143663106992, "grad_norm": 0.11972901225090027, "learning_rate": 5.898468976631749e-05, "loss": 0.1907, "step": 732 }, { "epoch": 0.14789441869836792, "grad_norm": 0.08803921192884445, "learning_rate": 5.914585012087027e-05, "loss": 0.2302, "step": 734 }, { "epoch": 0.14829740076566592, "grad_norm": 0.18843410909175873, "learning_rate": 5.930701047542305e-05, "loss": 0.2544, "step": 736 }, { "epoch": 0.14870038283296394, "grad_norm": 0.1302308738231659, "learning_rate": 5.946817082997583e-05, "loss": 0.231, "step": 738 }, { "epoch": 0.14910336490026194, "grad_norm": 0.10559621453285217, "learning_rate": 5.9629331184528614e-05, "loss": 0.1879, "step": 740 }, { "epoch": 0.14950634696755993, "grad_norm": 0.11322048306465149, "learning_rate": 5.9790491539081385e-05, "loss": 0.2098, "step": 742 }, { "epoch": 0.14990932903485796, "grad_norm": 0.10071557015180588, "learning_rate": 5.995165189363416e-05, "loss": 0.182, "step": 744 }, { "epoch": 0.15031231110215595, "grad_norm": 0.10590151697397232, "learning_rate": 6.0112812248186946e-05, "loss": 0.2738, "step": 746 }, { "epoch": 0.15071529316945395, "grad_norm": 0.11812455207109451, "learning_rate": 6.0273972602739724e-05, "loss": 0.1983, "step": 748 }, { "epoch": 0.15111827523675198, "grad_norm": 0.15133807063102722, "learning_rate": 6.043513295729251e-05, "loss": 0.2795, "step": 750 }, { "epoch": 0.15152125730404997, "grad_norm": 0.0995708629488945, "learning_rate": 6.0596293311845285e-05, "loss": 0.178, "step": 752 }, { "epoch": 0.15192423937134797, "grad_norm": 0.0896756649017334, "learning_rate": 6.075745366639807e-05, "loss": 0.2164, "step": 754 }, { "epoch": 0.152327221438646, "grad_norm": 0.10863078385591507, "learning_rate": 6.0918614020950846e-05, "loss": 0.2283, "step": 756 }, { "epoch": 0.152730203505944, "grad_norm": 0.18915948271751404, "learning_rate": 6.107977437550362e-05, "loss": 0.1713, "step": 758 }, { "epoch": 0.153133185573242, "grad_norm": 0.2530260682106018, "learning_rate": 6.124093473005641e-05, "loss": 0.1929, "step": 760 }, { "epoch": 0.15353616764053998, "grad_norm": 0.091359943151474, "learning_rate": 6.140209508460919e-05, "loss": 0.1789, "step": 762 }, { "epoch": 0.153939149707838, "grad_norm": 0.08231671154499054, "learning_rate": 6.156325543916198e-05, "loss": 0.2092, "step": 764 }, { "epoch": 0.154342131775136, "grad_norm": 0.10517023503780365, "learning_rate": 6.172441579371475e-05, "loss": 0.1662, "step": 766 }, { "epoch": 0.154745113842434, "grad_norm": 0.31121233105659485, "learning_rate": 6.188557614826753e-05, "loss": 0.1885, "step": 768 }, { "epoch": 0.15514809590973203, "grad_norm": 0.12587322294712067, "learning_rate": 6.204673650282031e-05, "loss": 0.1954, "step": 770 }, { "epoch": 0.15555107797703002, "grad_norm": 0.11300528049468994, "learning_rate": 6.22078968573731e-05, "loss": 0.2143, "step": 772 }, { "epoch": 0.15595406004432802, "grad_norm": 0.12358961999416351, "learning_rate": 6.236905721192587e-05, "loss": 0.2489, "step": 774 }, { "epoch": 0.15635704211162604, "grad_norm": 0.07826519012451172, "learning_rate": 6.253021756647864e-05, "loss": 0.1567, "step": 776 }, { "epoch": 0.15676002417892404, "grad_norm": 0.14281342923641205, "learning_rate": 6.269137792103142e-05, "loss": 0.25, "step": 778 }, { "epoch": 0.15716300624622204, "grad_norm": 0.11091539263725281, "learning_rate": 6.285253827558421e-05, "loss": 0.202, "step": 780 }, { "epoch": 0.15756598831352006, "grad_norm": 0.10273347795009613, "learning_rate": 6.301369863013699e-05, "loss": 0.238, "step": 782 }, { "epoch": 0.15796897038081806, "grad_norm": 0.12522001564502716, "learning_rate": 6.317485898468976e-05, "loss": 0.1256, "step": 784 }, { "epoch": 0.15837195244811605, "grad_norm": 0.13455836474895477, "learning_rate": 6.333601933924255e-05, "loss": 0.2008, "step": 786 }, { "epoch": 0.15877493451541405, "grad_norm": 0.1040363609790802, "learning_rate": 6.349717969379533e-05, "loss": 0.2343, "step": 788 }, { "epoch": 0.15917791658271208, "grad_norm": 0.13617351651191711, "learning_rate": 6.365834004834811e-05, "loss": 0.1752, "step": 790 }, { "epoch": 0.15958089865001007, "grad_norm": 0.11176435649394989, "learning_rate": 6.381950040290089e-05, "loss": 0.207, "step": 792 }, { "epoch": 0.15998388071730807, "grad_norm": 0.11505793035030365, "learning_rate": 6.398066075745367e-05, "loss": 0.1928, "step": 794 }, { "epoch": 0.1603868627846061, "grad_norm": 0.10995722562074661, "learning_rate": 6.414182111200645e-05, "loss": 0.2455, "step": 796 }, { "epoch": 0.1607898448519041, "grad_norm": 0.09231299161911011, "learning_rate": 6.430298146655924e-05, "loss": 0.1634, "step": 798 }, { "epoch": 0.1611928269192021, "grad_norm": 0.09691483527421951, "learning_rate": 6.446414182111201e-05, "loss": 0.1466, "step": 800 }, { "epoch": 0.1615958089865001, "grad_norm": 0.11957940459251404, "learning_rate": 6.462530217566479e-05, "loss": 0.1902, "step": 802 }, { "epoch": 0.1619987910537981, "grad_norm": 0.11081273853778839, "learning_rate": 6.478646253021758e-05, "loss": 0.1711, "step": 804 }, { "epoch": 0.1624017731210961, "grad_norm": 0.10039583593606949, "learning_rate": 6.494762288477036e-05, "loss": 0.1578, "step": 806 }, { "epoch": 0.16280475518839413, "grad_norm": 0.08877286314964294, "learning_rate": 6.510878323932313e-05, "loss": 0.1832, "step": 808 }, { "epoch": 0.16320773725569213, "grad_norm": 0.1771819144487381, "learning_rate": 6.526994359387592e-05, "loss": 0.1901, "step": 810 }, { "epoch": 0.16361071932299012, "grad_norm": 0.13575054705142975, "learning_rate": 6.543110394842869e-05, "loss": 0.2324, "step": 812 }, { "epoch": 0.16401370139028812, "grad_norm": 0.10554531216621399, "learning_rate": 6.559226430298147e-05, "loss": 0.2033, "step": 814 }, { "epoch": 0.16441668345758614, "grad_norm": 0.09214624017477036, "learning_rate": 6.575342465753424e-05, "loss": 0.2308, "step": 816 }, { "epoch": 0.16481966552488414, "grad_norm": 0.09753014147281647, "learning_rate": 6.591458501208702e-05, "loss": 0.2118, "step": 818 }, { "epoch": 0.16522264759218214, "grad_norm": 0.10157620906829834, "learning_rate": 6.607574536663981e-05, "loss": 0.2282, "step": 820 }, { "epoch": 0.16562562965948016, "grad_norm": 0.09195137768983841, "learning_rate": 6.623690572119259e-05, "loss": 0.2375, "step": 822 }, { "epoch": 0.16602861172677816, "grad_norm": 0.09710432589054108, "learning_rate": 6.639806607574536e-05, "loss": 0.1578, "step": 824 }, { "epoch": 0.16643159379407615, "grad_norm": 0.0950450599193573, "learning_rate": 6.655922643029815e-05, "loss": 0.1785, "step": 826 }, { "epoch": 0.16683457586137418, "grad_norm": 0.09225623309612274, "learning_rate": 6.672038678485093e-05, "loss": 0.1755, "step": 828 }, { "epoch": 0.16723755792867218, "grad_norm": 0.10279329121112823, "learning_rate": 6.688154713940372e-05, "loss": 0.1872, "step": 830 }, { "epoch": 0.16764053999597017, "grad_norm": 0.08554810285568237, "learning_rate": 6.704270749395649e-05, "loss": 0.2222, "step": 832 }, { "epoch": 0.1680435220632682, "grad_norm": 0.08733980357646942, "learning_rate": 6.720386784850927e-05, "loss": 0.139, "step": 834 }, { "epoch": 0.1684465041305662, "grad_norm": 0.09240876138210297, "learning_rate": 6.736502820306205e-05, "loss": 0.2171, "step": 836 }, { "epoch": 0.1688494861978642, "grad_norm": 0.08311144262552261, "learning_rate": 6.752618855761484e-05, "loss": 0.1865, "step": 838 }, { "epoch": 0.1692524682651622, "grad_norm": 0.11759477853775024, "learning_rate": 6.768734891216761e-05, "loss": 0.2025, "step": 840 }, { "epoch": 0.1696554503324602, "grad_norm": 0.15229858458042145, "learning_rate": 6.784850926672039e-05, "loss": 0.1846, "step": 842 }, { "epoch": 0.1700584323997582, "grad_norm": 0.08780387789011002, "learning_rate": 6.800966962127318e-05, "loss": 0.1839, "step": 844 }, { "epoch": 0.1704614144670562, "grad_norm": 0.11263580620288849, "learning_rate": 6.817082997582595e-05, "loss": 0.1762, "step": 846 }, { "epoch": 0.17086439653435423, "grad_norm": 0.0929393470287323, "learning_rate": 6.833199033037873e-05, "loss": 0.1633, "step": 848 }, { "epoch": 0.17126737860165223, "grad_norm": 0.09778440743684769, "learning_rate": 6.84931506849315e-05, "loss": 0.1693, "step": 850 }, { "epoch": 0.17167036066895022, "grad_norm": 0.12297005206346512, "learning_rate": 6.865431103948429e-05, "loss": 0.1497, "step": 852 }, { "epoch": 0.17207334273624825, "grad_norm": 0.17671999335289001, "learning_rate": 6.881547139403707e-05, "loss": 0.2439, "step": 854 }, { "epoch": 0.17247632480354624, "grad_norm": 0.08522593975067139, "learning_rate": 6.897663174858985e-05, "loss": 0.1595, "step": 856 }, { "epoch": 0.17287930687084424, "grad_norm": 0.1255025416612625, "learning_rate": 6.913779210314262e-05, "loss": 0.2527, "step": 858 }, { "epoch": 0.17328228893814226, "grad_norm": 0.3059910535812378, "learning_rate": 6.929895245769541e-05, "loss": 0.2162, "step": 860 }, { "epoch": 0.17368527100544026, "grad_norm": 0.1549808382987976, "learning_rate": 6.946011281224819e-05, "loss": 0.2101, "step": 862 }, { "epoch": 0.17408825307273826, "grad_norm": 0.08645348250865936, "learning_rate": 6.962127316680098e-05, "loss": 0.1709, "step": 864 }, { "epoch": 0.17449123514003625, "grad_norm": 0.09869391471147537, "learning_rate": 6.978243352135375e-05, "loss": 0.2026, "step": 866 }, { "epoch": 0.17489421720733428, "grad_norm": 0.08920720964670181, "learning_rate": 6.994359387590653e-05, "loss": 0.2328, "step": 868 }, { "epoch": 0.17529719927463228, "grad_norm": 0.10059194266796112, "learning_rate": 7.010475423045932e-05, "loss": 0.2471, "step": 870 }, { "epoch": 0.17570018134193027, "grad_norm": 0.13767802715301514, "learning_rate": 7.02659145850121e-05, "loss": 0.2062, "step": 872 }, { "epoch": 0.1761031634092283, "grad_norm": 0.11204895377159119, "learning_rate": 7.042707493956487e-05, "loss": 0.1811, "step": 874 }, { "epoch": 0.1765061454765263, "grad_norm": 0.08391435444355011, "learning_rate": 7.058823529411765e-05, "loss": 0.1874, "step": 876 }, { "epoch": 0.1769091275438243, "grad_norm": 0.09149591624736786, "learning_rate": 7.074939564867044e-05, "loss": 0.2454, "step": 878 }, { "epoch": 0.17731210961112231, "grad_norm": 0.09233218431472778, "learning_rate": 7.091055600322321e-05, "loss": 0.2473, "step": 880 }, { "epoch": 0.1777150916784203, "grad_norm": 0.08432731032371521, "learning_rate": 7.107171635777598e-05, "loss": 0.1843, "step": 882 }, { "epoch": 0.1781180737457183, "grad_norm": 0.12103287875652313, "learning_rate": 7.123287671232876e-05, "loss": 0.2372, "step": 884 }, { "epoch": 0.17852105581301633, "grad_norm": 0.08081512898206711, "learning_rate": 7.139403706688155e-05, "loss": 0.2181, "step": 886 }, { "epoch": 0.17892403788031433, "grad_norm": 0.09800492227077484, "learning_rate": 7.155519742143433e-05, "loss": 0.1669, "step": 888 }, { "epoch": 0.17932701994761233, "grad_norm": 0.0976727157831192, "learning_rate": 7.17163577759871e-05, "loss": 0.2072, "step": 890 }, { "epoch": 0.17973000201491032, "grad_norm": 0.1146702691912651, "learning_rate": 7.187751813053989e-05, "loss": 0.1986, "step": 892 }, { "epoch": 0.18013298408220835, "grad_norm": 0.10681789368391037, "learning_rate": 7.203867848509267e-05, "loss": 0.1632, "step": 894 }, { "epoch": 0.18053596614950634, "grad_norm": 0.10094150900840759, "learning_rate": 7.219983883964545e-05, "loss": 0.1643, "step": 896 }, { "epoch": 0.18093894821680434, "grad_norm": 0.09761274605989456, "learning_rate": 7.236099919419823e-05, "loss": 0.1779, "step": 898 }, { "epoch": 0.18134193028410237, "grad_norm": 0.1324063241481781, "learning_rate": 7.252215954875101e-05, "loss": 0.2365, "step": 900 }, { "epoch": 0.18174491235140036, "grad_norm": 0.11601895093917847, "learning_rate": 7.26833199033038e-05, "loss": 0.1712, "step": 902 }, { "epoch": 0.18214789441869836, "grad_norm": 0.14145302772521973, "learning_rate": 7.284448025785658e-05, "loss": 0.2013, "step": 904 }, { "epoch": 0.18255087648599638, "grad_norm": 0.09013397246599197, "learning_rate": 7.300564061240935e-05, "loss": 0.2259, "step": 906 }, { "epoch": 0.18295385855329438, "grad_norm": 0.09207538515329361, "learning_rate": 7.316680096696213e-05, "loss": 0.2087, "step": 908 }, { "epoch": 0.18335684062059238, "grad_norm": 0.07779651135206223, "learning_rate": 7.332796132151492e-05, "loss": 0.234, "step": 910 }, { "epoch": 0.1837598226878904, "grad_norm": 0.08593969792127609, "learning_rate": 7.34891216760677e-05, "loss": 0.1854, "step": 912 }, { "epoch": 0.1841628047551884, "grad_norm": 0.09124486148357391, "learning_rate": 7.365028203062047e-05, "loss": 0.2279, "step": 914 }, { "epoch": 0.1845657868224864, "grad_norm": 0.11255534738302231, "learning_rate": 7.381144238517324e-05, "loss": 0.1733, "step": 916 }, { "epoch": 0.18496876888978442, "grad_norm": 0.1038624569773674, "learning_rate": 7.397260273972603e-05, "loss": 0.2392, "step": 918 }, { "epoch": 0.18537175095708242, "grad_norm": 0.10044854134321213, "learning_rate": 7.413376309427881e-05, "loss": 0.195, "step": 920 }, { "epoch": 0.1857747330243804, "grad_norm": 0.0884871855378151, "learning_rate": 7.42949234488316e-05, "loss": 0.2164, "step": 922 }, { "epoch": 0.1861777150916784, "grad_norm": 0.1108056977391243, "learning_rate": 7.445608380338436e-05, "loss": 0.1694, "step": 924 }, { "epoch": 0.18658069715897643, "grad_norm": 0.07165519148111343, "learning_rate": 7.461724415793715e-05, "loss": 0.1398, "step": 926 }, { "epoch": 0.18698367922627443, "grad_norm": 0.09175916761159897, "learning_rate": 7.477840451248993e-05, "loss": 0.1955, "step": 928 }, { "epoch": 0.18738666129357243, "grad_norm": 0.07176446169614792, "learning_rate": 7.493956486704272e-05, "loss": 0.1766, "step": 930 }, { "epoch": 0.18778964336087045, "grad_norm": 0.11476302146911621, "learning_rate": 7.510072522159549e-05, "loss": 0.1848, "step": 932 }, { "epoch": 0.18819262542816845, "grad_norm": 0.107746422290802, "learning_rate": 7.526188557614827e-05, "loss": 0.2264, "step": 934 }, { "epoch": 0.18859560749546644, "grad_norm": 0.10922015458345413, "learning_rate": 7.542304593070106e-05, "loss": 0.1803, "step": 936 }, { "epoch": 0.18899858956276447, "grad_norm": 0.08000432699918747, "learning_rate": 7.558420628525384e-05, "loss": 0.1601, "step": 938 }, { "epoch": 0.18940157163006247, "grad_norm": 0.07894396036863327, "learning_rate": 7.574536663980661e-05, "loss": 0.1968, "step": 940 }, { "epoch": 0.18980455369736046, "grad_norm": 0.13226218521595, "learning_rate": 7.59065269943594e-05, "loss": 0.1964, "step": 942 }, { "epoch": 0.1902075357646585, "grad_norm": 0.13322897255420685, "learning_rate": 7.606768734891218e-05, "loss": 0.2813, "step": 944 }, { "epoch": 0.19061051783195648, "grad_norm": 0.07467541843652725, "learning_rate": 7.622884770346496e-05, "loss": 0.2153, "step": 946 }, { "epoch": 0.19101349989925448, "grad_norm": 0.1104121133685112, "learning_rate": 7.639000805801773e-05, "loss": 0.2025, "step": 948 }, { "epoch": 0.19141648196655248, "grad_norm": 0.06779658049345016, "learning_rate": 7.65511684125705e-05, "loss": 0.2228, "step": 950 }, { "epoch": 0.1918194640338505, "grad_norm": 0.19166550040245056, "learning_rate": 7.671232876712329e-05, "loss": 0.1907, "step": 952 }, { "epoch": 0.1922224461011485, "grad_norm": 0.06244197115302086, "learning_rate": 7.687348912167607e-05, "loss": 0.1687, "step": 954 }, { "epoch": 0.1926254281684465, "grad_norm": 0.07573673874139786, "learning_rate": 7.703464947622884e-05, "loss": 0.1598, "step": 956 }, { "epoch": 0.19302841023574452, "grad_norm": 0.0870039090514183, "learning_rate": 7.719580983078163e-05, "loss": 0.2082, "step": 958 }, { "epoch": 0.19343139230304252, "grad_norm": 0.08709016442298889, "learning_rate": 7.735697018533441e-05, "loss": 0.1739, "step": 960 }, { "epoch": 0.1938343743703405, "grad_norm": 0.2350974977016449, "learning_rate": 7.75181305398872e-05, "loss": 0.2275, "step": 962 }, { "epoch": 0.19423735643763854, "grad_norm": 0.1652485728263855, "learning_rate": 7.767929089443996e-05, "loss": 0.2102, "step": 964 }, { "epoch": 0.19464033850493653, "grad_norm": 0.087095707654953, "learning_rate": 7.784045124899275e-05, "loss": 0.2238, "step": 966 }, { "epoch": 0.19504332057223453, "grad_norm": 0.11548943817615509, "learning_rate": 7.800161160354553e-05, "loss": 0.2168, "step": 968 }, { "epoch": 0.19544630263953255, "grad_norm": 0.0807507336139679, "learning_rate": 7.816277195809832e-05, "loss": 0.1806, "step": 970 }, { "epoch": 0.19584928470683055, "grad_norm": 0.393595814704895, "learning_rate": 7.832393231265109e-05, "loss": 0.2387, "step": 972 }, { "epoch": 0.19625226677412855, "grad_norm": 0.08075542002916336, "learning_rate": 7.848509266720387e-05, "loss": 0.2201, "step": 974 }, { "epoch": 0.19665524884142654, "grad_norm": 0.09349818527698517, "learning_rate": 7.864625302175666e-05, "loss": 0.1415, "step": 976 }, { "epoch": 0.19705823090872457, "grad_norm": 0.21948008239269257, "learning_rate": 7.880741337630944e-05, "loss": 0.2032, "step": 978 }, { "epoch": 0.19746121297602257, "grad_norm": 0.09178763628005981, "learning_rate": 7.896857373086221e-05, "loss": 0.2024, "step": 980 }, { "epoch": 0.19786419504332056, "grad_norm": 0.09847205132246017, "learning_rate": 7.9129734085415e-05, "loss": 0.226, "step": 982 }, { "epoch": 0.1982671771106186, "grad_norm": 0.15902641415596008, "learning_rate": 7.929089443996776e-05, "loss": 0.2055, "step": 984 }, { "epoch": 0.19867015917791658, "grad_norm": 0.15558022260665894, "learning_rate": 7.945205479452055e-05, "loss": 0.1926, "step": 986 }, { "epoch": 0.19907314124521458, "grad_norm": 0.09379275888204575, "learning_rate": 7.961321514907333e-05, "loss": 0.1761, "step": 988 }, { "epoch": 0.1994761233125126, "grad_norm": 0.17286166548728943, "learning_rate": 7.97743755036261e-05, "loss": 0.2303, "step": 990 }, { "epoch": 0.1998791053798106, "grad_norm": 0.11570542305707932, "learning_rate": 7.993553585817889e-05, "loss": 0.2176, "step": 992 }, { "epoch": 0.2002820874471086, "grad_norm": 0.13672104477882385, "learning_rate": 8.009669621273167e-05, "loss": 0.2171, "step": 994 }, { "epoch": 0.20068506951440662, "grad_norm": 0.12963563203811646, "learning_rate": 8.025785656728446e-05, "loss": 0.2296, "step": 996 }, { "epoch": 0.20108805158170462, "grad_norm": 0.13109353184700012, "learning_rate": 8.041901692183723e-05, "loss": 0.2, "step": 998 }, { "epoch": 0.20149103364900262, "grad_norm": 0.08438586443662643, "learning_rate": 8.058017727639001e-05, "loss": 0.1921, "step": 1000 }, { "epoch": 0.2018940157163006, "grad_norm": 0.08162175118923187, "learning_rate": 8.07413376309428e-05, "loss": 0.2028, "step": 1002 }, { "epoch": 0.20229699778359864, "grad_norm": 0.08619034290313721, "learning_rate": 8.090249798549558e-05, "loss": 0.1999, "step": 1004 }, { "epoch": 0.20269997985089663, "grad_norm": 0.07941418886184692, "learning_rate": 8.106365834004835e-05, "loss": 0.2118, "step": 1006 }, { "epoch": 0.20310296191819463, "grad_norm": 0.12020314484834671, "learning_rate": 8.122481869460113e-05, "loss": 0.223, "step": 1008 }, { "epoch": 0.20350594398549265, "grad_norm": 0.08442337810993195, "learning_rate": 8.138597904915392e-05, "loss": 0.2103, "step": 1010 }, { "epoch": 0.20390892605279065, "grad_norm": 0.1368478238582611, "learning_rate": 8.15471394037067e-05, "loss": 0.2011, "step": 1012 }, { "epoch": 0.20431190812008865, "grad_norm": 0.12291720509529114, "learning_rate": 8.170829975825947e-05, "loss": 0.2377, "step": 1014 }, { "epoch": 0.20471489018738667, "grad_norm": 0.09744734317064285, "learning_rate": 8.186946011281226e-05, "loss": 0.1922, "step": 1016 }, { "epoch": 0.20511787225468467, "grad_norm": 0.08120467513799667, "learning_rate": 8.203062046736503e-05, "loss": 0.2241, "step": 1018 }, { "epoch": 0.20552085432198267, "grad_norm": 0.10533369332551956, "learning_rate": 8.219178082191781e-05, "loss": 0.2189, "step": 1020 }, { "epoch": 0.2059238363892807, "grad_norm": 0.10071130096912384, "learning_rate": 8.23529411764706e-05, "loss": 0.1639, "step": 1022 }, { "epoch": 0.2063268184565787, "grad_norm": 0.1534520983695984, "learning_rate": 8.251410153102337e-05, "loss": 0.1691, "step": 1024 }, { "epoch": 0.20672980052387668, "grad_norm": 0.08435958623886108, "learning_rate": 8.267526188557615e-05, "loss": 0.2177, "step": 1026 }, { "epoch": 0.20713278259117468, "grad_norm": 0.11280474066734314, "learning_rate": 8.283642224012893e-05, "loss": 0.1767, "step": 1028 }, { "epoch": 0.2075357646584727, "grad_norm": 0.09684017300605774, "learning_rate": 8.299758259468172e-05, "loss": 0.2139, "step": 1030 }, { "epoch": 0.2079387467257707, "grad_norm": 0.08194670081138611, "learning_rate": 8.315874294923449e-05, "loss": 0.1917, "step": 1032 }, { "epoch": 0.2083417287930687, "grad_norm": 0.15235085785388947, "learning_rate": 8.331990330378727e-05, "loss": 0.2436, "step": 1034 }, { "epoch": 0.20874471086036672, "grad_norm": 0.08844275772571564, "learning_rate": 8.348106365834006e-05, "loss": 0.1862, "step": 1036 }, { "epoch": 0.20914769292766472, "grad_norm": 0.1334722340106964, "learning_rate": 8.364222401289284e-05, "loss": 0.1837, "step": 1038 }, { "epoch": 0.20955067499496272, "grad_norm": 0.08106778562068939, "learning_rate": 8.380338436744561e-05, "loss": 0.2101, "step": 1040 }, { "epoch": 0.20995365706226074, "grad_norm": 0.0860428661108017, "learning_rate": 8.39645447219984e-05, "loss": 0.1754, "step": 1042 }, { "epoch": 0.21035663912955874, "grad_norm": 0.0777168869972229, "learning_rate": 8.412570507655118e-05, "loss": 0.2105, "step": 1044 }, { "epoch": 0.21075962119685673, "grad_norm": 0.08495823293924332, "learning_rate": 8.428686543110396e-05, "loss": 0.187, "step": 1046 }, { "epoch": 0.21116260326415476, "grad_norm": 0.07410518079996109, "learning_rate": 8.444802578565673e-05, "loss": 0.1962, "step": 1048 }, { "epoch": 0.21156558533145275, "grad_norm": 0.0910082459449768, "learning_rate": 8.460918614020952e-05, "loss": 0.2341, "step": 1050 }, { "epoch": 0.21196856739875075, "grad_norm": 0.11832420527935028, "learning_rate": 8.477034649476229e-05, "loss": 0.2532, "step": 1052 }, { "epoch": 0.21237154946604875, "grad_norm": 0.09605500847101212, "learning_rate": 8.493150684931507e-05, "loss": 0.1657, "step": 1054 }, { "epoch": 0.21277453153334677, "grad_norm": 0.07742031663656235, "learning_rate": 8.509266720386784e-05, "loss": 0.1792, "step": 1056 }, { "epoch": 0.21317751360064477, "grad_norm": 0.07660829275846481, "learning_rate": 8.525382755842063e-05, "loss": 0.1488, "step": 1058 }, { "epoch": 0.21358049566794277, "grad_norm": 0.09640536457300186, "learning_rate": 8.541498791297341e-05, "loss": 0.2233, "step": 1060 }, { "epoch": 0.2139834777352408, "grad_norm": 0.12653407454490662, "learning_rate": 8.55761482675262e-05, "loss": 0.2007, "step": 1062 }, { "epoch": 0.2143864598025388, "grad_norm": 0.09995963424444199, "learning_rate": 8.573730862207897e-05, "loss": 0.1933, "step": 1064 }, { "epoch": 0.21478944186983678, "grad_norm": 0.08510065078735352, "learning_rate": 8.589846897663175e-05, "loss": 0.2423, "step": 1066 }, { "epoch": 0.2151924239371348, "grad_norm": 0.09552331268787384, "learning_rate": 8.605962933118453e-05, "loss": 0.2171, "step": 1068 }, { "epoch": 0.2155954060044328, "grad_norm": 0.09067709743976593, "learning_rate": 8.622078968573732e-05, "loss": 0.2328, "step": 1070 }, { "epoch": 0.2159983880717308, "grad_norm": 0.09756525605916977, "learning_rate": 8.638195004029009e-05, "loss": 0.1938, "step": 1072 }, { "epoch": 0.21640137013902883, "grad_norm": 0.10737069696187973, "learning_rate": 8.654311039484287e-05, "loss": 0.1905, "step": 1074 }, { "epoch": 0.21680435220632682, "grad_norm": 0.08812405914068222, "learning_rate": 8.670427074939566e-05, "loss": 0.2276, "step": 1076 }, { "epoch": 0.21720733427362482, "grad_norm": 0.08840040117502213, "learning_rate": 8.686543110394844e-05, "loss": 0.1989, "step": 1078 }, { "epoch": 0.21761031634092282, "grad_norm": 0.1236484944820404, "learning_rate": 8.702659145850121e-05, "loss": 0.1772, "step": 1080 }, { "epoch": 0.21801329840822084, "grad_norm": 0.0713542103767395, "learning_rate": 8.7187751813054e-05, "loss": 0.1878, "step": 1082 }, { "epoch": 0.21841628047551884, "grad_norm": 0.0982968658208847, "learning_rate": 8.734891216760678e-05, "loss": 0.1925, "step": 1084 }, { "epoch": 0.21881926254281683, "grad_norm": 0.07813037931919098, "learning_rate": 8.751007252215955e-05, "loss": 0.2141, "step": 1086 }, { "epoch": 0.21922224461011486, "grad_norm": 0.10197921842336655, "learning_rate": 8.767123287671233e-05, "loss": 0.2662, "step": 1088 }, { "epoch": 0.21962522667741285, "grad_norm": 0.0717720240354538, "learning_rate": 8.78323932312651e-05, "loss": 0.1753, "step": 1090 }, { "epoch": 0.22002820874471085, "grad_norm": 0.08220771700143814, "learning_rate": 8.799355358581789e-05, "loss": 0.2293, "step": 1092 }, { "epoch": 0.22043119081200888, "grad_norm": 0.10889850556850433, "learning_rate": 8.815471394037067e-05, "loss": 0.1917, "step": 1094 }, { "epoch": 0.22083417287930687, "grad_norm": 0.06890220940113068, "learning_rate": 8.831587429492346e-05, "loss": 0.184, "step": 1096 }, { "epoch": 0.22123715494660487, "grad_norm": 0.11168145388364792, "learning_rate": 8.847703464947623e-05, "loss": 0.2341, "step": 1098 }, { "epoch": 0.2216401370139029, "grad_norm": 0.06456907838582993, "learning_rate": 8.863819500402901e-05, "loss": 0.1529, "step": 1100 }, { "epoch": 0.2220431190812009, "grad_norm": 0.07093362510204315, "learning_rate": 8.87993553585818e-05, "loss": 0.2238, "step": 1102 }, { "epoch": 0.2224461011484989, "grad_norm": 0.08005674928426743, "learning_rate": 8.896051571313458e-05, "loss": 0.183, "step": 1104 }, { "epoch": 0.22284908321579688, "grad_norm": 0.09400587528944016, "learning_rate": 8.912167606768735e-05, "loss": 0.1935, "step": 1106 }, { "epoch": 0.2232520652830949, "grad_norm": 0.09655874222517014, "learning_rate": 8.928283642224013e-05, "loss": 0.1628, "step": 1108 }, { "epoch": 0.2236550473503929, "grad_norm": 0.10121942311525345, "learning_rate": 8.944399677679292e-05, "loss": 0.1718, "step": 1110 }, { "epoch": 0.2240580294176909, "grad_norm": 0.1059177815914154, "learning_rate": 8.96051571313457e-05, "loss": 0.2144, "step": 1112 }, { "epoch": 0.22446101148498893, "grad_norm": 0.07645639777183533, "learning_rate": 8.976631748589847e-05, "loss": 0.212, "step": 1114 }, { "epoch": 0.22486399355228692, "grad_norm": 0.07680249214172363, "learning_rate": 8.992747784045126e-05, "loss": 0.184, "step": 1116 }, { "epoch": 0.22526697561958492, "grad_norm": 0.10838435590267181, "learning_rate": 9.008863819500404e-05, "loss": 0.2153, "step": 1118 }, { "epoch": 0.22566995768688294, "grad_norm": 0.0947759747505188, "learning_rate": 9.024979854955683e-05, "loss": 0.1972, "step": 1120 }, { "epoch": 0.22607293975418094, "grad_norm": 0.12324242293834686, "learning_rate": 9.041095890410958e-05, "loss": 0.2042, "step": 1122 }, { "epoch": 0.22647592182147894, "grad_norm": 0.09450756758451462, "learning_rate": 9.057211925866237e-05, "loss": 0.2614, "step": 1124 }, { "epoch": 0.22687890388877696, "grad_norm": 0.092324398458004, "learning_rate": 9.073327961321515e-05, "loss": 0.1664, "step": 1126 }, { "epoch": 0.22728188595607496, "grad_norm": 0.12351454794406891, "learning_rate": 9.089443996776793e-05, "loss": 0.2146, "step": 1128 }, { "epoch": 0.22768486802337295, "grad_norm": 0.07259409874677658, "learning_rate": 9.10556003223207e-05, "loss": 0.1875, "step": 1130 }, { "epoch": 0.22808785009067095, "grad_norm": 0.060035668313503265, "learning_rate": 9.121676067687349e-05, "loss": 0.2133, "step": 1132 }, { "epoch": 0.22849083215796898, "grad_norm": 0.06675513088703156, "learning_rate": 9.137792103142627e-05, "loss": 0.1959, "step": 1134 }, { "epoch": 0.22889381422526697, "grad_norm": 0.10324272513389587, "learning_rate": 9.153908138597906e-05, "loss": 0.2426, "step": 1136 }, { "epoch": 0.22929679629256497, "grad_norm": 0.06724414229393005, "learning_rate": 9.170024174053183e-05, "loss": 0.1731, "step": 1138 }, { "epoch": 0.229699778359863, "grad_norm": 0.07515553385019302, "learning_rate": 9.186140209508461e-05, "loss": 0.1696, "step": 1140 }, { "epoch": 0.230102760427161, "grad_norm": 0.08454802632331848, "learning_rate": 9.20225624496374e-05, "loss": 0.2387, "step": 1142 }, { "epoch": 0.230505742494459, "grad_norm": 0.06945478171110153, "learning_rate": 9.218372280419018e-05, "loss": 0.2107, "step": 1144 }, { "epoch": 0.230908724561757, "grad_norm": 0.06837344914674759, "learning_rate": 9.234488315874295e-05, "loss": 0.2317, "step": 1146 }, { "epoch": 0.231311706629055, "grad_norm": 0.07493479549884796, "learning_rate": 9.250604351329573e-05, "loss": 0.2259, "step": 1148 }, { "epoch": 0.231714688696353, "grad_norm": 0.08560243993997574, "learning_rate": 9.266720386784852e-05, "loss": 0.212, "step": 1150 }, { "epoch": 0.23211767076365103, "grad_norm": 0.07562673836946487, "learning_rate": 9.28283642224013e-05, "loss": 0.221, "step": 1152 }, { "epoch": 0.23252065283094903, "grad_norm": 0.10255958139896393, "learning_rate": 9.298952457695407e-05, "loss": 0.2069, "step": 1154 }, { "epoch": 0.23292363489824702, "grad_norm": 0.06924106925725937, "learning_rate": 9.315068493150684e-05, "loss": 0.1942, "step": 1156 }, { "epoch": 0.23332661696554502, "grad_norm": 0.08090320974588394, "learning_rate": 9.331184528605963e-05, "loss": 0.1634, "step": 1158 }, { "epoch": 0.23372959903284304, "grad_norm": 0.05619840696454048, "learning_rate": 9.347300564061241e-05, "loss": 0.2432, "step": 1160 }, { "epoch": 0.23413258110014104, "grad_norm": 0.0675722137093544, "learning_rate": 9.36341659951652e-05, "loss": 0.2034, "step": 1162 }, { "epoch": 0.23453556316743904, "grad_norm": 0.07722295820713043, "learning_rate": 9.379532634971797e-05, "loss": 0.1589, "step": 1164 }, { "epoch": 0.23493854523473706, "grad_norm": 0.06578662246465683, "learning_rate": 9.395648670427075e-05, "loss": 0.1686, "step": 1166 }, { "epoch": 0.23534152730203506, "grad_norm": 0.08277074992656708, "learning_rate": 9.411764705882353e-05, "loss": 0.1611, "step": 1168 }, { "epoch": 0.23574450936933306, "grad_norm": 0.07715737074613571, "learning_rate": 9.427880741337632e-05, "loss": 0.1441, "step": 1170 }, { "epoch": 0.23614749143663108, "grad_norm": 0.08344750106334686, "learning_rate": 9.443996776792909e-05, "loss": 0.2076, "step": 1172 }, { "epoch": 0.23655047350392908, "grad_norm": 0.07293462008237839, "learning_rate": 9.460112812248187e-05, "loss": 0.1415, "step": 1174 }, { "epoch": 0.23695345557122707, "grad_norm": 0.08313830941915512, "learning_rate": 9.476228847703466e-05, "loss": 0.2245, "step": 1176 }, { "epoch": 0.2373564376385251, "grad_norm": 0.08135011047124863, "learning_rate": 9.492344883158744e-05, "loss": 0.2603, "step": 1178 }, { "epoch": 0.2377594197058231, "grad_norm": 0.090848907828331, "learning_rate": 9.508460918614021e-05, "loss": 0.2008, "step": 1180 }, { "epoch": 0.2381624017731211, "grad_norm": 0.05730780214071274, "learning_rate": 9.5245769540693e-05, "loss": 0.2229, "step": 1182 }, { "epoch": 0.23856538384041912, "grad_norm": 0.10784997791051865, "learning_rate": 9.540692989524578e-05, "loss": 0.2067, "step": 1184 }, { "epoch": 0.2389683659077171, "grad_norm": 0.07142709195613861, "learning_rate": 9.556809024979856e-05, "loss": 0.2081, "step": 1186 }, { "epoch": 0.2393713479750151, "grad_norm": 0.08427638560533524, "learning_rate": 9.572925060435133e-05, "loss": 0.1884, "step": 1188 }, { "epoch": 0.2397743300423131, "grad_norm": 0.06093582883477211, "learning_rate": 9.58904109589041e-05, "loss": 0.2077, "step": 1190 }, { "epoch": 0.24017731210961113, "grad_norm": 0.12947604060173035, "learning_rate": 9.605157131345689e-05, "loss": 0.2013, "step": 1192 }, { "epoch": 0.24058029417690913, "grad_norm": 0.07346334308385849, "learning_rate": 9.621273166800967e-05, "loss": 0.2296, "step": 1194 }, { "epoch": 0.24098327624420712, "grad_norm": 0.07245267927646637, "learning_rate": 9.637389202256244e-05, "loss": 0.2416, "step": 1196 }, { "epoch": 0.24138625831150515, "grad_norm": 0.0768049955368042, "learning_rate": 9.653505237711523e-05, "loss": 0.1814, "step": 1198 }, { "epoch": 0.24178924037880314, "grad_norm": 0.09695810824632645, "learning_rate": 9.669621273166801e-05, "loss": 0.2085, "step": 1200 }, { "epoch": 0.24219222244610114, "grad_norm": 0.10410469025373459, "learning_rate": 9.68573730862208e-05, "loss": 0.1793, "step": 1202 }, { "epoch": 0.24259520451339917, "grad_norm": 0.08499378710985184, "learning_rate": 9.701853344077357e-05, "loss": 0.1548, "step": 1204 }, { "epoch": 0.24299818658069716, "grad_norm": 0.07553906738758087, "learning_rate": 9.717969379532635e-05, "loss": 0.1562, "step": 1206 }, { "epoch": 0.24340116864799516, "grad_norm": 0.06801648437976837, "learning_rate": 9.734085414987914e-05, "loss": 0.1601, "step": 1208 }, { "epoch": 0.24380415071529318, "grad_norm": 0.07008855044841766, "learning_rate": 9.750201450443192e-05, "loss": 0.2134, "step": 1210 }, { "epoch": 0.24420713278259118, "grad_norm": 0.0757250189781189, "learning_rate": 9.766317485898469e-05, "loss": 0.2255, "step": 1212 }, { "epoch": 0.24461011484988918, "grad_norm": 0.07179669290781021, "learning_rate": 9.782433521353747e-05, "loss": 0.1668, "step": 1214 }, { "epoch": 0.24501309691718717, "grad_norm": 0.08034947514533997, "learning_rate": 9.798549556809026e-05, "loss": 0.2016, "step": 1216 }, { "epoch": 0.2454160789844852, "grad_norm": 0.07972761243581772, "learning_rate": 9.814665592264304e-05, "loss": 0.2055, "step": 1218 }, { "epoch": 0.2458190610517832, "grad_norm": 0.09259269386529922, "learning_rate": 9.830781627719581e-05, "loss": 0.2188, "step": 1220 }, { "epoch": 0.2462220431190812, "grad_norm": 0.09266602993011475, "learning_rate": 9.84689766317486e-05, "loss": 0.1846, "step": 1222 }, { "epoch": 0.24662502518637922, "grad_norm": 0.09400223940610886, "learning_rate": 9.863013698630137e-05, "loss": 0.2265, "step": 1224 }, { "epoch": 0.2470280072536772, "grad_norm": 0.11411723494529724, "learning_rate": 9.879129734085415e-05, "loss": 0.2147, "step": 1226 }, { "epoch": 0.2474309893209752, "grad_norm": 0.08758651465177536, "learning_rate": 9.895245769540694e-05, "loss": 0.1507, "step": 1228 }, { "epoch": 0.24783397138827323, "grad_norm": 0.09494465589523315, "learning_rate": 9.91136180499597e-05, "loss": 0.2167, "step": 1230 }, { "epoch": 0.24823695345557123, "grad_norm": 0.0853755846619606, "learning_rate": 9.927477840451249e-05, "loss": 0.2135, "step": 1232 }, { "epoch": 0.24863993552286923, "grad_norm": 0.1234959214925766, "learning_rate": 9.943593875906527e-05, "loss": 0.2236, "step": 1234 }, { "epoch": 0.24904291759016725, "grad_norm": 0.0904574766755104, "learning_rate": 9.959709911361806e-05, "loss": 0.2291, "step": 1236 }, { "epoch": 0.24944589965746525, "grad_norm": 0.06450655311346054, "learning_rate": 9.975825946817083e-05, "loss": 0.2311, "step": 1238 }, { "epoch": 0.24984888172476324, "grad_norm": 0.08350057154893875, "learning_rate": 9.991941982272361e-05, "loss": 0.2108, "step": 1240 }, { "epoch": 0.25025186379206127, "grad_norm": 0.06631229817867279, "learning_rate": 9.999999955601e-05, "loss": 0.1971, "step": 1242 }, { "epoch": 0.25065484585935927, "grad_norm": 0.09298577904701233, "learning_rate": 9.999999600409e-05, "loss": 0.2115, "step": 1244 }, { "epoch": 0.25105782792665726, "grad_norm": 0.0637964978814125, "learning_rate": 9.999998890025024e-05, "loss": 0.2547, "step": 1246 }, { "epoch": 0.25146080999395526, "grad_norm": 0.08004257082939148, "learning_rate": 9.999997824449123e-05, "loss": 0.2039, "step": 1248 }, { "epoch": 0.25186379206125326, "grad_norm": 0.06306260079145432, "learning_rate": 9.999996403681373e-05, "loss": 0.2126, "step": 1250 }, { "epoch": 0.25226677412855125, "grad_norm": 0.06148292124271393, "learning_rate": 9.999994627721875e-05, "loss": 0.1845, "step": 1252 }, { "epoch": 0.2526697561958493, "grad_norm": 0.06277811527252197, "learning_rate": 9.999992496570755e-05, "loss": 0.1993, "step": 1254 }, { "epoch": 0.2530727382631473, "grad_norm": 0.06535515934228897, "learning_rate": 9.999990010228164e-05, "loss": 0.1837, "step": 1256 }, { "epoch": 0.2534757203304453, "grad_norm": 0.07780063897371292, "learning_rate": 9.99998716869428e-05, "loss": 0.1825, "step": 1258 }, { "epoch": 0.2538787023977433, "grad_norm": 0.0873618796467781, "learning_rate": 9.999983971969302e-05, "loss": 0.2225, "step": 1260 }, { "epoch": 0.2542816844650413, "grad_norm": 0.08165138959884644, "learning_rate": 9.99998042005346e-05, "loss": 0.2264, "step": 1262 }, { "epoch": 0.2546846665323393, "grad_norm": 0.07608946412801743, "learning_rate": 9.999976512947007e-05, "loss": 0.2499, "step": 1264 }, { "epoch": 0.25508764859963734, "grad_norm": 0.08392094075679779, "learning_rate": 9.999972250650215e-05, "loss": 0.1677, "step": 1266 }, { "epoch": 0.25549063066693534, "grad_norm": 0.2400495409965515, "learning_rate": 9.999967633163394e-05, "loss": 0.1398, "step": 1268 }, { "epoch": 0.25589361273423333, "grad_norm": 0.09449879825115204, "learning_rate": 9.999962660486868e-05, "loss": 0.2168, "step": 1270 }, { "epoch": 0.25629659480153133, "grad_norm": 0.063988097012043, "learning_rate": 9.999957332620989e-05, "loss": 0.2091, "step": 1272 }, { "epoch": 0.2566995768688293, "grad_norm": 0.27040895819664, "learning_rate": 9.999951649566139e-05, "loss": 0.1982, "step": 1274 }, { "epoch": 0.2571025589361273, "grad_norm": 0.07855580747127533, "learning_rate": 9.999945611322719e-05, "loss": 0.2506, "step": 1276 }, { "epoch": 0.2575055410034253, "grad_norm": 0.2895398437976837, "learning_rate": 9.99993921789116e-05, "loss": 0.2104, "step": 1278 }, { "epoch": 0.25790852307072337, "grad_norm": 0.06772362440824509, "learning_rate": 9.999932469271915e-05, "loss": 0.2338, "step": 1280 }, { "epoch": 0.25831150513802137, "grad_norm": 0.14966493844985962, "learning_rate": 9.999925365465463e-05, "loss": 0.2066, "step": 1282 }, { "epoch": 0.25871448720531937, "grad_norm": 0.06578774750232697, "learning_rate": 9.99991790647231e-05, "loss": 0.1908, "step": 1284 }, { "epoch": 0.25911746927261736, "grad_norm": 0.17319980263710022, "learning_rate": 9.999910092292985e-05, "loss": 0.1874, "step": 1286 }, { "epoch": 0.25952045133991536, "grad_norm": 0.06658493727445602, "learning_rate": 9.999901922928042e-05, "loss": 0.2024, "step": 1288 }, { "epoch": 0.25992343340721336, "grad_norm": 0.08135207742452621, "learning_rate": 9.999893398378064e-05, "loss": 0.2109, "step": 1290 }, { "epoch": 0.2603264154745114, "grad_norm": 0.061190057545900345, "learning_rate": 9.999884518643654e-05, "loss": 0.1343, "step": 1292 }, { "epoch": 0.2607293975418094, "grad_norm": 0.08722230792045593, "learning_rate": 9.999875283725444e-05, "loss": 0.2028, "step": 1294 }, { "epoch": 0.2611323796091074, "grad_norm": 0.09891603142023087, "learning_rate": 9.999865693624091e-05, "loss": 0.1758, "step": 1296 }, { "epoch": 0.2615353616764054, "grad_norm": 0.10503659397363663, "learning_rate": 9.999855748340274e-05, "loss": 0.1831, "step": 1298 }, { "epoch": 0.2619383437437034, "grad_norm": 0.08791584521532059, "learning_rate": 9.999845447874702e-05, "loss": 0.2349, "step": 1300 }, { "epoch": 0.2623413258110014, "grad_norm": 0.10015831142663956, "learning_rate": 9.999834792228105e-05, "loss": 0.2115, "step": 1302 }, { "epoch": 0.2627443078782994, "grad_norm": 0.08647844940423965, "learning_rate": 9.99982378140124e-05, "loss": 0.1913, "step": 1304 }, { "epoch": 0.26314728994559744, "grad_norm": 0.15039680898189545, "learning_rate": 9.999812415394891e-05, "loss": 0.2008, "step": 1306 }, { "epoch": 0.26355027201289544, "grad_norm": 0.08669546246528625, "learning_rate": 9.999800694209862e-05, "loss": 0.2544, "step": 1308 }, { "epoch": 0.26395325408019343, "grad_norm": 0.07216177880764008, "learning_rate": 9.999788617846989e-05, "loss": 0.1602, "step": 1310 }, { "epoch": 0.26435623614749143, "grad_norm": 0.1571531444787979, "learning_rate": 9.999776186307129e-05, "loss": 0.2214, "step": 1312 }, { "epoch": 0.2647592182147894, "grad_norm": 0.07496760785579681, "learning_rate": 9.999763399591162e-05, "loss": 0.2304, "step": 1314 }, { "epoch": 0.2651622002820874, "grad_norm": 0.06391184777021408, "learning_rate": 9.999750257700002e-05, "loss": 0.2287, "step": 1316 }, { "epoch": 0.2655651823493855, "grad_norm": 0.08557723462581635, "learning_rate": 9.999736760634578e-05, "loss": 0.2596, "step": 1318 }, { "epoch": 0.2659681644166835, "grad_norm": 0.0653078481554985, "learning_rate": 9.999722908395851e-05, "loss": 0.193, "step": 1320 }, { "epoch": 0.26637114648398147, "grad_norm": 0.07708454132080078, "learning_rate": 9.999708700984804e-05, "loss": 0.2117, "step": 1322 }, { "epoch": 0.26677412855127947, "grad_norm": 0.07978586852550507, "learning_rate": 9.999694138402448e-05, "loss": 0.1499, "step": 1324 }, { "epoch": 0.26717711061857746, "grad_norm": 0.18177969753742218, "learning_rate": 9.999679220649815e-05, "loss": 0.1992, "step": 1326 }, { "epoch": 0.26758009268587546, "grad_norm": 0.08391742408275604, "learning_rate": 9.999663947727966e-05, "loss": 0.1551, "step": 1328 }, { "epoch": 0.2679830747531735, "grad_norm": 0.06364039331674576, "learning_rate": 9.999648319637986e-05, "loss": 0.2108, "step": 1330 }, { "epoch": 0.2683860568204715, "grad_norm": 0.08180107921361923, "learning_rate": 9.999632336380986e-05, "loss": 0.2298, "step": 1332 }, { "epoch": 0.2687890388877695, "grad_norm": 0.08764016628265381, "learning_rate": 9.999615997958101e-05, "loss": 0.2359, "step": 1334 }, { "epoch": 0.2691920209550675, "grad_norm": 0.07102248817682266, "learning_rate": 9.999599304370489e-05, "loss": 0.2487, "step": 1336 }, { "epoch": 0.2695950030223655, "grad_norm": 0.07571367919445038, "learning_rate": 9.99958225561934e-05, "loss": 0.2138, "step": 1338 }, { "epoch": 0.2699979850896635, "grad_norm": 0.09333647787570953, "learning_rate": 9.999564851705862e-05, "loss": 0.1944, "step": 1340 }, { "epoch": 0.2704009671569615, "grad_norm": 0.06818639487028122, "learning_rate": 9.999547092631293e-05, "loss": 0.1859, "step": 1342 }, { "epoch": 0.27080394922425954, "grad_norm": 0.07819613069295883, "learning_rate": 9.999528978396895e-05, "loss": 0.2134, "step": 1344 }, { "epoch": 0.27120693129155754, "grad_norm": 0.07632673531770706, "learning_rate": 9.999510509003953e-05, "loss": 0.1645, "step": 1346 }, { "epoch": 0.27160991335885554, "grad_norm": 0.08994800597429276, "learning_rate": 9.99949168445378e-05, "loss": 0.2005, "step": 1348 }, { "epoch": 0.27201289542615353, "grad_norm": 0.07124733179807663, "learning_rate": 9.999472504747714e-05, "loss": 0.215, "step": 1350 }, { "epoch": 0.27241587749345153, "grad_norm": 0.0745796486735344, "learning_rate": 9.999452969887116e-05, "loss": 0.2201, "step": 1352 }, { "epoch": 0.2728188595607495, "grad_norm": 0.09241003543138504, "learning_rate": 9.999433079873372e-05, "loss": 0.2026, "step": 1354 }, { "epoch": 0.2732218416280476, "grad_norm": 0.07417726516723633, "learning_rate": 9.999412834707902e-05, "loss": 0.1683, "step": 1356 }, { "epoch": 0.2736248236953456, "grad_norm": 0.07573942840099335, "learning_rate": 9.999392234392138e-05, "loss": 0.2068, "step": 1358 }, { "epoch": 0.2740278057626436, "grad_norm": 0.08692026883363724, "learning_rate": 9.999371278927543e-05, "loss": 0.23, "step": 1360 }, { "epoch": 0.27443078782994157, "grad_norm": 0.07430460304021835, "learning_rate": 9.99934996831561e-05, "loss": 0.2022, "step": 1362 }, { "epoch": 0.27483376989723957, "grad_norm": 0.09575946629047394, "learning_rate": 9.99932830255785e-05, "loss": 0.1642, "step": 1364 }, { "epoch": 0.27523675196453756, "grad_norm": 0.11394454538822174, "learning_rate": 9.999306281655803e-05, "loss": 0.2194, "step": 1366 }, { "epoch": 0.27563973403183556, "grad_norm": 0.09447552263736725, "learning_rate": 9.99928390561103e-05, "loss": 0.1799, "step": 1368 }, { "epoch": 0.2760427160991336, "grad_norm": 0.09351827204227448, "learning_rate": 9.999261174425127e-05, "loss": 0.1997, "step": 1370 }, { "epoch": 0.2764456981664316, "grad_norm": 0.06894346326589584, "learning_rate": 9.999238088099704e-05, "loss": 0.1359, "step": 1372 }, { "epoch": 0.2768486802337296, "grad_norm": 0.10162418335676193, "learning_rate": 9.999214646636404e-05, "loss": 0.2037, "step": 1374 }, { "epoch": 0.2772516623010276, "grad_norm": 0.08390204608440399, "learning_rate": 9.999190850036889e-05, "loss": 0.2224, "step": 1376 }, { "epoch": 0.2776546443683256, "grad_norm": 0.06677880138158798, "learning_rate": 9.99916669830285e-05, "loss": 0.1331, "step": 1378 }, { "epoch": 0.2780576264356236, "grad_norm": 0.06353922933340073, "learning_rate": 9.999142191436004e-05, "loss": 0.1537, "step": 1380 }, { "epoch": 0.27846060850292165, "grad_norm": 0.079979807138443, "learning_rate": 9.999117329438092e-05, "loss": 0.2154, "step": 1382 }, { "epoch": 0.27886359057021964, "grad_norm": 0.08919129520654678, "learning_rate": 9.999092112310881e-05, "loss": 0.2191, "step": 1384 }, { "epoch": 0.27926657263751764, "grad_norm": 0.06999266147613525, "learning_rate": 9.99906654005616e-05, "loss": 0.2094, "step": 1386 }, { "epoch": 0.27966955470481564, "grad_norm": 0.16508431732654572, "learning_rate": 9.999040612675748e-05, "loss": 0.2204, "step": 1388 }, { "epoch": 0.28007253677211363, "grad_norm": 0.07799568772315979, "learning_rate": 9.999014330171485e-05, "loss": 0.1778, "step": 1390 }, { "epoch": 0.28047551883941163, "grad_norm": 0.08466193079948425, "learning_rate": 9.998987692545239e-05, "loss": 0.1751, "step": 1392 }, { "epoch": 0.2808785009067096, "grad_norm": 0.06041021645069122, "learning_rate": 9.998960699798902e-05, "loss": 0.2082, "step": 1394 }, { "epoch": 0.2812814829740077, "grad_norm": 0.08678558468818665, "learning_rate": 9.99893335193439e-05, "loss": 0.2406, "step": 1396 }, { "epoch": 0.2816844650413057, "grad_norm": 0.07207660377025604, "learning_rate": 9.998905648953649e-05, "loss": 0.1713, "step": 1398 }, { "epoch": 0.2820874471086037, "grad_norm": 0.0904468297958374, "learning_rate": 9.998877590858646e-05, "loss": 0.18, "step": 1400 }, { "epoch": 0.28249042917590167, "grad_norm": 0.09611000120639801, "learning_rate": 9.998849177651371e-05, "loss": 0.218, "step": 1402 }, { "epoch": 0.28289341124319967, "grad_norm": 0.07803361862897873, "learning_rate": 9.998820409333847e-05, "loss": 0.1766, "step": 1404 }, { "epoch": 0.28329639331049766, "grad_norm": 0.07811973243951797, "learning_rate": 9.998791285908115e-05, "loss": 0.1674, "step": 1406 }, { "epoch": 0.2836993753777957, "grad_norm": 0.0652318000793457, "learning_rate": 9.998761807376245e-05, "loss": 0.2013, "step": 1408 }, { "epoch": 0.2841023574450937, "grad_norm": 0.08281879127025604, "learning_rate": 9.998731973740329e-05, "loss": 0.2161, "step": 1410 }, { "epoch": 0.2845053395123917, "grad_norm": 0.06643449515104294, "learning_rate": 9.998701785002489e-05, "loss": 0.1932, "step": 1412 }, { "epoch": 0.2849083215796897, "grad_norm": 0.06392282247543335, "learning_rate": 9.998671241164868e-05, "loss": 0.1612, "step": 1414 }, { "epoch": 0.2853113036469877, "grad_norm": 0.06907442212104797, "learning_rate": 9.998640342229636e-05, "loss": 0.2002, "step": 1416 }, { "epoch": 0.2857142857142857, "grad_norm": 0.07263526320457458, "learning_rate": 9.998609088198988e-05, "loss": 0.2152, "step": 1418 }, { "epoch": 0.2861172677815837, "grad_norm": 0.058846525847911835, "learning_rate": 9.998577479075145e-05, "loss": 0.1925, "step": 1420 }, { "epoch": 0.28652024984888175, "grad_norm": 0.05752931907773018, "learning_rate": 9.998545514860352e-05, "loss": 0.1744, "step": 1422 }, { "epoch": 0.28692323191617974, "grad_norm": 0.07596255093812943, "learning_rate": 9.99851319555688e-05, "loss": 0.1915, "step": 1424 }, { "epoch": 0.28732621398347774, "grad_norm": 0.08544960618019104, "learning_rate": 9.998480521167025e-05, "loss": 0.1749, "step": 1426 }, { "epoch": 0.28772919605077574, "grad_norm": 0.07587330788373947, "learning_rate": 9.998447491693105e-05, "loss": 0.1518, "step": 1428 }, { "epoch": 0.28813217811807373, "grad_norm": 0.07118236273527145, "learning_rate": 9.998414107137471e-05, "loss": 0.1754, "step": 1430 }, { "epoch": 0.28853516018537173, "grad_norm": 0.06920721381902695, "learning_rate": 9.998380367502493e-05, "loss": 0.1693, "step": 1432 }, { "epoch": 0.2889381422526698, "grad_norm": 0.07532205432653427, "learning_rate": 9.998346272790567e-05, "loss": 0.2045, "step": 1434 }, { "epoch": 0.2893411243199678, "grad_norm": 0.07022588700056076, "learning_rate": 9.998311823004114e-05, "loss": 0.1946, "step": 1436 }, { "epoch": 0.2897441063872658, "grad_norm": 0.0767081007361412, "learning_rate": 9.998277018145585e-05, "loss": 0.1599, "step": 1438 }, { "epoch": 0.2901470884545638, "grad_norm": 0.09657610207796097, "learning_rate": 9.998241858217449e-05, "loss": 0.1972, "step": 1440 }, { "epoch": 0.29055007052186177, "grad_norm": 0.06490109115839005, "learning_rate": 9.998206343222205e-05, "loss": 0.2065, "step": 1442 }, { "epoch": 0.29095305258915977, "grad_norm": 0.09028349071741104, "learning_rate": 9.998170473162376e-05, "loss": 0.187, "step": 1444 }, { "epoch": 0.29135603465645776, "grad_norm": 0.08025387674570084, "learning_rate": 9.99813424804051e-05, "loss": 0.2205, "step": 1446 }, { "epoch": 0.2917590167237558, "grad_norm": 0.08773567527532578, "learning_rate": 9.99809766785918e-05, "loss": 0.2921, "step": 1448 }, { "epoch": 0.2921619987910538, "grad_norm": 0.07190251350402832, "learning_rate": 9.998060732620985e-05, "loss": 0.2012, "step": 1450 }, { "epoch": 0.2925649808583518, "grad_norm": 0.08821823447942734, "learning_rate": 9.998023442328549e-05, "loss": 0.1774, "step": 1452 }, { "epoch": 0.2929679629256498, "grad_norm": 0.07339081913232803, "learning_rate": 9.99798579698452e-05, "loss": 0.2081, "step": 1454 }, { "epoch": 0.2933709449929478, "grad_norm": 0.07733795791864395, "learning_rate": 9.997947796591573e-05, "loss": 0.2352, "step": 1456 }, { "epoch": 0.2937739270602458, "grad_norm": 0.061597902327775955, "learning_rate": 9.99790944115241e-05, "loss": 0.2202, "step": 1458 }, { "epoch": 0.29417690912754385, "grad_norm": 0.06928958743810654, "learning_rate": 9.997870730669752e-05, "loss": 0.2337, "step": 1460 }, { "epoch": 0.29457989119484185, "grad_norm": 0.07841575145721436, "learning_rate": 9.997831665146348e-05, "loss": 0.2247, "step": 1462 }, { "epoch": 0.29498287326213984, "grad_norm": 0.08794905245304108, "learning_rate": 9.997792244584978e-05, "loss": 0.2142, "step": 1464 }, { "epoch": 0.29538585532943784, "grad_norm": 0.09079600870609283, "learning_rate": 9.997752468988439e-05, "loss": 0.2001, "step": 1466 }, { "epoch": 0.29578883739673584, "grad_norm": 0.06976237893104553, "learning_rate": 9.997712338359557e-05, "loss": 0.1637, "step": 1468 }, { "epoch": 0.29619181946403383, "grad_norm": 0.06324432045221329, "learning_rate": 9.997671852701185e-05, "loss": 0.1678, "step": 1470 }, { "epoch": 0.29659480153133183, "grad_norm": 0.05822910740971565, "learning_rate": 9.997631012016195e-05, "loss": 0.1582, "step": 1472 }, { "epoch": 0.2969977835986299, "grad_norm": 0.07130776345729828, "learning_rate": 9.997589816307491e-05, "loss": 0.1777, "step": 1474 }, { "epoch": 0.2974007656659279, "grad_norm": 0.0914449617266655, "learning_rate": 9.997548265577998e-05, "loss": 0.2279, "step": 1476 }, { "epoch": 0.2978037477332259, "grad_norm": 0.07395707070827484, "learning_rate": 9.99750635983067e-05, "loss": 0.1786, "step": 1478 }, { "epoch": 0.2982067298005239, "grad_norm": 0.0627974271774292, "learning_rate": 9.997464099068484e-05, "loss": 0.2187, "step": 1480 }, { "epoch": 0.29860971186782187, "grad_norm": 0.08622007071971893, "learning_rate": 9.997421483294438e-05, "loss": 0.2272, "step": 1482 }, { "epoch": 0.29901269393511987, "grad_norm": 0.05741937831044197, "learning_rate": 9.997378512511561e-05, "loss": 0.1968, "step": 1484 }, { "epoch": 0.2994156760024179, "grad_norm": 0.07579632848501205, "learning_rate": 9.997335186722909e-05, "loss": 0.2003, "step": 1486 }, { "epoch": 0.2998186580697159, "grad_norm": 0.06345254182815552, "learning_rate": 9.997291505931558e-05, "loss": 0.2027, "step": 1488 }, { "epoch": 0.3002216401370139, "grad_norm": 0.05749038606882095, "learning_rate": 9.997247470140608e-05, "loss": 0.1728, "step": 1490 }, { "epoch": 0.3006246222043119, "grad_norm": 0.06285730749368668, "learning_rate": 9.99720307935319e-05, "loss": 0.1839, "step": 1492 }, { "epoch": 0.3010276042716099, "grad_norm": 0.08099797368049622, "learning_rate": 9.99715833357246e-05, "loss": 0.2482, "step": 1494 }, { "epoch": 0.3014305863389079, "grad_norm": 0.06453213840723038, "learning_rate": 9.997113232801592e-05, "loss": 0.1932, "step": 1496 }, { "epoch": 0.3018335684062059, "grad_norm": 0.07795906066894531, "learning_rate": 9.99706777704379e-05, "loss": 0.2067, "step": 1498 }, { "epoch": 0.30223655047350395, "grad_norm": 0.06192241236567497, "learning_rate": 9.997021966302287e-05, "loss": 0.2083, "step": 1500 }, { "epoch": 0.30263953254080195, "grad_norm": 0.07832437753677368, "learning_rate": 9.996975800580333e-05, "loss": 0.2086, "step": 1502 }, { "epoch": 0.30304251460809994, "grad_norm": 0.06281822919845581, "learning_rate": 9.996929279881211e-05, "loss": 0.1819, "step": 1504 }, { "epoch": 0.30344549667539794, "grad_norm": 0.08558258414268494, "learning_rate": 9.996882404208224e-05, "loss": 0.1979, "step": 1506 }, { "epoch": 0.30384847874269594, "grad_norm": 0.07973451912403107, "learning_rate": 9.996835173564703e-05, "loss": 0.1916, "step": 1508 }, { "epoch": 0.30425146080999393, "grad_norm": 0.09966539591550827, "learning_rate": 9.996787587954001e-05, "loss": 0.212, "step": 1510 }, { "epoch": 0.304654442877292, "grad_norm": 0.06780754774808884, "learning_rate": 9.996739647379501e-05, "loss": 0.1834, "step": 1512 }, { "epoch": 0.30505742494459, "grad_norm": 0.054154492914676666, "learning_rate": 9.996691351844608e-05, "loss": 0.1977, "step": 1514 }, { "epoch": 0.305460407011888, "grad_norm": 0.08135867863893509, "learning_rate": 9.996642701352752e-05, "loss": 0.2166, "step": 1516 }, { "epoch": 0.305863389079186, "grad_norm": 0.06651759892702103, "learning_rate": 9.99659369590739e-05, "loss": 0.1611, "step": 1518 }, { "epoch": 0.306266371146484, "grad_norm": 0.07462992519140244, "learning_rate": 9.996544335512001e-05, "loss": 0.2129, "step": 1520 }, { "epoch": 0.30666935321378197, "grad_norm": 0.0842273160815239, "learning_rate": 9.996494620170094e-05, "loss": 0.1879, "step": 1522 }, { "epoch": 0.30707233528107997, "grad_norm": 0.06992574781179428, "learning_rate": 9.9964445498852e-05, "loss": 0.2261, "step": 1524 }, { "epoch": 0.307475317348378, "grad_norm": 0.1636413186788559, "learning_rate": 9.996394124660876e-05, "loss": 0.2208, "step": 1526 }, { "epoch": 0.307878299415676, "grad_norm": 0.07403954118490219, "learning_rate": 9.996343344500705e-05, "loss": 0.1993, "step": 1528 }, { "epoch": 0.308281281482974, "grad_norm": 0.058403730392456055, "learning_rate": 9.996292209408291e-05, "loss": 0.2463, "step": 1530 }, { "epoch": 0.308684263550272, "grad_norm": 0.06864384561777115, "learning_rate": 9.996240719387271e-05, "loss": 0.2136, "step": 1532 }, { "epoch": 0.30908724561757, "grad_norm": 0.1078968495130539, "learning_rate": 9.996188874441298e-05, "loss": 0.2107, "step": 1534 }, { "epoch": 0.309490227684868, "grad_norm": 0.06026478856801987, "learning_rate": 9.996136674574059e-05, "loss": 0.1582, "step": 1536 }, { "epoch": 0.30989320975216605, "grad_norm": 0.059033844619989395, "learning_rate": 9.996084119789262e-05, "loss": 0.1873, "step": 1538 }, { "epoch": 0.31029619181946405, "grad_norm": 0.05242142453789711, "learning_rate": 9.996031210090637e-05, "loss": 0.2194, "step": 1540 }, { "epoch": 0.31069917388676205, "grad_norm": 0.07254286110401154, "learning_rate": 9.995977945481946e-05, "loss": 0.2227, "step": 1542 }, { "epoch": 0.31110215595406004, "grad_norm": 0.07154665887355804, "learning_rate": 9.995924325966973e-05, "loss": 0.178, "step": 1544 }, { "epoch": 0.31150513802135804, "grad_norm": 0.08451339602470398, "learning_rate": 9.995870351549523e-05, "loss": 0.2361, "step": 1546 }, { "epoch": 0.31190812008865604, "grad_norm": 0.06354124844074249, "learning_rate": 9.995816022233435e-05, "loss": 0.1935, "step": 1548 }, { "epoch": 0.31231110215595403, "grad_norm": 0.05122746527194977, "learning_rate": 9.995761338022566e-05, "loss": 0.1893, "step": 1550 }, { "epoch": 0.3127140842232521, "grad_norm": 0.06881532818078995, "learning_rate": 9.9957062989208e-05, "loss": 0.1638, "step": 1552 }, { "epoch": 0.3131170662905501, "grad_norm": 0.06362321227788925, "learning_rate": 9.99565090493205e-05, "loss": 0.1889, "step": 1554 }, { "epoch": 0.3135200483578481, "grad_norm": 0.07629745453596115, "learning_rate": 9.995595156060246e-05, "loss": 0.2519, "step": 1556 }, { "epoch": 0.3139230304251461, "grad_norm": 0.05831436440348625, "learning_rate": 9.995539052309353e-05, "loss": 0.1613, "step": 1558 }, { "epoch": 0.3143260124924441, "grad_norm": 0.08071257919073105, "learning_rate": 9.995482593683356e-05, "loss": 0.2155, "step": 1560 }, { "epoch": 0.31472899455974207, "grad_norm": 0.06999702006578445, "learning_rate": 9.995425780186263e-05, "loss": 0.2026, "step": 1562 }, { "epoch": 0.3151319766270401, "grad_norm": 0.08394176512956619, "learning_rate": 9.995368611822113e-05, "loss": 0.2288, "step": 1564 }, { "epoch": 0.3155349586943381, "grad_norm": 0.0686328262090683, "learning_rate": 9.995311088594966e-05, "loss": 0.1879, "step": 1566 }, { "epoch": 0.3159379407616361, "grad_norm": 0.06367763131856918, "learning_rate": 9.995253210508906e-05, "loss": 0.198, "step": 1568 }, { "epoch": 0.3163409228289341, "grad_norm": 0.0799856036901474, "learning_rate": 9.995194977568047e-05, "loss": 0.2044, "step": 1570 }, { "epoch": 0.3167439048962321, "grad_norm": 0.059465620666742325, "learning_rate": 9.995136389776527e-05, "loss": 0.2186, "step": 1572 }, { "epoch": 0.3171468869635301, "grad_norm": 0.07960893958806992, "learning_rate": 9.995077447138506e-05, "loss": 0.2338, "step": 1574 }, { "epoch": 0.3175498690308281, "grad_norm": 0.05532738193869591, "learning_rate": 9.995018149658171e-05, "loss": 0.1919, "step": 1576 }, { "epoch": 0.31795285109812615, "grad_norm": 0.06862778216600418, "learning_rate": 9.994958497339735e-05, "loss": 0.165, "step": 1578 }, { "epoch": 0.31835583316542415, "grad_norm": 0.07541234791278839, "learning_rate": 9.994898490187434e-05, "loss": 0.211, "step": 1580 }, { "epoch": 0.31875881523272215, "grad_norm": 0.05714013800024986, "learning_rate": 9.994838128205535e-05, "loss": 0.2176, "step": 1582 }, { "epoch": 0.31916179730002014, "grad_norm": 0.08829407393932343, "learning_rate": 9.994777411398323e-05, "loss": 0.1978, "step": 1584 }, { "epoch": 0.31956477936731814, "grad_norm": 0.11317098140716553, "learning_rate": 9.994716339770111e-05, "loss": 0.2177, "step": 1586 }, { "epoch": 0.31996776143461614, "grad_norm": 0.05632421001791954, "learning_rate": 9.994654913325239e-05, "loss": 0.1938, "step": 1588 }, { "epoch": 0.3203707435019142, "grad_norm": 0.1002504974603653, "learning_rate": 9.994593132068068e-05, "loss": 0.1982, "step": 1590 }, { "epoch": 0.3207737255692122, "grad_norm": 0.07449984550476074, "learning_rate": 9.99453099600299e-05, "loss": 0.1514, "step": 1592 }, { "epoch": 0.3211767076365102, "grad_norm": 0.06601531058549881, "learning_rate": 9.994468505134417e-05, "loss": 0.2112, "step": 1594 }, { "epoch": 0.3215796897038082, "grad_norm": 0.08076255768537521, "learning_rate": 9.994405659466791e-05, "loss": 0.2217, "step": 1596 }, { "epoch": 0.3219826717711062, "grad_norm": 0.06299552321434021, "learning_rate": 9.994342459004571e-05, "loss": 0.2193, "step": 1598 }, { "epoch": 0.3223856538384042, "grad_norm": 0.07977569848299026, "learning_rate": 9.994278903752252e-05, "loss": 0.1955, "step": 1600 }, { "epoch": 0.32278863590570217, "grad_norm": 0.06248839944601059, "learning_rate": 9.994214993714346e-05, "loss": 0.1976, "step": 1602 }, { "epoch": 0.3231916179730002, "grad_norm": 0.09939960390329361, "learning_rate": 9.994150728895394e-05, "loss": 0.1709, "step": 1604 }, { "epoch": 0.3235946000402982, "grad_norm": 0.07984374463558197, "learning_rate": 9.994086109299961e-05, "loss": 0.2249, "step": 1606 }, { "epoch": 0.3239975821075962, "grad_norm": 0.08359546959400177, "learning_rate": 9.994021134932638e-05, "loss": 0.234, "step": 1608 }, { "epoch": 0.3244005641748942, "grad_norm": 0.05540497601032257, "learning_rate": 9.993955805798041e-05, "loss": 0.1827, "step": 1610 }, { "epoch": 0.3248035462421922, "grad_norm": 0.06334353983402252, "learning_rate": 9.993890121900809e-05, "loss": 0.1989, "step": 1612 }, { "epoch": 0.3252065283094902, "grad_norm": 0.0763341635465622, "learning_rate": 9.99382408324561e-05, "loss": 0.1968, "step": 1614 }, { "epoch": 0.32560951037678826, "grad_norm": 0.0655263289809227, "learning_rate": 9.993757689837135e-05, "loss": 0.1874, "step": 1616 }, { "epoch": 0.32601249244408625, "grad_norm": 0.06662862002849579, "learning_rate": 9.993690941680098e-05, "loss": 0.1996, "step": 1618 }, { "epoch": 0.32641547451138425, "grad_norm": 0.06894282251596451, "learning_rate": 9.993623838779244e-05, "loss": 0.2167, "step": 1620 }, { "epoch": 0.32681845657868225, "grad_norm": 0.10926174372434616, "learning_rate": 9.993556381139339e-05, "loss": 0.2293, "step": 1622 }, { "epoch": 0.32722143864598024, "grad_norm": 0.057909030467271805, "learning_rate": 9.993488568765175e-05, "loss": 0.192, "step": 1624 }, { "epoch": 0.32762442071327824, "grad_norm": 0.06387782841920853, "learning_rate": 9.993420401661569e-05, "loss": 0.1674, "step": 1626 }, { "epoch": 0.32802740278057624, "grad_norm": 0.07918224483728409, "learning_rate": 9.993351879833363e-05, "loss": 0.1965, "step": 1628 }, { "epoch": 0.3284303848478743, "grad_norm": 0.07043313980102539, "learning_rate": 9.993283003285425e-05, "loss": 0.1587, "step": 1630 }, { "epoch": 0.3288333669151723, "grad_norm": 0.06445840746164322, "learning_rate": 9.993213772022648e-05, "loss": 0.1999, "step": 1632 }, { "epoch": 0.3292363489824703, "grad_norm": 0.07198817282915115, "learning_rate": 9.99314418604995e-05, "loss": 0.224, "step": 1634 }, { "epoch": 0.3296393310497683, "grad_norm": 0.062218908220529556, "learning_rate": 9.993074245372276e-05, "loss": 0.2172, "step": 1636 }, { "epoch": 0.3300423131170663, "grad_norm": 0.09242686629295349, "learning_rate": 9.99300394999459e-05, "loss": 0.1877, "step": 1638 }, { "epoch": 0.3304452951843643, "grad_norm": 0.05355711653828621, "learning_rate": 9.992933299921891e-05, "loss": 0.2071, "step": 1640 }, { "epoch": 0.3308482772516623, "grad_norm": 0.05850759148597717, "learning_rate": 9.992862295159193e-05, "loss": 0.157, "step": 1642 }, { "epoch": 0.3312512593189603, "grad_norm": 0.06185588613152504, "learning_rate": 9.992790935711544e-05, "loss": 0.1621, "step": 1644 }, { "epoch": 0.3316542413862583, "grad_norm": 0.07613895833492279, "learning_rate": 9.992719221584012e-05, "loss": 0.2315, "step": 1646 }, { "epoch": 0.3320572234535563, "grad_norm": 0.09143196791410446, "learning_rate": 9.99264715278169e-05, "loss": 0.2347, "step": 1648 }, { "epoch": 0.3324602055208543, "grad_norm": 0.08874918520450592, "learning_rate": 9.992574729309701e-05, "loss": 0.2206, "step": 1650 }, { "epoch": 0.3328631875881523, "grad_norm": 0.11054286360740662, "learning_rate": 9.992501951173186e-05, "loss": 0.2128, "step": 1652 }, { "epoch": 0.3332661696554503, "grad_norm": 0.05944681912660599, "learning_rate": 9.992428818377318e-05, "loss": 0.1919, "step": 1654 }, { "epoch": 0.33366915172274836, "grad_norm": 0.09661544114351273, "learning_rate": 9.992355330927288e-05, "loss": 0.2246, "step": 1656 }, { "epoch": 0.33407213379004636, "grad_norm": 0.05847775191068649, "learning_rate": 9.992281488828322e-05, "loss": 0.1888, "step": 1658 }, { "epoch": 0.33447511585734435, "grad_norm": 0.05200980231165886, "learning_rate": 9.992207292085662e-05, "loss": 0.147, "step": 1660 }, { "epoch": 0.33487809792464235, "grad_norm": 0.07436300069093704, "learning_rate": 9.99213274070458e-05, "loss": 0.1989, "step": 1662 }, { "epoch": 0.33528107999194035, "grad_norm": 0.10399733483791351, "learning_rate": 9.992057834690373e-05, "loss": 0.2312, "step": 1664 }, { "epoch": 0.33568406205923834, "grad_norm": 0.05904490128159523, "learning_rate": 9.99198257404836e-05, "loss": 0.1982, "step": 1666 }, { "epoch": 0.3360870441265364, "grad_norm": 0.08844948559999466, "learning_rate": 9.991906958783887e-05, "loss": 0.1843, "step": 1668 }, { "epoch": 0.3364900261938344, "grad_norm": 0.06130826473236084, "learning_rate": 9.991830988902328e-05, "loss": 0.2058, "step": 1670 }, { "epoch": 0.3368930082611324, "grad_norm": 0.06297429651021957, "learning_rate": 9.99175466440908e-05, "loss": 0.1825, "step": 1672 }, { "epoch": 0.3372959903284304, "grad_norm": 0.058773405849933624, "learning_rate": 9.991677985309563e-05, "loss": 0.245, "step": 1674 }, { "epoch": 0.3376989723957284, "grad_norm": 0.06418730318546295, "learning_rate": 9.991600951609226e-05, "loss": 0.2072, "step": 1676 }, { "epoch": 0.3381019544630264, "grad_norm": 0.07122557610273361, "learning_rate": 9.991523563313538e-05, "loss": 0.1764, "step": 1678 }, { "epoch": 0.3385049365303244, "grad_norm": 0.08291307836771011, "learning_rate": 9.991445820428e-05, "loss": 0.1887, "step": 1680 }, { "epoch": 0.3389079185976224, "grad_norm": 0.09296026825904846, "learning_rate": 9.991367722958134e-05, "loss": 0.2177, "step": 1682 }, { "epoch": 0.3393109006649204, "grad_norm": 0.08643902093172073, "learning_rate": 9.991289270909488e-05, "loss": 0.1744, "step": 1684 }, { "epoch": 0.3397138827322184, "grad_norm": 0.06946311146020889, "learning_rate": 9.991210464287633e-05, "loss": 0.237, "step": 1686 }, { "epoch": 0.3401168647995164, "grad_norm": 0.06480636447668076, "learning_rate": 9.99113130309817e-05, "loss": 0.2472, "step": 1688 }, { "epoch": 0.3405198468668144, "grad_norm": 0.0569583997130394, "learning_rate": 9.991051787346721e-05, "loss": 0.2188, "step": 1690 }, { "epoch": 0.3409228289341124, "grad_norm": 0.06187818944454193, "learning_rate": 9.990971917038933e-05, "loss": 0.2236, "step": 1692 }, { "epoch": 0.34132581100141046, "grad_norm": 0.07523096352815628, "learning_rate": 9.990891692180485e-05, "loss": 0.233, "step": 1694 }, { "epoch": 0.34172879306870846, "grad_norm": 0.06507721543312073, "learning_rate": 9.990811112777072e-05, "loss": 0.1959, "step": 1696 }, { "epoch": 0.34213177513600646, "grad_norm": 0.08201830089092255, "learning_rate": 9.99073017883442e-05, "loss": 0.1792, "step": 1698 }, { "epoch": 0.34253475720330445, "grad_norm": 0.06180296465754509, "learning_rate": 9.990648890358277e-05, "loss": 0.18, "step": 1700 }, { "epoch": 0.34293773927060245, "grad_norm": 0.09496378898620605, "learning_rate": 9.990567247354416e-05, "loss": 0.2402, "step": 1702 }, { "epoch": 0.34334072133790045, "grad_norm": 0.06735090166330338, "learning_rate": 9.990485249828641e-05, "loss": 0.1585, "step": 1704 }, { "epoch": 0.34374370340519844, "grad_norm": 0.060190655291080475, "learning_rate": 9.990402897786775e-05, "loss": 0.22, "step": 1706 }, { "epoch": 0.3441466854724965, "grad_norm": 0.08018799871206284, "learning_rate": 9.990320191234667e-05, "loss": 0.2305, "step": 1708 }, { "epoch": 0.3445496675397945, "grad_norm": 0.09596269577741623, "learning_rate": 9.990237130178194e-05, "loss": 0.2066, "step": 1710 }, { "epoch": 0.3449526496070925, "grad_norm": 0.07448262721300125, "learning_rate": 9.990153714623257e-05, "loss": 0.1807, "step": 1712 }, { "epoch": 0.3453556316743905, "grad_norm": 0.07886528223752975, "learning_rate": 9.99006994457578e-05, "loss": 0.2259, "step": 1714 }, { "epoch": 0.3457586137416885, "grad_norm": 0.13016141951084137, "learning_rate": 9.989985820041714e-05, "loss": 0.2409, "step": 1716 }, { "epoch": 0.3461615958089865, "grad_norm": 0.06906406581401825, "learning_rate": 9.989901341027037e-05, "loss": 0.1858, "step": 1718 }, { "epoch": 0.34656457787628453, "grad_norm": 0.09606174379587173, "learning_rate": 9.989816507537748e-05, "loss": 0.2201, "step": 1720 }, { "epoch": 0.3469675599435825, "grad_norm": 0.0815054252743721, "learning_rate": 9.989731319579873e-05, "loss": 0.1897, "step": 1722 }, { "epoch": 0.3473705420108805, "grad_norm": 0.08122877776622772, "learning_rate": 9.989645777159467e-05, "loss": 0.1985, "step": 1724 }, { "epoch": 0.3477735240781785, "grad_norm": 0.06756141036748886, "learning_rate": 9.989559880282604e-05, "loss": 0.1802, "step": 1726 }, { "epoch": 0.3481765061454765, "grad_norm": 0.1047460213303566, "learning_rate": 9.989473628955387e-05, "loss": 0.1752, "step": 1728 }, { "epoch": 0.3485794882127745, "grad_norm": 0.08227745443582535, "learning_rate": 9.989387023183943e-05, "loss": 0.1597, "step": 1730 }, { "epoch": 0.3489824702800725, "grad_norm": 0.0924132764339447, "learning_rate": 9.989300062974424e-05, "loss": 0.1911, "step": 1732 }, { "epoch": 0.34938545234737056, "grad_norm": 0.05753399431705475, "learning_rate": 9.989212748333008e-05, "loss": 0.1474, "step": 1734 }, { "epoch": 0.34978843441466856, "grad_norm": 0.10338200628757477, "learning_rate": 9.989125079265896e-05, "loss": 0.1717, "step": 1736 }, { "epoch": 0.35019141648196656, "grad_norm": 0.0935504212975502, "learning_rate": 9.989037055779318e-05, "loss": 0.2091, "step": 1738 }, { "epoch": 0.35059439854926455, "grad_norm": 0.06872272491455078, "learning_rate": 9.988948677879528e-05, "loss": 0.1865, "step": 1740 }, { "epoch": 0.35099738061656255, "grad_norm": 0.08346632122993469, "learning_rate": 9.988859945572802e-05, "loss": 0.1935, "step": 1742 }, { "epoch": 0.35140036268386055, "grad_norm": 0.07062527537345886, "learning_rate": 9.988770858865441e-05, "loss": 0.2459, "step": 1744 }, { "epoch": 0.3518033447511586, "grad_norm": 0.08531786501407623, "learning_rate": 9.98868141776378e-05, "loss": 0.2299, "step": 1746 }, { "epoch": 0.3522063268184566, "grad_norm": 0.05672174692153931, "learning_rate": 9.988591622274169e-05, "loss": 0.164, "step": 1748 }, { "epoch": 0.3526093088857546, "grad_norm": 0.05787867307662964, "learning_rate": 9.988501472402984e-05, "loss": 0.2275, "step": 1750 }, { "epoch": 0.3530122909530526, "grad_norm": 0.09398354589939117, "learning_rate": 9.988410968156637e-05, "loss": 0.2173, "step": 1752 }, { "epoch": 0.3534152730203506, "grad_norm": 0.06995268166065216, "learning_rate": 9.988320109541549e-05, "loss": 0.2, "step": 1754 }, { "epoch": 0.3538182550876486, "grad_norm": 0.061387669295072556, "learning_rate": 9.98822889656418e-05, "loss": 0.2059, "step": 1756 }, { "epoch": 0.3542212371549466, "grad_norm": 0.06817382574081421, "learning_rate": 9.988137329231007e-05, "loss": 0.2139, "step": 1758 }, { "epoch": 0.35462421922224463, "grad_norm": 0.2687901258468628, "learning_rate": 9.988045407548534e-05, "loss": 0.2202, "step": 1760 }, { "epoch": 0.3550272012895426, "grad_norm": 0.07906468957662582, "learning_rate": 9.987953131523295e-05, "loss": 0.2178, "step": 1762 }, { "epoch": 0.3554301833568406, "grad_norm": 0.06844101846218109, "learning_rate": 9.987860501161841e-05, "loss": 0.197, "step": 1764 }, { "epoch": 0.3558331654241386, "grad_norm": 0.06971251219511032, "learning_rate": 9.987767516470754e-05, "loss": 0.2172, "step": 1766 }, { "epoch": 0.3562361474914366, "grad_norm": 0.06165212765336037, "learning_rate": 9.98767417745664e-05, "loss": 0.2212, "step": 1768 }, { "epoch": 0.3566391295587346, "grad_norm": 0.07242526859045029, "learning_rate": 9.987580484126129e-05, "loss": 0.2357, "step": 1770 }, { "epoch": 0.35704211162603267, "grad_norm": 0.07110414654016495, "learning_rate": 9.987486436485877e-05, "loss": 0.2393, "step": 1772 }, { "epoch": 0.35744509369333066, "grad_norm": 0.06459176540374756, "learning_rate": 9.987392034542564e-05, "loss": 0.1703, "step": 1774 }, { "epoch": 0.35784807576062866, "grad_norm": 0.0674254521727562, "learning_rate": 9.987297278302898e-05, "loss": 0.2301, "step": 1776 }, { "epoch": 0.35825105782792666, "grad_norm": 0.054965537041425705, "learning_rate": 9.987202167773609e-05, "loss": 0.1904, "step": 1778 }, { "epoch": 0.35865403989522465, "grad_norm": 0.08467627316713333, "learning_rate": 9.987106702961453e-05, "loss": 0.1696, "step": 1780 }, { "epoch": 0.35905702196252265, "grad_norm": 0.0902966558933258, "learning_rate": 9.987010883873214e-05, "loss": 0.228, "step": 1782 }, { "epoch": 0.35946000402982065, "grad_norm": 0.12665781378746033, "learning_rate": 9.986914710515697e-05, "loss": 0.2427, "step": 1784 }, { "epoch": 0.3598629860971187, "grad_norm": 0.08858704566955566, "learning_rate": 9.986818182895734e-05, "loss": 0.1922, "step": 1786 }, { "epoch": 0.3602659681644167, "grad_norm": 0.08362315595149994, "learning_rate": 9.986721301020181e-05, "loss": 0.2165, "step": 1788 }, { "epoch": 0.3606689502317147, "grad_norm": 0.06421255320310593, "learning_rate": 9.986624064895924e-05, "loss": 0.2342, "step": 1790 }, { "epoch": 0.3610719322990127, "grad_norm": 0.0623495914041996, "learning_rate": 9.986526474529868e-05, "loss": 0.1901, "step": 1792 }, { "epoch": 0.3614749143663107, "grad_norm": 0.16917704045772552, "learning_rate": 9.986428529928946e-05, "loss": 0.222, "step": 1794 }, { "epoch": 0.3618778964336087, "grad_norm": 0.19206245243549347, "learning_rate": 9.986330231100116e-05, "loss": 0.2488, "step": 1796 }, { "epoch": 0.36228087850090673, "grad_norm": 0.08742326498031616, "learning_rate": 9.986231578050361e-05, "loss": 0.2056, "step": 1798 }, { "epoch": 0.36268386056820473, "grad_norm": 0.616911768913269, "learning_rate": 9.986132570786688e-05, "loss": 0.2006, "step": 1800 }, { "epoch": 0.3630868426355027, "grad_norm": 0.06567348539829254, "learning_rate": 9.986033209316132e-05, "loss": 0.1604, "step": 1802 }, { "epoch": 0.3634898247028007, "grad_norm": 0.06277676671743393, "learning_rate": 9.98593349364575e-05, "loss": 0.1995, "step": 1804 }, { "epoch": 0.3638928067700987, "grad_norm": 0.08555983006954193, "learning_rate": 9.985833423782626e-05, "loss": 0.2083, "step": 1806 }, { "epoch": 0.3642957888373967, "grad_norm": 0.07542190700769424, "learning_rate": 9.985732999733872e-05, "loss": 0.2448, "step": 1808 }, { "epoch": 0.3646987709046947, "grad_norm": 0.06384123116731644, "learning_rate": 9.985632221506617e-05, "loss": 0.1966, "step": 1810 }, { "epoch": 0.36510175297199277, "grad_norm": 0.08267765492200851, "learning_rate": 9.985531089108023e-05, "loss": 0.1711, "step": 1812 }, { "epoch": 0.36550473503929076, "grad_norm": 0.05912201106548309, "learning_rate": 9.985429602545274e-05, "loss": 0.2477, "step": 1814 }, { "epoch": 0.36590771710658876, "grad_norm": 0.08566808700561523, "learning_rate": 9.985327761825577e-05, "loss": 0.1865, "step": 1816 }, { "epoch": 0.36631069917388676, "grad_norm": 0.06715419143438339, "learning_rate": 9.98522556695617e-05, "loss": 0.2159, "step": 1818 }, { "epoch": 0.36671368124118475, "grad_norm": 0.08008668571710587, "learning_rate": 9.985123017944311e-05, "loss": 0.177, "step": 1820 }, { "epoch": 0.36711666330848275, "grad_norm": 0.07066267728805542, "learning_rate": 9.985020114797287e-05, "loss": 0.207, "step": 1822 }, { "epoch": 0.3675196453757808, "grad_norm": 0.1034514307975769, "learning_rate": 9.984916857522404e-05, "loss": 0.1647, "step": 1824 }, { "epoch": 0.3679226274430788, "grad_norm": 0.0847577378153801, "learning_rate": 9.984813246127002e-05, "loss": 0.1412, "step": 1826 }, { "epoch": 0.3683256095103768, "grad_norm": 0.08753248304128647, "learning_rate": 9.984709280618438e-05, "loss": 0.2522, "step": 1828 }, { "epoch": 0.3687285915776748, "grad_norm": 0.0821591392159462, "learning_rate": 9.984604961004098e-05, "loss": 0.1748, "step": 1830 }, { "epoch": 0.3691315736449728, "grad_norm": 0.0583636499941349, "learning_rate": 9.984500287291393e-05, "loss": 0.1763, "step": 1832 }, { "epoch": 0.3695345557122708, "grad_norm": 0.05514378100633621, "learning_rate": 9.98439525948776e-05, "loss": 0.1618, "step": 1834 }, { "epoch": 0.36993753777956884, "grad_norm": 0.06351856887340546, "learning_rate": 9.984289877600659e-05, "loss": 0.1949, "step": 1836 }, { "epoch": 0.37034051984686683, "grad_norm": 0.06602806597948074, "learning_rate": 9.984184141637576e-05, "loss": 0.2321, "step": 1838 }, { "epoch": 0.37074350191416483, "grad_norm": 0.07348185032606125, "learning_rate": 9.984078051606022e-05, "loss": 0.1378, "step": 1840 }, { "epoch": 0.3711464839814628, "grad_norm": 0.06466913968324661, "learning_rate": 9.983971607513536e-05, "loss": 0.1866, "step": 1842 }, { "epoch": 0.3715494660487608, "grad_norm": 0.059923093765974045, "learning_rate": 9.983864809367676e-05, "loss": 0.214, "step": 1844 }, { "epoch": 0.3719524481160588, "grad_norm": 0.07116147130727768, "learning_rate": 9.983757657176032e-05, "loss": 0.2026, "step": 1846 }, { "epoch": 0.3723554301833568, "grad_norm": 0.11628149449825287, "learning_rate": 9.983650150946213e-05, "loss": 0.192, "step": 1848 }, { "epoch": 0.37275841225065487, "grad_norm": 0.0832078754901886, "learning_rate": 9.983542290685859e-05, "loss": 0.196, "step": 1850 }, { "epoch": 0.37316139431795287, "grad_norm": 0.07628065347671509, "learning_rate": 9.98343407640263e-05, "loss": 0.2624, "step": 1852 }, { "epoch": 0.37356437638525086, "grad_norm": 0.07016732543706894, "learning_rate": 9.983325508104214e-05, "loss": 0.1612, "step": 1854 }, { "epoch": 0.37396735845254886, "grad_norm": 0.0670395940542221, "learning_rate": 9.983216585798322e-05, "loss": 0.215, "step": 1856 }, { "epoch": 0.37437034051984686, "grad_norm": 0.07254261523485184, "learning_rate": 9.983107309492693e-05, "loss": 0.211, "step": 1858 }, { "epoch": 0.37477332258714485, "grad_norm": 0.08141341805458069, "learning_rate": 9.982997679195092e-05, "loss": 0.2174, "step": 1860 }, { "epoch": 0.3751763046544429, "grad_norm": 0.0632607713341713, "learning_rate": 9.982887694913306e-05, "loss": 0.1653, "step": 1862 }, { "epoch": 0.3755792867217409, "grad_norm": 0.06702928990125656, "learning_rate": 9.982777356655144e-05, "loss": 0.2359, "step": 1864 }, { "epoch": 0.3759822687890389, "grad_norm": 0.052461106330156326, "learning_rate": 9.98266666442845e-05, "loss": 0.1681, "step": 1866 }, { "epoch": 0.3763852508563369, "grad_norm": 0.05812010541558266, "learning_rate": 9.982555618241082e-05, "loss": 0.2286, "step": 1868 }, { "epoch": 0.3767882329236349, "grad_norm": 0.08102001249790192, "learning_rate": 9.982444218100935e-05, "loss": 0.2297, "step": 1870 }, { "epoch": 0.3771912149909329, "grad_norm": 0.08580035716295242, "learning_rate": 9.982332464015915e-05, "loss": 0.2389, "step": 1872 }, { "epoch": 0.3775941970582309, "grad_norm": 0.07281485199928284, "learning_rate": 9.982220355993968e-05, "loss": 0.2064, "step": 1874 }, { "epoch": 0.37799717912552894, "grad_norm": 0.0776129812002182, "learning_rate": 9.982107894043053e-05, "loss": 0.2068, "step": 1876 }, { "epoch": 0.37840016119282693, "grad_norm": 0.087554931640625, "learning_rate": 9.981995078171162e-05, "loss": 0.1778, "step": 1878 }, { "epoch": 0.37880314326012493, "grad_norm": 0.06286180019378662, "learning_rate": 9.981881908386308e-05, "loss": 0.1755, "step": 1880 }, { "epoch": 0.3792061253274229, "grad_norm": 0.08133803308010101, "learning_rate": 9.98176838469653e-05, "loss": 0.1926, "step": 1882 }, { "epoch": 0.3796091073947209, "grad_norm": 0.08006949722766876, "learning_rate": 9.981654507109893e-05, "loss": 0.1832, "step": 1884 }, { "epoch": 0.3800120894620189, "grad_norm": 0.06335432082414627, "learning_rate": 9.981540275634487e-05, "loss": 0.1761, "step": 1886 }, { "epoch": 0.380415071529317, "grad_norm": 0.0711664929986, "learning_rate": 9.981425690278426e-05, "loss": 0.173, "step": 1888 }, { "epoch": 0.38081805359661497, "grad_norm": 0.08654266595840454, "learning_rate": 9.981310751049851e-05, "loss": 0.2176, "step": 1890 }, { "epoch": 0.38122103566391297, "grad_norm": 0.06413505226373672, "learning_rate": 9.981195457956928e-05, "loss": 0.2199, "step": 1892 }, { "epoch": 0.38162401773121096, "grad_norm": 0.06541746854782104, "learning_rate": 9.981079811007845e-05, "loss": 0.2112, "step": 1894 }, { "epoch": 0.38202699979850896, "grad_norm": 0.07289738208055496, "learning_rate": 9.980963810210817e-05, "loss": 0.1691, "step": 1896 }, { "epoch": 0.38242998186580696, "grad_norm": 0.10542286932468414, "learning_rate": 9.980847455574087e-05, "loss": 0.2083, "step": 1898 }, { "epoch": 0.38283296393310495, "grad_norm": 0.08158999681472778, "learning_rate": 9.98073074710592e-05, "loss": 0.2275, "step": 1900 }, { "epoch": 0.383235946000403, "grad_norm": 0.07118821889162064, "learning_rate": 9.980613684814606e-05, "loss": 0.2156, "step": 1902 }, { "epoch": 0.383638928067701, "grad_norm": 0.06565722823143005, "learning_rate": 9.980496268708461e-05, "loss": 0.1858, "step": 1904 }, { "epoch": 0.384041910134999, "grad_norm": 0.05890239030122757, "learning_rate": 9.980378498795825e-05, "loss": 0.1843, "step": 1906 }, { "epoch": 0.384444892202297, "grad_norm": 0.06181297451257706, "learning_rate": 9.980260375085067e-05, "loss": 0.2045, "step": 1908 }, { "epoch": 0.384847874269595, "grad_norm": 0.05857539921998978, "learning_rate": 9.980141897584576e-05, "loss": 0.1791, "step": 1910 }, { "epoch": 0.385250856336893, "grad_norm": 0.07684215158224106, "learning_rate": 9.98002306630277e-05, "loss": 0.1961, "step": 1912 }, { "epoch": 0.38565383840419104, "grad_norm": 0.06899187713861465, "learning_rate": 9.979903881248088e-05, "loss": 0.1255, "step": 1914 }, { "epoch": 0.38605682047148904, "grad_norm": 0.08205246925354004, "learning_rate": 9.979784342429e-05, "loss": 0.1641, "step": 1916 }, { "epoch": 0.38645980253878703, "grad_norm": 0.08253847807645798, "learning_rate": 9.979664449853996e-05, "loss": 0.1602, "step": 1918 }, { "epoch": 0.38686278460608503, "grad_norm": 0.0585266575217247, "learning_rate": 9.979544203531592e-05, "loss": 0.1817, "step": 1920 }, { "epoch": 0.387265766673383, "grad_norm": 0.10539548099040985, "learning_rate": 9.979423603470333e-05, "loss": 0.2231, "step": 1922 }, { "epoch": 0.387668748740681, "grad_norm": 0.0943535566329956, "learning_rate": 9.979302649678783e-05, "loss": 0.2369, "step": 1924 }, { "epoch": 0.388071730807979, "grad_norm": 0.07806706428527832, "learning_rate": 9.979181342165538e-05, "loss": 0.2279, "step": 1926 }, { "epoch": 0.3884747128752771, "grad_norm": 0.06602683663368225, "learning_rate": 9.979059680939213e-05, "loss": 0.2579, "step": 1928 }, { "epoch": 0.38887769494257507, "grad_norm": 0.08012797683477402, "learning_rate": 9.97893766600845e-05, "loss": 0.2526, "step": 1930 }, { "epoch": 0.38928067700987307, "grad_norm": 0.058674756437540054, "learning_rate": 9.978815297381919e-05, "loss": 0.2077, "step": 1932 }, { "epoch": 0.38968365907717106, "grad_norm": 0.06406107544898987, "learning_rate": 9.97869257506831e-05, "loss": 0.1945, "step": 1934 }, { "epoch": 0.39008664114446906, "grad_norm": 0.06540010869503021, "learning_rate": 9.978569499076345e-05, "loss": 0.1683, "step": 1936 }, { "epoch": 0.39048962321176706, "grad_norm": 0.056828927248716354, "learning_rate": 9.978446069414763e-05, "loss": 0.2444, "step": 1938 }, { "epoch": 0.3908926052790651, "grad_norm": 0.05789874494075775, "learning_rate": 9.978322286092334e-05, "loss": 0.2166, "step": 1940 }, { "epoch": 0.3912955873463631, "grad_norm": 0.06070829555392265, "learning_rate": 9.978198149117852e-05, "loss": 0.1722, "step": 1942 }, { "epoch": 0.3916985694136611, "grad_norm": 0.06728588044643402, "learning_rate": 9.978073658500135e-05, "loss": 0.1812, "step": 1944 }, { "epoch": 0.3921015514809591, "grad_norm": 0.07258196920156479, "learning_rate": 9.977948814248028e-05, "loss": 0.1818, "step": 1946 }, { "epoch": 0.3925045335482571, "grad_norm": 0.07481992989778519, "learning_rate": 9.977823616370397e-05, "loss": 0.2135, "step": 1948 }, { "epoch": 0.3929075156155551, "grad_norm": 0.08128810673952103, "learning_rate": 9.977698064876136e-05, "loss": 0.2571, "step": 1950 }, { "epoch": 0.3933104976828531, "grad_norm": 0.09639985859394073, "learning_rate": 9.977572159774167e-05, "loss": 0.2428, "step": 1952 }, { "epoch": 0.39371347975015114, "grad_norm": 0.06070376932621002, "learning_rate": 9.977445901073431e-05, "loss": 0.1951, "step": 1954 }, { "epoch": 0.39411646181744914, "grad_norm": 0.06723980605602264, "learning_rate": 9.9773192887829e-05, "loss": 0.1721, "step": 1956 }, { "epoch": 0.39451944388474713, "grad_norm": 0.061029743403196335, "learning_rate": 9.977192322911565e-05, "loss": 0.2161, "step": 1958 }, { "epoch": 0.39492242595204513, "grad_norm": 0.06752429157495499, "learning_rate": 9.977065003468447e-05, "loss": 0.1729, "step": 1960 }, { "epoch": 0.3953254080193431, "grad_norm": 0.07469742000102997, "learning_rate": 9.976937330462593e-05, "loss": 0.1864, "step": 1962 }, { "epoch": 0.3957283900866411, "grad_norm": 0.06833580881357193, "learning_rate": 9.976809303903069e-05, "loss": 0.2202, "step": 1964 }, { "epoch": 0.3961313721539392, "grad_norm": 0.06711506843566895, "learning_rate": 9.976680923798971e-05, "loss": 0.2449, "step": 1966 }, { "epoch": 0.3965343542212372, "grad_norm": 0.0628892183303833, "learning_rate": 9.97655219015942e-05, "loss": 0.2307, "step": 1968 }, { "epoch": 0.39693733628853517, "grad_norm": 0.04486105963587761, "learning_rate": 9.97642310299356e-05, "loss": 0.1519, "step": 1970 }, { "epoch": 0.39734031835583317, "grad_norm": 0.060665931552648544, "learning_rate": 9.976293662310561e-05, "loss": 0.1907, "step": 1972 }, { "epoch": 0.39774330042313116, "grad_norm": 0.0764361023902893, "learning_rate": 9.97616386811962e-05, "loss": 0.1947, "step": 1974 }, { "epoch": 0.39814628249042916, "grad_norm": 0.08323927223682404, "learning_rate": 9.976033720429954e-05, "loss": 0.1534, "step": 1976 }, { "epoch": 0.39854926455772716, "grad_norm": 0.10088339447975159, "learning_rate": 9.97590321925081e-05, "loss": 0.2169, "step": 1978 }, { "epoch": 0.3989522466250252, "grad_norm": 0.06974364817142487, "learning_rate": 9.975772364591461e-05, "loss": 0.1841, "step": 1980 }, { "epoch": 0.3993552286923232, "grad_norm": 0.051529139280319214, "learning_rate": 9.9756411564612e-05, "loss": 0.156, "step": 1982 }, { "epoch": 0.3997582107596212, "grad_norm": 0.04539600387215614, "learning_rate": 9.97550959486935e-05, "loss": 0.1884, "step": 1984 }, { "epoch": 0.4001611928269192, "grad_norm": 0.07553747296333313, "learning_rate": 9.975377679825254e-05, "loss": 0.1764, "step": 1986 }, { "epoch": 0.4005641748942172, "grad_norm": 0.06788526475429535, "learning_rate": 9.975245411338286e-05, "loss": 0.1896, "step": 1988 }, { "epoch": 0.4009671569615152, "grad_norm": 0.0715952068567276, "learning_rate": 9.975112789417839e-05, "loss": 0.1663, "step": 1990 }, { "epoch": 0.40137013902881324, "grad_norm": 0.0693850964307785, "learning_rate": 9.974979814073335e-05, "loss": 0.1951, "step": 1992 }, { "epoch": 0.40177312109611124, "grad_norm": 0.06748230755329132, "learning_rate": 9.974846485314225e-05, "loss": 0.2539, "step": 1994 }, { "epoch": 0.40217610316340924, "grad_norm": 0.07175850868225098, "learning_rate": 9.974712803149974e-05, "loss": 0.1882, "step": 1996 }, { "epoch": 0.40257908523070723, "grad_norm": 0.05859972909092903, "learning_rate": 9.974578767590081e-05, "loss": 0.2038, "step": 1998 }, { "epoch": 0.40298206729800523, "grad_norm": 0.0738549679517746, "learning_rate": 9.97444437864407e-05, "loss": 0.2094, "step": 2000 }, { "epoch": 0.4033850493653032, "grad_norm": 0.07968976348638535, "learning_rate": 9.974309636321484e-05, "loss": 0.1927, "step": 2002 }, { "epoch": 0.4037880314326012, "grad_norm": 0.06320148706436157, "learning_rate": 9.974174540631898e-05, "loss": 0.2125, "step": 2004 }, { "epoch": 0.4041910134998993, "grad_norm": 0.06155259907245636, "learning_rate": 9.974039091584908e-05, "loss": 0.2159, "step": 2006 }, { "epoch": 0.4045939955671973, "grad_norm": 0.12573003768920898, "learning_rate": 9.973903289190134e-05, "loss": 0.2388, "step": 2008 }, { "epoch": 0.40499697763449527, "grad_norm": 0.06864972412586212, "learning_rate": 9.973767133457225e-05, "loss": 0.1782, "step": 2010 }, { "epoch": 0.40539995970179327, "grad_norm": 0.1890304684638977, "learning_rate": 9.973630624395856e-05, "loss": 0.2289, "step": 2012 }, { "epoch": 0.40580294176909126, "grad_norm": 0.06997068971395493, "learning_rate": 9.973493762015719e-05, "loss": 0.226, "step": 2014 }, { "epoch": 0.40620592383638926, "grad_norm": 0.048897821456193924, "learning_rate": 9.973356546326539e-05, "loss": 0.2236, "step": 2016 }, { "epoch": 0.4066089059036873, "grad_norm": 0.06851400434970856, "learning_rate": 9.973218977338064e-05, "loss": 0.1874, "step": 2018 }, { "epoch": 0.4070118879709853, "grad_norm": 0.07441789656877518, "learning_rate": 9.973081055060067e-05, "loss": 0.1879, "step": 2020 }, { "epoch": 0.4074148700382833, "grad_norm": 0.06250349432229996, "learning_rate": 9.972942779502345e-05, "loss": 0.269, "step": 2022 }, { "epoch": 0.4078178521055813, "grad_norm": 0.06143819913268089, "learning_rate": 9.972804150674722e-05, "loss": 0.173, "step": 2024 }, { "epoch": 0.4082208341728793, "grad_norm": 0.06300696730613708, "learning_rate": 9.972665168587043e-05, "loss": 0.2063, "step": 2026 }, { "epoch": 0.4086238162401773, "grad_norm": 0.05857381224632263, "learning_rate": 9.972525833249184e-05, "loss": 0.1931, "step": 2028 }, { "epoch": 0.4090267983074753, "grad_norm": 0.054102227091789246, "learning_rate": 9.972386144671043e-05, "loss": 0.1767, "step": 2030 }, { "epoch": 0.40942978037477334, "grad_norm": 0.08161073923110962, "learning_rate": 9.97224610286254e-05, "loss": 0.2202, "step": 2032 }, { "epoch": 0.40983276244207134, "grad_norm": 0.06625241786241531, "learning_rate": 9.972105707833628e-05, "loss": 0.2147, "step": 2034 }, { "epoch": 0.41023574450936934, "grad_norm": 0.06663915514945984, "learning_rate": 9.971964959594276e-05, "loss": 0.2014, "step": 2036 }, { "epoch": 0.41063872657666733, "grad_norm": 0.06275123357772827, "learning_rate": 9.971823858154487e-05, "loss": 0.2785, "step": 2038 }, { "epoch": 0.41104170864396533, "grad_norm": 0.06302032619714737, "learning_rate": 9.971682403524281e-05, "loss": 0.1869, "step": 2040 }, { "epoch": 0.41144469071126333, "grad_norm": 0.13110080361366272, "learning_rate": 9.971540595713709e-05, "loss": 0.1643, "step": 2042 }, { "epoch": 0.4118476727785614, "grad_norm": 0.16180118918418884, "learning_rate": 9.971398434732843e-05, "loss": 0.1396, "step": 2044 }, { "epoch": 0.4122506548458594, "grad_norm": 0.06744285672903061, "learning_rate": 9.971255920591784e-05, "loss": 0.186, "step": 2046 }, { "epoch": 0.4126536369131574, "grad_norm": 0.06002974510192871, "learning_rate": 9.971113053300653e-05, "loss": 0.1548, "step": 2048 }, { "epoch": 0.41305661898045537, "grad_norm": 0.07181243598461151, "learning_rate": 9.970969832869603e-05, "loss": 0.2426, "step": 2050 }, { "epoch": 0.41345960104775337, "grad_norm": 0.06398441642522812, "learning_rate": 9.970826259308805e-05, "loss": 0.1657, "step": 2052 }, { "epoch": 0.41386258311505136, "grad_norm": 0.06433025002479553, "learning_rate": 9.970682332628459e-05, "loss": 0.2147, "step": 2054 }, { "epoch": 0.41426556518234936, "grad_norm": 0.18160675466060638, "learning_rate": 9.970538052838789e-05, "loss": 0.2346, "step": 2056 }, { "epoch": 0.4146685472496474, "grad_norm": 0.06317636370658875, "learning_rate": 9.970393419950046e-05, "loss": 0.2058, "step": 2058 }, { "epoch": 0.4150715293169454, "grad_norm": 0.053193751722574234, "learning_rate": 9.970248433972503e-05, "loss": 0.1896, "step": 2060 }, { "epoch": 0.4154745113842434, "grad_norm": 0.05231672152876854, "learning_rate": 9.970103094916459e-05, "loss": 0.1512, "step": 2062 }, { "epoch": 0.4158774934515414, "grad_norm": 0.05676732212305069, "learning_rate": 9.96995740279224e-05, "loss": 0.1831, "step": 2064 }, { "epoch": 0.4162804755188394, "grad_norm": 0.17120619118213654, "learning_rate": 9.969811357610197e-05, "loss": 0.2052, "step": 2066 }, { "epoch": 0.4166834575861374, "grad_norm": 0.04707195237278938, "learning_rate": 9.969664959380702e-05, "loss": 0.1296, "step": 2068 }, { "epoch": 0.41708643965343545, "grad_norm": 0.05727340281009674, "learning_rate": 9.969518208114157e-05, "loss": 0.2024, "step": 2070 }, { "epoch": 0.41748942172073344, "grad_norm": 0.058648549020290375, "learning_rate": 9.969371103820983e-05, "loss": 0.2046, "step": 2072 }, { "epoch": 0.41789240378803144, "grad_norm": 0.2652423083782196, "learning_rate": 9.969223646511636e-05, "loss": 0.1867, "step": 2074 }, { "epoch": 0.41829538585532944, "grad_norm": 0.35208940505981445, "learning_rate": 9.969075836196589e-05, "loss": 0.2112, "step": 2076 }, { "epoch": 0.41869836792262743, "grad_norm": 0.06454955041408539, "learning_rate": 9.968927672886339e-05, "loss": 0.2225, "step": 2078 }, { "epoch": 0.41910134998992543, "grad_norm": 0.05421265587210655, "learning_rate": 9.968779156591414e-05, "loss": 0.1717, "step": 2080 }, { "epoch": 0.41950433205722343, "grad_norm": 0.05428704246878624, "learning_rate": 9.968630287322367e-05, "loss": 0.1961, "step": 2082 }, { "epoch": 0.4199073141245215, "grad_norm": 0.10567606985569, "learning_rate": 9.968481065089768e-05, "loss": 0.1915, "step": 2084 }, { "epoch": 0.4203102961918195, "grad_norm": 0.06691636145114899, "learning_rate": 9.96833148990422e-05, "loss": 0.1842, "step": 2086 }, { "epoch": 0.4207132782591175, "grad_norm": 0.06900250166654587, "learning_rate": 9.968181561776348e-05, "loss": 0.1391, "step": 2088 }, { "epoch": 0.42111626032641547, "grad_norm": 0.05278802663087845, "learning_rate": 9.968031280716805e-05, "loss": 0.177, "step": 2090 }, { "epoch": 0.42151924239371347, "grad_norm": 0.08458192646503448, "learning_rate": 9.967880646736265e-05, "loss": 0.2298, "step": 2092 }, { "epoch": 0.42192222446101146, "grad_norm": 0.06902390718460083, "learning_rate": 9.967729659845428e-05, "loss": 0.2137, "step": 2094 }, { "epoch": 0.4223252065283095, "grad_norm": 0.056869540363550186, "learning_rate": 9.967578320055023e-05, "loss": 0.2007, "step": 2096 }, { "epoch": 0.4227281885956075, "grad_norm": 0.07104408740997314, "learning_rate": 9.967426627375796e-05, "loss": 0.1854, "step": 2098 }, { "epoch": 0.4231311706629055, "grad_norm": 0.062130045145750046, "learning_rate": 9.967274581818524e-05, "loss": 0.1184, "step": 2100 }, { "epoch": 0.4235341527302035, "grad_norm": 0.07842563092708588, "learning_rate": 9.967122183394013e-05, "loss": 0.2273, "step": 2102 }, { "epoch": 0.4239371347975015, "grad_norm": 0.06722037494182587, "learning_rate": 9.966969432113085e-05, "loss": 0.1704, "step": 2104 }, { "epoch": 0.4243401168647995, "grad_norm": 0.07529338449239731, "learning_rate": 9.966816327986591e-05, "loss": 0.1818, "step": 2106 }, { "epoch": 0.4247430989320975, "grad_norm": 0.06540022790431976, "learning_rate": 9.96666287102541e-05, "loss": 0.2518, "step": 2108 }, { "epoch": 0.42514608099939555, "grad_norm": 0.2302924394607544, "learning_rate": 9.96650906124044e-05, "loss": 0.2158, "step": 2110 }, { "epoch": 0.42554906306669354, "grad_norm": 0.062155935913324356, "learning_rate": 9.966354898642609e-05, "loss": 0.2174, "step": 2112 }, { "epoch": 0.42595204513399154, "grad_norm": 0.09047554433345795, "learning_rate": 9.96620038324287e-05, "loss": 0.1489, "step": 2114 }, { "epoch": 0.42635502720128954, "grad_norm": 0.11305224895477295, "learning_rate": 9.966045515052197e-05, "loss": 0.1712, "step": 2116 }, { "epoch": 0.42675800926858753, "grad_norm": 0.09782232344150543, "learning_rate": 9.965890294081592e-05, "loss": 0.1961, "step": 2118 }, { "epoch": 0.42716099133588553, "grad_norm": 0.06156953424215317, "learning_rate": 9.965734720342084e-05, "loss": 0.182, "step": 2120 }, { "epoch": 0.4275639734031836, "grad_norm": 0.07488352805376053, "learning_rate": 9.965578793844723e-05, "loss": 0.2113, "step": 2122 }, { "epoch": 0.4279669554704816, "grad_norm": 0.24926647543907166, "learning_rate": 9.965422514600585e-05, "loss": 0.2136, "step": 2124 }, { "epoch": 0.4283699375377796, "grad_norm": 0.06749057024717331, "learning_rate": 9.965265882620771e-05, "loss": 0.2188, "step": 2126 }, { "epoch": 0.4287729196050776, "grad_norm": 0.07169239223003387, "learning_rate": 9.965108897916411e-05, "loss": 0.2276, "step": 2128 }, { "epoch": 0.42917590167237557, "grad_norm": 0.0644363984465599, "learning_rate": 9.964951560498657e-05, "loss": 0.171, "step": 2130 }, { "epoch": 0.42957888373967357, "grad_norm": 0.07681705802679062, "learning_rate": 9.964793870378681e-05, "loss": 0.2332, "step": 2132 }, { "epoch": 0.42998186580697156, "grad_norm": 0.047269873321056366, "learning_rate": 9.964635827567691e-05, "loss": 0.134, "step": 2134 }, { "epoch": 0.4303848478742696, "grad_norm": 0.06874144822359085, "learning_rate": 9.964477432076911e-05, "loss": 0.1645, "step": 2136 }, { "epoch": 0.4307878299415676, "grad_norm": 0.05638271942734718, "learning_rate": 9.964318683917593e-05, "loss": 0.2089, "step": 2138 }, { "epoch": 0.4311908120088656, "grad_norm": 0.07065290212631226, "learning_rate": 9.964159583101016e-05, "loss": 0.2175, "step": 2140 }, { "epoch": 0.4315937940761636, "grad_norm": 0.0586637519299984, "learning_rate": 9.96400012963848e-05, "loss": 0.207, "step": 2142 }, { "epoch": 0.4319967761434616, "grad_norm": 0.061192356050014496, "learning_rate": 9.963840323541314e-05, "loss": 0.212, "step": 2144 }, { "epoch": 0.4323997582107596, "grad_norm": 0.06529909372329712, "learning_rate": 9.96368016482087e-05, "loss": 0.1886, "step": 2146 }, { "epoch": 0.43280274027805765, "grad_norm": 0.06861956417560577, "learning_rate": 9.963519653488527e-05, "loss": 0.2226, "step": 2148 }, { "epoch": 0.43320572234535565, "grad_norm": 0.07531817257404327, "learning_rate": 9.963358789555683e-05, "loss": 0.2213, "step": 2150 }, { "epoch": 0.43360870441265364, "grad_norm": 0.0557052381336689, "learning_rate": 9.96319757303377e-05, "loss": 0.1994, "step": 2152 }, { "epoch": 0.43401168647995164, "grad_norm": 0.05331805348396301, "learning_rate": 9.963036003934238e-05, "loss": 0.2163, "step": 2154 }, { "epoch": 0.43441466854724964, "grad_norm": 0.05752795189619064, "learning_rate": 9.962874082268567e-05, "loss": 0.2135, "step": 2156 }, { "epoch": 0.43481765061454763, "grad_norm": 0.060762520879507065, "learning_rate": 9.962711808048258e-05, "loss": 0.2401, "step": 2158 }, { "epoch": 0.43522063268184563, "grad_norm": 0.05835457518696785, "learning_rate": 9.962549181284838e-05, "loss": 0.1785, "step": 2160 }, { "epoch": 0.4356236147491437, "grad_norm": 0.06465306878089905, "learning_rate": 9.96238620198986e-05, "loss": 0.1975, "step": 2162 }, { "epoch": 0.4360265968164417, "grad_norm": 0.0581306591629982, "learning_rate": 9.962222870174902e-05, "loss": 0.1602, "step": 2164 }, { "epoch": 0.4364295788837397, "grad_norm": 0.04962344840168953, "learning_rate": 9.962059185851569e-05, "loss": 0.2231, "step": 2166 }, { "epoch": 0.4368325609510377, "grad_norm": 0.0643840953707695, "learning_rate": 9.961895149031486e-05, "loss": 0.1481, "step": 2168 }, { "epoch": 0.43723554301833567, "grad_norm": 0.12278378009796143, "learning_rate": 9.961730759726307e-05, "loss": 0.2492, "step": 2170 }, { "epoch": 0.43763852508563367, "grad_norm": 0.05890028551220894, "learning_rate": 9.96156601794771e-05, "loss": 0.1423, "step": 2172 }, { "epoch": 0.4380415071529317, "grad_norm": 0.05931360647082329, "learning_rate": 9.961400923707398e-05, "loss": 0.1958, "step": 2174 }, { "epoch": 0.4384444892202297, "grad_norm": 0.07436667382717133, "learning_rate": 9.961235477017098e-05, "loss": 0.2163, "step": 2176 }, { "epoch": 0.4388474712875277, "grad_norm": 0.08503016084432602, "learning_rate": 9.961069677888566e-05, "loss": 0.2187, "step": 2178 }, { "epoch": 0.4392504533548257, "grad_norm": 0.10647798329591751, "learning_rate": 9.960903526333576e-05, "loss": 0.2981, "step": 2180 }, { "epoch": 0.4396534354221237, "grad_norm": 0.06522869318723679, "learning_rate": 9.960737022363935e-05, "loss": 0.1779, "step": 2182 }, { "epoch": 0.4400564174894217, "grad_norm": 0.050729621201753616, "learning_rate": 9.960570165991469e-05, "loss": 0.1355, "step": 2184 }, { "epoch": 0.4404593995567197, "grad_norm": 0.0786505937576294, "learning_rate": 9.960402957228032e-05, "loss": 0.175, "step": 2186 }, { "epoch": 0.44086238162401775, "grad_norm": 0.09716632962226868, "learning_rate": 9.960235396085502e-05, "loss": 0.2767, "step": 2188 }, { "epoch": 0.44126536369131575, "grad_norm": 0.13773681223392487, "learning_rate": 9.960067482575781e-05, "loss": 0.242, "step": 2190 }, { "epoch": 0.44166834575861375, "grad_norm": 0.06173882633447647, "learning_rate": 9.9598992167108e-05, "loss": 0.213, "step": 2192 }, { "epoch": 0.44207132782591174, "grad_norm": 0.06848306208848953, "learning_rate": 9.95973059850251e-05, "loss": 0.2, "step": 2194 }, { "epoch": 0.44247430989320974, "grad_norm": 0.06073886528611183, "learning_rate": 9.95956162796289e-05, "loss": 0.1993, "step": 2196 }, { "epoch": 0.44287729196050774, "grad_norm": 0.07946296781301498, "learning_rate": 9.959392305103943e-05, "loss": 0.261, "step": 2198 }, { "epoch": 0.4432802740278058, "grad_norm": 0.051856525242328644, "learning_rate": 9.959222629937699e-05, "loss": 0.2111, "step": 2200 }, { "epoch": 0.4436832560951038, "grad_norm": 0.05881345272064209, "learning_rate": 9.95905260247621e-05, "loss": 0.2147, "step": 2202 }, { "epoch": 0.4440862381624018, "grad_norm": 0.04665559157729149, "learning_rate": 9.958882222731555e-05, "loss": 0.184, "step": 2204 }, { "epoch": 0.4444892202296998, "grad_norm": 0.06761233508586884, "learning_rate": 9.958711490715838e-05, "loss": 0.2161, "step": 2206 }, { "epoch": 0.4448922022969978, "grad_norm": 0.09146778285503387, "learning_rate": 9.958540406441187e-05, "loss": 0.2292, "step": 2208 }, { "epoch": 0.44529518436429577, "grad_norm": 0.06186634674668312, "learning_rate": 9.958368969919756e-05, "loss": 0.1743, "step": 2210 }, { "epoch": 0.44569816643159377, "grad_norm": 0.06448390334844589, "learning_rate": 9.958197181163722e-05, "loss": 0.165, "step": 2212 }, { "epoch": 0.4461011484988918, "grad_norm": 0.07400643825531006, "learning_rate": 9.95802504018529e-05, "loss": 0.204, "step": 2214 }, { "epoch": 0.4465041305661898, "grad_norm": 0.0617455393075943, "learning_rate": 9.957852546996688e-05, "loss": 0.1909, "step": 2216 }, { "epoch": 0.4469071126334878, "grad_norm": 0.0583464615046978, "learning_rate": 9.957679701610171e-05, "loss": 0.1884, "step": 2218 }, { "epoch": 0.4473100947007858, "grad_norm": 0.06541068851947784, "learning_rate": 9.957506504038015e-05, "loss": 0.1666, "step": 2220 }, { "epoch": 0.4477130767680838, "grad_norm": 0.06339999288320541, "learning_rate": 9.957332954292526e-05, "loss": 0.2063, "step": 2222 }, { "epoch": 0.4481160588353818, "grad_norm": 0.0743519738316536, "learning_rate": 9.957159052386033e-05, "loss": 0.1872, "step": 2224 }, { "epoch": 0.44851904090267986, "grad_norm": 0.08613268285989761, "learning_rate": 9.956984798330888e-05, "loss": 0.2495, "step": 2226 }, { "epoch": 0.44892202296997785, "grad_norm": 0.06281512975692749, "learning_rate": 9.956810192139471e-05, "loss": 0.2057, "step": 2228 }, { "epoch": 0.44932500503727585, "grad_norm": 0.08816128969192505, "learning_rate": 9.956635233824185e-05, "loss": 0.2602, "step": 2230 }, { "epoch": 0.44972798710457385, "grad_norm": 0.10240280628204346, "learning_rate": 9.956459923397459e-05, "loss": 0.2232, "step": 2232 }, { "epoch": 0.45013096917187184, "grad_norm": 0.06470140814781189, "learning_rate": 9.956284260871745e-05, "loss": 0.2375, "step": 2234 }, { "epoch": 0.45053395123916984, "grad_norm": 0.06823807209730148, "learning_rate": 9.956108246259526e-05, "loss": 0.1698, "step": 2236 }, { "epoch": 0.45093693330646784, "grad_norm": 0.0750935971736908, "learning_rate": 9.955931879573302e-05, "loss": 0.1979, "step": 2238 }, { "epoch": 0.4513399153737659, "grad_norm": 0.09087410569190979, "learning_rate": 9.955755160825604e-05, "loss": 0.2525, "step": 2240 }, { "epoch": 0.4517428974410639, "grad_norm": 0.0783960372209549, "learning_rate": 9.955578090028983e-05, "loss": 0.1925, "step": 2242 }, { "epoch": 0.4521458795083619, "grad_norm": 0.06691782921552658, "learning_rate": 9.955400667196021e-05, "loss": 0.2186, "step": 2244 }, { "epoch": 0.4525488615756599, "grad_norm": 0.0832383930683136, "learning_rate": 9.95522289233932e-05, "loss": 0.219, "step": 2246 }, { "epoch": 0.4529518436429579, "grad_norm": 0.07930655032396317, "learning_rate": 9.95504476547151e-05, "loss": 0.2159, "step": 2248 }, { "epoch": 0.45335482571025587, "grad_norm": 0.07651390880346298, "learning_rate": 9.954866286605246e-05, "loss": 0.2016, "step": 2250 }, { "epoch": 0.4537578077775539, "grad_norm": 0.06129351630806923, "learning_rate": 9.954687455753202e-05, "loss": 0.193, "step": 2252 }, { "epoch": 0.4541607898448519, "grad_norm": 0.07738249748945236, "learning_rate": 9.954508272928087e-05, "loss": 0.1875, "step": 2254 }, { "epoch": 0.4545637719121499, "grad_norm": 0.0622461661696434, "learning_rate": 9.954328738142628e-05, "loss": 0.212, "step": 2256 }, { "epoch": 0.4549667539794479, "grad_norm": 0.08971701562404633, "learning_rate": 9.954148851409577e-05, "loss": 0.1996, "step": 2258 }, { "epoch": 0.4553697360467459, "grad_norm": 0.08519180119037628, "learning_rate": 9.953968612741717e-05, "loss": 0.2016, "step": 2260 }, { "epoch": 0.4557727181140439, "grad_norm": 0.06239600107073784, "learning_rate": 9.953788022151848e-05, "loss": 0.2145, "step": 2262 }, { "epoch": 0.4561757001813419, "grad_norm": 0.06886850297451019, "learning_rate": 9.9536070796528e-05, "loss": 0.1723, "step": 2264 }, { "epoch": 0.45657868224863996, "grad_norm": 0.07382401078939438, "learning_rate": 9.953425785257428e-05, "loss": 0.247, "step": 2266 }, { "epoch": 0.45698166431593795, "grad_norm": 0.07136175781488419, "learning_rate": 9.953244138978608e-05, "loss": 0.2037, "step": 2268 }, { "epoch": 0.45738464638323595, "grad_norm": 0.07412570714950562, "learning_rate": 9.953062140829249e-05, "loss": 0.2047, "step": 2270 }, { "epoch": 0.45778762845053395, "grad_norm": 0.07109946012496948, "learning_rate": 9.952879790822276e-05, "loss": 0.1648, "step": 2272 }, { "epoch": 0.45819061051783194, "grad_norm": 0.09064648300409317, "learning_rate": 9.952697088970642e-05, "loss": 0.2203, "step": 2274 }, { "epoch": 0.45859359258512994, "grad_norm": 0.06263390183448792, "learning_rate": 9.952514035287328e-05, "loss": 0.1801, "step": 2276 }, { "epoch": 0.458996574652428, "grad_norm": 0.06853969395160675, "learning_rate": 9.952330629785338e-05, "loss": 0.1989, "step": 2278 }, { "epoch": 0.459399556719726, "grad_norm": 0.06831579655408859, "learning_rate": 9.9521468724777e-05, "loss": 0.2149, "step": 2280 }, { "epoch": 0.459802538787024, "grad_norm": 0.07879806309938431, "learning_rate": 9.951962763377469e-05, "loss": 0.2391, "step": 2282 }, { "epoch": 0.460205520854322, "grad_norm": 0.07946325838565826, "learning_rate": 9.95177830249772e-05, "loss": 0.217, "step": 2284 }, { "epoch": 0.46060850292162, "grad_norm": 0.04904184117913246, "learning_rate": 9.951593489851562e-05, "loss": 0.2036, "step": 2286 }, { "epoch": 0.461011484988918, "grad_norm": 0.06621547043323517, "learning_rate": 9.95140832545212e-05, "loss": 0.2289, "step": 2288 }, { "epoch": 0.46141446705621597, "grad_norm": 0.3637445867061615, "learning_rate": 9.95122280931255e-05, "loss": 0.212, "step": 2290 }, { "epoch": 0.461817449123514, "grad_norm": 0.10791198909282684, "learning_rate": 9.95103694144603e-05, "loss": 0.2213, "step": 2292 }, { "epoch": 0.462220431190812, "grad_norm": 0.05832645669579506, "learning_rate": 9.950850721865763e-05, "loss": 0.1969, "step": 2294 }, { "epoch": 0.46262341325811, "grad_norm": 0.051262617111206055, "learning_rate": 9.950664150584979e-05, "loss": 0.2189, "step": 2296 }, { "epoch": 0.463026395325408, "grad_norm": 0.08473943918943405, "learning_rate": 9.950477227616931e-05, "loss": 0.1805, "step": 2298 }, { "epoch": 0.463429377392706, "grad_norm": 0.06247183680534363, "learning_rate": 9.950289952974898e-05, "loss": 0.1754, "step": 2300 }, { "epoch": 0.463832359460004, "grad_norm": 0.09339699894189835, "learning_rate": 9.950102326672184e-05, "loss": 0.2281, "step": 2302 }, { "epoch": 0.46423534152730206, "grad_norm": 0.06101800501346588, "learning_rate": 9.949914348722116e-05, "loss": 0.2179, "step": 2304 }, { "epoch": 0.46463832359460006, "grad_norm": 0.08553671091794968, "learning_rate": 9.94972601913805e-05, "loss": 0.2065, "step": 2306 }, { "epoch": 0.46504130566189805, "grad_norm": 0.07908914238214493, "learning_rate": 9.949537337933363e-05, "loss": 0.1714, "step": 2308 }, { "epoch": 0.46544428772919605, "grad_norm": 0.05011114850640297, "learning_rate": 9.949348305121459e-05, "loss": 0.1514, "step": 2310 }, { "epoch": 0.46584726979649405, "grad_norm": 0.07748831063508987, "learning_rate": 9.949158920715766e-05, "loss": 0.217, "step": 2312 }, { "epoch": 0.46625025186379204, "grad_norm": 0.07742765545845032, "learning_rate": 9.94896918472974e-05, "loss": 0.2007, "step": 2314 }, { "epoch": 0.46665323393109004, "grad_norm": 0.06443807482719421, "learning_rate": 9.948779097176857e-05, "loss": 0.2428, "step": 2316 }, { "epoch": 0.4670562159983881, "grad_norm": 0.060489460825920105, "learning_rate": 9.948588658070622e-05, "loss": 0.2127, "step": 2318 }, { "epoch": 0.4674591980656861, "grad_norm": 0.0675460621714592, "learning_rate": 9.948397867424562e-05, "loss": 0.21, "step": 2320 }, { "epoch": 0.4678621801329841, "grad_norm": 0.061212215572595596, "learning_rate": 9.948206725252231e-05, "loss": 0.1984, "step": 2322 }, { "epoch": 0.4682651622002821, "grad_norm": 0.062292277812957764, "learning_rate": 9.948015231567208e-05, "loss": 0.166, "step": 2324 }, { "epoch": 0.4686681442675801, "grad_norm": 0.048851098865270615, "learning_rate": 9.947823386383097e-05, "loss": 0.1606, "step": 2326 }, { "epoch": 0.4690711263348781, "grad_norm": 0.05528166517615318, "learning_rate": 9.947631189713524e-05, "loss": 0.1853, "step": 2328 }, { "epoch": 0.4694741084021761, "grad_norm": 0.051568541675806046, "learning_rate": 9.947438641572145e-05, "loss": 0.2046, "step": 2330 }, { "epoch": 0.4698770904694741, "grad_norm": 0.09390091896057129, "learning_rate": 9.947245741972638e-05, "loss": 0.2408, "step": 2332 }, { "epoch": 0.4702800725367721, "grad_norm": 0.055641692131757736, "learning_rate": 9.947052490928704e-05, "loss": 0.2001, "step": 2334 }, { "epoch": 0.4706830546040701, "grad_norm": 0.05879371985793114, "learning_rate": 9.946858888454072e-05, "loss": 0.2079, "step": 2336 }, { "epoch": 0.4710860366713681, "grad_norm": 0.06226501986384392, "learning_rate": 9.946664934562497e-05, "loss": 0.2495, "step": 2338 }, { "epoch": 0.4714890187386661, "grad_norm": 0.059466030448675156, "learning_rate": 9.946470629267756e-05, "loss": 0.1897, "step": 2340 }, { "epoch": 0.4718920008059641, "grad_norm": 0.05849766731262207, "learning_rate": 9.946275972583651e-05, "loss": 0.1906, "step": 2342 }, { "epoch": 0.47229498287326216, "grad_norm": 0.051197245717048645, "learning_rate": 9.946080964524013e-05, "loss": 0.1546, "step": 2344 }, { "epoch": 0.47269796494056016, "grad_norm": 0.058286767452955246, "learning_rate": 9.945885605102694e-05, "loss": 0.1748, "step": 2346 }, { "epoch": 0.47310094700785815, "grad_norm": 0.064874567091465, "learning_rate": 9.94568989433357e-05, "loss": 0.2119, "step": 2348 }, { "epoch": 0.47350392907515615, "grad_norm": 0.05664130300283432, "learning_rate": 9.945493832230546e-05, "loss": 0.22, "step": 2350 }, { "epoch": 0.47390691114245415, "grad_norm": 0.1102651059627533, "learning_rate": 9.945297418807549e-05, "loss": 0.2036, "step": 2352 }, { "epoch": 0.47430989320975214, "grad_norm": 0.10592664033174515, "learning_rate": 9.945100654078532e-05, "loss": 0.2118, "step": 2354 }, { "epoch": 0.4747128752770502, "grad_norm": 0.06491530686616898, "learning_rate": 9.944903538057473e-05, "loss": 0.1692, "step": 2356 }, { "epoch": 0.4751158573443482, "grad_norm": 0.08583839237689972, "learning_rate": 9.944706070758373e-05, "loss": 0.2252, "step": 2358 }, { "epoch": 0.4755188394116462, "grad_norm": 0.07632534205913544, "learning_rate": 9.944508252195264e-05, "loss": 0.2198, "step": 2360 }, { "epoch": 0.4759218214789442, "grad_norm": 0.08482904732227325, "learning_rate": 9.944310082382198e-05, "loss": 0.1565, "step": 2362 }, { "epoch": 0.4763248035462422, "grad_norm": 0.0885341688990593, "learning_rate": 9.944111561333248e-05, "loss": 0.202, "step": 2364 }, { "epoch": 0.4767277856135402, "grad_norm": 0.050671521574258804, "learning_rate": 9.94391268906252e-05, "loss": 0.1886, "step": 2366 }, { "epoch": 0.47713076768083823, "grad_norm": 0.058330848813056946, "learning_rate": 9.943713465584143e-05, "loss": 0.1961, "step": 2368 }, { "epoch": 0.4775337497481362, "grad_norm": 0.04063691198825836, "learning_rate": 9.943513890912266e-05, "loss": 0.1735, "step": 2370 }, { "epoch": 0.4779367318154342, "grad_norm": 0.04938462749123573, "learning_rate": 9.943313965061069e-05, "loss": 0.1799, "step": 2372 }, { "epoch": 0.4783397138827322, "grad_norm": 0.0486234650015831, "learning_rate": 9.943113688044753e-05, "loss": 0.1653, "step": 2374 }, { "epoch": 0.4787426959500302, "grad_norm": 0.05696294456720352, "learning_rate": 9.942913059877546e-05, "loss": 0.1955, "step": 2376 }, { "epoch": 0.4791456780173282, "grad_norm": 0.043870650231838226, "learning_rate": 9.9427120805737e-05, "loss": 0.156, "step": 2378 }, { "epoch": 0.4795486600846262, "grad_norm": 0.06747590750455856, "learning_rate": 9.942510750147493e-05, "loss": 0.2309, "step": 2380 }, { "epoch": 0.47995164215192426, "grad_norm": 0.05680996552109718, "learning_rate": 9.942309068613227e-05, "loss": 0.1492, "step": 2382 }, { "epoch": 0.48035462421922226, "grad_norm": 0.08178498595952988, "learning_rate": 9.942107035985229e-05, "loss": 0.2317, "step": 2384 }, { "epoch": 0.48075760628652026, "grad_norm": 0.055255163460969925, "learning_rate": 9.941904652277849e-05, "loss": 0.2344, "step": 2386 }, { "epoch": 0.48116058835381825, "grad_norm": 0.07890515774488449, "learning_rate": 9.941701917505468e-05, "loss": 0.173, "step": 2388 }, { "epoch": 0.48156357042111625, "grad_norm": 0.07036946713924408, "learning_rate": 9.941498831682486e-05, "loss": 0.1778, "step": 2390 }, { "epoch": 0.48196655248841425, "grad_norm": 0.05692208930850029, "learning_rate": 9.941295394823328e-05, "loss": 0.1556, "step": 2392 }, { "epoch": 0.4823695345557123, "grad_norm": 0.06822335720062256, "learning_rate": 9.941091606942447e-05, "loss": 0.1858, "step": 2394 }, { "epoch": 0.4827725166230103, "grad_norm": 0.05806328356266022, "learning_rate": 9.940887468054323e-05, "loss": 0.1881, "step": 2396 }, { "epoch": 0.4831754986903083, "grad_norm": 0.06417976319789886, "learning_rate": 9.940682978173455e-05, "loss": 0.1779, "step": 2398 }, { "epoch": 0.4835784807576063, "grad_norm": 0.057871218770742416, "learning_rate": 9.940478137314368e-05, "loss": 0.1556, "step": 2400 }, { "epoch": 0.4839814628249043, "grad_norm": 0.06445048749446869, "learning_rate": 9.940272945491616e-05, "loss": 0.1756, "step": 2402 }, { "epoch": 0.4843844448922023, "grad_norm": 0.05714261159300804, "learning_rate": 9.940067402719773e-05, "loss": 0.2266, "step": 2404 }, { "epoch": 0.4847874269595003, "grad_norm": 0.060727428644895554, "learning_rate": 9.939861509013444e-05, "loss": 0.2172, "step": 2406 }, { "epoch": 0.48519040902679833, "grad_norm": 0.05237874761223793, "learning_rate": 9.939655264387253e-05, "loss": 0.1958, "step": 2408 }, { "epoch": 0.4855933910940963, "grad_norm": 0.05351502448320389, "learning_rate": 9.939448668855853e-05, "loss": 0.2019, "step": 2410 }, { "epoch": 0.4859963731613943, "grad_norm": 0.07500231266021729, "learning_rate": 9.939241722433918e-05, "loss": 0.1265, "step": 2412 }, { "epoch": 0.4863993552286923, "grad_norm": 0.06734588742256165, "learning_rate": 9.939034425136152e-05, "loss": 0.2397, "step": 2414 }, { "epoch": 0.4868023372959903, "grad_norm": 0.07433614134788513, "learning_rate": 9.938826776977276e-05, "loss": 0.1875, "step": 2416 }, { "epoch": 0.4872053193632883, "grad_norm": 0.090156689286232, "learning_rate": 9.938618777972046e-05, "loss": 0.1549, "step": 2418 }, { "epoch": 0.48760830143058637, "grad_norm": 0.058888860046863556, "learning_rate": 9.938410428135236e-05, "loss": 0.2471, "step": 2420 }, { "epoch": 0.48801128349788436, "grad_norm": 0.08173494786024094, "learning_rate": 9.938201727481647e-05, "loss": 0.2159, "step": 2422 }, { "epoch": 0.48841426556518236, "grad_norm": 0.08549089729785919, "learning_rate": 9.937992676026105e-05, "loss": 0.1892, "step": 2424 }, { "epoch": 0.48881724763248036, "grad_norm": 0.06504713743925095, "learning_rate": 9.93778327378346e-05, "loss": 0.2715, "step": 2426 }, { "epoch": 0.48922022969977835, "grad_norm": 0.04682251811027527, "learning_rate": 9.937573520768589e-05, "loss": 0.1937, "step": 2428 }, { "epoch": 0.48962321176707635, "grad_norm": 0.05704076960682869, "learning_rate": 9.93736341699639e-05, "loss": 0.2076, "step": 2430 }, { "epoch": 0.49002619383437435, "grad_norm": 0.04885469004511833, "learning_rate": 9.93715296248179e-05, "loss": 0.2168, "step": 2432 }, { "epoch": 0.4904291759016724, "grad_norm": 0.05703292787075043, "learning_rate": 9.936942157239741e-05, "loss": 0.1514, "step": 2434 }, { "epoch": 0.4908321579689704, "grad_norm": 0.055790483951568604, "learning_rate": 9.936731001285215e-05, "loss": 0.2213, "step": 2436 }, { "epoch": 0.4912351400362684, "grad_norm": 0.06825339794158936, "learning_rate": 9.936519494633216e-05, "loss": 0.2115, "step": 2438 }, { "epoch": 0.4916381221035664, "grad_norm": 0.04873501509428024, "learning_rate": 9.936307637298765e-05, "loss": 0.2197, "step": 2440 }, { "epoch": 0.4920411041708644, "grad_norm": 0.09479009360074997, "learning_rate": 9.936095429296915e-05, "loss": 0.2088, "step": 2442 }, { "epoch": 0.4924440862381624, "grad_norm": 0.0469268262386322, "learning_rate": 9.93588287064274e-05, "loss": 0.1793, "step": 2444 }, { "epoch": 0.49284706830546043, "grad_norm": 0.05863165855407715, "learning_rate": 9.935669961351336e-05, "loss": 0.1689, "step": 2446 }, { "epoch": 0.49325005037275843, "grad_norm": 0.053911954164505005, "learning_rate": 9.935456701437835e-05, "loss": 0.1522, "step": 2448 }, { "epoch": 0.4936530324400564, "grad_norm": 0.055318981409072876, "learning_rate": 9.935243090917383e-05, "loss": 0.1643, "step": 2450 }, { "epoch": 0.4940560145073544, "grad_norm": 0.0797191932797432, "learning_rate": 9.935029129805153e-05, "loss": 0.2051, "step": 2452 }, { "epoch": 0.4944589965746524, "grad_norm": 0.048973795026540756, "learning_rate": 9.934814818116348e-05, "loss": 0.1867, "step": 2454 }, { "epoch": 0.4948619786419504, "grad_norm": 0.05954229086637497, "learning_rate": 9.93460015586619e-05, "loss": 0.1994, "step": 2456 }, { "epoch": 0.4952649607092484, "grad_norm": 0.075109101831913, "learning_rate": 9.934385143069927e-05, "loss": 0.2131, "step": 2458 }, { "epoch": 0.49566794277654647, "grad_norm": 0.07947821170091629, "learning_rate": 9.934169779742837e-05, "loss": 0.1959, "step": 2460 }, { "epoch": 0.49607092484384446, "grad_norm": 0.06468906998634338, "learning_rate": 9.933954065900215e-05, "loss": 0.194, "step": 2462 }, { "epoch": 0.49647390691114246, "grad_norm": 0.07549004256725311, "learning_rate": 9.933738001557386e-05, "loss": 0.204, "step": 2464 }, { "epoch": 0.49687688897844046, "grad_norm": 0.06827647238969803, "learning_rate": 9.933521586729703e-05, "loss": 0.1886, "step": 2466 }, { "epoch": 0.49727987104573845, "grad_norm": 0.053156349807977676, "learning_rate": 9.933304821432535e-05, "loss": 0.1744, "step": 2468 }, { "epoch": 0.49768285311303645, "grad_norm": 0.05513354763388634, "learning_rate": 9.933087705681281e-05, "loss": 0.1869, "step": 2470 }, { "epoch": 0.4980858351803345, "grad_norm": 0.051556315273046494, "learning_rate": 9.932870239491367e-05, "loss": 0.152, "step": 2472 }, { "epoch": 0.4984888172476325, "grad_norm": 0.054548539221286774, "learning_rate": 9.932652422878239e-05, "loss": 0.1541, "step": 2474 }, { "epoch": 0.4988917993149305, "grad_norm": 0.05240153521299362, "learning_rate": 9.932434255857372e-05, "loss": 0.2452, "step": 2476 }, { "epoch": 0.4992947813822285, "grad_norm": 0.09754245728254318, "learning_rate": 9.932215738444263e-05, "loss": 0.2503, "step": 2478 }, { "epoch": 0.4996977634495265, "grad_norm": 0.05348599702119827, "learning_rate": 9.931996870654438e-05, "loss": 0.2018, "step": 2480 }, { "epoch": 0.5001007455168245, "grad_norm": 0.055064063519239426, "learning_rate": 9.931777652503442e-05, "loss": 0.1739, "step": 2482 }, { "epoch": 0.5005037275841225, "grad_norm": 0.07443667203187943, "learning_rate": 9.931558084006849e-05, "loss": 0.2414, "step": 2484 }, { "epoch": 0.5009067096514205, "grad_norm": 0.06823594868183136, "learning_rate": 9.931338165180254e-05, "loss": 0.188, "step": 2486 }, { "epoch": 0.5013096917187185, "grad_norm": 0.0575629360973835, "learning_rate": 9.931117896039286e-05, "loss": 0.2041, "step": 2488 }, { "epoch": 0.5017126737860165, "grad_norm": 0.05056982487440109, "learning_rate": 9.930897276599587e-05, "loss": 0.1903, "step": 2490 }, { "epoch": 0.5021156558533145, "grad_norm": 0.0499715618789196, "learning_rate": 9.930676306876832e-05, "loss": 0.188, "step": 2492 }, { "epoch": 0.5025186379206126, "grad_norm": 0.06276939809322357, "learning_rate": 9.930454986886716e-05, "loss": 0.1728, "step": 2494 }, { "epoch": 0.5029216199879105, "grad_norm": 0.05897986888885498, "learning_rate": 9.930233316644963e-05, "loss": 0.1712, "step": 2496 }, { "epoch": 0.5033246020552086, "grad_norm": 0.06042663753032684, "learning_rate": 9.93001129616732e-05, "loss": 0.1711, "step": 2498 }, { "epoch": 0.5037275841225065, "grad_norm": 0.0756792277097702, "learning_rate": 9.92978892546956e-05, "loss": 0.2415, "step": 2500 }, { "epoch": 0.5041305661898046, "grad_norm": 0.07142479717731476, "learning_rate": 9.92956620456748e-05, "loss": 0.2126, "step": 2502 }, { "epoch": 0.5045335482571025, "grad_norm": 0.045365698635578156, "learning_rate": 9.929343133476898e-05, "loss": 0.2324, "step": 2504 }, { "epoch": 0.5049365303244006, "grad_norm": 0.0676538497209549, "learning_rate": 9.929119712213664e-05, "loss": 0.2497, "step": 2506 }, { "epoch": 0.5053395123916986, "grad_norm": 0.08482253551483154, "learning_rate": 9.92889594079365e-05, "loss": 0.1959, "step": 2508 }, { "epoch": 0.5057424944589965, "grad_norm": 0.05569139122962952, "learning_rate": 9.928671819232749e-05, "loss": 0.2229, "step": 2510 }, { "epoch": 0.5061454765262946, "grad_norm": 0.052241578698158264, "learning_rate": 9.928447347546885e-05, "loss": 0.2421, "step": 2512 }, { "epoch": 0.5065484585935925, "grad_norm": 0.04659108445048332, "learning_rate": 9.928222525752002e-05, "loss": 0.1722, "step": 2514 }, { "epoch": 0.5069514406608906, "grad_norm": 0.06179108843207359, "learning_rate": 9.927997353864073e-05, "loss": 0.1461, "step": 2516 }, { "epoch": 0.5073544227281886, "grad_norm": 0.05853302776813507, "learning_rate": 9.927771831899095e-05, "loss": 0.1849, "step": 2518 }, { "epoch": 0.5077574047954866, "grad_norm": 0.05103042721748352, "learning_rate": 9.927545959873086e-05, "loss": 0.2018, "step": 2520 }, { "epoch": 0.5081603868627846, "grad_norm": 0.047647956758737564, "learning_rate": 9.92731973780209e-05, "loss": 0.1589, "step": 2522 }, { "epoch": 0.5085633689300826, "grad_norm": 0.07556428760290146, "learning_rate": 9.927093165702182e-05, "loss": 0.2306, "step": 2524 }, { "epoch": 0.5089663509973806, "grad_norm": 0.06066511198878288, "learning_rate": 9.926866243589456e-05, "loss": 0.1792, "step": 2526 }, { "epoch": 0.5093693330646786, "grad_norm": 0.06090389937162399, "learning_rate": 9.92663897148003e-05, "loss": 0.2135, "step": 2528 }, { "epoch": 0.5097723151319766, "grad_norm": 0.04129406809806824, "learning_rate": 9.92641134939005e-05, "loss": 0.1798, "step": 2530 }, { "epoch": 0.5101752971992747, "grad_norm": 0.0553179495036602, "learning_rate": 9.926183377335689e-05, "loss": 0.2488, "step": 2532 }, { "epoch": 0.5105782792665726, "grad_norm": 0.04894782975316048, "learning_rate": 9.925955055333136e-05, "loss": 0.218, "step": 2534 }, { "epoch": 0.5109812613338707, "grad_norm": 0.05691911652684212, "learning_rate": 9.925726383398617e-05, "loss": 0.1882, "step": 2536 }, { "epoch": 0.5113842434011686, "grad_norm": 0.040179017931222916, "learning_rate": 9.925497361548371e-05, "loss": 0.1446, "step": 2538 }, { "epoch": 0.5117872254684667, "grad_norm": 0.06504693627357483, "learning_rate": 9.92526798979867e-05, "loss": 0.1943, "step": 2540 }, { "epoch": 0.5121902075357646, "grad_norm": 0.06121726706624031, "learning_rate": 9.925038268165808e-05, "loss": 0.1886, "step": 2542 }, { "epoch": 0.5125931896030627, "grad_norm": 0.05474965274333954, "learning_rate": 9.924808196666103e-05, "loss": 0.206, "step": 2544 }, { "epoch": 0.5129961716703607, "grad_norm": 0.06797949224710464, "learning_rate": 9.924577775315901e-05, "loss": 0.1983, "step": 2546 }, { "epoch": 0.5133991537376587, "grad_norm": 0.05731568858027458, "learning_rate": 9.924347004131568e-05, "loss": 0.226, "step": 2548 }, { "epoch": 0.5138021358049567, "grad_norm": 0.059852488338947296, "learning_rate": 9.924115883129501e-05, "loss": 0.1754, "step": 2550 }, { "epoch": 0.5142051178722546, "grad_norm": 0.05445285141468048, "learning_rate": 9.923884412326116e-05, "loss": 0.217, "step": 2552 }, { "epoch": 0.5146080999395527, "grad_norm": 0.07617855072021484, "learning_rate": 9.923652591737856e-05, "loss": 0.2535, "step": 2554 }, { "epoch": 0.5150110820068506, "grad_norm": 0.06034578010439873, "learning_rate": 9.923420421381191e-05, "loss": 0.1515, "step": 2556 }, { "epoch": 0.5154140640741487, "grad_norm": 0.05395951867103577, "learning_rate": 9.923187901272613e-05, "loss": 0.194, "step": 2558 }, { "epoch": 0.5158170461414467, "grad_norm": 0.06721216440200806, "learning_rate": 9.92295503142864e-05, "loss": 0.1354, "step": 2560 }, { "epoch": 0.5162200282087447, "grad_norm": 0.05250773951411247, "learning_rate": 9.922721811865815e-05, "loss": 0.2008, "step": 2562 }, { "epoch": 0.5166230102760427, "grad_norm": 0.06109349802136421, "learning_rate": 9.922488242600705e-05, "loss": 0.1996, "step": 2564 }, { "epoch": 0.5170259923433407, "grad_norm": 0.057626061141490936, "learning_rate": 9.922254323649902e-05, "loss": 0.2194, "step": 2566 }, { "epoch": 0.5174289744106387, "grad_norm": 0.08392177522182465, "learning_rate": 9.922020055030025e-05, "loss": 0.2169, "step": 2568 }, { "epoch": 0.5178319564779368, "grad_norm": 0.04369249939918518, "learning_rate": 9.921785436757713e-05, "loss": 0.2107, "step": 2570 }, { "epoch": 0.5182349385452347, "grad_norm": 0.04233145713806152, "learning_rate": 9.921550468849636e-05, "loss": 0.1723, "step": 2572 }, { "epoch": 0.5186379206125328, "grad_norm": 0.06697847694158554, "learning_rate": 9.921315151322486e-05, "loss": 0.18, "step": 2574 }, { "epoch": 0.5190409026798307, "grad_norm": 0.06663229316473007, "learning_rate": 9.921079484192975e-05, "loss": 0.2025, "step": 2576 }, { "epoch": 0.5194438847471288, "grad_norm": 0.05028081312775612, "learning_rate": 9.92084346747785e-05, "loss": 0.1953, "step": 2578 }, { "epoch": 0.5198468668144267, "grad_norm": 0.05140357092022896, "learning_rate": 9.920607101193875e-05, "loss": 0.1753, "step": 2580 }, { "epoch": 0.5202498488817248, "grad_norm": 0.048799578100442886, "learning_rate": 9.920370385357839e-05, "loss": 0.1184, "step": 2582 }, { "epoch": 0.5206528309490228, "grad_norm": 0.04645717889070511, "learning_rate": 9.92013331998656e-05, "loss": 0.1956, "step": 2584 }, { "epoch": 0.5210558130163208, "grad_norm": 0.06382746249437332, "learning_rate": 9.91989590509688e-05, "loss": 0.2239, "step": 2586 }, { "epoch": 0.5214587950836188, "grad_norm": 0.05598960071802139, "learning_rate": 9.919658140705662e-05, "loss": 0.2089, "step": 2588 }, { "epoch": 0.5218617771509168, "grad_norm": 0.04480404034256935, "learning_rate": 9.919420026829797e-05, "loss": 0.2062, "step": 2590 }, { "epoch": 0.5222647592182148, "grad_norm": 0.0500781387090683, "learning_rate": 9.919181563486201e-05, "loss": 0.1724, "step": 2592 }, { "epoch": 0.5226677412855127, "grad_norm": 0.07896167784929276, "learning_rate": 9.918942750691816e-05, "loss": 0.1899, "step": 2594 }, { "epoch": 0.5230707233528108, "grad_norm": 0.06729870289564133, "learning_rate": 9.918703588463603e-05, "loss": 0.197, "step": 2596 }, { "epoch": 0.5234737054201088, "grad_norm": 0.059732042253017426, "learning_rate": 9.918464076818553e-05, "loss": 0.2084, "step": 2598 }, { "epoch": 0.5238766874874068, "grad_norm": 0.07242099940776825, "learning_rate": 9.918224215773682e-05, "loss": 0.1948, "step": 2600 }, { "epoch": 0.5242796695547048, "grad_norm": 0.05805491283535957, "learning_rate": 9.917984005346027e-05, "loss": 0.2387, "step": 2602 }, { "epoch": 0.5246826516220028, "grad_norm": 0.06213317811489105, "learning_rate": 9.917743445552654e-05, "loss": 0.2208, "step": 2604 }, { "epoch": 0.5250856336893008, "grad_norm": 0.05157310143113136, "learning_rate": 9.917502536410652e-05, "loss": 0.19, "step": 2606 }, { "epoch": 0.5254886157565988, "grad_norm": 0.06837864220142365, "learning_rate": 9.917261277937133e-05, "loss": 0.2255, "step": 2608 }, { "epoch": 0.5258915978238968, "grad_norm": 0.06203755363821983, "learning_rate": 9.917019670149236e-05, "loss": 0.2151, "step": 2610 }, { "epoch": 0.5262945798911949, "grad_norm": 0.05853395164012909, "learning_rate": 9.916777713064129e-05, "loss": 0.2176, "step": 2612 }, { "epoch": 0.5266975619584928, "grad_norm": 0.0638599693775177, "learning_rate": 9.916535406698994e-05, "loss": 0.1831, "step": 2614 }, { "epoch": 0.5271005440257909, "grad_norm": 0.055155493319034576, "learning_rate": 9.916292751071046e-05, "loss": 0.231, "step": 2616 }, { "epoch": 0.5275035260930888, "grad_norm": 0.04611392319202423, "learning_rate": 9.916049746197524e-05, "loss": 0.1901, "step": 2618 }, { "epoch": 0.5279065081603869, "grad_norm": 0.05793232470750809, "learning_rate": 9.91580639209569e-05, "loss": 0.1608, "step": 2620 }, { "epoch": 0.5283094902276849, "grad_norm": 0.05881835147738457, "learning_rate": 9.915562688782832e-05, "loss": 0.2177, "step": 2622 }, { "epoch": 0.5287124722949829, "grad_norm": 0.04847261682152748, "learning_rate": 9.915318636276262e-05, "loss": 0.183, "step": 2624 }, { "epoch": 0.5291154543622809, "grad_norm": 0.0664074644446373, "learning_rate": 9.915074234593316e-05, "loss": 0.2255, "step": 2626 }, { "epoch": 0.5295184364295789, "grad_norm": 0.06530804187059402, "learning_rate": 9.914829483751358e-05, "loss": 0.1786, "step": 2628 }, { "epoch": 0.5299214184968769, "grad_norm": 0.0650443434715271, "learning_rate": 9.914584383767773e-05, "loss": 0.1792, "step": 2630 }, { "epoch": 0.5303244005641748, "grad_norm": 0.05127038061618805, "learning_rate": 9.914338934659973e-05, "loss": 0.242, "step": 2632 }, { "epoch": 0.5307273826314729, "grad_norm": 0.051562655717134476, "learning_rate": 9.914093136445395e-05, "loss": 0.1925, "step": 2634 }, { "epoch": 0.531130364698771, "grad_norm": 0.058482736349105835, "learning_rate": 9.913846989141499e-05, "loss": 0.2273, "step": 2636 }, { "epoch": 0.5315333467660689, "grad_norm": 0.058450616896152496, "learning_rate": 9.913600492765771e-05, "loss": 0.1778, "step": 2638 }, { "epoch": 0.531936328833367, "grad_norm": 0.04832917079329491, "learning_rate": 9.913353647335723e-05, "loss": 0.2222, "step": 2640 }, { "epoch": 0.5323393109006649, "grad_norm": 0.05004747956991196, "learning_rate": 9.91310645286889e-05, "loss": 0.1857, "step": 2642 }, { "epoch": 0.5327422929679629, "grad_norm": 0.05869967117905617, "learning_rate": 9.91285890938283e-05, "loss": 0.1851, "step": 2644 }, { "epoch": 0.5331452750352609, "grad_norm": 0.06229320168495178, "learning_rate": 9.912611016895131e-05, "loss": 0.1887, "step": 2646 }, { "epoch": 0.5335482571025589, "grad_norm": 0.05405684933066368, "learning_rate": 9.912362775423403e-05, "loss": 0.1712, "step": 2648 }, { "epoch": 0.533951239169857, "grad_norm": 0.06194634735584259, "learning_rate": 9.912114184985279e-05, "loss": 0.2287, "step": 2650 }, { "epoch": 0.5343542212371549, "grad_norm": 0.04791036620736122, "learning_rate": 9.911865245598419e-05, "loss": 0.1607, "step": 2652 }, { "epoch": 0.534757203304453, "grad_norm": 0.06112559512257576, "learning_rate": 9.911615957280506e-05, "loss": 0.2234, "step": 2654 }, { "epoch": 0.5351601853717509, "grad_norm": 0.04859330132603645, "learning_rate": 9.911366320049253e-05, "loss": 0.2157, "step": 2656 }, { "epoch": 0.535563167439049, "grad_norm": 0.07223501056432724, "learning_rate": 9.91111633392239e-05, "loss": 0.2384, "step": 2658 }, { "epoch": 0.535966149506347, "grad_norm": 0.04936986416578293, "learning_rate": 9.910865998917675e-05, "loss": 0.2292, "step": 2660 }, { "epoch": 0.536369131573645, "grad_norm": 0.05428776890039444, "learning_rate": 9.910615315052896e-05, "loss": 0.201, "step": 2662 }, { "epoch": 0.536772113640943, "grad_norm": 0.05943161994218826, "learning_rate": 9.910364282345857e-05, "loss": 0.219, "step": 2664 }, { "epoch": 0.537175095708241, "grad_norm": 0.058624330908060074, "learning_rate": 9.910112900814393e-05, "loss": 0.1767, "step": 2666 }, { "epoch": 0.537578077775539, "grad_norm": 0.056275349110364914, "learning_rate": 9.90986117047636e-05, "loss": 0.1772, "step": 2668 }, { "epoch": 0.537981059842837, "grad_norm": 0.06266789138317108, "learning_rate": 9.90960909134964e-05, "loss": 0.1881, "step": 2670 }, { "epoch": 0.538384041910135, "grad_norm": 0.05568632483482361, "learning_rate": 9.909356663452146e-05, "loss": 0.1773, "step": 2672 }, { "epoch": 0.538787023977433, "grad_norm": 0.050182417035102844, "learning_rate": 9.909103886801803e-05, "loss": 0.2263, "step": 2674 }, { "epoch": 0.539190006044731, "grad_norm": 0.06228543445467949, "learning_rate": 9.908850761416573e-05, "loss": 0.2069, "step": 2676 }, { "epoch": 0.539592988112029, "grad_norm": 0.06411072611808777, "learning_rate": 9.908597287314434e-05, "loss": 0.1986, "step": 2678 }, { "epoch": 0.539995970179327, "grad_norm": 0.04468056932091713, "learning_rate": 9.908343464513394e-05, "loss": 0.1636, "step": 2680 }, { "epoch": 0.540398952246625, "grad_norm": 0.05913592502474785, "learning_rate": 9.908089293031483e-05, "loss": 0.2376, "step": 2682 }, { "epoch": 0.540801934313923, "grad_norm": 0.253537118434906, "learning_rate": 9.907834772886761e-05, "loss": 0.1615, "step": 2684 }, { "epoch": 0.541204916381221, "grad_norm": 0.0542256236076355, "learning_rate": 9.907579904097305e-05, "loss": 0.165, "step": 2686 }, { "epoch": 0.5416078984485191, "grad_norm": 0.06959859281778336, "learning_rate": 9.907324686681218e-05, "loss": 0.1724, "step": 2688 }, { "epoch": 0.542010880515817, "grad_norm": 0.05335042253136635, "learning_rate": 9.907069120656636e-05, "loss": 0.2072, "step": 2690 }, { "epoch": 0.5424138625831151, "grad_norm": 0.04987449571490288, "learning_rate": 9.90681320604171e-05, "loss": 0.1566, "step": 2692 }, { "epoch": 0.542816844650413, "grad_norm": 0.04512554407119751, "learning_rate": 9.906556942854623e-05, "loss": 0.2381, "step": 2694 }, { "epoch": 0.5432198267177111, "grad_norm": 0.042101211845874786, "learning_rate": 9.906300331113576e-05, "loss": 0.1501, "step": 2696 }, { "epoch": 0.543622808785009, "grad_norm": 0.06392179429531097, "learning_rate": 9.9060433708368e-05, "loss": 0.2025, "step": 2698 }, { "epoch": 0.5440257908523071, "grad_norm": 0.05543966218829155, "learning_rate": 9.905786062042551e-05, "loss": 0.1677, "step": 2700 }, { "epoch": 0.5444287729196051, "grad_norm": 0.06768188625574112, "learning_rate": 9.905528404749102e-05, "loss": 0.197, "step": 2702 }, { "epoch": 0.5448317549869031, "grad_norm": 0.09007920324802399, "learning_rate": 9.905270398974763e-05, "loss": 0.1605, "step": 2704 }, { "epoch": 0.5452347370542011, "grad_norm": 0.06303185969591141, "learning_rate": 9.90501204473786e-05, "loss": 0.1849, "step": 2706 }, { "epoch": 0.545637719121499, "grad_norm": 0.08676113933324814, "learning_rate": 9.904753342056746e-05, "loss": 0.1749, "step": 2708 }, { "epoch": 0.5460407011887971, "grad_norm": 0.056663088500499725, "learning_rate": 9.904494290949797e-05, "loss": 0.204, "step": 2710 }, { "epoch": 0.5464436832560952, "grad_norm": 0.06061787158250809, "learning_rate": 9.904234891435416e-05, "loss": 0.1735, "step": 2712 }, { "epoch": 0.5468466653233931, "grad_norm": 0.0656784325838089, "learning_rate": 9.903975143532034e-05, "loss": 0.2053, "step": 2714 }, { "epoch": 0.5472496473906912, "grad_norm": 0.0533025786280632, "learning_rate": 9.9037150472581e-05, "loss": 0.1506, "step": 2716 }, { "epoch": 0.5476526294579891, "grad_norm": 0.060621704906225204, "learning_rate": 9.903454602632092e-05, "loss": 0.2182, "step": 2718 }, { "epoch": 0.5480556115252871, "grad_norm": 0.05371670052409172, "learning_rate": 9.903193809672509e-05, "loss": 0.145, "step": 2720 }, { "epoch": 0.5484585935925851, "grad_norm": 0.08136498928070068, "learning_rate": 9.90293266839788e-05, "loss": 0.2058, "step": 2722 }, { "epoch": 0.5488615756598831, "grad_norm": 0.06714341789484024, "learning_rate": 9.902671178826757e-05, "loss": 0.2286, "step": 2724 }, { "epoch": 0.5492645577271812, "grad_norm": 0.05731106176972389, "learning_rate": 9.902409340977713e-05, "loss": 0.1818, "step": 2726 }, { "epoch": 0.5496675397944791, "grad_norm": 0.06478509306907654, "learning_rate": 9.902147154869348e-05, "loss": 0.1992, "step": 2728 }, { "epoch": 0.5500705218617772, "grad_norm": 0.06453832238912582, "learning_rate": 9.901884620520291e-05, "loss": 0.2373, "step": 2730 }, { "epoch": 0.5504735039290751, "grad_norm": 0.06110521778464317, "learning_rate": 9.901621737949189e-05, "loss": 0.1849, "step": 2732 }, { "epoch": 0.5508764859963732, "grad_norm": 0.04943872615695, "learning_rate": 9.901358507174719e-05, "loss": 0.179, "step": 2734 }, { "epoch": 0.5512794680636711, "grad_norm": 0.046395331621170044, "learning_rate": 9.901094928215577e-05, "loss": 0.2282, "step": 2736 }, { "epoch": 0.5516824501309692, "grad_norm": 0.05042804405093193, "learning_rate": 9.900831001090491e-05, "loss": 0.2087, "step": 2738 }, { "epoch": 0.5520854321982672, "grad_norm": 0.040862563997507095, "learning_rate": 9.900566725818208e-05, "loss": 0.2409, "step": 2740 }, { "epoch": 0.5524884142655652, "grad_norm": 0.058182161301374435, "learning_rate": 9.900302102417502e-05, "loss": 0.2004, "step": 2742 }, { "epoch": 0.5528913963328632, "grad_norm": 0.05086760222911835, "learning_rate": 9.900037130907171e-05, "loss": 0.2169, "step": 2744 }, { "epoch": 0.5532943784001612, "grad_norm": 0.04748394712805748, "learning_rate": 9.89977181130604e-05, "loss": 0.1727, "step": 2746 }, { "epoch": 0.5536973604674592, "grad_norm": 0.05247688293457031, "learning_rate": 9.899506143632954e-05, "loss": 0.2065, "step": 2748 }, { "epoch": 0.5541003425347572, "grad_norm": 0.05333872139453888, "learning_rate": 9.899240127906791e-05, "loss": 0.209, "step": 2750 }, { "epoch": 0.5545033246020552, "grad_norm": 0.05926572158932686, "learning_rate": 9.89897376414644e-05, "loss": 0.2409, "step": 2752 }, { "epoch": 0.5549063066693533, "grad_norm": 0.05073244497179985, "learning_rate": 9.89870705237083e-05, "loss": 0.1838, "step": 2754 }, { "epoch": 0.5553092887366512, "grad_norm": 0.04936111718416214, "learning_rate": 9.898439992598904e-05, "loss": 0.2102, "step": 2756 }, { "epoch": 0.5557122708039492, "grad_norm": 0.06824660301208496, "learning_rate": 9.898172584849636e-05, "loss": 0.2545, "step": 2758 }, { "epoch": 0.5561152528712472, "grad_norm": 0.07397361099720001, "learning_rate": 9.89790482914202e-05, "loss": 0.2195, "step": 2760 }, { "epoch": 0.5565182349385452, "grad_norm": 0.0612940639257431, "learning_rate": 9.897636725495078e-05, "loss": 0.2221, "step": 2762 }, { "epoch": 0.5569212170058433, "grad_norm": 0.05344128981232643, "learning_rate": 9.897368273927857e-05, "loss": 0.193, "step": 2764 }, { "epoch": 0.5573241990731412, "grad_norm": 0.06365928053855896, "learning_rate": 9.897099474459424e-05, "loss": 0.1599, "step": 2766 }, { "epoch": 0.5577271811404393, "grad_norm": 0.05055849254131317, "learning_rate": 9.896830327108878e-05, "loss": 0.1981, "step": 2768 }, { "epoch": 0.5581301632077372, "grad_norm": 0.05964979901909828, "learning_rate": 9.896560831895335e-05, "loss": 0.1928, "step": 2770 }, { "epoch": 0.5585331452750353, "grad_norm": 0.05901632830500603, "learning_rate": 9.896290988837942e-05, "loss": 0.1739, "step": 2772 }, { "epoch": 0.5589361273423332, "grad_norm": 0.05880974978208542, "learning_rate": 9.896020797955868e-05, "loss": 0.212, "step": 2774 }, { "epoch": 0.5593391094096313, "grad_norm": 0.07250037789344788, "learning_rate": 9.895750259268307e-05, "loss": 0.1928, "step": 2776 }, { "epoch": 0.5597420914769293, "grad_norm": 0.05343032628297806, "learning_rate": 9.895479372794477e-05, "loss": 0.206, "step": 2778 }, { "epoch": 0.5601450735442273, "grad_norm": 0.052220240235328674, "learning_rate": 9.89520813855362e-05, "loss": 0.1788, "step": 2780 }, { "epoch": 0.5605480556115253, "grad_norm": 0.056068528443574905, "learning_rate": 9.894936556565008e-05, "loss": 0.2249, "step": 2782 }, { "epoch": 0.5609510376788233, "grad_norm": 0.050359394401311874, "learning_rate": 9.89466462684793e-05, "loss": 0.151, "step": 2784 }, { "epoch": 0.5613540197461213, "grad_norm": 0.058597322553396225, "learning_rate": 9.894392349421704e-05, "loss": 0.1641, "step": 2786 }, { "epoch": 0.5617570018134193, "grad_norm": 0.0488249696791172, "learning_rate": 9.894119724305675e-05, "loss": 0.1949, "step": 2788 }, { "epoch": 0.5621599838807173, "grad_norm": 0.0449531264603138, "learning_rate": 9.893846751519205e-05, "loss": 0.1553, "step": 2790 }, { "epoch": 0.5625629659480154, "grad_norm": 0.09443546086549759, "learning_rate": 9.89357343108169e-05, "loss": 0.214, "step": 2792 }, { "epoch": 0.5629659480153133, "grad_norm": 0.08376643806695938, "learning_rate": 9.893299763012545e-05, "loss": 0.2011, "step": 2794 }, { "epoch": 0.5633689300826114, "grad_norm": 0.05647800862789154, "learning_rate": 9.893025747331211e-05, "loss": 0.1575, "step": 2796 }, { "epoch": 0.5637719121499093, "grad_norm": 0.05209295079112053, "learning_rate": 9.89275138405715e-05, "loss": 0.2157, "step": 2798 }, { "epoch": 0.5641748942172073, "grad_norm": 0.06927008181810379, "learning_rate": 9.892476673209858e-05, "loss": 0.1606, "step": 2800 }, { "epoch": 0.5645778762845053, "grad_norm": 0.06608123332262039, "learning_rate": 9.892201614808848e-05, "loss": 0.2286, "step": 2802 }, { "epoch": 0.5649808583518033, "grad_norm": 0.05888240784406662, "learning_rate": 9.891926208873658e-05, "loss": 0.2173, "step": 2804 }, { "epoch": 0.5653838404191014, "grad_norm": 0.05698193237185478, "learning_rate": 9.891650455423854e-05, "loss": 0.2126, "step": 2806 }, { "epoch": 0.5657868224863993, "grad_norm": 0.05356952175498009, "learning_rate": 9.891374354479025e-05, "loss": 0.2118, "step": 2808 }, { "epoch": 0.5661898045536974, "grad_norm": 0.05336877331137657, "learning_rate": 9.891097906058784e-05, "loss": 0.1706, "step": 2810 }, { "epoch": 0.5665927866209953, "grad_norm": 0.04258881136775017, "learning_rate": 9.890821110182769e-05, "loss": 0.2012, "step": 2812 }, { "epoch": 0.5669957686882934, "grad_norm": 0.041180457919836044, "learning_rate": 9.890543966870646e-05, "loss": 0.1952, "step": 2814 }, { "epoch": 0.5673987507555914, "grad_norm": 0.0453479178249836, "learning_rate": 9.8902664761421e-05, "loss": 0.2232, "step": 2816 }, { "epoch": 0.5678017328228894, "grad_norm": 0.0834401398897171, "learning_rate": 9.889988638016844e-05, "loss": 0.1682, "step": 2818 }, { "epoch": 0.5682047148901874, "grad_norm": 0.053984783589839935, "learning_rate": 9.889710452514616e-05, "loss": 0.2373, "step": 2820 }, { "epoch": 0.5686076969574854, "grad_norm": 0.08120491355657578, "learning_rate": 9.889431919655176e-05, "loss": 0.1928, "step": 2822 }, { "epoch": 0.5690106790247834, "grad_norm": 0.0568217933177948, "learning_rate": 9.889153039458314e-05, "loss": 0.2374, "step": 2824 }, { "epoch": 0.5694136610920814, "grad_norm": 0.04305284842848778, "learning_rate": 9.888873811943838e-05, "loss": 0.1806, "step": 2826 }, { "epoch": 0.5698166431593794, "grad_norm": 0.05167011171579361, "learning_rate": 9.888594237131586e-05, "loss": 0.2031, "step": 2828 }, { "epoch": 0.5702196252266775, "grad_norm": 0.042756181210279465, "learning_rate": 9.888314315041417e-05, "loss": 0.218, "step": 2830 }, { "epoch": 0.5706226072939754, "grad_norm": 0.05174868926405907, "learning_rate": 9.888034045693215e-05, "loss": 0.1962, "step": 2832 }, { "epoch": 0.5710255893612735, "grad_norm": 0.04308900609612465, "learning_rate": 9.887753429106894e-05, "loss": 0.1956, "step": 2834 }, { "epoch": 0.5714285714285714, "grad_norm": 0.050183191895484924, "learning_rate": 9.887472465302386e-05, "loss": 0.1871, "step": 2836 }, { "epoch": 0.5718315534958694, "grad_norm": 0.052778877317905426, "learning_rate": 9.887191154299649e-05, "loss": 0.2299, "step": 2838 }, { "epoch": 0.5722345355631674, "grad_norm": 0.05091378092765808, "learning_rate": 9.886909496118668e-05, "loss": 0.2262, "step": 2840 }, { "epoch": 0.5726375176304654, "grad_norm": 0.054165229201316833, "learning_rate": 9.886627490779452e-05, "loss": 0.1676, "step": 2842 }, { "epoch": 0.5730404996977635, "grad_norm": 0.0649203434586525, "learning_rate": 9.886345138302035e-05, "loss": 0.1646, "step": 2844 }, { "epoch": 0.5734434817650614, "grad_norm": 0.052568770945072174, "learning_rate": 9.886062438706474e-05, "loss": 0.2328, "step": 2846 }, { "epoch": 0.5738464638323595, "grad_norm": 0.047305673360824585, "learning_rate": 9.885779392012852e-05, "loss": 0.1873, "step": 2848 }, { "epoch": 0.5742494458996574, "grad_norm": 0.05377272143959999, "learning_rate": 9.885495998241275e-05, "loss": 0.1747, "step": 2850 }, { "epoch": 0.5746524279669555, "grad_norm": 0.061727289110422134, "learning_rate": 9.885212257411875e-05, "loss": 0.2086, "step": 2852 }, { "epoch": 0.5750554100342534, "grad_norm": 0.056558411568403244, "learning_rate": 9.88492816954481e-05, "loss": 0.2031, "step": 2854 }, { "epoch": 0.5754583921015515, "grad_norm": 0.06308440864086151, "learning_rate": 9.88464373466026e-05, "loss": 0.1929, "step": 2856 }, { "epoch": 0.5758613741688495, "grad_norm": 0.06898178160190582, "learning_rate": 9.88435895277843e-05, "loss": 0.1831, "step": 2858 }, { "epoch": 0.5762643562361475, "grad_norm": 0.06173472851514816, "learning_rate": 9.884073823919553e-05, "loss": 0.2316, "step": 2860 }, { "epoch": 0.5766673383034455, "grad_norm": 0.04862646386027336, "learning_rate": 9.88378834810388e-05, "loss": 0.2015, "step": 2862 }, { "epoch": 0.5770703203707435, "grad_norm": 0.06486746668815613, "learning_rate": 9.883502525351695e-05, "loss": 0.2179, "step": 2864 }, { "epoch": 0.5774733024380415, "grad_norm": 0.0436808243393898, "learning_rate": 9.8832163556833e-05, "loss": 0.2002, "step": 2866 }, { "epoch": 0.5778762845053396, "grad_norm": 0.04756801575422287, "learning_rate": 9.882929839119025e-05, "loss": 0.1953, "step": 2868 }, { "epoch": 0.5782792665726375, "grad_norm": 0.05114509537816048, "learning_rate": 9.882642975679224e-05, "loss": 0.2263, "step": 2870 }, { "epoch": 0.5786822486399356, "grad_norm": 0.06755529344081879, "learning_rate": 9.882355765384273e-05, "loss": 0.2195, "step": 2872 }, { "epoch": 0.5790852307072335, "grad_norm": 0.049488335847854614, "learning_rate": 9.882068208254578e-05, "loss": 0.236, "step": 2874 }, { "epoch": 0.5794882127745316, "grad_norm": 0.05077878385782242, "learning_rate": 9.881780304310564e-05, "loss": 0.1497, "step": 2876 }, { "epoch": 0.5798911948418295, "grad_norm": 0.0435592457652092, "learning_rate": 9.881492053572685e-05, "loss": 0.1614, "step": 2878 }, { "epoch": 0.5802941769091275, "grad_norm": 0.060861654579639435, "learning_rate": 9.881203456061418e-05, "loss": 0.1771, "step": 2880 }, { "epoch": 0.5806971589764256, "grad_norm": 0.0579148605465889, "learning_rate": 9.880914511797262e-05, "loss": 0.2413, "step": 2882 }, { "epoch": 0.5811001410437235, "grad_norm": 0.05515383556485176, "learning_rate": 9.880625220800746e-05, "loss": 0.2095, "step": 2884 }, { "epoch": 0.5815031231110216, "grad_norm": 0.0689605101943016, "learning_rate": 9.880335583092421e-05, "loss": 0.2078, "step": 2886 }, { "epoch": 0.5819061051783195, "grad_norm": 0.05493571236729622, "learning_rate": 9.88004559869286e-05, "loss": 0.1886, "step": 2888 }, { "epoch": 0.5823090872456176, "grad_norm": 0.054588478058576584, "learning_rate": 9.879755267622664e-05, "loss": 0.201, "step": 2890 }, { "epoch": 0.5827120693129155, "grad_norm": 0.06213277950882912, "learning_rate": 9.879464589902458e-05, "loss": 0.1827, "step": 2892 }, { "epoch": 0.5831150513802136, "grad_norm": 0.05389844626188278, "learning_rate": 9.879173565552891e-05, "loss": 0.2057, "step": 2894 }, { "epoch": 0.5835180334475116, "grad_norm": 0.04522683843970299, "learning_rate": 9.878882194594637e-05, "loss": 0.2145, "step": 2896 }, { "epoch": 0.5839210155148096, "grad_norm": 0.039040908217430115, "learning_rate": 9.878590477048394e-05, "loss": 0.1494, "step": 2898 }, { "epoch": 0.5843239975821076, "grad_norm": 0.06023675948381424, "learning_rate": 9.878298412934886e-05, "loss": 0.1985, "step": 2900 }, { "epoch": 0.5847269796494056, "grad_norm": 0.07509973645210266, "learning_rate": 9.87800600227486e-05, "loss": 0.2325, "step": 2902 }, { "epoch": 0.5851299617167036, "grad_norm": 0.06361687183380127, "learning_rate": 9.877713245089089e-05, "loss": 0.2007, "step": 2904 }, { "epoch": 0.5855329437840016, "grad_norm": 0.05781874805688858, "learning_rate": 9.87742014139837e-05, "loss": 0.2256, "step": 2906 }, { "epoch": 0.5859359258512996, "grad_norm": 0.043421436101198196, "learning_rate": 9.877126691223525e-05, "loss": 0.1871, "step": 2908 }, { "epoch": 0.5863389079185977, "grad_norm": 0.05317642167210579, "learning_rate": 9.8768328945854e-05, "loss": 0.2071, "step": 2910 }, { "epoch": 0.5867418899858956, "grad_norm": 0.05214408412575722, "learning_rate": 9.876538751504865e-05, "loss": 0.2015, "step": 2912 }, { "epoch": 0.5871448720531937, "grad_norm": 0.050086211413145065, "learning_rate": 9.876244262002817e-05, "loss": 0.1946, "step": 2914 }, { "epoch": 0.5875478541204916, "grad_norm": 0.0423690602183342, "learning_rate": 9.875949426100172e-05, "loss": 0.2099, "step": 2916 }, { "epoch": 0.5879508361877896, "grad_norm": 0.0662025436758995, "learning_rate": 9.87565424381788e-05, "loss": 0.1724, "step": 2918 }, { "epoch": 0.5883538182550877, "grad_norm": 0.06472502648830414, "learning_rate": 9.875358715176908e-05, "loss": 0.2587, "step": 2920 }, { "epoch": 0.5887568003223856, "grad_norm": 0.08011851459741592, "learning_rate": 9.875062840198248e-05, "loss": 0.2292, "step": 2922 }, { "epoch": 0.5891597823896837, "grad_norm": 0.11067450046539307, "learning_rate": 9.874766618902922e-05, "loss": 0.2778, "step": 2924 }, { "epoch": 0.5895627644569816, "grad_norm": 0.05279851332306862, "learning_rate": 9.874470051311971e-05, "loss": 0.1758, "step": 2926 }, { "epoch": 0.5899657465242797, "grad_norm": 0.07390250265598297, "learning_rate": 9.874173137446463e-05, "loss": 0.2023, "step": 2928 }, { "epoch": 0.5903687285915776, "grad_norm": 0.05263578146696091, "learning_rate": 9.873875877327491e-05, "loss": 0.2448, "step": 2930 }, { "epoch": 0.5907717106588757, "grad_norm": 0.05811066925525665, "learning_rate": 9.873578270976172e-05, "loss": 0.1733, "step": 2932 }, { "epoch": 0.5911746927261737, "grad_norm": 0.05098055303096771, "learning_rate": 9.873280318413644e-05, "loss": 0.1623, "step": 2934 }, { "epoch": 0.5915776747934717, "grad_norm": 0.050901107490062714, "learning_rate": 9.87298201966108e-05, "loss": 0.2252, "step": 2936 }, { "epoch": 0.5919806568607697, "grad_norm": 0.06555388867855072, "learning_rate": 9.872683374739662e-05, "loss": 0.1906, "step": 2938 }, { "epoch": 0.5923836389280677, "grad_norm": 0.05196414515376091, "learning_rate": 9.872384383670611e-05, "loss": 0.1696, "step": 2940 }, { "epoch": 0.5927866209953657, "grad_norm": 0.15211619436740875, "learning_rate": 9.872085046475169e-05, "loss": 0.2249, "step": 2942 }, { "epoch": 0.5931896030626637, "grad_norm": 0.07226165384054184, "learning_rate": 9.871785363174592e-05, "loss": 0.2197, "step": 2944 }, { "epoch": 0.5935925851299617, "grad_norm": 0.07411706447601318, "learning_rate": 9.871485333790178e-05, "loss": 0.235, "step": 2946 }, { "epoch": 0.5939955671972598, "grad_norm": 0.07494895905256271, "learning_rate": 9.871184958343234e-05, "loss": 0.2677, "step": 2948 }, { "epoch": 0.5943985492645577, "grad_norm": 0.06455809623003006, "learning_rate": 9.870884236855103e-05, "loss": 0.2008, "step": 2950 }, { "epoch": 0.5948015313318558, "grad_norm": 0.07862219214439392, "learning_rate": 9.870583169347146e-05, "loss": 0.203, "step": 2952 }, { "epoch": 0.5952045133991537, "grad_norm": 0.09578139334917068, "learning_rate": 9.870281755840747e-05, "loss": 0.1717, "step": 2954 }, { "epoch": 0.5956074954664518, "grad_norm": 0.07982958853244781, "learning_rate": 9.869979996357323e-05, "loss": 0.2288, "step": 2956 }, { "epoch": 0.5960104775337497, "grad_norm": 0.15855936706066132, "learning_rate": 9.869677890918307e-05, "loss": 0.1945, "step": 2958 }, { "epoch": 0.5964134596010477, "grad_norm": 0.04864702373743057, "learning_rate": 9.869375439545164e-05, "loss": 0.1835, "step": 2960 }, { "epoch": 0.5968164416683458, "grad_norm": 0.08095841109752655, "learning_rate": 9.869072642259375e-05, "loss": 0.2169, "step": 2962 }, { "epoch": 0.5972194237356437, "grad_norm": 0.051869262009859085, "learning_rate": 9.868769499082453e-05, "loss": 0.1718, "step": 2964 }, { "epoch": 0.5976224058029418, "grad_norm": 0.06466794013977051, "learning_rate": 9.868466010035932e-05, "loss": 0.1867, "step": 2966 }, { "epoch": 0.5980253878702397, "grad_norm": 0.07450581341981888, "learning_rate": 9.868162175141373e-05, "loss": 0.2033, "step": 2968 }, { "epoch": 0.5984283699375378, "grad_norm": 0.06121667101979256, "learning_rate": 9.867857994420357e-05, "loss": 0.2116, "step": 2970 }, { "epoch": 0.5988313520048358, "grad_norm": 0.0859074667096138, "learning_rate": 9.867553467894494e-05, "loss": 0.1945, "step": 2972 }, { "epoch": 0.5992343340721338, "grad_norm": 0.0621977373957634, "learning_rate": 9.867248595585419e-05, "loss": 0.1979, "step": 2974 }, { "epoch": 0.5996373161394318, "grad_norm": 0.057669565081596375, "learning_rate": 9.866943377514787e-05, "loss": 0.2332, "step": 2976 }, { "epoch": 0.6000402982067298, "grad_norm": 0.05102207884192467, "learning_rate": 9.86663781370428e-05, "loss": 0.1866, "step": 2978 }, { "epoch": 0.6004432802740278, "grad_norm": 0.04888800159096718, "learning_rate": 9.866331904175608e-05, "loss": 0.2701, "step": 2980 }, { "epoch": 0.6008462623413258, "grad_norm": 0.17599685490131378, "learning_rate": 9.866025648950496e-05, "loss": 0.243, "step": 2982 }, { "epoch": 0.6012492444086238, "grad_norm": 0.04738638177514076, "learning_rate": 9.865719048050707e-05, "loss": 0.1871, "step": 2984 }, { "epoch": 0.6016522264759219, "grad_norm": 0.0663914680480957, "learning_rate": 9.865412101498019e-05, "loss": 0.1896, "step": 2986 }, { "epoch": 0.6020552085432198, "grad_norm": 0.3060331344604492, "learning_rate": 9.865104809314234e-05, "loss": 0.2408, "step": 2988 }, { "epoch": 0.6024581906105179, "grad_norm": 0.3577231168746948, "learning_rate": 9.864797171521185e-05, "loss": 0.1561, "step": 2990 }, { "epoch": 0.6028611726778158, "grad_norm": 0.12939517199993134, "learning_rate": 9.864489188140727e-05, "loss": 0.1593, "step": 2992 }, { "epoch": 0.6032641547451139, "grad_norm": 0.08543020486831665, "learning_rate": 9.864180859194734e-05, "loss": 1.2892, "step": 2994 }, { "epoch": 0.6036671368124118, "grad_norm": 0.06838931888341904, "learning_rate": 9.863872184705111e-05, "loss": 0.2024, "step": 2996 }, { "epoch": 0.6040701188797098, "grad_norm": 0.09500443190336227, "learning_rate": 9.86356316469379e-05, "loss": 0.1507, "step": 2998 }, { "epoch": 0.6044731009470079, "grad_norm": 10.331274032592773, "learning_rate": 9.863253799182718e-05, "loss": 0.2497, "step": 3000 }, { "epoch": 0.6048760830143058, "grad_norm": 0.3732292652130127, "learning_rate": 9.862944088193874e-05, "loss": 0.2482, "step": 3002 }, { "epoch": 0.6052790650816039, "grad_norm": 0.0717553049325943, "learning_rate": 9.862634031749258e-05, "loss": 0.1793, "step": 3004 }, { "epoch": 0.6056820471489018, "grad_norm": 0.06944863498210907, "learning_rate": 9.862323629870899e-05, "loss": 0.1742, "step": 3006 }, { "epoch": 0.6060850292161999, "grad_norm": 0.07637281715869904, "learning_rate": 9.862012882580845e-05, "loss": 0.1575, "step": 3008 }, { "epoch": 0.6064880112834978, "grad_norm": 0.3652530610561371, "learning_rate": 9.86170178990117e-05, "loss": 0.2212, "step": 3010 }, { "epoch": 0.6068909933507959, "grad_norm": 0.07585793733596802, "learning_rate": 9.861390351853976e-05, "loss": 0.1926, "step": 3012 }, { "epoch": 0.6072939754180939, "grad_norm": 0.06400326639413834, "learning_rate": 9.861078568461386e-05, "loss": 0.2252, "step": 3014 }, { "epoch": 0.6076969574853919, "grad_norm": 0.4003634452819824, "learning_rate": 9.86076643974555e-05, "loss": 0.1863, "step": 3016 }, { "epoch": 0.6080999395526899, "grad_norm": 0.07460381835699081, "learning_rate": 9.860453965728638e-05, "loss": 0.1905, "step": 3018 }, { "epoch": 0.6085029216199879, "grad_norm": 0.0575389601290226, "learning_rate": 9.860141146432848e-05, "loss": 0.2124, "step": 3020 }, { "epoch": 0.6089059036872859, "grad_norm": 0.07313038408756256, "learning_rate": 9.859827981880408e-05, "loss": 0.2669, "step": 3022 }, { "epoch": 0.609308885754584, "grad_norm": 0.08500310778617859, "learning_rate": 9.859514472093557e-05, "loss": 0.2018, "step": 3024 }, { "epoch": 0.6097118678218819, "grad_norm": 0.10082163661718369, "learning_rate": 9.85920061709457e-05, "loss": 0.1774, "step": 3026 }, { "epoch": 0.61011484988918, "grad_norm": 0.5841183066368103, "learning_rate": 9.858886416905741e-05, "loss": 0.2417, "step": 3028 }, { "epoch": 0.6105178319564779, "grad_norm": 1.5156642198562622, "learning_rate": 9.858571871549394e-05, "loss": 0.196, "step": 3030 }, { "epoch": 0.610920814023776, "grad_norm": 8.291611671447754, "learning_rate": 9.858256981047871e-05, "loss": 0.1798, "step": 3032 }, { "epoch": 0.6113237960910739, "grad_norm": 18.14417839050293, "learning_rate": 9.857941745423541e-05, "loss": 3.6401, "step": 3034 }, { "epoch": 0.611726778158372, "grad_norm": 286.70599365234375, "learning_rate": 9.857626164698798e-05, "loss": 8.12, "step": 3036 }, { "epoch": 0.61212976022567, "grad_norm": 0.9753457903862, "learning_rate": 9.857310238896062e-05, "loss": 1.2561, "step": 3038 }, { "epoch": 0.612532742292968, "grad_norm": 0.18690338730812073, "learning_rate": 9.856993968037775e-05, "loss": 0.2009, "step": 3040 }, { "epoch": 0.612935724360266, "grad_norm": 0.24463942646980286, "learning_rate": 9.856677352146404e-05, "loss": 0.2085, "step": 3042 }, { "epoch": 0.6133387064275639, "grad_norm": 0.10862796753644943, "learning_rate": 9.856360391244441e-05, "loss": 0.215, "step": 3044 }, { "epoch": 0.613741688494862, "grad_norm": 0.32057657837867737, "learning_rate": 9.856043085354402e-05, "loss": 0.2343, "step": 3046 }, { "epoch": 0.6141446705621599, "grad_norm": 0.07785500586032867, "learning_rate": 9.855725434498828e-05, "loss": 0.1888, "step": 3048 }, { "epoch": 0.614547652629458, "grad_norm": 0.047291629016399384, "learning_rate": 9.855407438700284e-05, "loss": 0.199, "step": 3050 }, { "epoch": 0.614950634696756, "grad_norm": 0.0839415118098259, "learning_rate": 9.855089097981362e-05, "loss": 0.2054, "step": 3052 }, { "epoch": 0.615353616764054, "grad_norm": 0.07940755039453506, "learning_rate": 9.854770412364676e-05, "loss": 0.2376, "step": 3054 }, { "epoch": 0.615756598831352, "grad_norm": 0.07128242403268814, "learning_rate": 9.854451381872862e-05, "loss": 0.2176, "step": 3056 }, { "epoch": 0.61615958089865, "grad_norm": 0.08210190385580063, "learning_rate": 9.854132006528586e-05, "loss": 0.2131, "step": 3058 }, { "epoch": 0.616562562965948, "grad_norm": 0.09783251583576202, "learning_rate": 9.853812286354536e-05, "loss": 0.2359, "step": 3060 }, { "epoch": 0.616965545033246, "grad_norm": 0.06867846846580505, "learning_rate": 9.853492221373421e-05, "loss": 0.1647, "step": 3062 }, { "epoch": 0.617368527100544, "grad_norm": 0.053430572152137756, "learning_rate": 9.853171811607983e-05, "loss": 0.1712, "step": 3064 }, { "epoch": 0.6177715091678421, "grad_norm": 0.06458867341279984, "learning_rate": 9.852851057080982e-05, "loss": 0.1941, "step": 3066 }, { "epoch": 0.61817449123514, "grad_norm": 0.052936289459466934, "learning_rate": 9.852529957815202e-05, "loss": 0.2173, "step": 3068 }, { "epoch": 0.6185774733024381, "grad_norm": 0.05379785597324371, "learning_rate": 9.852208513833454e-05, "loss": 0.1515, "step": 3070 }, { "epoch": 0.618980455369736, "grad_norm": 0.06087642163038254, "learning_rate": 9.851886725158573e-05, "loss": 0.1818, "step": 3072 }, { "epoch": 0.6193834374370341, "grad_norm": 0.09215617924928665, "learning_rate": 9.851564591813418e-05, "loss": 0.2171, "step": 3074 }, { "epoch": 0.6197864195043321, "grad_norm": 0.0784756988286972, "learning_rate": 9.851242113820873e-05, "loss": 0.1924, "step": 3076 }, { "epoch": 0.62018940157163, "grad_norm": 0.10722616314888, "learning_rate": 9.850919291203848e-05, "loss": 0.2037, "step": 3078 }, { "epoch": 0.6205923836389281, "grad_norm": 0.06373076885938644, "learning_rate": 9.850596123985274e-05, "loss": 0.1746, "step": 3080 }, { "epoch": 0.620995365706226, "grad_norm": 0.08903231471776962, "learning_rate": 9.850272612188109e-05, "loss": 0.2424, "step": 3082 }, { "epoch": 0.6213983477735241, "grad_norm": 0.05747120454907417, "learning_rate": 9.849948755835333e-05, "loss": 0.2722, "step": 3084 }, { "epoch": 0.621801329840822, "grad_norm": 0.05946004018187523, "learning_rate": 9.849624554949954e-05, "loss": 0.1631, "step": 3086 }, { "epoch": 0.6222043119081201, "grad_norm": 0.07288770377635956, "learning_rate": 9.849300009555005e-05, "loss": 0.2492, "step": 3088 }, { "epoch": 0.6226072939754181, "grad_norm": 0.056010086089372635, "learning_rate": 9.848975119673536e-05, "loss": 0.2085, "step": 3090 }, { "epoch": 0.6230102760427161, "grad_norm": 0.05710703134536743, "learning_rate": 9.848649885328631e-05, "loss": 0.1921, "step": 3092 }, { "epoch": 0.6234132581100141, "grad_norm": 0.07084643095731735, "learning_rate": 9.848324306543391e-05, "loss": 0.2246, "step": 3094 }, { "epoch": 0.6238162401773121, "grad_norm": 0.061563413590192795, "learning_rate": 9.847998383340947e-05, "loss": 0.1917, "step": 3096 }, { "epoch": 0.6242192222446101, "grad_norm": 0.044611282646656036, "learning_rate": 9.847672115744451e-05, "loss": 0.183, "step": 3098 }, { "epoch": 0.6246222043119081, "grad_norm": 0.06666868925094604, "learning_rate": 9.847345503777079e-05, "loss": 0.228, "step": 3100 }, { "epoch": 0.6250251863792061, "grad_norm": 0.08843137323856354, "learning_rate": 9.847018547462037e-05, "loss": 0.2709, "step": 3102 }, { "epoch": 0.6254281684465042, "grad_norm": 0.05686771869659424, "learning_rate": 9.846691246822548e-05, "loss": 0.1932, "step": 3104 }, { "epoch": 0.6258311505138021, "grad_norm": 0.0559655986726284, "learning_rate": 9.846363601881862e-05, "loss": 0.1875, "step": 3106 }, { "epoch": 0.6262341325811002, "grad_norm": 0.04997412487864494, "learning_rate": 9.846035612663261e-05, "loss": 0.1792, "step": 3108 }, { "epoch": 0.6266371146483981, "grad_norm": 0.07034078985452652, "learning_rate": 9.845707279190037e-05, "loss": 0.1944, "step": 3110 }, { "epoch": 0.6270400967156962, "grad_norm": 0.06097714975476265, "learning_rate": 9.845378601485517e-05, "loss": 0.1702, "step": 3112 }, { "epoch": 0.6274430787829941, "grad_norm": 0.055597711354494095, "learning_rate": 9.845049579573051e-05, "loss": 0.1629, "step": 3114 }, { "epoch": 0.6278460608502922, "grad_norm": 0.0588107630610466, "learning_rate": 9.844720213476012e-05, "loss": 0.1652, "step": 3116 }, { "epoch": 0.6282490429175902, "grad_norm": 0.05339299514889717, "learning_rate": 9.844390503217796e-05, "loss": 0.1681, "step": 3118 }, { "epoch": 0.6286520249848881, "grad_norm": 0.0750700905919075, "learning_rate": 9.844060448821827e-05, "loss": 0.1935, "step": 3120 }, { "epoch": 0.6290550070521862, "grad_norm": 0.061377957463264465, "learning_rate": 9.843730050311551e-05, "loss": 0.1798, "step": 3122 }, { "epoch": 0.6294579891194841, "grad_norm": 0.053399331867694855, "learning_rate": 9.843399307710437e-05, "loss": 0.1576, "step": 3124 }, { "epoch": 0.6298609711867822, "grad_norm": 0.08089054375886917, "learning_rate": 9.843068221041982e-05, "loss": 0.1891, "step": 3126 }, { "epoch": 0.6302639532540802, "grad_norm": 0.17065788805484772, "learning_rate": 9.842736790329707e-05, "loss": 0.2374, "step": 3128 }, { "epoch": 0.6306669353213782, "grad_norm": 0.071701280772686, "learning_rate": 9.842405015597156e-05, "loss": 0.1846, "step": 3130 }, { "epoch": 0.6310699173886762, "grad_norm": 0.06457066535949707, "learning_rate": 9.842072896867895e-05, "loss": 0.1606, "step": 3132 }, { "epoch": 0.6314728994559742, "grad_norm": 0.0577966645359993, "learning_rate": 9.84174043416552e-05, "loss": 0.1813, "step": 3134 }, { "epoch": 0.6318758815232722, "grad_norm": 0.12219101935625076, "learning_rate": 9.841407627513649e-05, "loss": 0.214, "step": 3136 }, { "epoch": 0.6322788635905702, "grad_norm": 0.16911907494068146, "learning_rate": 9.841074476935921e-05, "loss": 0.2144, "step": 3138 }, { "epoch": 0.6326818456578682, "grad_norm": 0.05600879341363907, "learning_rate": 9.840740982456005e-05, "loss": 0.1586, "step": 3140 }, { "epoch": 0.6330848277251663, "grad_norm": 0.05861913040280342, "learning_rate": 9.840407144097593e-05, "loss": 0.2022, "step": 3142 }, { "epoch": 0.6334878097924642, "grad_norm": 0.08657240867614746, "learning_rate": 9.840072961884396e-05, "loss": 0.2582, "step": 3144 }, { "epoch": 0.6338907918597623, "grad_norm": 0.064541295170784, "learning_rate": 9.839738435840157e-05, "loss": 0.1894, "step": 3146 }, { "epoch": 0.6342937739270602, "grad_norm": 0.10545245558023453, "learning_rate": 9.83940356598864e-05, "loss": 0.245, "step": 3148 }, { "epoch": 0.6346967559943583, "grad_norm": 0.05893385037779808, "learning_rate": 9.839068352353633e-05, "loss": 0.2393, "step": 3150 }, { "epoch": 0.6350997380616562, "grad_norm": 0.06363905221223831, "learning_rate": 9.838732794958949e-05, "loss": 0.204, "step": 3152 }, { "epoch": 0.6355027201289543, "grad_norm": 0.053424712270498276, "learning_rate": 9.838396893828426e-05, "loss": 0.1796, "step": 3154 }, { "epoch": 0.6359057021962523, "grad_norm": 0.04647869989275932, "learning_rate": 9.838060648985925e-05, "loss": 0.2063, "step": 3156 }, { "epoch": 0.6363086842635503, "grad_norm": 0.09858336299657822, "learning_rate": 9.837724060455333e-05, "loss": 0.2552, "step": 3158 }, { "epoch": 0.6367116663308483, "grad_norm": 0.06279218941926956, "learning_rate": 9.83738712826056e-05, "loss": 0.1997, "step": 3160 }, { "epoch": 0.6371146483981462, "grad_norm": 0.07511857897043228, "learning_rate": 9.837049852425544e-05, "loss": 0.1842, "step": 3162 }, { "epoch": 0.6375176304654443, "grad_norm": 0.0642283633351326, "learning_rate": 9.83671223297424e-05, "loss": 0.2447, "step": 3164 }, { "epoch": 0.6379206125327423, "grad_norm": 0.04851103946566582, "learning_rate": 9.836374269930635e-05, "loss": 0.1869, "step": 3166 }, { "epoch": 0.6383235946000403, "grad_norm": 0.05043719336390495, "learning_rate": 9.836035963318735e-05, "loss": 0.1988, "step": 3168 }, { "epoch": 0.6387265766673383, "grad_norm": 0.06849416345357895, "learning_rate": 9.835697313162577e-05, "loss": 0.1624, "step": 3170 }, { "epoch": 0.6391295587346363, "grad_norm": 0.049817949533462524, "learning_rate": 9.835358319486212e-05, "loss": 0.1497, "step": 3172 }, { "epoch": 0.6395325408019343, "grad_norm": 0.06779654324054718, "learning_rate": 9.835018982313729e-05, "loss": 0.2263, "step": 3174 }, { "epoch": 0.6399355228692323, "grad_norm": 0.05196770653128624, "learning_rate": 9.834679301669227e-05, "loss": 0.2165, "step": 3176 }, { "epoch": 0.6403385049365303, "grad_norm": 0.08842454105615616, "learning_rate": 9.83433927757684e-05, "loss": 0.2257, "step": 3178 }, { "epoch": 0.6407414870038284, "grad_norm": 0.05393417552113533, "learning_rate": 9.833998910060723e-05, "loss": 0.179, "step": 3180 }, { "epoch": 0.6411444690711263, "grad_norm": 0.06792977452278137, "learning_rate": 9.833658199145053e-05, "loss": 0.2053, "step": 3182 }, { "epoch": 0.6415474511384244, "grad_norm": 0.06019643694162369, "learning_rate": 9.833317144854035e-05, "loss": 0.1592, "step": 3184 }, { "epoch": 0.6419504332057223, "grad_norm": 0.06538268178701401, "learning_rate": 9.832975747211896e-05, "loss": 0.1791, "step": 3186 }, { "epoch": 0.6423534152730204, "grad_norm": 0.06463984400033951, "learning_rate": 9.832634006242891e-05, "loss": 0.2425, "step": 3188 }, { "epoch": 0.6427563973403183, "grad_norm": 0.050466809421777725, "learning_rate": 9.832291921971295e-05, "loss": 0.1764, "step": 3190 }, { "epoch": 0.6431593794076164, "grad_norm": 0.0687469020485878, "learning_rate": 9.831949494421409e-05, "loss": 0.2187, "step": 3192 }, { "epoch": 0.6435623614749144, "grad_norm": 0.06412825733423233, "learning_rate": 9.831606723617557e-05, "loss": 0.1452, "step": 3194 }, { "epoch": 0.6439653435422124, "grad_norm": 0.07165306806564331, "learning_rate": 9.831263609584091e-05, "loss": 0.1617, "step": 3196 }, { "epoch": 0.6443683256095104, "grad_norm": 0.050628188997507095, "learning_rate": 9.830920152345385e-05, "loss": 0.2079, "step": 3198 }, { "epoch": 0.6447713076768083, "grad_norm": 0.07396326214075089, "learning_rate": 9.830576351925836e-05, "loss": 0.2052, "step": 3200 }, { "epoch": 0.6451742897441064, "grad_norm": 0.05688002333045006, "learning_rate": 9.83023220834987e-05, "loss": 0.2432, "step": 3202 }, { "epoch": 0.6455772718114043, "grad_norm": 0.06849195063114166, "learning_rate": 9.829887721641931e-05, "loss": 0.2222, "step": 3204 }, { "epoch": 0.6459802538787024, "grad_norm": 0.08155640959739685, "learning_rate": 9.829542891826493e-05, "loss": 0.2307, "step": 3206 }, { "epoch": 0.6463832359460004, "grad_norm": 0.06775806099176407, "learning_rate": 9.829197718928053e-05, "loss": 0.2101, "step": 3208 }, { "epoch": 0.6467862180132984, "grad_norm": 0.05798272415995598, "learning_rate": 9.828852202971129e-05, "loss": 0.2196, "step": 3210 }, { "epoch": 0.6471892000805964, "grad_norm": 0.04944278672337532, "learning_rate": 9.828506343980269e-05, "loss": 0.2321, "step": 3212 }, { "epoch": 0.6475921821478944, "grad_norm": 0.09051971882581711, "learning_rate": 9.828160141980037e-05, "loss": 0.1878, "step": 3214 }, { "epoch": 0.6479951642151924, "grad_norm": 0.05372557416558266, "learning_rate": 9.827813596995033e-05, "loss": 0.1763, "step": 3216 }, { "epoch": 0.6483981462824905, "grad_norm": 0.05551968887448311, "learning_rate": 9.82746670904987e-05, "loss": 0.1705, "step": 3218 }, { "epoch": 0.6488011283497884, "grad_norm": 0.07168183475732803, "learning_rate": 9.827119478169194e-05, "loss": 0.2073, "step": 3220 }, { "epoch": 0.6492041104170865, "grad_norm": 0.05864137038588524, "learning_rate": 9.82677190437767e-05, "loss": 0.2129, "step": 3222 }, { "epoch": 0.6496070924843844, "grad_norm": 0.04913345351815224, "learning_rate": 9.826423987699988e-05, "loss": 0.1874, "step": 3224 }, { "epoch": 0.6500100745516825, "grad_norm": 0.07202889025211334, "learning_rate": 9.826075728160863e-05, "loss": 0.193, "step": 3226 }, { "epoch": 0.6504130566189804, "grad_norm": 0.06070972606539726, "learning_rate": 9.82572712578504e-05, "loss": 0.2112, "step": 3228 }, { "epoch": 0.6508160386862785, "grad_norm": 0.07425591349601746, "learning_rate": 9.825378180597278e-05, "loss": 0.1363, "step": 3230 }, { "epoch": 0.6512190207535765, "grad_norm": 0.05426356941461563, "learning_rate": 9.825028892622367e-05, "loss": 0.2429, "step": 3232 }, { "epoch": 0.6516220028208745, "grad_norm": 0.07585328072309494, "learning_rate": 9.824679261885122e-05, "loss": 0.185, "step": 3234 }, { "epoch": 0.6520249848881725, "grad_norm": 0.05416212975978851, "learning_rate": 9.824329288410376e-05, "loss": 0.2271, "step": 3236 }, { "epoch": 0.6524279669554705, "grad_norm": 0.07425907999277115, "learning_rate": 9.823978972222993e-05, "loss": 0.1591, "step": 3238 }, { "epoch": 0.6528309490227685, "grad_norm": 0.06274693459272385, "learning_rate": 9.823628313347859e-05, "loss": 0.2194, "step": 3240 }, { "epoch": 0.6532339310900664, "grad_norm": 0.10922153294086456, "learning_rate": 9.823277311809884e-05, "loss": 0.2269, "step": 3242 }, { "epoch": 0.6536369131573645, "grad_norm": 0.05456622317433357, "learning_rate": 9.822925967634003e-05, "loss": 0.2249, "step": 3244 }, { "epoch": 0.6540398952246625, "grad_norm": 0.07304324954748154, "learning_rate": 9.822574280845171e-05, "loss": 0.1911, "step": 3246 }, { "epoch": 0.6544428772919605, "grad_norm": 0.14694897830486298, "learning_rate": 9.822222251468378e-05, "loss": 0.2003, "step": 3248 }, { "epoch": 0.6548458593592585, "grad_norm": 0.07201841473579407, "learning_rate": 9.821869879528628e-05, "loss": 0.1586, "step": 3250 }, { "epoch": 0.6552488414265565, "grad_norm": 0.0460667610168457, "learning_rate": 9.821517165050953e-05, "loss": 0.1666, "step": 3252 }, { "epoch": 0.6556518234938545, "grad_norm": 0.07998193055391312, "learning_rate": 9.821164108060407e-05, "loss": 0.2349, "step": 3254 }, { "epoch": 0.6560548055611525, "grad_norm": 0.07132408767938614, "learning_rate": 9.820810708582077e-05, "loss": 0.1718, "step": 3256 }, { "epoch": 0.6564577876284505, "grad_norm": 0.06714514642953873, "learning_rate": 9.820456966641063e-05, "loss": 0.2025, "step": 3258 }, { "epoch": 0.6568607696957486, "grad_norm": 0.07556632906198502, "learning_rate": 9.820102882262494e-05, "loss": 0.2169, "step": 3260 }, { "epoch": 0.6572637517630465, "grad_norm": 0.05741098150610924, "learning_rate": 9.819748455471525e-05, "loss": 0.2155, "step": 3262 }, { "epoch": 0.6576667338303446, "grad_norm": 0.052905187010765076, "learning_rate": 9.819393686293334e-05, "loss": 0.1433, "step": 3264 }, { "epoch": 0.6580697158976425, "grad_norm": 0.08418918401002884, "learning_rate": 9.819038574753123e-05, "loss": 0.1992, "step": 3266 }, { "epoch": 0.6584726979649406, "grad_norm": 0.073238305747509, "learning_rate": 9.818683120876119e-05, "loss": 0.2118, "step": 3268 }, { "epoch": 0.6588756800322386, "grad_norm": 0.0671781674027443, "learning_rate": 9.818327324687572e-05, "loss": 0.186, "step": 3270 }, { "epoch": 0.6592786620995366, "grad_norm": 0.06390842795372009, "learning_rate": 9.817971186212758e-05, "loss": 0.2308, "step": 3272 }, { "epoch": 0.6596816441668346, "grad_norm": 0.04517597705125809, "learning_rate": 9.817614705476976e-05, "loss": 0.1754, "step": 3274 }, { "epoch": 0.6600846262341326, "grad_norm": 0.061243556439876556, "learning_rate": 9.81725788250555e-05, "loss": 0.2033, "step": 3276 }, { "epoch": 0.6604876083014306, "grad_norm": 0.05314614623785019, "learning_rate": 9.816900717323827e-05, "loss": 0.187, "step": 3278 }, { "epoch": 0.6608905903687285, "grad_norm": 0.05768498033285141, "learning_rate": 9.816543209957181e-05, "loss": 0.1838, "step": 3280 }, { "epoch": 0.6612935724360266, "grad_norm": 0.0638279840350151, "learning_rate": 9.816185360431009e-05, "loss": 0.1805, "step": 3282 }, { "epoch": 0.6616965545033247, "grad_norm": 0.053918592631816864, "learning_rate": 9.815827168770733e-05, "loss": 0.1993, "step": 3284 }, { "epoch": 0.6620995365706226, "grad_norm": 0.06501603126525879, "learning_rate": 9.815468635001794e-05, "loss": 0.1783, "step": 3286 }, { "epoch": 0.6625025186379206, "grad_norm": 0.055193666368722916, "learning_rate": 9.815109759149665e-05, "loss": 0.2104, "step": 3288 }, { "epoch": 0.6629055007052186, "grad_norm": 0.06583480536937714, "learning_rate": 9.814750541239838e-05, "loss": 0.2385, "step": 3290 }, { "epoch": 0.6633084827725166, "grad_norm": 0.07717015594244003, "learning_rate": 9.814390981297836e-05, "loss": 0.1724, "step": 3292 }, { "epoch": 0.6637114648398146, "grad_norm": 0.04836519435048103, "learning_rate": 9.814031079349197e-05, "loss": 0.1589, "step": 3294 }, { "epoch": 0.6641144469071126, "grad_norm": 0.06062225624918938, "learning_rate": 9.813670835419488e-05, "loss": 0.197, "step": 3296 }, { "epoch": 0.6645174289744107, "grad_norm": 0.04774460569024086, "learning_rate": 9.813310249534301e-05, "loss": 0.2113, "step": 3298 }, { "epoch": 0.6649204110417086, "grad_norm": 0.060781124979257584, "learning_rate": 9.812949321719252e-05, "loss": 0.2177, "step": 3300 }, { "epoch": 0.6653233931090067, "grad_norm": 0.053476136177778244, "learning_rate": 9.812588051999981e-05, "loss": 0.195, "step": 3302 }, { "epoch": 0.6657263751763046, "grad_norm": 0.07423517107963562, "learning_rate": 9.81222644040215e-05, "loss": 0.1965, "step": 3304 }, { "epoch": 0.6661293572436027, "grad_norm": 0.0641479343175888, "learning_rate": 9.81186448695145e-05, "loss": 0.191, "step": 3306 }, { "epoch": 0.6665323393109006, "grad_norm": 0.29882076382637024, "learning_rate": 9.811502191673591e-05, "loss": 0.2615, "step": 3308 }, { "epoch": 0.6669353213781987, "grad_norm": 0.05279373377561569, "learning_rate": 9.811139554594314e-05, "loss": 0.2141, "step": 3310 }, { "epoch": 0.6673383034454967, "grad_norm": 0.053844355046749115, "learning_rate": 9.810776575739375e-05, "loss": 0.1597, "step": 3312 }, { "epoch": 0.6677412855127947, "grad_norm": 0.05351219326257706, "learning_rate": 9.810413255134561e-05, "loss": 0.1949, "step": 3314 }, { "epoch": 0.6681442675800927, "grad_norm": 0.06659112870693207, "learning_rate": 9.810049592805684e-05, "loss": 0.1775, "step": 3316 }, { "epoch": 0.6685472496473907, "grad_norm": 0.07621248066425323, "learning_rate": 9.809685588778577e-05, "loss": 0.2378, "step": 3318 }, { "epoch": 0.6689502317146887, "grad_norm": 0.07413389533758163, "learning_rate": 9.809321243079096e-05, "loss": 0.2343, "step": 3320 }, { "epoch": 0.6693532137819868, "grad_norm": 0.053018804639577866, "learning_rate": 9.808956555733126e-05, "loss": 0.1307, "step": 3322 }, { "epoch": 0.6697561958492847, "grad_norm": 0.09446662664413452, "learning_rate": 9.808591526766573e-05, "loss": 0.2026, "step": 3324 }, { "epoch": 0.6701591779165827, "grad_norm": 0.19013406336307526, "learning_rate": 9.808226156205369e-05, "loss": 0.203, "step": 3326 }, { "epoch": 0.6705621599838807, "grad_norm": 0.06566434353590012, "learning_rate": 9.807860444075467e-05, "loss": 0.1664, "step": 3328 }, { "epoch": 0.6709651420511787, "grad_norm": 0.06202316656708717, "learning_rate": 9.807494390402849e-05, "loss": 0.1753, "step": 3330 }, { "epoch": 0.6713681241184767, "grad_norm": 0.06079009175300598, "learning_rate": 9.807127995213518e-05, "loss": 0.224, "step": 3332 }, { "epoch": 0.6717711061857747, "grad_norm": 0.056815944612026215, "learning_rate": 9.8067612585335e-05, "loss": 0.2313, "step": 3334 }, { "epoch": 0.6721740882530728, "grad_norm": 0.06725732982158661, "learning_rate": 9.806394180388854e-05, "loss": 0.1753, "step": 3336 }, { "epoch": 0.6725770703203707, "grad_norm": 0.06362208724021912, "learning_rate": 9.806026760805649e-05, "loss": 0.1315, "step": 3338 }, { "epoch": 0.6729800523876688, "grad_norm": 0.062081463634967804, "learning_rate": 9.80565899980999e-05, "loss": 0.2372, "step": 3340 }, { "epoch": 0.6733830344549667, "grad_norm": 0.06947702914476395, "learning_rate": 9.805290897428e-05, "loss": 0.153, "step": 3342 }, { "epoch": 0.6737860165222648, "grad_norm": 0.06031232699751854, "learning_rate": 9.80492245368583e-05, "loss": 0.1669, "step": 3344 }, { "epoch": 0.6741889985895627, "grad_norm": 0.07842449098825455, "learning_rate": 9.804553668609654e-05, "loss": 0.209, "step": 3346 }, { "epoch": 0.6745919806568608, "grad_norm": 0.10207051783800125, "learning_rate": 9.804184542225669e-05, "loss": 0.2374, "step": 3348 }, { "epoch": 0.6749949627241588, "grad_norm": 0.06684952229261398, "learning_rate": 9.803815074560096e-05, "loss": 0.2037, "step": 3350 }, { "epoch": 0.6753979447914568, "grad_norm": 0.049116067588329315, "learning_rate": 9.803445265639184e-05, "loss": 0.1943, "step": 3352 }, { "epoch": 0.6758009268587548, "grad_norm": 0.07274952530860901, "learning_rate": 9.803075115489203e-05, "loss": 0.2178, "step": 3354 }, { "epoch": 0.6762039089260528, "grad_norm": 0.06149638071656227, "learning_rate": 9.802704624136444e-05, "loss": 0.2173, "step": 3356 }, { "epoch": 0.6766068909933508, "grad_norm": 0.08942967653274536, "learning_rate": 9.802333791607233e-05, "loss": 0.2088, "step": 3358 }, { "epoch": 0.6770098730606487, "grad_norm": 0.08084887266159058, "learning_rate": 9.801962617927907e-05, "loss": 0.2187, "step": 3360 }, { "epoch": 0.6774128551279468, "grad_norm": 0.06932730227708817, "learning_rate": 9.801591103124837e-05, "loss": 0.2379, "step": 3362 }, { "epoch": 0.6778158371952449, "grad_norm": 0.06372372061014175, "learning_rate": 9.801219247224415e-05, "loss": 0.1857, "step": 3364 }, { "epoch": 0.6782188192625428, "grad_norm": 0.06282777339220047, "learning_rate": 9.800847050253055e-05, "loss": 0.2028, "step": 3366 }, { "epoch": 0.6786218013298408, "grad_norm": 0.05891014263033867, "learning_rate": 9.800474512237199e-05, "loss": 0.2422, "step": 3368 }, { "epoch": 0.6790247833971388, "grad_norm": 0.057885557413101196, "learning_rate": 9.80010163320331e-05, "loss": 0.2128, "step": 3370 }, { "epoch": 0.6794277654644368, "grad_norm": 0.0572814866900444, "learning_rate": 9.799728413177878e-05, "loss": 0.2079, "step": 3372 }, { "epoch": 0.6798307475317349, "grad_norm": 0.04849035665392876, "learning_rate": 9.799354852187417e-05, "loss": 0.1852, "step": 3374 }, { "epoch": 0.6802337295990328, "grad_norm": 0.04705261439085007, "learning_rate": 9.79898095025846e-05, "loss": 0.1822, "step": 3376 }, { "epoch": 0.6806367116663309, "grad_norm": 0.0639222264289856, "learning_rate": 9.798606707417573e-05, "loss": 0.2139, "step": 3378 }, { "epoch": 0.6810396937336288, "grad_norm": 0.08927220851182938, "learning_rate": 9.79823212369134e-05, "loss": 0.2152, "step": 3380 }, { "epoch": 0.6814426758009269, "grad_norm": 0.0470849983394146, "learning_rate": 9.797857199106369e-05, "loss": 0.2052, "step": 3382 }, { "epoch": 0.6818456578682248, "grad_norm": 0.05196920037269592, "learning_rate": 9.797481933689296e-05, "loss": 0.161, "step": 3384 }, { "epoch": 0.6822486399355229, "grad_norm": 0.07091446220874786, "learning_rate": 9.79710632746678e-05, "loss": 0.1761, "step": 3386 }, { "epoch": 0.6826516220028209, "grad_norm": 0.05899330973625183, "learning_rate": 9.796730380465502e-05, "loss": 0.1786, "step": 3388 }, { "epoch": 0.6830546040701189, "grad_norm": 0.12023943662643433, "learning_rate": 9.796354092712168e-05, "loss": 0.2054, "step": 3390 }, { "epoch": 0.6834575861374169, "grad_norm": 0.05540652573108673, "learning_rate": 9.795977464233513e-05, "loss": 0.1603, "step": 3392 }, { "epoch": 0.6838605682047149, "grad_norm": 0.059661611914634705, "learning_rate": 9.795600495056285e-05, "loss": 0.1768, "step": 3394 }, { "epoch": 0.6842635502720129, "grad_norm": 0.07932312786579132, "learning_rate": 9.79522318520727e-05, "loss": 0.2197, "step": 3396 }, { "epoch": 0.6846665323393109, "grad_norm": 0.06426575034856796, "learning_rate": 9.794845534713266e-05, "loss": 0.2275, "step": 3398 }, { "epoch": 0.6850695144066089, "grad_norm": 0.041860274970531464, "learning_rate": 9.794467543601106e-05, "loss": 0.1523, "step": 3400 }, { "epoch": 0.685472496473907, "grad_norm": 0.05880004167556763, "learning_rate": 9.794089211897638e-05, "loss": 0.196, "step": 3402 }, { "epoch": 0.6858754785412049, "grad_norm": 0.05922669917345047, "learning_rate": 9.79371053962974e-05, "loss": 0.2885, "step": 3404 }, { "epoch": 0.686278460608503, "grad_norm": 0.05087222531437874, "learning_rate": 9.793331526824312e-05, "loss": 0.128, "step": 3406 }, { "epoch": 0.6866814426758009, "grad_norm": 0.07046099007129669, "learning_rate": 9.792952173508277e-05, "loss": 0.2136, "step": 3408 }, { "epoch": 0.6870844247430989, "grad_norm": 0.05308796837925911, "learning_rate": 9.792572479708586e-05, "loss": 0.2458, "step": 3410 }, { "epoch": 0.6874874068103969, "grad_norm": 0.05528232827782631, "learning_rate": 9.79219244545221e-05, "loss": 0.1772, "step": 3412 }, { "epoch": 0.6878903888776949, "grad_norm": 0.04996131733059883, "learning_rate": 9.791812070766147e-05, "loss": 0.1741, "step": 3414 }, { "epoch": 0.688293370944993, "grad_norm": 0.060730304569005966, "learning_rate": 9.791431355677416e-05, "loss": 0.1764, "step": 3416 }, { "epoch": 0.6886963530122909, "grad_norm": 0.049104318022727966, "learning_rate": 9.791050300213066e-05, "loss": 0.1766, "step": 3418 }, { "epoch": 0.689099335079589, "grad_norm": 0.043497003614902496, "learning_rate": 9.790668904400165e-05, "loss": 0.2286, "step": 3420 }, { "epoch": 0.6895023171468869, "grad_norm": 0.054365627467632294, "learning_rate": 9.790287168265806e-05, "loss": 0.1956, "step": 3422 }, { "epoch": 0.689905299214185, "grad_norm": 0.07255329191684723, "learning_rate": 9.789905091837109e-05, "loss": 0.1929, "step": 3424 }, { "epoch": 0.690308281281483, "grad_norm": 0.038361676037311554, "learning_rate": 9.789522675141212e-05, "loss": 0.18, "step": 3426 }, { "epoch": 0.690711263348781, "grad_norm": 0.0500890351831913, "learning_rate": 9.789139918205285e-05, "loss": 0.1705, "step": 3428 }, { "epoch": 0.691114245416079, "grad_norm": 0.054220765829086304, "learning_rate": 9.788756821056517e-05, "loss": 0.256, "step": 3430 }, { "epoch": 0.691517227483377, "grad_norm": 0.05716811493039131, "learning_rate": 9.788373383722125e-05, "loss": 0.1494, "step": 3432 }, { "epoch": 0.691920209550675, "grad_norm": 0.057214152067899704, "learning_rate": 9.787989606229343e-05, "loss": 0.226, "step": 3434 }, { "epoch": 0.692323191617973, "grad_norm": 0.05963205546140671, "learning_rate": 9.787605488605438e-05, "loss": 0.1787, "step": 3436 }, { "epoch": 0.692726173685271, "grad_norm": 0.06281402707099915, "learning_rate": 9.787221030877696e-05, "loss": 0.1814, "step": 3438 }, { "epoch": 0.6931291557525691, "grad_norm": 0.044817693531513214, "learning_rate": 9.786836233073427e-05, "loss": 0.2141, "step": 3440 }, { "epoch": 0.693532137819867, "grad_norm": 0.07125949114561081, "learning_rate": 9.786451095219967e-05, "loss": 0.2133, "step": 3442 }, { "epoch": 0.693935119887165, "grad_norm": 0.06045832857489586, "learning_rate": 9.786065617344677e-05, "loss": 0.1693, "step": 3444 }, { "epoch": 0.694338101954463, "grad_norm": 0.05925006419420242, "learning_rate": 9.78567979947494e-05, "loss": 0.1897, "step": 3446 }, { "epoch": 0.694741084021761, "grad_norm": 0.047181349247694016, "learning_rate": 9.785293641638162e-05, "loss": 0.1675, "step": 3448 }, { "epoch": 0.695144066089059, "grad_norm": 0.045355185866355896, "learning_rate": 9.784907143861779e-05, "loss": 0.2318, "step": 3450 }, { "epoch": 0.695547048156357, "grad_norm": 0.06112281605601311, "learning_rate": 9.784520306173244e-05, "loss": 0.1841, "step": 3452 }, { "epoch": 0.6959500302236551, "grad_norm": 0.06289122998714447, "learning_rate": 9.784133128600037e-05, "loss": 0.1489, "step": 3454 }, { "epoch": 0.696353012290953, "grad_norm": 0.06532011181116104, "learning_rate": 9.783745611169665e-05, "loss": 0.1487, "step": 3456 }, { "epoch": 0.6967559943582511, "grad_norm": 0.055435534566640854, "learning_rate": 9.783357753909654e-05, "loss": 0.1732, "step": 3458 }, { "epoch": 0.697158976425549, "grad_norm": 0.06668412685394287, "learning_rate": 9.78296955684756e-05, "loss": 0.2097, "step": 3460 }, { "epoch": 0.6975619584928471, "grad_norm": 0.05773022770881653, "learning_rate": 9.782581020010956e-05, "loss": 0.2037, "step": 3462 }, { "epoch": 0.697964940560145, "grad_norm": 0.06257740408182144, "learning_rate": 9.782192143427446e-05, "loss": 0.2101, "step": 3464 }, { "epoch": 0.6983679226274431, "grad_norm": 0.07159219682216644, "learning_rate": 9.781802927124652e-05, "loss": 0.2461, "step": 3466 }, { "epoch": 0.6987709046947411, "grad_norm": 0.04788602888584137, "learning_rate": 9.781413371130228e-05, "loss": 0.1685, "step": 3468 }, { "epoch": 0.6991738867620391, "grad_norm": 0.05469825863838196, "learning_rate": 9.781023475471845e-05, "loss": 0.2121, "step": 3470 }, { "epoch": 0.6995768688293371, "grad_norm": 0.042920369654893875, "learning_rate": 9.780633240177198e-05, "loss": 0.2039, "step": 3472 }, { "epoch": 0.6999798508966351, "grad_norm": 0.043295204639434814, "learning_rate": 9.780242665274013e-05, "loss": 0.1937, "step": 3474 }, { "epoch": 0.7003828329639331, "grad_norm": 0.042053669691085815, "learning_rate": 9.779851750790033e-05, "loss": 0.182, "step": 3476 }, { "epoch": 0.7007858150312312, "grad_norm": 0.050461430102586746, "learning_rate": 9.77946049675303e-05, "loss": 0.1847, "step": 3478 }, { "epoch": 0.7011887970985291, "grad_norm": 0.05842139571905136, "learning_rate": 9.779068903190796e-05, "loss": 0.2068, "step": 3480 }, { "epoch": 0.7015917791658272, "grad_norm": 0.055488720536231995, "learning_rate": 9.77867697013115e-05, "loss": 0.2043, "step": 3482 }, { "epoch": 0.7019947612331251, "grad_norm": 0.05179367586970329, "learning_rate": 9.778284697601934e-05, "loss": 0.1916, "step": 3484 }, { "epoch": 0.7023977433004231, "grad_norm": 0.056457649916410446, "learning_rate": 9.777892085631016e-05, "loss": 0.2545, "step": 3486 }, { "epoch": 0.7028007253677211, "grad_norm": 0.05595328286290169, "learning_rate": 9.777499134246285e-05, "loss": 0.1836, "step": 3488 }, { "epoch": 0.7032037074350191, "grad_norm": 0.04369045048952103, "learning_rate": 9.777105843475655e-05, "loss": 0.1407, "step": 3490 }, { "epoch": 0.7036066895023172, "grad_norm": 0.05445627123117447, "learning_rate": 9.776712213347068e-05, "loss": 0.2406, "step": 3492 }, { "epoch": 0.7040096715696151, "grad_norm": 0.0552406869828701, "learning_rate": 9.776318243888482e-05, "loss": 0.1942, "step": 3494 }, { "epoch": 0.7044126536369132, "grad_norm": 0.04776231199502945, "learning_rate": 9.775923935127889e-05, "loss": 0.2204, "step": 3496 }, { "epoch": 0.7048156357042111, "grad_norm": 0.04784173145890236, "learning_rate": 9.775529287093296e-05, "loss": 0.243, "step": 3498 }, { "epoch": 0.7052186177715092, "grad_norm": 0.04918511211872101, "learning_rate": 9.77513429981274e-05, "loss": 0.18, "step": 3500 }, { "epoch": 0.7056215998388071, "grad_norm": 0.062354523688554764, "learning_rate": 9.774738973314281e-05, "loss": 0.2023, "step": 3502 }, { "epoch": 0.7060245819061052, "grad_norm": 0.05601181089878082, "learning_rate": 9.774343307626e-05, "loss": 0.2051, "step": 3504 }, { "epoch": 0.7064275639734032, "grad_norm": 0.07394832372665405, "learning_rate": 9.773947302776006e-05, "loss": 0.1896, "step": 3506 }, { "epoch": 0.7068305460407012, "grad_norm": 0.05391557887196541, "learning_rate": 9.77355095879243e-05, "loss": 0.2055, "step": 3508 }, { "epoch": 0.7072335281079992, "grad_norm": 0.06169082224369049, "learning_rate": 9.77315427570343e-05, "loss": 0.2209, "step": 3510 }, { "epoch": 0.7076365101752972, "grad_norm": 0.050488028675317764, "learning_rate": 9.772757253537184e-05, "loss": 0.1484, "step": 3512 }, { "epoch": 0.7080394922425952, "grad_norm": 0.04180103540420532, "learning_rate": 9.772359892321893e-05, "loss": 0.1583, "step": 3514 }, { "epoch": 0.7084424743098932, "grad_norm": 0.059814367443323135, "learning_rate": 9.771962192085789e-05, "loss": 0.1898, "step": 3516 }, { "epoch": 0.7088454563771912, "grad_norm": 0.06913917511701584, "learning_rate": 9.771564152857123e-05, "loss": 0.1652, "step": 3518 }, { "epoch": 0.7092484384444893, "grad_norm": 0.060908909887075424, "learning_rate": 9.77116577466417e-05, "loss": 0.2386, "step": 3520 }, { "epoch": 0.7096514205117872, "grad_norm": 0.03974832966923714, "learning_rate": 9.77076705753523e-05, "loss": 0.1894, "step": 3522 }, { "epoch": 0.7100544025790853, "grad_norm": 0.04386703670024872, "learning_rate": 9.770368001498629e-05, "loss": 0.201, "step": 3524 }, { "epoch": 0.7104573846463832, "grad_norm": 0.04671144112944603, "learning_rate": 9.769968606582713e-05, "loss": 0.1959, "step": 3526 }, { "epoch": 0.7108603667136812, "grad_norm": 0.044962078332901, "learning_rate": 9.769568872815856e-05, "loss": 0.2122, "step": 3528 }, { "epoch": 0.7112633487809793, "grad_norm": 0.046789754182100296, "learning_rate": 9.769168800226454e-05, "loss": 0.1655, "step": 3530 }, { "epoch": 0.7116663308482772, "grad_norm": 0.04514605551958084, "learning_rate": 9.768768388842929e-05, "loss": 0.2152, "step": 3532 }, { "epoch": 0.7120693129155753, "grad_norm": 0.044290874153375626, "learning_rate": 9.768367638693723e-05, "loss": 0.1686, "step": 3534 }, { "epoch": 0.7124722949828732, "grad_norm": 0.05953364819288254, "learning_rate": 9.767966549807306e-05, "loss": 0.1464, "step": 3536 }, { "epoch": 0.7128752770501713, "grad_norm": 0.04944094642996788, "learning_rate": 9.767565122212171e-05, "loss": 0.1621, "step": 3538 }, { "epoch": 0.7132782591174692, "grad_norm": 0.07431504875421524, "learning_rate": 9.767163355936835e-05, "loss": 0.1801, "step": 3540 }, { "epoch": 0.7136812411847673, "grad_norm": 0.06648550927639008, "learning_rate": 9.766761251009836e-05, "loss": 0.202, "step": 3542 }, { "epoch": 0.7140842232520653, "grad_norm": 0.06690148264169693, "learning_rate": 9.766358807459742e-05, "loss": 0.1898, "step": 3544 }, { "epoch": 0.7144872053193633, "grad_norm": 0.06820599734783173, "learning_rate": 9.765956025315142e-05, "loss": 0.2158, "step": 3546 }, { "epoch": 0.7148901873866613, "grad_norm": 0.05998406931757927, "learning_rate": 9.765552904604647e-05, "loss": 0.2057, "step": 3548 }, { "epoch": 0.7152931694539593, "grad_norm": 0.08947255462408066, "learning_rate": 9.765149445356894e-05, "loss": 0.2408, "step": 3550 }, { "epoch": 0.7156961515212573, "grad_norm": 0.0403655469417572, "learning_rate": 9.764745647600545e-05, "loss": 0.1495, "step": 3552 }, { "epoch": 0.7160991335885553, "grad_norm": 0.04842576012015343, "learning_rate": 9.764341511364288e-05, "loss": 0.1605, "step": 3554 }, { "epoch": 0.7165021156558533, "grad_norm": 0.06308891624212265, "learning_rate": 9.763937036676829e-05, "loss": 0.2036, "step": 3556 }, { "epoch": 0.7169050977231514, "grad_norm": 0.05665654316544533, "learning_rate": 9.7635322235669e-05, "loss": 0.2227, "step": 3558 }, { "epoch": 0.7173080797904493, "grad_norm": 0.05124920234084129, "learning_rate": 9.763127072063261e-05, "loss": 0.1838, "step": 3560 }, { "epoch": 0.7177110618577474, "grad_norm": 0.05555025488138199, "learning_rate": 9.762721582194692e-05, "loss": 0.2314, "step": 3562 }, { "epoch": 0.7181140439250453, "grad_norm": 0.06558901816606522, "learning_rate": 9.762315753989999e-05, "loss": 0.1591, "step": 3564 }, { "epoch": 0.7185170259923434, "grad_norm": 0.08271800726652145, "learning_rate": 9.76190958747801e-05, "loss": 0.2709, "step": 3566 }, { "epoch": 0.7189200080596413, "grad_norm": 0.05561373382806778, "learning_rate": 9.76150308268758e-05, "loss": 0.1889, "step": 3568 }, { "epoch": 0.7193229901269393, "grad_norm": 0.05947286635637283, "learning_rate": 9.761096239647588e-05, "loss": 0.2221, "step": 3570 }, { "epoch": 0.7197259721942374, "grad_norm": 0.05997093394398689, "learning_rate": 9.760689058386929e-05, "loss": 0.2229, "step": 3572 }, { "epoch": 0.7201289542615353, "grad_norm": 0.06581594049930573, "learning_rate": 9.760281538934536e-05, "loss": 0.2132, "step": 3574 }, { "epoch": 0.7205319363288334, "grad_norm": 0.05122917890548706, "learning_rate": 9.759873681319354e-05, "loss": 0.2357, "step": 3576 }, { "epoch": 0.7209349183961313, "grad_norm": 0.0549938790500164, "learning_rate": 9.75946548557036e-05, "loss": 0.2176, "step": 3578 }, { "epoch": 0.7213379004634294, "grad_norm": 0.038299329578876495, "learning_rate": 9.759056951716548e-05, "loss": 0.2005, "step": 3580 }, { "epoch": 0.7217408825307274, "grad_norm": 0.04204018786549568, "learning_rate": 9.758648079786941e-05, "loss": 0.1957, "step": 3582 }, { "epoch": 0.7221438645980254, "grad_norm": 0.0423753559589386, "learning_rate": 9.758238869810585e-05, "loss": 0.2046, "step": 3584 }, { "epoch": 0.7225468466653234, "grad_norm": 0.04697240889072418, "learning_rate": 9.75782932181655e-05, "loss": 0.1985, "step": 3586 }, { "epoch": 0.7229498287326214, "grad_norm": 0.054548196494579315, "learning_rate": 9.757419435833928e-05, "loss": 0.1823, "step": 3588 }, { "epoch": 0.7233528107999194, "grad_norm": 0.040596771985292435, "learning_rate": 9.757009211891839e-05, "loss": 0.184, "step": 3590 }, { "epoch": 0.7237557928672174, "grad_norm": 0.04304562136530876, "learning_rate": 9.756598650019421e-05, "loss": 0.2115, "step": 3592 }, { "epoch": 0.7241587749345154, "grad_norm": 0.04047536477446556, "learning_rate": 9.756187750245844e-05, "loss": 0.2026, "step": 3594 }, { "epoch": 0.7245617570018135, "grad_norm": 0.050050195306539536, "learning_rate": 9.755776512600295e-05, "loss": 0.2037, "step": 3596 }, { "epoch": 0.7249647390691114, "grad_norm": 0.063184455037117, "learning_rate": 9.755364937111988e-05, "loss": 0.1819, "step": 3598 }, { "epoch": 0.7253677211364095, "grad_norm": 0.05444857105612755, "learning_rate": 9.754953023810162e-05, "loss": 0.1766, "step": 3600 }, { "epoch": 0.7257707032037074, "grad_norm": 0.049810487776994705, "learning_rate": 9.754540772724077e-05, "loss": 0.1551, "step": 3602 }, { "epoch": 0.7261736852710055, "grad_norm": 0.060143712908029556, "learning_rate": 9.754128183883018e-05, "loss": 0.2045, "step": 3604 }, { "epoch": 0.7265766673383034, "grad_norm": 0.055027224123477936, "learning_rate": 9.753715257316298e-05, "loss": 0.1711, "step": 3606 }, { "epoch": 0.7269796494056014, "grad_norm": 0.054849088191986084, "learning_rate": 9.753301993053247e-05, "loss": 0.2089, "step": 3608 }, { "epoch": 0.7273826314728995, "grad_norm": 0.06147737428545952, "learning_rate": 9.752888391123224e-05, "loss": 0.2434, "step": 3610 }, { "epoch": 0.7277856135401974, "grad_norm": 0.0619664303958416, "learning_rate": 9.752474451555614e-05, "loss": 0.2034, "step": 3612 }, { "epoch": 0.7281885956074955, "grad_norm": 0.053098902106285095, "learning_rate": 9.752060174379816e-05, "loss": 0.2415, "step": 3614 }, { "epoch": 0.7285915776747934, "grad_norm": 0.05426261946558952, "learning_rate": 9.751645559625264e-05, "loss": 0.2463, "step": 3616 }, { "epoch": 0.7289945597420915, "grad_norm": 0.07284572720527649, "learning_rate": 9.751230607321411e-05, "loss": 0.2031, "step": 3618 }, { "epoch": 0.7293975418093894, "grad_norm": 0.05113440379500389, "learning_rate": 9.750815317497733e-05, "loss": 0.1986, "step": 3620 }, { "epoch": 0.7298005238766875, "grad_norm": 0.07082262635231018, "learning_rate": 9.750399690183733e-05, "loss": 0.1869, "step": 3622 }, { "epoch": 0.7302035059439855, "grad_norm": 0.04850250855088234, "learning_rate": 9.749983725408938e-05, "loss": 0.1853, "step": 3624 }, { "epoch": 0.7306064880112835, "grad_norm": 0.062024109065532684, "learning_rate": 9.749567423202893e-05, "loss": 0.203, "step": 3626 }, { "epoch": 0.7310094700785815, "grad_norm": 0.062060195952653885, "learning_rate": 9.749150783595176e-05, "loss": 0.2347, "step": 3628 }, { "epoch": 0.7314124521458795, "grad_norm": 0.04899689182639122, "learning_rate": 9.748733806615382e-05, "loss": 0.2279, "step": 3630 }, { "epoch": 0.7318154342131775, "grad_norm": 0.05050105229020119, "learning_rate": 9.748316492293132e-05, "loss": 0.2057, "step": 3632 }, { "epoch": 0.7322184162804756, "grad_norm": 0.05160733684897423, "learning_rate": 9.747898840658072e-05, "loss": 0.2122, "step": 3634 }, { "epoch": 0.7326213983477735, "grad_norm": 0.051758818328380585, "learning_rate": 9.747480851739872e-05, "loss": 0.2239, "step": 3636 }, { "epoch": 0.7330243804150716, "grad_norm": 0.07806959003210068, "learning_rate": 9.747062525568226e-05, "loss": 0.2139, "step": 3638 }, { "epoch": 0.7334273624823695, "grad_norm": 0.03898259997367859, "learning_rate": 9.746643862172849e-05, "loss": 0.2334, "step": 3640 }, { "epoch": 0.7338303445496676, "grad_norm": 0.048747796565294266, "learning_rate": 9.746224861583484e-05, "loss": 0.2342, "step": 3642 }, { "epoch": 0.7342333266169655, "grad_norm": 0.04277431219816208, "learning_rate": 9.745805523829893e-05, "loss": 0.1758, "step": 3644 }, { "epoch": 0.7346363086842636, "grad_norm": 0.05254721641540527, "learning_rate": 9.74538584894187e-05, "loss": 0.2353, "step": 3646 }, { "epoch": 0.7350392907515616, "grad_norm": 0.04728619009256363, "learning_rate": 9.744965836949225e-05, "loss": 0.193, "step": 3648 }, { "epoch": 0.7354422728188595, "grad_norm": 0.04738273471593857, "learning_rate": 9.744545487881793e-05, "loss": 0.1914, "step": 3650 }, { "epoch": 0.7358452548861576, "grad_norm": 0.047282394021749496, "learning_rate": 9.74412480176944e-05, "loss": 0.2354, "step": 3652 }, { "epoch": 0.7362482369534555, "grad_norm": 0.06668908894062042, "learning_rate": 9.743703778642047e-05, "loss": 0.2104, "step": 3654 }, { "epoch": 0.7366512190207536, "grad_norm": 0.042231086641550064, "learning_rate": 9.743282418529525e-05, "loss": 0.1355, "step": 3656 }, { "epoch": 0.7370542010880515, "grad_norm": 0.06251658499240875, "learning_rate": 9.742860721461804e-05, "loss": 0.22, "step": 3658 }, { "epoch": 0.7374571831553496, "grad_norm": 0.040848251432180405, "learning_rate": 9.742438687468843e-05, "loss": 0.1877, "step": 3660 }, { "epoch": 0.7378601652226476, "grad_norm": 0.07239284366369247, "learning_rate": 9.742016316580622e-05, "loss": 0.2361, "step": 3662 }, { "epoch": 0.7382631472899456, "grad_norm": 0.048893146216869354, "learning_rate": 9.741593608827146e-05, "loss": 0.232, "step": 3664 }, { "epoch": 0.7386661293572436, "grad_norm": 0.04481977969408035, "learning_rate": 9.741170564238444e-05, "loss": 0.181, "step": 3666 }, { "epoch": 0.7390691114245416, "grad_norm": 0.04950220510363579, "learning_rate": 9.740747182844567e-05, "loss": 0.2143, "step": 3668 }, { "epoch": 0.7394720934918396, "grad_norm": 0.04517577216029167, "learning_rate": 9.740323464675591e-05, "loss": 0.2452, "step": 3670 }, { "epoch": 0.7398750755591377, "grad_norm": 0.04096828028559685, "learning_rate": 9.739899409761617e-05, "loss": 0.2507, "step": 3672 }, { "epoch": 0.7402780576264356, "grad_norm": 0.04736701026558876, "learning_rate": 9.739475018132771e-05, "loss": 0.1706, "step": 3674 }, { "epoch": 0.7406810396937337, "grad_norm": 0.06137583777308464, "learning_rate": 9.739050289819198e-05, "loss": 0.1851, "step": 3676 }, { "epoch": 0.7410840217610316, "grad_norm": 0.05057776719331741, "learning_rate": 9.738625224851071e-05, "loss": 0.1475, "step": 3678 }, { "epoch": 0.7414870038283297, "grad_norm": 0.058802492916584015, "learning_rate": 9.738199823258587e-05, "loss": 0.1984, "step": 3680 }, { "epoch": 0.7418899858956276, "grad_norm": 0.0801457017660141, "learning_rate": 9.737774085071965e-05, "loss": 0.198, "step": 3682 }, { "epoch": 0.7422929679629257, "grad_norm": 0.04764263331890106, "learning_rate": 9.73734801032145e-05, "loss": 0.2157, "step": 3684 }, { "epoch": 0.7426959500302237, "grad_norm": 0.04858770966529846, "learning_rate": 9.736921599037307e-05, "loss": 0.1898, "step": 3686 }, { "epoch": 0.7430989320975216, "grad_norm": 0.06790611147880554, "learning_rate": 9.73649485124983e-05, "loss": 0.1888, "step": 3688 }, { "epoch": 0.7435019141648197, "grad_norm": 0.05359569564461708, "learning_rate": 9.736067766989333e-05, "loss": 0.2109, "step": 3690 }, { "epoch": 0.7439048962321176, "grad_norm": 0.045951735228300095, "learning_rate": 9.735640346286157e-05, "loss": 0.2321, "step": 3692 }, { "epoch": 0.7443078782994157, "grad_norm": 0.038241468369960785, "learning_rate": 9.735212589170664e-05, "loss": 0.1565, "step": 3694 }, { "epoch": 0.7447108603667136, "grad_norm": 0.054619934409856796, "learning_rate": 9.734784495673242e-05, "loss": 0.2008, "step": 3696 }, { "epoch": 0.7451138424340117, "grad_norm": 0.04232056066393852, "learning_rate": 9.734356065824301e-05, "loss": 0.1715, "step": 3698 }, { "epoch": 0.7455168245013097, "grad_norm": 0.05665093660354614, "learning_rate": 9.733927299654277e-05, "loss": 0.1869, "step": 3700 }, { "epoch": 0.7459198065686077, "grad_norm": 0.05509399622678757, "learning_rate": 9.733498197193627e-05, "loss": 0.2342, "step": 3702 }, { "epoch": 0.7463227886359057, "grad_norm": 0.051437657326459885, "learning_rate": 9.733068758472836e-05, "loss": 0.2195, "step": 3704 }, { "epoch": 0.7467257707032037, "grad_norm": 0.06696417927742004, "learning_rate": 9.73263898352241e-05, "loss": 0.1925, "step": 3706 }, { "epoch": 0.7471287527705017, "grad_norm": 0.05943130701780319, "learning_rate": 9.73220887237288e-05, "loss": 0.1996, "step": 3708 }, { "epoch": 0.7475317348377997, "grad_norm": 0.05695728957653046, "learning_rate": 9.731778425054801e-05, "loss": 0.2164, "step": 3710 }, { "epoch": 0.7479347169050977, "grad_norm": 0.05246898904442787, "learning_rate": 9.731347641598747e-05, "loss": 0.2263, "step": 3712 }, { "epoch": 0.7483376989723958, "grad_norm": 0.05090521275997162, "learning_rate": 9.730916522035325e-05, "loss": 0.2062, "step": 3714 }, { "epoch": 0.7487406810396937, "grad_norm": 0.0509493350982666, "learning_rate": 9.730485066395158e-05, "loss": 0.2105, "step": 3716 }, { "epoch": 0.7491436631069918, "grad_norm": 0.06465563923120499, "learning_rate": 9.730053274708898e-05, "loss": 0.2148, "step": 3718 }, { "epoch": 0.7495466451742897, "grad_norm": 0.06121086701750755, "learning_rate": 9.729621147007218e-05, "loss": 0.2228, "step": 3720 }, { "epoch": 0.7499496272415878, "grad_norm": 0.05174422636628151, "learning_rate": 9.729188683320816e-05, "loss": 0.2304, "step": 3722 }, { "epoch": 0.7503526093088858, "grad_norm": 0.052721381187438965, "learning_rate": 9.728755883680412e-05, "loss": 0.1822, "step": 3724 }, { "epoch": 0.7507555913761838, "grad_norm": 0.04520372301340103, "learning_rate": 9.728322748116754e-05, "loss": 0.1936, "step": 3726 }, { "epoch": 0.7511585734434818, "grad_norm": 0.04828318580985069, "learning_rate": 9.727889276660608e-05, "loss": 0.175, "step": 3728 }, { "epoch": 0.7515615555107797, "grad_norm": 0.04844732955098152, "learning_rate": 9.72745546934277e-05, "loss": 0.2404, "step": 3730 }, { "epoch": 0.7519645375780778, "grad_norm": 0.04914208874106407, "learning_rate": 9.727021326194057e-05, "loss": 0.199, "step": 3732 }, { "epoch": 0.7523675196453757, "grad_norm": 0.05813221260905266, "learning_rate": 9.726586847245308e-05, "loss": 0.1974, "step": 3734 }, { "epoch": 0.7527705017126738, "grad_norm": 0.07353874295949936, "learning_rate": 9.726152032527386e-05, "loss": 0.194, "step": 3736 }, { "epoch": 0.7531734837799718, "grad_norm": 0.06467559933662415, "learning_rate": 9.725716882071185e-05, "loss": 0.2251, "step": 3738 }, { "epoch": 0.7535764658472698, "grad_norm": 0.08326564729213715, "learning_rate": 9.725281395907612e-05, "loss": 0.2149, "step": 3740 }, { "epoch": 0.7539794479145678, "grad_norm": 0.06660158187150955, "learning_rate": 9.724845574067607e-05, "loss": 0.1908, "step": 3742 }, { "epoch": 0.7543824299818658, "grad_norm": 0.04767664894461632, "learning_rate": 9.724409416582129e-05, "loss": 0.1932, "step": 3744 }, { "epoch": 0.7547854120491638, "grad_norm": 0.05308603122830391, "learning_rate": 9.723972923482163e-05, "loss": 0.2752, "step": 3746 }, { "epoch": 0.7551883941164618, "grad_norm": 0.05647696554660797, "learning_rate": 9.723536094798713e-05, "loss": 0.2571, "step": 3748 }, { "epoch": 0.7555913761837598, "grad_norm": 0.06191622465848923, "learning_rate": 9.723098930562813e-05, "loss": 0.1999, "step": 3750 }, { "epoch": 0.7559943582510579, "grad_norm": 0.06864877790212631, "learning_rate": 9.72266143080552e-05, "loss": 0.1778, "step": 3752 }, { "epoch": 0.7563973403183558, "grad_norm": 0.04861884191632271, "learning_rate": 9.72222359555791e-05, "loss": 0.1615, "step": 3754 }, { "epoch": 0.7568003223856539, "grad_norm": 0.06259770691394806, "learning_rate": 9.721785424851089e-05, "loss": 0.2407, "step": 3756 }, { "epoch": 0.7572033044529518, "grad_norm": 0.07162967324256897, "learning_rate": 9.721346918716184e-05, "loss": 0.2126, "step": 3758 }, { "epoch": 0.7576062865202499, "grad_norm": 0.07644571363925934, "learning_rate": 9.720908077184341e-05, "loss": 0.2492, "step": 3760 }, { "epoch": 0.7580092685875478, "grad_norm": 0.10371597111225128, "learning_rate": 9.720468900286741e-05, "loss": 0.2252, "step": 3762 }, { "epoch": 0.7584122506548459, "grad_norm": 0.23490440845489502, "learning_rate": 9.720029388054578e-05, "loss": 0.2502, "step": 3764 }, { "epoch": 0.7588152327221439, "grad_norm": 0.05937601253390312, "learning_rate": 9.719589540519077e-05, "loss": 0.2005, "step": 3766 }, { "epoch": 0.7592182147894418, "grad_norm": 0.05609803646802902, "learning_rate": 9.719149357711483e-05, "loss": 0.2276, "step": 3768 }, { "epoch": 0.7596211968567399, "grad_norm": 0.059711702167987823, "learning_rate": 9.718708839663065e-05, "loss": 0.2541, "step": 3770 }, { "epoch": 0.7600241789240378, "grad_norm": 0.05018291249871254, "learning_rate": 9.718267986405118e-05, "loss": 0.1599, "step": 3772 }, { "epoch": 0.7604271609913359, "grad_norm": 0.05470029264688492, "learning_rate": 9.717826797968958e-05, "loss": 0.1962, "step": 3774 }, { "epoch": 0.760830143058634, "grad_norm": 0.05150442197918892, "learning_rate": 9.717385274385929e-05, "loss": 0.198, "step": 3776 }, { "epoch": 0.7612331251259319, "grad_norm": 0.06866799294948578, "learning_rate": 9.716943415687394e-05, "loss": 0.2038, "step": 3778 }, { "epoch": 0.7616361071932299, "grad_norm": 0.046754587441682816, "learning_rate": 9.716501221904741e-05, "loss": 0.1902, "step": 3780 }, { "epoch": 0.7620390892605279, "grad_norm": 0.05241142213344574, "learning_rate": 9.716058693069386e-05, "loss": 0.2104, "step": 3782 }, { "epoch": 0.7624420713278259, "grad_norm": 0.06286633759737015, "learning_rate": 9.715615829212763e-05, "loss": 0.1956, "step": 3784 }, { "epoch": 0.7628450533951239, "grad_norm": 0.09779243171215057, "learning_rate": 9.715172630366334e-05, "loss": 0.2315, "step": 3786 }, { "epoch": 0.7632480354624219, "grad_norm": 0.061123188585042953, "learning_rate": 9.71472909656158e-05, "loss": 0.1918, "step": 3788 }, { "epoch": 0.76365101752972, "grad_norm": 0.0447014644742012, "learning_rate": 9.714285227830013e-05, "loss": 0.192, "step": 3790 }, { "epoch": 0.7640539995970179, "grad_norm": 0.08232752978801727, "learning_rate": 9.71384102420316e-05, "loss": 0.2029, "step": 3792 }, { "epoch": 0.764456981664316, "grad_norm": 0.05152542516589165, "learning_rate": 9.713396485712583e-05, "loss": 0.2, "step": 3794 }, { "epoch": 0.7648599637316139, "grad_norm": 0.06779137253761292, "learning_rate": 9.712951612389855e-05, "loss": 0.2021, "step": 3796 }, { "epoch": 0.765262945798912, "grad_norm": 0.04836380109190941, "learning_rate": 9.712506404266583e-05, "loss": 0.1895, "step": 3798 }, { "epoch": 0.7656659278662099, "grad_norm": 0.062117740511894226, "learning_rate": 9.712060861374391e-05, "loss": 0.1953, "step": 3800 }, { "epoch": 0.766068909933508, "grad_norm": 0.057543814182281494, "learning_rate": 9.711614983744932e-05, "loss": 0.1926, "step": 3802 }, { "epoch": 0.766471892000806, "grad_norm": 0.07145657390356064, "learning_rate": 9.711168771409882e-05, "loss": 0.1997, "step": 3804 }, { "epoch": 0.766874874068104, "grad_norm": 0.07367728650569916, "learning_rate": 9.710722224400935e-05, "loss": 0.256, "step": 3806 }, { "epoch": 0.767277856135402, "grad_norm": 0.5376330018043518, "learning_rate": 9.710275342749813e-05, "loss": 0.2055, "step": 3808 }, { "epoch": 0.7676808382026999, "grad_norm": 0.048469942063093185, "learning_rate": 9.709828126488265e-05, "loss": 0.213, "step": 3810 }, { "epoch": 0.768083820269998, "grad_norm": 0.05275251343846321, "learning_rate": 9.709380575648061e-05, "loss": 0.2052, "step": 3812 }, { "epoch": 0.7684868023372959, "grad_norm": 0.046325862407684326, "learning_rate": 9.70893269026099e-05, "loss": 0.2063, "step": 3814 }, { "epoch": 0.768889784404594, "grad_norm": 0.04846194013953209, "learning_rate": 9.708484470358873e-05, "loss": 0.2443, "step": 3816 }, { "epoch": 0.769292766471892, "grad_norm": 0.06712636351585388, "learning_rate": 9.708035915973548e-05, "loss": 0.1973, "step": 3818 }, { "epoch": 0.76969574853919, "grad_norm": 0.04823420196771622, "learning_rate": 9.707587027136882e-05, "loss": 0.2347, "step": 3820 }, { "epoch": 0.770098730606488, "grad_norm": 0.05545537546277046, "learning_rate": 9.707137803880762e-05, "loss": 0.1621, "step": 3822 }, { "epoch": 0.770501712673786, "grad_norm": 0.06045274809002876, "learning_rate": 9.706688246237101e-05, "loss": 0.1736, "step": 3824 }, { "epoch": 0.770904694741084, "grad_norm": 0.04846609756350517, "learning_rate": 9.706238354237833e-05, "loss": 0.1557, "step": 3826 }, { "epoch": 0.7713076768083821, "grad_norm": 0.056797757744789124, "learning_rate": 9.70578812791492e-05, "loss": 0.1836, "step": 3828 }, { "epoch": 0.77171065887568, "grad_norm": 0.06821225583553314, "learning_rate": 9.705337567300343e-05, "loss": 0.2045, "step": 3830 }, { "epoch": 0.7721136409429781, "grad_norm": 0.052724190056324005, "learning_rate": 9.704886672426111e-05, "loss": 0.1744, "step": 3832 }, { "epoch": 0.772516623010276, "grad_norm": 0.12339714169502258, "learning_rate": 9.704435443324254e-05, "loss": 0.2365, "step": 3834 }, { "epoch": 0.7729196050775741, "grad_norm": 0.05186685919761658, "learning_rate": 9.703983880026827e-05, "loss": 0.1393, "step": 3836 }, { "epoch": 0.773322587144872, "grad_norm": 0.09036187082529068, "learning_rate": 9.703531982565907e-05, "loss": 0.2183, "step": 3838 }, { "epoch": 0.7737255692121701, "grad_norm": 0.041202742606401443, "learning_rate": 9.703079750973598e-05, "loss": 0.2083, "step": 3840 }, { "epoch": 0.7741285512794681, "grad_norm": 0.06593915820121765, "learning_rate": 9.702627185282026e-05, "loss": 0.2433, "step": 3842 }, { "epoch": 0.774531533346766, "grad_norm": 0.05617796629667282, "learning_rate": 9.702174285523337e-05, "loss": 0.1957, "step": 3844 }, { "epoch": 0.7749345154140641, "grad_norm": 0.05651909485459328, "learning_rate": 9.70172105172971e-05, "loss": 0.2095, "step": 3846 }, { "epoch": 0.775337497481362, "grad_norm": 0.07869057357311249, "learning_rate": 9.701267483933337e-05, "loss": 0.2107, "step": 3848 }, { "epoch": 0.7757404795486601, "grad_norm": 0.09256685525178909, "learning_rate": 9.70081358216644e-05, "loss": 0.2304, "step": 3850 }, { "epoch": 0.776143461615958, "grad_norm": 0.0674339234828949, "learning_rate": 9.700359346461265e-05, "loss": 0.2421, "step": 3852 }, { "epoch": 0.7765464436832561, "grad_norm": 0.050970036536455154, "learning_rate": 9.699904776850078e-05, "loss": 0.2318, "step": 3854 }, { "epoch": 0.7769494257505541, "grad_norm": 0.049779586493968964, "learning_rate": 9.699449873365173e-05, "loss": 0.1531, "step": 3856 }, { "epoch": 0.7773524078178521, "grad_norm": 0.04937596619129181, "learning_rate": 9.698994636038864e-05, "loss": 0.155, "step": 3858 }, { "epoch": 0.7777553898851501, "grad_norm": 0.06441762298345566, "learning_rate": 9.698539064903491e-05, "loss": 0.2013, "step": 3860 }, { "epoch": 0.7781583719524481, "grad_norm": 0.0680166631937027, "learning_rate": 9.698083159991418e-05, "loss": 0.1606, "step": 3862 }, { "epoch": 0.7785613540197461, "grad_norm": 0.06702303141355515, "learning_rate": 9.69762692133503e-05, "loss": 0.2218, "step": 3864 }, { "epoch": 0.7789643360870441, "grad_norm": 0.0592481829226017, "learning_rate": 9.697170348966738e-05, "loss": 0.2439, "step": 3866 }, { "epoch": 0.7793673181543421, "grad_norm": 0.057907577604055405, "learning_rate": 9.696713442918977e-05, "loss": 0.1622, "step": 3868 }, { "epoch": 0.7797703002216402, "grad_norm": 0.05967063456773758, "learning_rate": 9.696256203224205e-05, "loss": 0.2157, "step": 3870 }, { "epoch": 0.7801732822889381, "grad_norm": 0.061265114694833755, "learning_rate": 9.6957986299149e-05, "loss": 0.1746, "step": 3872 }, { "epoch": 0.7805762643562362, "grad_norm": 0.07470270991325378, "learning_rate": 9.695340723023574e-05, "loss": 0.1761, "step": 3874 }, { "epoch": 0.7809792464235341, "grad_norm": 0.06045358628034592, "learning_rate": 9.69488248258275e-05, "loss": 0.2112, "step": 3876 }, { "epoch": 0.7813822284908322, "grad_norm": 0.058921560645103455, "learning_rate": 9.694423908624983e-05, "loss": 0.2867, "step": 3878 }, { "epoch": 0.7817852105581302, "grad_norm": 0.057590458542108536, "learning_rate": 9.693965001182849e-05, "loss": 0.2301, "step": 3880 }, { "epoch": 0.7821881926254282, "grad_norm": 0.048990052193403244, "learning_rate": 9.693505760288948e-05, "loss": 0.2059, "step": 3882 }, { "epoch": 0.7825911746927262, "grad_norm": 0.06255292892456055, "learning_rate": 9.693046185975905e-05, "loss": 0.2433, "step": 3884 }, { "epoch": 0.7829941567600242, "grad_norm": 0.05485512688755989, "learning_rate": 9.692586278276366e-05, "loss": 0.2001, "step": 3886 }, { "epoch": 0.7833971388273222, "grad_norm": 0.061514101922512054, "learning_rate": 9.692126037223002e-05, "loss": 0.1689, "step": 3888 }, { "epoch": 0.7838001208946201, "grad_norm": 0.04852156713604927, "learning_rate": 9.691665462848508e-05, "loss": 0.2094, "step": 3890 }, { "epoch": 0.7842031029619182, "grad_norm": 0.058205705136060715, "learning_rate": 9.691204555185603e-05, "loss": 0.1868, "step": 3892 }, { "epoch": 0.7846060850292162, "grad_norm": 0.0600285641849041, "learning_rate": 9.690743314267029e-05, "loss": 0.2324, "step": 3894 }, { "epoch": 0.7850090670965142, "grad_norm": 0.05361782759428024, "learning_rate": 9.690281740125552e-05, "loss": 0.2192, "step": 3896 }, { "epoch": 0.7854120491638122, "grad_norm": 0.0479610301554203, "learning_rate": 9.689819832793961e-05, "loss": 0.2361, "step": 3898 }, { "epoch": 0.7858150312311102, "grad_norm": 0.04447110369801521, "learning_rate": 9.689357592305069e-05, "loss": 0.1594, "step": 3900 }, { "epoch": 0.7862180132984082, "grad_norm": 0.05070003494620323, "learning_rate": 9.688895018691713e-05, "loss": 0.1618, "step": 3902 }, { "epoch": 0.7866209953657062, "grad_norm": 0.12754860520362854, "learning_rate": 9.688432111986754e-05, "loss": 0.2075, "step": 3904 }, { "epoch": 0.7870239774330042, "grad_norm": 0.07254056632518768, "learning_rate": 9.687968872223077e-05, "loss": 0.2761, "step": 3906 }, { "epoch": 0.7874269595003023, "grad_norm": 0.05646049603819847, "learning_rate": 9.687505299433587e-05, "loss": 0.1883, "step": 3908 }, { "epoch": 0.7878299415676002, "grad_norm": 0.05001097545027733, "learning_rate": 9.687041393651217e-05, "loss": 0.1723, "step": 3910 }, { "epoch": 0.7882329236348983, "grad_norm": 0.06254450976848602, "learning_rate": 9.686577154908924e-05, "loss": 0.2134, "step": 3912 }, { "epoch": 0.7886359057021962, "grad_norm": 0.05184612423181534, "learning_rate": 9.686112583239684e-05, "loss": 0.177, "step": 3914 }, { "epoch": 0.7890388877694943, "grad_norm": 0.05817123129963875, "learning_rate": 9.6856476786765e-05, "loss": 0.1708, "step": 3916 }, { "epoch": 0.7894418698367922, "grad_norm": 0.04600764811038971, "learning_rate": 9.685182441252398e-05, "loss": 0.195, "step": 3918 }, { "epoch": 0.7898448519040903, "grad_norm": 0.048723481595516205, "learning_rate": 9.684716871000429e-05, "loss": 0.1915, "step": 3920 }, { "epoch": 0.7902478339713883, "grad_norm": 0.04340605065226555, "learning_rate": 9.684250967953666e-05, "loss": 0.2266, "step": 3922 }, { "epoch": 0.7906508160386863, "grad_norm": 0.06534282863140106, "learning_rate": 9.683784732145205e-05, "loss": 0.2265, "step": 3924 }, { "epoch": 0.7910537981059843, "grad_norm": 0.047532081604003906, "learning_rate": 9.683318163608166e-05, "loss": 0.1832, "step": 3926 }, { "epoch": 0.7914567801732822, "grad_norm": 0.05659693479537964, "learning_rate": 9.682851262375696e-05, "loss": 0.2089, "step": 3928 }, { "epoch": 0.7918597622405803, "grad_norm": 0.04877660050988197, "learning_rate": 9.682384028480962e-05, "loss": 0.1407, "step": 3930 }, { "epoch": 0.7922627443078784, "grad_norm": 0.060812223702669144, "learning_rate": 9.681916461957155e-05, "loss": 0.2014, "step": 3932 }, { "epoch": 0.7926657263751763, "grad_norm": 0.03950156643986702, "learning_rate": 9.681448562837489e-05, "loss": 0.1924, "step": 3934 }, { "epoch": 0.7930687084424743, "grad_norm": 0.05669216439127922, "learning_rate": 9.680980331155204e-05, "loss": 0.2227, "step": 3936 }, { "epoch": 0.7934716905097723, "grad_norm": 0.0466766394674778, "learning_rate": 9.680511766943563e-05, "loss": 0.1991, "step": 3938 }, { "epoch": 0.7938746725770703, "grad_norm": 0.05456336587667465, "learning_rate": 9.68004287023585e-05, "loss": 0.2363, "step": 3940 }, { "epoch": 0.7942776546443683, "grad_norm": 0.05615445598959923, "learning_rate": 9.679573641065378e-05, "loss": 0.1821, "step": 3942 }, { "epoch": 0.7946806367116663, "grad_norm": 0.04117673635482788, "learning_rate": 9.679104079465478e-05, "loss": 0.1608, "step": 3944 }, { "epoch": 0.7950836187789644, "grad_norm": 0.06808658689260483, "learning_rate": 9.678634185469507e-05, "loss": 0.2022, "step": 3946 }, { "epoch": 0.7954866008462623, "grad_norm": 0.03982819616794586, "learning_rate": 9.678163959110846e-05, "loss": 0.1872, "step": 3948 }, { "epoch": 0.7958895829135604, "grad_norm": 0.05367998778820038, "learning_rate": 9.677693400422898e-05, "loss": 0.2246, "step": 3950 }, { "epoch": 0.7962925649808583, "grad_norm": 0.0663178563117981, "learning_rate": 9.677222509439094e-05, "loss": 0.2514, "step": 3952 }, { "epoch": 0.7966955470481564, "grad_norm": 0.06157734617590904, "learning_rate": 9.67675128619288e-05, "loss": 0.2064, "step": 3954 }, { "epoch": 0.7970985291154543, "grad_norm": 0.04360115900635719, "learning_rate": 9.676279730717737e-05, "loss": 0.1879, "step": 3956 }, { "epoch": 0.7975015111827524, "grad_norm": 0.04340951517224312, "learning_rate": 9.675807843047159e-05, "loss": 0.1763, "step": 3958 }, { "epoch": 0.7979044932500504, "grad_norm": 0.04515016824007034, "learning_rate": 9.67533562321467e-05, "loss": 0.2203, "step": 3960 }, { "epoch": 0.7983074753173484, "grad_norm": 0.04955977573990822, "learning_rate": 9.674863071253815e-05, "loss": 0.2147, "step": 3962 }, { "epoch": 0.7987104573846464, "grad_norm": 0.0752781331539154, "learning_rate": 9.674390187198163e-05, "loss": 0.1997, "step": 3964 }, { "epoch": 0.7991134394519444, "grad_norm": 0.05286385491490364, "learning_rate": 9.67391697108131e-05, "loss": 0.2361, "step": 3966 }, { "epoch": 0.7995164215192424, "grad_norm": 0.05145072564482689, "learning_rate": 9.673443422936867e-05, "loss": 0.2219, "step": 3968 }, { "epoch": 0.7999194035865403, "grad_norm": 0.05920419469475746, "learning_rate": 9.67296954279848e-05, "loss": 0.146, "step": 3970 }, { "epoch": 0.8003223856538384, "grad_norm": 0.05192543566226959, "learning_rate": 9.672495330699808e-05, "loss": 0.2088, "step": 3972 }, { "epoch": 0.8007253677211364, "grad_norm": 0.04138815402984619, "learning_rate": 9.672020786674543e-05, "loss": 0.2041, "step": 3974 }, { "epoch": 0.8011283497884344, "grad_norm": 0.04955004155635834, "learning_rate": 9.671545910756392e-05, "loss": 0.2155, "step": 3976 }, { "epoch": 0.8015313318557324, "grad_norm": 0.04830560460686684, "learning_rate": 9.67107070297909e-05, "loss": 0.195, "step": 3978 }, { "epoch": 0.8019343139230304, "grad_norm": 0.04398579150438309, "learning_rate": 9.670595163376394e-05, "loss": 0.2121, "step": 3980 }, { "epoch": 0.8023372959903284, "grad_norm": 0.06084743142127991, "learning_rate": 9.670119291982089e-05, "loss": 0.2057, "step": 3982 }, { "epoch": 0.8027402780576265, "grad_norm": 0.05149463191628456, "learning_rate": 9.669643088829978e-05, "loss": 0.2216, "step": 3984 }, { "epoch": 0.8031432601249244, "grad_norm": 0.03941832110285759, "learning_rate": 9.66916655395389e-05, "loss": 0.1706, "step": 3986 }, { "epoch": 0.8035462421922225, "grad_norm": 0.06777830421924591, "learning_rate": 9.668689687387678e-05, "loss": 0.2356, "step": 3988 }, { "epoch": 0.8039492242595204, "grad_norm": 0.05458809807896614, "learning_rate": 9.668212489165216e-05, "loss": 0.1913, "step": 3990 }, { "epoch": 0.8043522063268185, "grad_norm": 0.06013821065425873, "learning_rate": 9.667734959320405e-05, "loss": 0.2519, "step": 3992 }, { "epoch": 0.8047551883941164, "grad_norm": 0.07752058655023575, "learning_rate": 9.667257097887167e-05, "loss": 0.2241, "step": 3994 }, { "epoch": 0.8051581704614145, "grad_norm": 0.06690877676010132, "learning_rate": 9.666778904899449e-05, "loss": 0.183, "step": 3996 }, { "epoch": 0.8055611525287125, "grad_norm": 0.050800006836652756, "learning_rate": 9.666300380391222e-05, "loss": 0.1714, "step": 3998 }, { "epoch": 0.8059641345960105, "grad_norm": 0.06313162297010422, "learning_rate": 9.665821524396476e-05, "loss": 0.2002, "step": 4000 }, { "epoch": 0.8063671166633085, "grad_norm": 0.06548028439283371, "learning_rate": 9.665342336949232e-05, "loss": 0.2249, "step": 4002 }, { "epoch": 0.8067700987306065, "grad_norm": 0.05883391574025154, "learning_rate": 9.664862818083531e-05, "loss": 0.1774, "step": 4004 }, { "epoch": 0.8071730807979045, "grad_norm": 0.06782971322536469, "learning_rate": 9.664382967833435e-05, "loss": 0.1652, "step": 4006 }, { "epoch": 0.8075760628652024, "grad_norm": 0.056911651045084, "learning_rate": 9.663902786233032e-05, "loss": 0.1955, "step": 4008 }, { "epoch": 0.8079790449325005, "grad_norm": 0.06347520649433136, "learning_rate": 9.663422273316433e-05, "loss": 0.2332, "step": 4010 }, { "epoch": 0.8083820269997986, "grad_norm": 0.040850598365068436, "learning_rate": 9.662941429117775e-05, "loss": 0.141, "step": 4012 }, { "epoch": 0.8087850090670965, "grad_norm": 0.05497581511735916, "learning_rate": 9.662460253671216e-05, "loss": 0.2611, "step": 4014 }, { "epoch": 0.8091879911343945, "grad_norm": 0.05755303055047989, "learning_rate": 9.661978747010936e-05, "loss": 0.2626, "step": 4016 }, { "epoch": 0.8095909732016925, "grad_norm": 0.04909120127558708, "learning_rate": 9.661496909171141e-05, "loss": 0.1573, "step": 4018 }, { "epoch": 0.8099939552689905, "grad_norm": 0.0518498420715332, "learning_rate": 9.661014740186063e-05, "loss": 0.2242, "step": 4020 }, { "epoch": 0.8103969373362885, "grad_norm": 0.07708277553319931, "learning_rate": 9.66053224008995e-05, "loss": 0.2089, "step": 4022 }, { "epoch": 0.8107999194035865, "grad_norm": 0.07529980689287186, "learning_rate": 9.66004940891708e-05, "loss": 0.1794, "step": 4024 }, { "epoch": 0.8112029014708846, "grad_norm": 0.04863161966204643, "learning_rate": 9.659566246701753e-05, "loss": 0.2272, "step": 4026 }, { "epoch": 0.8116058835381825, "grad_norm": 0.03779434785246849, "learning_rate": 9.659082753478292e-05, "loss": 0.1701, "step": 4028 }, { "epoch": 0.8120088656054806, "grad_norm": 0.05291910842061043, "learning_rate": 9.658598929281042e-05, "loss": 0.2102, "step": 4030 }, { "epoch": 0.8124118476727785, "grad_norm": 0.03803229704499245, "learning_rate": 9.658114774144376e-05, "loss": 0.1921, "step": 4032 }, { "epoch": 0.8128148297400766, "grad_norm": 0.0535709448158741, "learning_rate": 9.657630288102686e-05, "loss": 0.1913, "step": 4034 }, { "epoch": 0.8132178118073746, "grad_norm": 0.045826297253370285, "learning_rate": 9.657145471190388e-05, "loss": 0.226, "step": 4036 }, { "epoch": 0.8136207938746726, "grad_norm": 0.0441228449344635, "learning_rate": 9.656660323441924e-05, "loss": 0.1807, "step": 4038 }, { "epoch": 0.8140237759419706, "grad_norm": 0.04264240711927414, "learning_rate": 9.656174844891759e-05, "loss": 0.2037, "step": 4040 }, { "epoch": 0.8144267580092686, "grad_norm": 0.04649609327316284, "learning_rate": 9.655689035574378e-05, "loss": 0.2423, "step": 4042 }, { "epoch": 0.8148297400765666, "grad_norm": 0.03742203488945961, "learning_rate": 9.655202895524294e-05, "loss": 0.1589, "step": 4044 }, { "epoch": 0.8152327221438646, "grad_norm": 0.06208263710141182, "learning_rate": 9.654716424776041e-05, "loss": 0.1986, "step": 4046 }, { "epoch": 0.8156357042111626, "grad_norm": 0.04026419296860695, "learning_rate": 9.654229623364177e-05, "loss": 0.1577, "step": 4048 }, { "epoch": 0.8160386862784607, "grad_norm": 0.0645565316081047, "learning_rate": 9.653742491323286e-05, "loss": 0.1589, "step": 4050 }, { "epoch": 0.8164416683457586, "grad_norm": 0.043083306401968, "learning_rate": 9.653255028687969e-05, "loss": 0.2045, "step": 4052 }, { "epoch": 0.8168446504130566, "grad_norm": 0.054640740156173706, "learning_rate": 9.652767235492856e-05, "loss": 0.1936, "step": 4054 }, { "epoch": 0.8172476324803546, "grad_norm": 0.06548301875591278, "learning_rate": 9.652279111772603e-05, "loss": 0.1989, "step": 4056 }, { "epoch": 0.8176506145476526, "grad_norm": 0.07910330593585968, "learning_rate": 9.651790657561879e-05, "loss": 0.2104, "step": 4058 }, { "epoch": 0.8180535966149506, "grad_norm": 0.06049291044473648, "learning_rate": 9.651301872895387e-05, "loss": 0.2042, "step": 4060 }, { "epoch": 0.8184565786822486, "grad_norm": 0.05698911473155022, "learning_rate": 9.650812757807848e-05, "loss": 0.2017, "step": 4062 }, { "epoch": 0.8188595607495467, "grad_norm": 0.048121679574251175, "learning_rate": 9.650323312334008e-05, "loss": 0.1826, "step": 4064 }, { "epoch": 0.8192625428168446, "grad_norm": 0.06803172826766968, "learning_rate": 9.649833536508639e-05, "loss": 0.2246, "step": 4066 }, { "epoch": 0.8196655248841427, "grad_norm": 0.05307772755622864, "learning_rate": 9.649343430366531e-05, "loss": 0.1693, "step": 4068 }, { "epoch": 0.8200685069514406, "grad_norm": 0.04842696711421013, "learning_rate": 9.648852993942501e-05, "loss": 0.2509, "step": 4070 }, { "epoch": 0.8204714890187387, "grad_norm": 0.06046567112207413, "learning_rate": 9.64836222727139e-05, "loss": 0.202, "step": 4072 }, { "epoch": 0.8208744710860366, "grad_norm": 0.0492112897336483, "learning_rate": 9.647871130388059e-05, "loss": 0.1805, "step": 4074 }, { "epoch": 0.8212774531533347, "grad_norm": 0.039268992841243744, "learning_rate": 9.647379703327396e-05, "loss": 0.1686, "step": 4076 }, { "epoch": 0.8216804352206327, "grad_norm": 0.04768858104944229, "learning_rate": 9.646887946124313e-05, "loss": 0.2717, "step": 4078 }, { "epoch": 0.8220834172879307, "grad_norm": 0.04175850749015808, "learning_rate": 9.646395858813739e-05, "loss": 0.1466, "step": 4080 }, { "epoch": 0.8224863993552287, "grad_norm": 0.05806022137403488, "learning_rate": 9.645903441430637e-05, "loss": 0.2093, "step": 4082 }, { "epoch": 0.8228893814225267, "grad_norm": 0.23905980587005615, "learning_rate": 9.645410694009984e-05, "loss": 0.224, "step": 4084 }, { "epoch": 0.8232923634898247, "grad_norm": 0.05047454684972763, "learning_rate": 9.644917616586783e-05, "loss": 0.1972, "step": 4086 }, { "epoch": 0.8236953455571228, "grad_norm": 0.05037853121757507, "learning_rate": 9.644424209196064e-05, "loss": 0.1878, "step": 4088 }, { "epoch": 0.8240983276244207, "grad_norm": 0.05729193612933159, "learning_rate": 9.643930471872877e-05, "loss": 0.1767, "step": 4090 }, { "epoch": 0.8245013096917188, "grad_norm": 0.061119064688682556, "learning_rate": 9.643436404652295e-05, "loss": 0.1605, "step": 4092 }, { "epoch": 0.8249042917590167, "grad_norm": 0.03909744322299957, "learning_rate": 9.642942007569418e-05, "loss": 0.1822, "step": 4094 }, { "epoch": 0.8253072738263147, "grad_norm": 0.06683524698019028, "learning_rate": 9.642447280659365e-05, "loss": 0.1889, "step": 4096 }, { "epoch": 0.8257102558936127, "grad_norm": 0.06590251624584198, "learning_rate": 9.641952223957282e-05, "loss": 0.1949, "step": 4098 }, { "epoch": 0.8261132379609107, "grad_norm": 0.05025814473628998, "learning_rate": 9.641456837498338e-05, "loss": 0.1786, "step": 4100 }, { "epoch": 0.8265162200282088, "grad_norm": 0.06974563002586365, "learning_rate": 9.640961121317722e-05, "loss": 0.2245, "step": 4102 }, { "epoch": 0.8269192020955067, "grad_norm": 0.05516672879457474, "learning_rate": 9.640465075450651e-05, "loss": 0.1922, "step": 4104 }, { "epoch": 0.8273221841628048, "grad_norm": 0.05775739997625351, "learning_rate": 9.639968699932361e-05, "loss": 0.1736, "step": 4106 }, { "epoch": 0.8277251662301027, "grad_norm": 0.04958879202604294, "learning_rate": 9.639471994798117e-05, "loss": 0.2281, "step": 4108 }, { "epoch": 0.8281281482974008, "grad_norm": 0.049606986343860626, "learning_rate": 9.6389749600832e-05, "loss": 0.1881, "step": 4110 }, { "epoch": 0.8285311303646987, "grad_norm": 0.0543147549033165, "learning_rate": 9.638477595822922e-05, "loss": 0.2541, "step": 4112 }, { "epoch": 0.8289341124319968, "grad_norm": 0.048618897795677185, "learning_rate": 9.637979902052614e-05, "loss": 0.2474, "step": 4114 }, { "epoch": 0.8293370944992948, "grad_norm": 0.053704481571912766, "learning_rate": 9.63748187880763e-05, "loss": 0.1822, "step": 4116 }, { "epoch": 0.8297400765665928, "grad_norm": 0.0653117224574089, "learning_rate": 9.636983526123351e-05, "loss": 0.2378, "step": 4118 }, { "epoch": 0.8301430586338908, "grad_norm": 0.05925583839416504, "learning_rate": 9.636484844035179e-05, "loss": 0.2085, "step": 4120 }, { "epoch": 0.8305460407011888, "grad_norm": 0.05028758570551872, "learning_rate": 9.635985832578536e-05, "loss": 0.1704, "step": 4122 }, { "epoch": 0.8309490227684868, "grad_norm": 0.07257190346717834, "learning_rate": 9.635486491788875e-05, "loss": 0.2143, "step": 4124 }, { "epoch": 0.8313520048357848, "grad_norm": 0.11785920709371567, "learning_rate": 9.634986821701667e-05, "loss": 0.2413, "step": 4126 }, { "epoch": 0.8317549869030828, "grad_norm": 0.04574896767735481, "learning_rate": 9.634486822352408e-05, "loss": 0.2276, "step": 4128 }, { "epoch": 0.8321579689703809, "grad_norm": 0.05423697084188461, "learning_rate": 9.633986493776617e-05, "loss": 0.1688, "step": 4130 }, { "epoch": 0.8325609510376788, "grad_norm": 0.05051505193114281, "learning_rate": 9.633485836009836e-05, "loss": 0.2345, "step": 4132 }, { "epoch": 0.8329639331049769, "grad_norm": 0.045644182711839676, "learning_rate": 9.63298484908763e-05, "loss": 0.1883, "step": 4134 }, { "epoch": 0.8333669151722748, "grad_norm": 0.04327382519841194, "learning_rate": 9.632483533045592e-05, "loss": 0.1658, "step": 4136 }, { "epoch": 0.8337698972395728, "grad_norm": 0.05330512300133705, "learning_rate": 9.631981887919332e-05, "loss": 0.2277, "step": 4138 }, { "epoch": 0.8341728793068709, "grad_norm": 0.04282519593834877, "learning_rate": 9.631479913744486e-05, "loss": 0.144, "step": 4140 }, { "epoch": 0.8345758613741688, "grad_norm": 0.05408427491784096, "learning_rate": 9.630977610556713e-05, "loss": 0.1884, "step": 4142 }, { "epoch": 0.8349788434414669, "grad_norm": 0.05264348164200783, "learning_rate": 9.630474978391697e-05, "loss": 0.2291, "step": 4144 }, { "epoch": 0.8353818255087648, "grad_norm": 0.04659276828169823, "learning_rate": 9.629972017285144e-05, "loss": 0.2049, "step": 4146 }, { "epoch": 0.8357848075760629, "grad_norm": 0.0541483573615551, "learning_rate": 9.629468727272785e-05, "loss": 0.1918, "step": 4148 }, { "epoch": 0.8361877896433608, "grad_norm": 0.04391827434301376, "learning_rate": 9.62896510839037e-05, "loss": 0.1756, "step": 4150 }, { "epoch": 0.8365907717106589, "grad_norm": 0.05609648674726486, "learning_rate": 9.628461160673676e-05, "loss": 0.2171, "step": 4152 }, { "epoch": 0.8369937537779569, "grad_norm": 0.06236313283443451, "learning_rate": 9.627956884158505e-05, "loss": 0.1794, "step": 4154 }, { "epoch": 0.8373967358452549, "grad_norm": 0.054331183433532715, "learning_rate": 9.627452278880677e-05, "loss": 0.1556, "step": 4156 }, { "epoch": 0.8377997179125529, "grad_norm": 0.051585860550403595, "learning_rate": 9.62694734487604e-05, "loss": 0.1957, "step": 4158 }, { "epoch": 0.8382026999798509, "grad_norm": 0.05942024290561676, "learning_rate": 9.626442082180463e-05, "loss": 0.1722, "step": 4160 }, { "epoch": 0.8386056820471489, "grad_norm": 0.06458491086959839, "learning_rate": 9.625936490829842e-05, "loss": 0.1744, "step": 4162 }, { "epoch": 0.8390086641144469, "grad_norm": 0.05910806357860565, "learning_rate": 9.625430570860087e-05, "loss": 0.1826, "step": 4164 }, { "epoch": 0.8394116461817449, "grad_norm": 0.08108662813901901, "learning_rate": 9.624924322307142e-05, "loss": 0.1826, "step": 4166 }, { "epoch": 0.839814628249043, "grad_norm": 0.057688187807798386, "learning_rate": 9.62441774520697e-05, "loss": 0.2289, "step": 4168 }, { "epoch": 0.8402176103163409, "grad_norm": 0.05357905849814415, "learning_rate": 9.62391083959556e-05, "loss": 0.1976, "step": 4170 }, { "epoch": 0.840620592383639, "grad_norm": 0.07163774967193604, "learning_rate": 9.623403605508916e-05, "loss": 0.2353, "step": 4172 }, { "epoch": 0.8410235744509369, "grad_norm": 0.04970443621277809, "learning_rate": 9.622896042983075e-05, "loss": 0.2062, "step": 4174 }, { "epoch": 0.841426556518235, "grad_norm": 0.0592183880507946, "learning_rate": 9.622388152054092e-05, "loss": 0.191, "step": 4176 }, { "epoch": 0.8418295385855329, "grad_norm": 0.05800905451178551, "learning_rate": 9.621879932758045e-05, "loss": 0.2192, "step": 4178 }, { "epoch": 0.8422325206528309, "grad_norm": 0.06246621534228325, "learning_rate": 9.621371385131042e-05, "loss": 0.1652, "step": 4180 }, { "epoch": 0.842635502720129, "grad_norm": 0.06259558349847794, "learning_rate": 9.620862509209206e-05, "loss": 0.1838, "step": 4182 }, { "epoch": 0.8430384847874269, "grad_norm": 0.047461558133363724, "learning_rate": 9.620353305028687e-05, "loss": 0.2317, "step": 4184 }, { "epoch": 0.843441466854725, "grad_norm": 0.06128053739666939, "learning_rate": 9.619843772625657e-05, "loss": 0.2001, "step": 4186 }, { "epoch": 0.8438444489220229, "grad_norm": 0.0504336915910244, "learning_rate": 9.619333912036314e-05, "loss": 0.2138, "step": 4188 }, { "epoch": 0.844247430989321, "grad_norm": 0.0516376867890358, "learning_rate": 9.618823723296879e-05, "loss": 0.1869, "step": 4190 }, { "epoch": 0.844650413056619, "grad_norm": 0.05454495549201965, "learning_rate": 9.618313206443595e-05, "loss": 0.2028, "step": 4192 }, { "epoch": 0.845053395123917, "grad_norm": 0.04650101065635681, "learning_rate": 9.617802361512723e-05, "loss": 0.2244, "step": 4194 }, { "epoch": 0.845456377191215, "grad_norm": 0.05743958428502083, "learning_rate": 9.617291188540558e-05, "loss": 0.2123, "step": 4196 }, { "epoch": 0.845859359258513, "grad_norm": 0.07449749112129211, "learning_rate": 9.616779687563411e-05, "loss": 0.208, "step": 4198 }, { "epoch": 0.846262341325811, "grad_norm": 0.047410573810338974, "learning_rate": 9.616267858617617e-05, "loss": 0.2003, "step": 4200 }, { "epoch": 0.846665323393109, "grad_norm": 0.04203968122601509, "learning_rate": 9.61575570173954e-05, "loss": 0.2223, "step": 4202 }, { "epoch": 0.847068305460407, "grad_norm": 0.047013357281684875, "learning_rate": 9.61524321696556e-05, "loss": 0.1855, "step": 4204 }, { "epoch": 0.8474712875277051, "grad_norm": 0.046448126435279846, "learning_rate": 9.614730404332079e-05, "loss": 0.1873, "step": 4206 }, { "epoch": 0.847874269595003, "grad_norm": 0.045094847679138184, "learning_rate": 9.614217263875533e-05, "loss": 0.2429, "step": 4208 }, { "epoch": 0.8482772516623011, "grad_norm": 0.04861465469002724, "learning_rate": 9.613703795632372e-05, "loss": 0.2428, "step": 4210 }, { "epoch": 0.848680233729599, "grad_norm": 0.05912397801876068, "learning_rate": 9.61318999963907e-05, "loss": 0.2101, "step": 4212 }, { "epoch": 0.849083215796897, "grad_norm": 0.05289468914270401, "learning_rate": 9.61267587593213e-05, "loss": 0.2354, "step": 4214 }, { "epoch": 0.849486197864195, "grad_norm": 0.04913996905088425, "learning_rate": 9.612161424548072e-05, "loss": 0.1922, "step": 4216 }, { "epoch": 0.849889179931493, "grad_norm": 0.046018872410058975, "learning_rate": 9.611646645523442e-05, "loss": 0.1793, "step": 4218 }, { "epoch": 0.8502921619987911, "grad_norm": 0.05342131480574608, "learning_rate": 9.611131538894811e-05, "loss": 0.2079, "step": 4220 }, { "epoch": 0.850695144066089, "grad_norm": 0.05977238342165947, "learning_rate": 9.610616104698768e-05, "loss": 0.1883, "step": 4222 }, { "epoch": 0.8510981261333871, "grad_norm": 0.04549986496567726, "learning_rate": 9.610100342971932e-05, "loss": 0.224, "step": 4224 }, { "epoch": 0.851501108200685, "grad_norm": 0.048815593123435974, "learning_rate": 9.60958425375094e-05, "loss": 0.1479, "step": 4226 }, { "epoch": 0.8519040902679831, "grad_norm": 0.06040544807910919, "learning_rate": 9.609067837072454e-05, "loss": 0.2188, "step": 4228 }, { "epoch": 0.8523070723352811, "grad_norm": 0.06375422328710556, "learning_rate": 9.60855109297316e-05, "loss": 0.194, "step": 4230 }, { "epoch": 0.8527100544025791, "grad_norm": 0.06123984232544899, "learning_rate": 9.608034021489766e-05, "loss": 0.2149, "step": 4232 }, { "epoch": 0.8531130364698771, "grad_norm": 0.06358905136585236, "learning_rate": 9.607516622659007e-05, "loss": 0.1769, "step": 4234 }, { "epoch": 0.8535160185371751, "grad_norm": 0.05531733110547066, "learning_rate": 9.606998896517634e-05, "loss": 0.2195, "step": 4236 }, { "epoch": 0.8539190006044731, "grad_norm": 0.051569703966379166, "learning_rate": 9.606480843102428e-05, "loss": 0.2088, "step": 4238 }, { "epoch": 0.8543219826717711, "grad_norm": 0.0638023391366005, "learning_rate": 9.605962462450188e-05, "loss": 0.1862, "step": 4240 }, { "epoch": 0.8547249647390691, "grad_norm": 0.06016799062490463, "learning_rate": 9.605443754597742e-05, "loss": 0.2086, "step": 4242 }, { "epoch": 0.8551279468063672, "grad_norm": 0.05225469917058945, "learning_rate": 9.604924719581938e-05, "loss": 0.19, "step": 4244 }, { "epoch": 0.8555309288736651, "grad_norm": 0.04729381203651428, "learning_rate": 9.604405357439646e-05, "loss": 0.1858, "step": 4246 }, { "epoch": 0.8559339109409632, "grad_norm": 0.04221804067492485, "learning_rate": 9.603885668207762e-05, "loss": 0.1857, "step": 4248 }, { "epoch": 0.8563368930082611, "grad_norm": 0.08133088797330856, "learning_rate": 9.6033656519232e-05, "loss": 0.2085, "step": 4250 }, { "epoch": 0.8567398750755592, "grad_norm": 0.048365265130996704, "learning_rate": 9.602845308622905e-05, "loss": 0.189, "step": 4252 }, { "epoch": 0.8571428571428571, "grad_norm": 0.044929929077625275, "learning_rate": 9.602324638343843e-05, "loss": 0.1709, "step": 4254 }, { "epoch": 0.8575458392101551, "grad_norm": 0.04540219157934189, "learning_rate": 9.601803641122998e-05, "loss": 0.1952, "step": 4256 }, { "epoch": 0.8579488212774532, "grad_norm": 0.06363467872142792, "learning_rate": 9.60128231699738e-05, "loss": 0.1461, "step": 4258 }, { "epoch": 0.8583518033447511, "grad_norm": 0.04269418120384216, "learning_rate": 9.600760666004025e-05, "loss": 0.1845, "step": 4260 }, { "epoch": 0.8587547854120492, "grad_norm": 0.04981771484017372, "learning_rate": 9.60023868817999e-05, "loss": 0.2093, "step": 4262 }, { "epoch": 0.8591577674793471, "grad_norm": 0.0496847927570343, "learning_rate": 9.599716383562358e-05, "loss": 0.1933, "step": 4264 }, { "epoch": 0.8595607495466452, "grad_norm": 0.038256920874118805, "learning_rate": 9.59919375218823e-05, "loss": 0.158, "step": 4266 }, { "epoch": 0.8599637316139431, "grad_norm": 0.04268384724855423, "learning_rate": 9.59867079409473e-05, "loss": 0.2097, "step": 4268 }, { "epoch": 0.8603667136812412, "grad_norm": 0.059887178242206573, "learning_rate": 9.598147509319015e-05, "loss": 0.1843, "step": 4270 }, { "epoch": 0.8607696957485392, "grad_norm": 0.04287001118063927, "learning_rate": 9.597623897898251e-05, "loss": 0.1696, "step": 4272 }, { "epoch": 0.8611726778158372, "grad_norm": 0.056464433670043945, "learning_rate": 9.597099959869641e-05, "loss": 0.2019, "step": 4274 }, { "epoch": 0.8615756598831352, "grad_norm": 0.04815569892525673, "learning_rate": 9.596575695270402e-05, "loss": 0.1792, "step": 4276 }, { "epoch": 0.8619786419504332, "grad_norm": 0.053726229816675186, "learning_rate": 9.596051104137775e-05, "loss": 0.2035, "step": 4278 }, { "epoch": 0.8623816240177312, "grad_norm": 0.053749434649944305, "learning_rate": 9.595526186509028e-05, "loss": 0.2442, "step": 4280 }, { "epoch": 0.8627846060850293, "grad_norm": 0.052994389086961746, "learning_rate": 9.59500094242145e-05, "loss": 0.2216, "step": 4282 }, { "epoch": 0.8631875881523272, "grad_norm": 0.040021564811468124, "learning_rate": 9.594475371912355e-05, "loss": 0.1594, "step": 4284 }, { "epoch": 0.8635905702196253, "grad_norm": 0.044010285288095474, "learning_rate": 9.593949475019076e-05, "loss": 0.1902, "step": 4286 }, { "epoch": 0.8639935522869232, "grad_norm": 0.056951433420181274, "learning_rate": 9.593423251778975e-05, "loss": 0.197, "step": 4288 }, { "epoch": 0.8643965343542213, "grad_norm": 0.08449291437864304, "learning_rate": 9.59289670222943e-05, "loss": 0.2241, "step": 4290 }, { "epoch": 0.8647995164215192, "grad_norm": 0.05323825031518936, "learning_rate": 9.59236982640785e-05, "loss": 0.195, "step": 4292 }, { "epoch": 0.8652024984888173, "grad_norm": 0.05166636407375336, "learning_rate": 9.591842624351661e-05, "loss": 0.1491, "step": 4294 }, { "epoch": 0.8656054805561153, "grad_norm": 0.0477205291390419, "learning_rate": 9.591315096098316e-05, "loss": 0.1932, "step": 4296 }, { "epoch": 0.8660084626234132, "grad_norm": 0.06305918842554092, "learning_rate": 9.59078724168529e-05, "loss": 0.2263, "step": 4298 }, { "epoch": 0.8664114446907113, "grad_norm": 0.052747536450624466, "learning_rate": 9.590259061150079e-05, "loss": 0.1766, "step": 4300 }, { "epoch": 0.8668144267580092, "grad_norm": 0.048709429800510406, "learning_rate": 9.589730554530208e-05, "loss": 0.1508, "step": 4302 }, { "epoch": 0.8672174088253073, "grad_norm": 0.0500153973698616, "learning_rate": 9.589201721863214e-05, "loss": 0.1934, "step": 4304 }, { "epoch": 0.8676203908926052, "grad_norm": 0.06479570269584656, "learning_rate": 9.588672563186674e-05, "loss": 0.2535, "step": 4306 }, { "epoch": 0.8680233729599033, "grad_norm": 0.0527423657476902, "learning_rate": 9.58814307853817e-05, "loss": 0.1761, "step": 4308 }, { "epoch": 0.8684263550272013, "grad_norm": 0.07280410081148148, "learning_rate": 9.58761326795532e-05, "loss": 0.212, "step": 4310 }, { "epoch": 0.8688293370944993, "grad_norm": 0.11216331273317337, "learning_rate": 9.587083131475762e-05, "loss": 0.1958, "step": 4312 }, { "epoch": 0.8692323191617973, "grad_norm": 0.0491580106317997, "learning_rate": 9.586552669137152e-05, "loss": 0.2232, "step": 4314 }, { "epoch": 0.8696353012290953, "grad_norm": 0.0734315887093544, "learning_rate": 9.586021880977177e-05, "loss": 0.1532, "step": 4316 }, { "epoch": 0.8700382832963933, "grad_norm": 0.0685787945985794, "learning_rate": 9.585490767033543e-05, "loss": 0.1724, "step": 4318 }, { "epoch": 0.8704412653636913, "grad_norm": 0.0568709559738636, "learning_rate": 9.584959327343976e-05, "loss": 0.1806, "step": 4320 }, { "epoch": 0.8708442474309893, "grad_norm": 0.05582389608025551, "learning_rate": 9.584427561946232e-05, "loss": 0.1688, "step": 4322 }, { "epoch": 0.8712472294982874, "grad_norm": 0.07161648571491241, "learning_rate": 9.583895470878085e-05, "loss": 0.1877, "step": 4324 }, { "epoch": 0.8716502115655853, "grad_norm": 0.07797323167324066, "learning_rate": 9.583363054177335e-05, "loss": 0.1632, "step": 4326 }, { "epoch": 0.8720531936328834, "grad_norm": 0.05685307830572128, "learning_rate": 9.582830311881803e-05, "loss": 0.1465, "step": 4328 }, { "epoch": 0.8724561757001813, "grad_norm": 0.06444845348596573, "learning_rate": 9.582297244029336e-05, "loss": 0.1738, "step": 4330 }, { "epoch": 0.8728591577674794, "grad_norm": 0.05205952376127243, "learning_rate": 9.581763850657801e-05, "loss": 0.1724, "step": 4332 }, { "epoch": 0.8732621398347774, "grad_norm": 0.0661550834774971, "learning_rate": 9.581230131805088e-05, "loss": 0.1713, "step": 4334 }, { "epoch": 0.8736651219020753, "grad_norm": 0.06810151785612106, "learning_rate": 9.580696087509115e-05, "loss": 0.2472, "step": 4336 }, { "epoch": 0.8740681039693734, "grad_norm": 0.05709342285990715, "learning_rate": 9.580161717807816e-05, "loss": 0.1988, "step": 4338 }, { "epoch": 0.8744710860366713, "grad_norm": 0.06400882452726364, "learning_rate": 9.579627022739155e-05, "loss": 0.2413, "step": 4340 }, { "epoch": 0.8748740681039694, "grad_norm": 0.07204491645097733, "learning_rate": 9.579092002341112e-05, "loss": 0.204, "step": 4342 }, { "epoch": 0.8752770501712673, "grad_norm": 0.06638916581869125, "learning_rate": 9.578556656651699e-05, "loss": 0.1755, "step": 4344 }, { "epoch": 0.8756800322385654, "grad_norm": 0.04414273798465729, "learning_rate": 9.578020985708942e-05, "loss": 0.1649, "step": 4346 }, { "epoch": 0.8760830143058634, "grad_norm": 0.06172650679945946, "learning_rate": 9.577484989550896e-05, "loss": 0.2017, "step": 4348 }, { "epoch": 0.8764859963731614, "grad_norm": 0.045115333050489426, "learning_rate": 9.576948668215638e-05, "loss": 0.2568, "step": 4350 }, { "epoch": 0.8768889784404594, "grad_norm": 0.05499257892370224, "learning_rate": 9.576412021741264e-05, "loss": 0.1959, "step": 4352 }, { "epoch": 0.8772919605077574, "grad_norm": 0.049140289425849915, "learning_rate": 9.575875050165902e-05, "loss": 0.1836, "step": 4354 }, { "epoch": 0.8776949425750554, "grad_norm": 0.04703235626220703, "learning_rate": 9.575337753527692e-05, "loss": 0.2211, "step": 4356 }, { "epoch": 0.8780979246423534, "grad_norm": 0.05271073803305626, "learning_rate": 9.574800131864805e-05, "loss": 0.2134, "step": 4358 }, { "epoch": 0.8785009067096514, "grad_norm": 0.056267786771059036, "learning_rate": 9.574262185215433e-05, "loss": 0.1945, "step": 4360 }, { "epoch": 0.8789038887769495, "grad_norm": 0.05832262709736824, "learning_rate": 9.573723913617791e-05, "loss": 0.156, "step": 4362 }, { "epoch": 0.8793068708442474, "grad_norm": 0.051851604133844376, "learning_rate": 9.573185317110119e-05, "loss": 0.2601, "step": 4364 }, { "epoch": 0.8797098529115455, "grad_norm": 0.0523109994828701, "learning_rate": 9.572646395730673e-05, "loss": 0.2256, "step": 4366 }, { "epoch": 0.8801128349788434, "grad_norm": 0.05357774719595909, "learning_rate": 9.572107149517741e-05, "loss": 0.1877, "step": 4368 }, { "epoch": 0.8805158170461415, "grad_norm": 0.05208495631814003, "learning_rate": 9.571567578509629e-05, "loss": 0.1873, "step": 4370 }, { "epoch": 0.8809187991134394, "grad_norm": 0.0533597432076931, "learning_rate": 9.571027682744668e-05, "loss": 0.2484, "step": 4372 }, { "epoch": 0.8813217811807375, "grad_norm": 0.05073506757616997, "learning_rate": 9.57048746226121e-05, "loss": 0.2035, "step": 4374 }, { "epoch": 0.8817247632480355, "grad_norm": 0.05859789997339249, "learning_rate": 9.569946917097631e-05, "loss": 0.181, "step": 4376 }, { "epoch": 0.8821277453153334, "grad_norm": 0.06273086369037628, "learning_rate": 9.569406047292334e-05, "loss": 0.1764, "step": 4378 }, { "epoch": 0.8825307273826315, "grad_norm": 0.05160843953490257, "learning_rate": 9.568864852883739e-05, "loss": 0.2122, "step": 4380 }, { "epoch": 0.8829337094499294, "grad_norm": 0.05211114138364792, "learning_rate": 9.56832333391029e-05, "loss": 0.241, "step": 4382 }, { "epoch": 0.8833366915172275, "grad_norm": 0.05442271754145622, "learning_rate": 9.567781490410456e-05, "loss": 0.1976, "step": 4384 }, { "epoch": 0.8837396735845255, "grad_norm": 0.051495131105184555, "learning_rate": 9.567239322422734e-05, "loss": 0.1859, "step": 4386 }, { "epoch": 0.8841426556518235, "grad_norm": 0.06409385055303574, "learning_rate": 9.566696829985633e-05, "loss": 0.1892, "step": 4388 }, { "epoch": 0.8845456377191215, "grad_norm": 0.05642905831336975, "learning_rate": 9.566154013137691e-05, "loss": 0.1252, "step": 4390 }, { "epoch": 0.8849486197864195, "grad_norm": 0.07028786838054657, "learning_rate": 9.565610871917472e-05, "loss": 0.1937, "step": 4392 }, { "epoch": 0.8853516018537175, "grad_norm": 0.048712924122810364, "learning_rate": 9.565067406363556e-05, "loss": 0.2038, "step": 4394 }, { "epoch": 0.8857545839210155, "grad_norm": 0.07108557969331741, "learning_rate": 9.564523616514556e-05, "loss": 0.2014, "step": 4396 }, { "epoch": 0.8861575659883135, "grad_norm": 0.05257798731327057, "learning_rate": 9.563979502409096e-05, "loss": 0.1502, "step": 4398 }, { "epoch": 0.8865605480556116, "grad_norm": 0.07098128646612167, "learning_rate": 9.563435064085832e-05, "loss": 0.2139, "step": 4400 }, { "epoch": 0.8869635301229095, "grad_norm": 0.050052594393491745, "learning_rate": 9.562890301583438e-05, "loss": 0.1656, "step": 4402 }, { "epoch": 0.8873665121902076, "grad_norm": 0.05565377324819565, "learning_rate": 9.562345214940616e-05, "loss": 0.2131, "step": 4404 }, { "epoch": 0.8877694942575055, "grad_norm": 0.06834172457456589, "learning_rate": 9.561799804196083e-05, "loss": 0.1691, "step": 4406 }, { "epoch": 0.8881724763248036, "grad_norm": 0.05031581595540047, "learning_rate": 9.56125406938859e-05, "loss": 0.193, "step": 4408 }, { "epoch": 0.8885754583921015, "grad_norm": 0.04691687971353531, "learning_rate": 9.560708010556902e-05, "loss": 0.1703, "step": 4410 }, { "epoch": 0.8889784404593996, "grad_norm": 0.038260504603385925, "learning_rate": 9.560161627739813e-05, "loss": 0.1592, "step": 4412 }, { "epoch": 0.8893814225266976, "grad_norm": 0.10631363838911057, "learning_rate": 9.559614920976131e-05, "loss": 0.2233, "step": 4414 }, { "epoch": 0.8897844045939955, "grad_norm": 0.05809139087796211, "learning_rate": 9.559067890304698e-05, "loss": 0.2693, "step": 4416 }, { "epoch": 0.8901873866612936, "grad_norm": 0.03824347257614136, "learning_rate": 9.558520535764375e-05, "loss": 0.1971, "step": 4418 }, { "epoch": 0.8905903687285915, "grad_norm": 0.061272189021110535, "learning_rate": 9.557972857394042e-05, "loss": 0.2159, "step": 4420 }, { "epoch": 0.8909933507958896, "grad_norm": 0.04731274023652077, "learning_rate": 9.557424855232608e-05, "loss": 0.2196, "step": 4422 }, { "epoch": 0.8913963328631875, "grad_norm": 0.058554090559482574, "learning_rate": 9.556876529318999e-05, "loss": 0.1422, "step": 4424 }, { "epoch": 0.8917993149304856, "grad_norm": 0.05500726401805878, "learning_rate": 9.55632787969217e-05, "loss": 0.2067, "step": 4426 }, { "epoch": 0.8922022969977836, "grad_norm": 0.04614582657814026, "learning_rate": 9.555778906391095e-05, "loss": 0.2027, "step": 4428 }, { "epoch": 0.8926052790650816, "grad_norm": 0.042596716433763504, "learning_rate": 9.555229609454772e-05, "loss": 0.1479, "step": 4430 }, { "epoch": 0.8930082611323796, "grad_norm": 0.08022289723157883, "learning_rate": 9.554679988922222e-05, "loss": 0.2432, "step": 4432 }, { "epoch": 0.8934112431996776, "grad_norm": 0.05268188938498497, "learning_rate": 9.554130044832492e-05, "loss": 0.1994, "step": 4434 }, { "epoch": 0.8938142252669756, "grad_norm": 0.049188628792762756, "learning_rate": 9.553579777224644e-05, "loss": 0.1807, "step": 4436 }, { "epoch": 0.8942172073342737, "grad_norm": 0.043887216597795486, "learning_rate": 9.553029186137775e-05, "loss": 0.1264, "step": 4438 }, { "epoch": 0.8946201894015716, "grad_norm": 0.11060819029808044, "learning_rate": 9.552478271610989e-05, "loss": 0.217, "step": 4440 }, { "epoch": 0.8950231714688697, "grad_norm": 0.05788939818739891, "learning_rate": 9.55192703368343e-05, "loss": 0.1859, "step": 4442 }, { "epoch": 0.8954261535361676, "grad_norm": 0.059715982526540756, "learning_rate": 9.551375472394255e-05, "loss": 0.175, "step": 4444 }, { "epoch": 0.8958291356034657, "grad_norm": 0.04088887572288513, "learning_rate": 9.550823587782645e-05, "loss": 0.1929, "step": 4446 }, { "epoch": 0.8962321176707636, "grad_norm": 0.05613197386264801, "learning_rate": 9.550271379887805e-05, "loss": 0.1827, "step": 4448 }, { "epoch": 0.8966350997380617, "grad_norm": 0.04531438648700714, "learning_rate": 9.549718848748962e-05, "loss": 0.183, "step": 4450 }, { "epoch": 0.8970380818053597, "grad_norm": 0.06092282757163048, "learning_rate": 9.54916599440537e-05, "loss": 0.1447, "step": 4452 }, { "epoch": 0.8974410638726577, "grad_norm": 0.03984326496720314, "learning_rate": 9.548612816896298e-05, "loss": 0.1448, "step": 4454 }, { "epoch": 0.8978440459399557, "grad_norm": 0.05323481187224388, "learning_rate": 9.548059316261049e-05, "loss": 0.1924, "step": 4456 }, { "epoch": 0.8982470280072536, "grad_norm": 0.06252434849739075, "learning_rate": 9.54750549253894e-05, "loss": 0.2178, "step": 4458 }, { "epoch": 0.8986500100745517, "grad_norm": 0.06720856577157974, "learning_rate": 9.546951345769311e-05, "loss": 0.1585, "step": 4460 }, { "epoch": 0.8990529921418496, "grad_norm": 0.060730621218681335, "learning_rate": 9.546396875991532e-05, "loss": 0.2401, "step": 4462 }, { "epoch": 0.8994559742091477, "grad_norm": 0.07229790836572647, "learning_rate": 9.54584208324499e-05, "loss": 0.1737, "step": 4464 }, { "epoch": 0.8998589562764457, "grad_norm": 0.05511532351374626, "learning_rate": 9.545286967569095e-05, "loss": 0.1808, "step": 4466 }, { "epoch": 0.9002619383437437, "grad_norm": 0.056663528084754944, "learning_rate": 9.544731529003283e-05, "loss": 0.1716, "step": 4468 }, { "epoch": 0.9006649204110417, "grad_norm": 0.04725079610943794, "learning_rate": 9.544175767587012e-05, "loss": 0.2016, "step": 4470 }, { "epoch": 0.9010679024783397, "grad_norm": 0.05176713317632675, "learning_rate": 9.543619683359762e-05, "loss": 0.186, "step": 4472 }, { "epoch": 0.9014708845456377, "grad_norm": 0.05347689613699913, "learning_rate": 9.543063276361037e-05, "loss": 0.1651, "step": 4474 }, { "epoch": 0.9018738666129357, "grad_norm": 0.053653594106435776, "learning_rate": 9.54250654663036e-05, "loss": 0.1924, "step": 4476 }, { "epoch": 0.9022768486802337, "grad_norm": 0.06456420570611954, "learning_rate": 9.541949494207286e-05, "loss": 0.1796, "step": 4478 }, { "epoch": 0.9026798307475318, "grad_norm": 0.05221540853381157, "learning_rate": 9.54139211913138e-05, "loss": 0.2247, "step": 4480 }, { "epoch": 0.9030828128148297, "grad_norm": 0.049423567950725555, "learning_rate": 9.540834421442243e-05, "loss": 0.2272, "step": 4482 }, { "epoch": 0.9034857948821278, "grad_norm": 0.051466915756464005, "learning_rate": 9.54027640117949e-05, "loss": 0.1817, "step": 4484 }, { "epoch": 0.9038887769494257, "grad_norm": 0.06747356057167053, "learning_rate": 9.539718058382763e-05, "loss": 0.2122, "step": 4486 }, { "epoch": 0.9042917590167238, "grad_norm": 0.04708021134138107, "learning_rate": 9.539159393091726e-05, "loss": 0.2082, "step": 4488 }, { "epoch": 0.9046947410840218, "grad_norm": 0.051016103476285934, "learning_rate": 9.538600405346064e-05, "loss": 0.2112, "step": 4490 }, { "epoch": 0.9050977231513198, "grad_norm": 0.05513007566332817, "learning_rate": 9.538041095185486e-05, "loss": 0.1696, "step": 4492 }, { "epoch": 0.9055007052186178, "grad_norm": 0.04744619131088257, "learning_rate": 9.537481462649729e-05, "loss": 0.1648, "step": 4494 }, { "epoch": 0.9059036872859157, "grad_norm": 0.04048705846071243, "learning_rate": 9.536921507778543e-05, "loss": 0.1534, "step": 4496 }, { "epoch": 0.9063066693532138, "grad_norm": 0.05609700828790665, "learning_rate": 9.53636123061171e-05, "loss": 0.1872, "step": 4498 }, { "epoch": 0.9067096514205117, "grad_norm": 0.061345573514699936, "learning_rate": 9.535800631189032e-05, "loss": 0.2813, "step": 4500 }, { "epoch": 0.9071126334878098, "grad_norm": 0.059606775641441345, "learning_rate": 9.535239709550328e-05, "loss": 0.2559, "step": 4502 }, { "epoch": 0.9075156155551078, "grad_norm": 0.06905293464660645, "learning_rate": 9.534678465735449e-05, "loss": 0.1876, "step": 4504 }, { "epoch": 0.9079185976224058, "grad_norm": 0.07575200498104095, "learning_rate": 9.534116899784265e-05, "loss": 0.2222, "step": 4506 }, { "epoch": 0.9083215796897038, "grad_norm": 0.053717587143182755, "learning_rate": 9.533555011736667e-05, "loss": 0.1471, "step": 4508 }, { "epoch": 0.9087245617570018, "grad_norm": 0.04818922281265259, "learning_rate": 9.532992801632571e-05, "loss": 0.2059, "step": 4510 }, { "epoch": 0.9091275438242998, "grad_norm": 0.07846032083034515, "learning_rate": 9.532430269511916e-05, "loss": 0.2092, "step": 4512 }, { "epoch": 0.9095305258915978, "grad_norm": 0.055181194096803665, "learning_rate": 9.531867415414664e-05, "loss": 0.2307, "step": 4514 }, { "epoch": 0.9099335079588958, "grad_norm": 0.0974973514676094, "learning_rate": 9.531304239380797e-05, "loss": 0.2617, "step": 4516 }, { "epoch": 0.9103364900261939, "grad_norm": 0.053954772651195526, "learning_rate": 9.530740741450323e-05, "loss": 0.2005, "step": 4518 }, { "epoch": 0.9107394720934918, "grad_norm": 0.04149053618311882, "learning_rate": 9.530176921663275e-05, "loss": 0.2076, "step": 4520 }, { "epoch": 0.9111424541607899, "grad_norm": 0.04992946609854698, "learning_rate": 9.529612780059703e-05, "loss": 0.1643, "step": 4522 }, { "epoch": 0.9115454362280878, "grad_norm": 0.0528724230825901, "learning_rate": 9.529048316679682e-05, "loss": 0.2377, "step": 4524 }, { "epoch": 0.9119484182953859, "grad_norm": 0.043224893510341644, "learning_rate": 9.528483531563313e-05, "loss": 0.2093, "step": 4526 }, { "epoch": 0.9123514003626838, "grad_norm": 0.04171156883239746, "learning_rate": 9.527918424750715e-05, "loss": 0.1763, "step": 4528 }, { "epoch": 0.9127543824299819, "grad_norm": 0.03696437180042267, "learning_rate": 9.527352996282033e-05, "loss": 0.2211, "step": 4530 }, { "epoch": 0.9131573644972799, "grad_norm": 0.051768265664577484, "learning_rate": 9.526787246197436e-05, "loss": 0.1906, "step": 4532 }, { "epoch": 0.9135603465645779, "grad_norm": 0.044482771307229996, "learning_rate": 9.526221174537111e-05, "loss": 0.1931, "step": 4534 }, { "epoch": 0.9139633286318759, "grad_norm": 0.05503168702125549, "learning_rate": 9.525654781341274e-05, "loss": 0.2304, "step": 4536 }, { "epoch": 0.9143663106991738, "grad_norm": 0.046392567455768585, "learning_rate": 9.525088066650158e-05, "loss": 0.2202, "step": 4538 }, { "epoch": 0.9147692927664719, "grad_norm": 0.07175581157207489, "learning_rate": 9.524521030504023e-05, "loss": 0.218, "step": 4540 }, { "epoch": 0.91517227483377, "grad_norm": 0.050982605665922165, "learning_rate": 9.523953672943152e-05, "loss": 0.226, "step": 4542 }, { "epoch": 0.9155752569010679, "grad_norm": 0.05275282263755798, "learning_rate": 9.523385994007843e-05, "loss": 0.1566, "step": 4544 }, { "epoch": 0.9159782389683659, "grad_norm": 0.04982639476656914, "learning_rate": 9.522817993738429e-05, "loss": 0.1909, "step": 4546 }, { "epoch": 0.9163812210356639, "grad_norm": 0.0570659339427948, "learning_rate": 9.522249672175259e-05, "loss": 0.1553, "step": 4548 }, { "epoch": 0.9167842031029619, "grad_norm": 0.04874482378363609, "learning_rate": 9.521681029358702e-05, "loss": 0.1659, "step": 4550 }, { "epoch": 0.9171871851702599, "grad_norm": 0.06070302054286003, "learning_rate": 9.521112065329159e-05, "loss": 0.1794, "step": 4552 }, { "epoch": 0.9175901672375579, "grad_norm": 0.057424820959568024, "learning_rate": 9.520542780127044e-05, "loss": 0.182, "step": 4554 }, { "epoch": 0.917993149304856, "grad_norm": 0.043012555688619614, "learning_rate": 9.519973173792798e-05, "loss": 0.1809, "step": 4556 }, { "epoch": 0.9183961313721539, "grad_norm": 0.07553418725728989, "learning_rate": 9.519403246366888e-05, "loss": 0.1845, "step": 4558 }, { "epoch": 0.918799113439452, "grad_norm": 0.051994435489177704, "learning_rate": 9.518832997889798e-05, "loss": 0.213, "step": 4560 }, { "epoch": 0.9192020955067499, "grad_norm": 0.05973916873335838, "learning_rate": 9.51826242840204e-05, "loss": 0.2248, "step": 4562 }, { "epoch": 0.919605077574048, "grad_norm": 0.0520995669066906, "learning_rate": 9.517691537944145e-05, "loss": 0.2318, "step": 4564 }, { "epoch": 0.9200080596413459, "grad_norm": 0.050771716982126236, "learning_rate": 9.517120326556666e-05, "loss": 0.2165, "step": 4566 }, { "epoch": 0.920411041708644, "grad_norm": 0.055689260363578796, "learning_rate": 9.516548794280185e-05, "loss": 0.1839, "step": 4568 }, { "epoch": 0.920814023775942, "grad_norm": 0.05374159663915634, "learning_rate": 9.5159769411553e-05, "loss": 0.175, "step": 4570 }, { "epoch": 0.92121700584324, "grad_norm": 0.07564659416675568, "learning_rate": 9.515404767222636e-05, "loss": 0.2277, "step": 4572 }, { "epoch": 0.921619987910538, "grad_norm": 0.05168713629245758, "learning_rate": 9.514832272522838e-05, "loss": 0.2263, "step": 4574 }, { "epoch": 0.922022969977836, "grad_norm": 0.05899692326784134, "learning_rate": 9.514259457096578e-05, "loss": 0.2301, "step": 4576 }, { "epoch": 0.922425952045134, "grad_norm": 0.040742505341768265, "learning_rate": 9.513686320984543e-05, "loss": 0.1854, "step": 4578 }, { "epoch": 0.9228289341124319, "grad_norm": 0.04289477691054344, "learning_rate": 9.513112864227451e-05, "loss": 0.1922, "step": 4580 }, { "epoch": 0.92323191617973, "grad_norm": 0.057433657348155975, "learning_rate": 9.512539086866038e-05, "loss": 0.1583, "step": 4582 }, { "epoch": 0.923634898247028, "grad_norm": 0.09156624227762222, "learning_rate": 9.511964988941067e-05, "loss": 0.1966, "step": 4584 }, { "epoch": 0.924037880314326, "grad_norm": 0.04218217730522156, "learning_rate": 9.511390570493317e-05, "loss": 0.1794, "step": 4586 }, { "epoch": 0.924440862381624, "grad_norm": 0.04996601492166519, "learning_rate": 9.510815831563596e-05, "loss": 0.2086, "step": 4588 }, { "epoch": 0.924843844448922, "grad_norm": 0.21532565355300903, "learning_rate": 9.510240772192733e-05, "loss": 0.2615, "step": 4590 }, { "epoch": 0.92524682651622, "grad_norm": 0.06121666356921196, "learning_rate": 9.509665392421579e-05, "loss": 0.233, "step": 4592 }, { "epoch": 0.9256498085835181, "grad_norm": 0.04677826538681984, "learning_rate": 9.509089692291006e-05, "loss": 0.1455, "step": 4594 }, { "epoch": 0.926052790650816, "grad_norm": 0.043626219034194946, "learning_rate": 9.508513671841914e-05, "loss": 0.1787, "step": 4596 }, { "epoch": 0.9264557727181141, "grad_norm": 0.04962493106722832, "learning_rate": 9.507937331115222e-05, "loss": 0.193, "step": 4598 }, { "epoch": 0.926858754785412, "grad_norm": 0.05599873512983322, "learning_rate": 9.50736067015187e-05, "loss": 0.1929, "step": 4600 }, { "epoch": 0.9272617368527101, "grad_norm": 0.0480925627052784, "learning_rate": 9.506783688992824e-05, "loss": 0.1696, "step": 4602 }, { "epoch": 0.927664718920008, "grad_norm": 0.05178828909993172, "learning_rate": 9.506206387679073e-05, "loss": 0.1668, "step": 4604 }, { "epoch": 0.9280677009873061, "grad_norm": 0.04958764463663101, "learning_rate": 9.505628766251628e-05, "loss": 0.1871, "step": 4606 }, { "epoch": 0.9284706830546041, "grad_norm": 0.05447972193360329, "learning_rate": 9.50505082475152e-05, "loss": 0.2132, "step": 4608 }, { "epoch": 0.9288736651219021, "grad_norm": 0.05043847858905792, "learning_rate": 9.504472563219805e-05, "loss": 0.2384, "step": 4610 }, { "epoch": 0.9292766471892001, "grad_norm": 0.053794488310813904, "learning_rate": 9.503893981697565e-05, "loss": 0.2195, "step": 4612 }, { "epoch": 0.929679629256498, "grad_norm": 0.044235896319150925, "learning_rate": 9.503315080225897e-05, "loss": 0.1764, "step": 4614 }, { "epoch": 0.9300826113237961, "grad_norm": 0.052948713302612305, "learning_rate": 9.50273585884593e-05, "loss": 0.1739, "step": 4616 }, { "epoch": 0.930485593391094, "grad_norm": 0.054978448897600174, "learning_rate": 9.502156317598807e-05, "loss": 0.2081, "step": 4618 }, { "epoch": 0.9308885754583921, "grad_norm": 0.0541086308658123, "learning_rate": 9.501576456525701e-05, "loss": 0.2139, "step": 4620 }, { "epoch": 0.9312915575256902, "grad_norm": 0.06622260063886642, "learning_rate": 9.500996275667802e-05, "loss": 0.1911, "step": 4622 }, { "epoch": 0.9316945395929881, "grad_norm": 0.05048837512731552, "learning_rate": 9.500415775066324e-05, "loss": 0.2073, "step": 4624 }, { "epoch": 0.9320975216602861, "grad_norm": 0.04079214856028557, "learning_rate": 9.49983495476251e-05, "loss": 0.1657, "step": 4626 }, { "epoch": 0.9325005037275841, "grad_norm": 0.054248470813035965, "learning_rate": 9.499253814797615e-05, "loss": 0.2275, "step": 4628 }, { "epoch": 0.9329034857948821, "grad_norm": 0.05754832178354263, "learning_rate": 9.498672355212925e-05, "loss": 0.1923, "step": 4630 }, { "epoch": 0.9333064678621801, "grad_norm": 0.05790744349360466, "learning_rate": 9.498090576049745e-05, "loss": 0.2118, "step": 4632 }, { "epoch": 0.9337094499294781, "grad_norm": 0.08360709995031357, "learning_rate": 9.497508477349406e-05, "loss": 0.2253, "step": 4634 }, { "epoch": 0.9341124319967762, "grad_norm": 0.11953411996364594, "learning_rate": 9.496926059153254e-05, "loss": 0.1517, "step": 4636 }, { "epoch": 0.9345154140640741, "grad_norm": 0.048100925981998444, "learning_rate": 9.49634332150267e-05, "loss": 0.2065, "step": 4638 }, { "epoch": 0.9349183961313722, "grad_norm": 0.04823688790202141, "learning_rate": 9.495760264439046e-05, "loss": 0.1974, "step": 4640 }, { "epoch": 0.9353213781986701, "grad_norm": 0.05790715292096138, "learning_rate": 9.495176888003803e-05, "loss": 0.2179, "step": 4642 }, { "epoch": 0.9357243602659682, "grad_norm": 0.05937306210398674, "learning_rate": 9.494593192238382e-05, "loss": 0.1792, "step": 4644 }, { "epoch": 0.9361273423332662, "grad_norm": 0.06410137563943863, "learning_rate": 9.494009177184248e-05, "loss": 0.1481, "step": 4646 }, { "epoch": 0.9365303244005642, "grad_norm": 0.03937869146466255, "learning_rate": 9.493424842882892e-05, "loss": 0.1937, "step": 4648 }, { "epoch": 0.9369333064678622, "grad_norm": 0.05343402177095413, "learning_rate": 9.492840189375819e-05, "loss": 0.2059, "step": 4650 }, { "epoch": 0.9373362885351602, "grad_norm": 0.05615449324250221, "learning_rate": 9.492255216704564e-05, "loss": 0.2398, "step": 4652 }, { "epoch": 0.9377392706024582, "grad_norm": 0.07987558841705322, "learning_rate": 9.491669924910684e-05, "loss": 0.2611, "step": 4654 }, { "epoch": 0.9381422526697561, "grad_norm": 0.042240921407938004, "learning_rate": 9.491084314035756e-05, "loss": 0.1426, "step": 4656 }, { "epoch": 0.9385452347370542, "grad_norm": 0.05875542387366295, "learning_rate": 9.49049838412138e-05, "loss": 0.2371, "step": 4658 }, { "epoch": 0.9389482168043523, "grad_norm": 0.05499307066202164, "learning_rate": 9.48991213520918e-05, "loss": 0.1818, "step": 4660 }, { "epoch": 0.9393511988716502, "grad_norm": 0.060871563851833344, "learning_rate": 9.489325567340804e-05, "loss": 0.1856, "step": 4662 }, { "epoch": 0.9397541809389482, "grad_norm": 0.10326692461967468, "learning_rate": 9.488738680557919e-05, "loss": 0.2386, "step": 4664 }, { "epoch": 0.9401571630062462, "grad_norm": 0.04238074645400047, "learning_rate": 9.488151474902215e-05, "loss": 0.1813, "step": 4666 }, { "epoch": 0.9405601450735442, "grad_norm": 0.07186947017908096, "learning_rate": 9.487563950415409e-05, "loss": 0.1766, "step": 4668 }, { "epoch": 0.9409631271408422, "grad_norm": 0.04443906992673874, "learning_rate": 9.486976107139237e-05, "loss": 0.171, "step": 4670 }, { "epoch": 0.9413661092081402, "grad_norm": 0.06593028455972672, "learning_rate": 9.486387945115458e-05, "loss": 0.1799, "step": 4672 }, { "epoch": 0.9417690912754383, "grad_norm": 0.09184325486421585, "learning_rate": 9.485799464385854e-05, "loss": 0.2211, "step": 4674 }, { "epoch": 0.9421720733427362, "grad_norm": 0.05752957612276077, "learning_rate": 9.48521066499223e-05, "loss": 0.2312, "step": 4676 }, { "epoch": 0.9425750554100343, "grad_norm": 0.06228434666991234, "learning_rate": 9.484621546976415e-05, "loss": 0.1777, "step": 4678 }, { "epoch": 0.9429780374773322, "grad_norm": 0.06209741532802582, "learning_rate": 9.484032110380256e-05, "loss": 0.2325, "step": 4680 }, { "epoch": 0.9433810195446303, "grad_norm": 0.04441828653216362, "learning_rate": 9.483442355245626e-05, "loss": 0.1211, "step": 4682 }, { "epoch": 0.9437840016119282, "grad_norm": 0.039253611117601395, "learning_rate": 9.482852281614423e-05, "loss": 0.1761, "step": 4684 }, { "epoch": 0.9441869836792263, "grad_norm": 0.04427387937903404, "learning_rate": 9.482261889528563e-05, "loss": 0.1874, "step": 4686 }, { "epoch": 0.9445899657465243, "grad_norm": 0.056583307683467865, "learning_rate": 9.481671179029985e-05, "loss": 0.2145, "step": 4688 }, { "epoch": 0.9449929478138223, "grad_norm": 0.04767727851867676, "learning_rate": 9.481080150160656e-05, "loss": 0.1776, "step": 4690 }, { "epoch": 0.9453959298811203, "grad_norm": 0.04897291958332062, "learning_rate": 9.480488802962559e-05, "loss": 0.1925, "step": 4692 }, { "epoch": 0.9457989119484183, "grad_norm": 0.06027200073003769, "learning_rate": 9.479897137477702e-05, "loss": 0.2359, "step": 4694 }, { "epoch": 0.9462018940157163, "grad_norm": 0.04494505375623703, "learning_rate": 9.479305153748116e-05, "loss": 0.1743, "step": 4696 }, { "epoch": 0.9466048760830144, "grad_norm": 0.060913268476724625, "learning_rate": 9.478712851815858e-05, "loss": 0.1648, "step": 4698 }, { "epoch": 0.9470078581503123, "grad_norm": 0.04992394894361496, "learning_rate": 9.478120231723001e-05, "loss": 0.1343, "step": 4700 }, { "epoch": 0.9474108402176104, "grad_norm": 0.04095543920993805, "learning_rate": 9.477527293511644e-05, "loss": 0.1663, "step": 4702 }, { "epoch": 0.9478138222849083, "grad_norm": 0.066753089427948, "learning_rate": 9.476934037223909e-05, "loss": 0.2191, "step": 4704 }, { "epoch": 0.9482168043522063, "grad_norm": 0.06055450811982155, "learning_rate": 9.47634046290194e-05, "loss": 0.209, "step": 4706 }, { "epoch": 0.9486197864195043, "grad_norm": 0.045478709042072296, "learning_rate": 9.475746570587903e-05, "loss": 0.2269, "step": 4708 }, { "epoch": 0.9490227684868023, "grad_norm": 0.06353277713060379, "learning_rate": 9.475152360323987e-05, "loss": 0.2201, "step": 4710 }, { "epoch": 0.9494257505541004, "grad_norm": 0.05817262828350067, "learning_rate": 9.474557832152405e-05, "loss": 0.2127, "step": 4712 }, { "epoch": 0.9498287326213983, "grad_norm": 0.05550335347652435, "learning_rate": 9.47396298611539e-05, "loss": 0.2479, "step": 4714 }, { "epoch": 0.9502317146886964, "grad_norm": 0.06964296102523804, "learning_rate": 9.473367822255202e-05, "loss": 0.2014, "step": 4716 }, { "epoch": 0.9506346967559943, "grad_norm": 0.14811141788959503, "learning_rate": 9.472772340614115e-05, "loss": 0.2747, "step": 4718 }, { "epoch": 0.9510376788232924, "grad_norm": 0.05050405487418175, "learning_rate": 9.472176541234435e-05, "loss": 0.2006, "step": 4720 }, { "epoch": 0.9514406608905903, "grad_norm": 0.038916632533073425, "learning_rate": 9.471580424158486e-05, "loss": 0.1612, "step": 4722 }, { "epoch": 0.9518436429578884, "grad_norm": 0.05323829501867294, "learning_rate": 9.470983989428615e-05, "loss": 0.1914, "step": 4724 }, { "epoch": 0.9522466250251864, "grad_norm": 0.06356607377529144, "learning_rate": 9.47038723708719e-05, "loss": 0.2505, "step": 4726 }, { "epoch": 0.9526496070924844, "grad_norm": 0.04368700832128525, "learning_rate": 9.469790167176606e-05, "loss": 0.1307, "step": 4728 }, { "epoch": 0.9530525891597824, "grad_norm": 0.050323549658060074, "learning_rate": 9.469192779739278e-05, "loss": 0.1773, "step": 4730 }, { "epoch": 0.9534555712270804, "grad_norm": 0.08620080351829529, "learning_rate": 9.468595074817641e-05, "loss": 0.2042, "step": 4732 }, { "epoch": 0.9538585532943784, "grad_norm": 0.04472964629530907, "learning_rate": 9.467997052454157e-05, "loss": 0.1965, "step": 4734 }, { "epoch": 0.9542615353616765, "grad_norm": 0.04990183562040329, "learning_rate": 9.467398712691308e-05, "loss": 0.1418, "step": 4736 }, { "epoch": 0.9546645174289744, "grad_norm": 0.05777883529663086, "learning_rate": 9.466800055571599e-05, "loss": 0.1987, "step": 4738 }, { "epoch": 0.9550674994962725, "grad_norm": 0.04438330978155136, "learning_rate": 9.466201081137557e-05, "loss": 0.2193, "step": 4740 }, { "epoch": 0.9554704815635704, "grad_norm": 0.07162509113550186, "learning_rate": 9.465601789431733e-05, "loss": 0.1679, "step": 4742 }, { "epoch": 0.9558734636308684, "grad_norm": 0.07432812452316284, "learning_rate": 9.465002180496701e-05, "loss": 0.2137, "step": 4744 }, { "epoch": 0.9562764456981664, "grad_norm": 0.045975614339113235, "learning_rate": 9.464402254375053e-05, "loss": 0.174, "step": 4746 }, { "epoch": 0.9566794277654644, "grad_norm": 0.04282781109213829, "learning_rate": 9.463802011109409e-05, "loss": 0.2672, "step": 4748 }, { "epoch": 0.9570824098327625, "grad_norm": 0.06693075597286224, "learning_rate": 9.46320145074241e-05, "loss": 0.1993, "step": 4750 }, { "epoch": 0.9574853919000604, "grad_norm": 0.07140269875526428, "learning_rate": 9.462600573316715e-05, "loss": 0.182, "step": 4752 }, { "epoch": 0.9578883739673585, "grad_norm": 0.0505669005215168, "learning_rate": 9.461999378875015e-05, "loss": 0.1665, "step": 4754 }, { "epoch": 0.9582913560346564, "grad_norm": 0.057108256965875626, "learning_rate": 9.461397867460014e-05, "loss": 0.1331, "step": 4756 }, { "epoch": 0.9586943381019545, "grad_norm": 0.05740061402320862, "learning_rate": 9.460796039114443e-05, "loss": 0.1687, "step": 4758 }, { "epoch": 0.9590973201692524, "grad_norm": 0.0820176899433136, "learning_rate": 9.460193893881057e-05, "loss": 0.2129, "step": 4760 }, { "epoch": 0.9595003022365505, "grad_norm": 0.06103077530860901, "learning_rate": 9.459591431802628e-05, "loss": 0.1989, "step": 4762 }, { "epoch": 0.9599032843038485, "grad_norm": 0.05346281826496124, "learning_rate": 9.458988652921957e-05, "loss": 0.2003, "step": 4764 }, { "epoch": 0.9603062663711465, "grad_norm": 0.0755985677242279, "learning_rate": 9.458385557281862e-05, "loss": 0.1845, "step": 4766 }, { "epoch": 0.9607092484384445, "grad_norm": 0.053794801235198975, "learning_rate": 9.457782144925188e-05, "loss": 0.2163, "step": 4768 }, { "epoch": 0.9611122305057425, "grad_norm": 0.04686738923192024, "learning_rate": 9.4571784158948e-05, "loss": 0.2205, "step": 4770 }, { "epoch": 0.9615152125730405, "grad_norm": 0.050479158759117126, "learning_rate": 9.456574370233584e-05, "loss": 0.1789, "step": 4772 }, { "epoch": 0.9619181946403385, "grad_norm": 0.0576460175216198, "learning_rate": 9.455970007984453e-05, "loss": 0.1383, "step": 4774 }, { "epoch": 0.9623211767076365, "grad_norm": 0.06098479777574539, "learning_rate": 9.45536532919034e-05, "loss": 0.1946, "step": 4776 }, { "epoch": 0.9627241587749346, "grad_norm": 0.057289186865091324, "learning_rate": 9.454760333894197e-05, "loss": 0.2384, "step": 4778 }, { "epoch": 0.9631271408422325, "grad_norm": 0.05714469403028488, "learning_rate": 9.454155022139006e-05, "loss": 0.2139, "step": 4780 }, { "epoch": 0.9635301229095306, "grad_norm": 0.06614606082439423, "learning_rate": 9.453549393967764e-05, "loss": 0.2204, "step": 4782 }, { "epoch": 0.9639331049768285, "grad_norm": 0.050378940999507904, "learning_rate": 9.452943449423497e-05, "loss": 0.2274, "step": 4784 }, { "epoch": 0.9643360870441265, "grad_norm": 0.06546289473772049, "learning_rate": 9.452337188549248e-05, "loss": 0.245, "step": 4786 }, { "epoch": 0.9647390691114246, "grad_norm": 0.04873489961028099, "learning_rate": 9.451730611388086e-05, "loss": 0.2005, "step": 4788 }, { "epoch": 0.9651420511787225, "grad_norm": 0.04178796708583832, "learning_rate": 9.451123717983101e-05, "loss": 0.1826, "step": 4790 }, { "epoch": 0.9655450332460206, "grad_norm": 0.05364158749580383, "learning_rate": 9.450516508377405e-05, "loss": 0.1845, "step": 4792 }, { "epoch": 0.9659480153133185, "grad_norm": 0.05494103580713272, "learning_rate": 9.449908982614133e-05, "loss": 0.2375, "step": 4794 }, { "epoch": 0.9663509973806166, "grad_norm": 0.04489286616444588, "learning_rate": 9.449301140736446e-05, "loss": 0.2198, "step": 4796 }, { "epoch": 0.9667539794479145, "grad_norm": 0.07947932928800583, "learning_rate": 9.44869298278752e-05, "loss": 0.2555, "step": 4798 }, { "epoch": 0.9671569615152126, "grad_norm": 0.05839437618851662, "learning_rate": 9.448084508810559e-05, "loss": 0.2151, "step": 4800 }, { "epoch": 0.9675599435825106, "grad_norm": 0.04776003211736679, "learning_rate": 9.447475718848788e-05, "loss": 0.183, "step": 4802 }, { "epoch": 0.9679629256498086, "grad_norm": 0.04061594605445862, "learning_rate": 9.446866612945455e-05, "loss": 0.1549, "step": 4804 }, { "epoch": 0.9683659077171066, "grad_norm": 0.042147375643253326, "learning_rate": 9.44625719114383e-05, "loss": 0.1591, "step": 4806 }, { "epoch": 0.9687688897844046, "grad_norm": 0.05506439134478569, "learning_rate": 9.445647453487204e-05, "loss": 0.2761, "step": 4808 }, { "epoch": 0.9691718718517026, "grad_norm": 0.04046489670872688, "learning_rate": 9.445037400018892e-05, "loss": 0.1516, "step": 4810 }, { "epoch": 0.9695748539190006, "grad_norm": 0.04889726638793945, "learning_rate": 9.444427030782234e-05, "loss": 0.147, "step": 4812 }, { "epoch": 0.9699778359862986, "grad_norm": 0.049571387469768524, "learning_rate": 9.443816345820587e-05, "loss": 0.2026, "step": 4814 }, { "epoch": 0.9703808180535967, "grad_norm": 0.06491173058748245, "learning_rate": 9.443205345177333e-05, "loss": 0.1986, "step": 4816 }, { "epoch": 0.9707838001208946, "grad_norm": 0.04497947171330452, "learning_rate": 9.442594028895877e-05, "loss": 0.2304, "step": 4818 }, { "epoch": 0.9711867821881927, "grad_norm": 0.045850589871406555, "learning_rate": 9.441982397019647e-05, "loss": 0.194, "step": 4820 }, { "epoch": 0.9715897642554906, "grad_norm": 0.0547361746430397, "learning_rate": 9.44137044959209e-05, "loss": 0.1766, "step": 4822 }, { "epoch": 0.9719927463227886, "grad_norm": 0.06350360810756683, "learning_rate": 9.44075818665668e-05, "loss": 0.2033, "step": 4824 }, { "epoch": 0.9723957283900866, "grad_norm": 0.05815676599740982, "learning_rate": 9.44014560825691e-05, "loss": 0.2162, "step": 4826 }, { "epoch": 0.9727987104573846, "grad_norm": 0.047649532556533813, "learning_rate": 9.439532714436297e-05, "loss": 0.2091, "step": 4828 }, { "epoch": 0.9732016925246827, "grad_norm": 0.05103228986263275, "learning_rate": 9.43891950523838e-05, "loss": 0.1958, "step": 4830 }, { "epoch": 0.9736046745919806, "grad_norm": 0.05040536820888519, "learning_rate": 9.438305980706721e-05, "loss": 0.2203, "step": 4832 }, { "epoch": 0.9740076566592787, "grad_norm": 0.03978874534368515, "learning_rate": 9.437692140884902e-05, "loss": 0.1667, "step": 4834 }, { "epoch": 0.9744106387265766, "grad_norm": 0.03923666477203369, "learning_rate": 9.437077985816532e-05, "loss": 0.1951, "step": 4836 }, { "epoch": 0.9748136207938747, "grad_norm": 0.047468025237321854, "learning_rate": 9.436463515545237e-05, "loss": 0.1986, "step": 4838 }, { "epoch": 0.9752166028611727, "grad_norm": 0.04159865155816078, "learning_rate": 9.435848730114668e-05, "loss": 0.2168, "step": 4840 }, { "epoch": 0.9756195849284707, "grad_norm": 0.04819134995341301, "learning_rate": 9.4352336295685e-05, "loss": 0.1822, "step": 4842 }, { "epoch": 0.9760225669957687, "grad_norm": 0.042101290076971054, "learning_rate": 9.434618213950428e-05, "loss": 0.152, "step": 4844 }, { "epoch": 0.9764255490630667, "grad_norm": 0.060625553131103516, "learning_rate": 9.434002483304172e-05, "loss": 0.2029, "step": 4846 }, { "epoch": 0.9768285311303647, "grad_norm": 0.04648581147193909, "learning_rate": 9.433386437673468e-05, "loss": 0.1906, "step": 4848 }, { "epoch": 0.9772315131976627, "grad_norm": 0.045150671154260635, "learning_rate": 9.432770077102084e-05, "loss": 0.1537, "step": 4850 }, { "epoch": 0.9776344952649607, "grad_norm": 0.0670241042971611, "learning_rate": 9.4321534016338e-05, "loss": 0.2136, "step": 4852 }, { "epoch": 0.9780374773322588, "grad_norm": 0.04682036116719246, "learning_rate": 9.431536411312429e-05, "loss": 0.1993, "step": 4854 }, { "epoch": 0.9784404593995567, "grad_norm": 0.05474329739809036, "learning_rate": 9.430919106181799e-05, "loss": 0.195, "step": 4856 }, { "epoch": 0.9788434414668548, "grad_norm": 0.09272141754627228, "learning_rate": 9.43030148628576e-05, "loss": 0.2943, "step": 4858 }, { "epoch": 0.9792464235341527, "grad_norm": 0.044193752110004425, "learning_rate": 9.429683551668189e-05, "loss": 0.1576, "step": 4860 }, { "epoch": 0.9796494056014508, "grad_norm": 0.04668412357568741, "learning_rate": 9.429065302372984e-05, "loss": 0.203, "step": 4862 }, { "epoch": 0.9800523876687487, "grad_norm": 0.049144770950078964, "learning_rate": 9.42844673844406e-05, "loss": 0.2408, "step": 4864 }, { "epoch": 0.9804553697360467, "grad_norm": 0.05974644795060158, "learning_rate": 9.427827859925366e-05, "loss": 0.2047, "step": 4866 }, { "epoch": 0.9808583518033448, "grad_norm": 0.059690799564123154, "learning_rate": 9.427208666860859e-05, "loss": 0.232, "step": 4868 }, { "epoch": 0.9812613338706427, "grad_norm": 0.05260715261101723, "learning_rate": 9.42658915929453e-05, "loss": 0.1786, "step": 4870 }, { "epoch": 0.9816643159379408, "grad_norm": 0.0499386303126812, "learning_rate": 9.425969337270386e-05, "loss": 0.2155, "step": 4872 }, { "epoch": 0.9820672980052387, "grad_norm": 0.03995073586702347, "learning_rate": 9.425349200832459e-05, "loss": 0.1629, "step": 4874 }, { "epoch": 0.9824702800725368, "grad_norm": 0.04229956492781639, "learning_rate": 9.424728750024802e-05, "loss": 0.1997, "step": 4876 }, { "epoch": 0.9828732621398347, "grad_norm": 0.04232575744390488, "learning_rate": 9.424107984891491e-05, "loss": 0.2051, "step": 4878 }, { "epoch": 0.9832762442071328, "grad_norm": 0.05002473667263985, "learning_rate": 9.423486905476624e-05, "loss": 0.1571, "step": 4880 }, { "epoch": 0.9836792262744308, "grad_norm": 0.039196085184812546, "learning_rate": 9.422865511824322e-05, "loss": 0.2116, "step": 4882 }, { "epoch": 0.9840822083417288, "grad_norm": 0.0501602478325367, "learning_rate": 9.422243803978726e-05, "loss": 0.1995, "step": 4884 }, { "epoch": 0.9844851904090268, "grad_norm": 0.0668732151389122, "learning_rate": 9.421621781984004e-05, "loss": 0.2035, "step": 4886 }, { "epoch": 0.9848881724763248, "grad_norm": 0.0779421254992485, "learning_rate": 9.42099944588434e-05, "loss": 0.2211, "step": 4888 }, { "epoch": 0.9852911545436228, "grad_norm": 0.052955832332372665, "learning_rate": 9.420376795723947e-05, "loss": 0.2195, "step": 4890 }, { "epoch": 0.9856941366109209, "grad_norm": 0.05393822491168976, "learning_rate": 9.419753831547056e-05, "loss": 0.2528, "step": 4892 }, { "epoch": 0.9860971186782188, "grad_norm": 0.05363127589225769, "learning_rate": 9.419130553397921e-05, "loss": 0.2517, "step": 4894 }, { "epoch": 0.9865001007455169, "grad_norm": 0.05999777838587761, "learning_rate": 9.418506961320819e-05, "loss": 0.191, "step": 4896 }, { "epoch": 0.9869030828128148, "grad_norm": 0.05673222243785858, "learning_rate": 9.417883055360048e-05, "loss": 0.2017, "step": 4898 }, { "epoch": 0.9873060648801129, "grad_norm": 0.04215675964951515, "learning_rate": 9.417258835559931e-05, "loss": 0.2031, "step": 4900 }, { "epoch": 0.9877090469474108, "grad_norm": 0.03936833515763283, "learning_rate": 9.41663430196481e-05, "loss": 0.2285, "step": 4902 }, { "epoch": 0.9881120290147088, "grad_norm": 0.05112620070576668, "learning_rate": 9.416009454619053e-05, "loss": 0.2145, "step": 4904 }, { "epoch": 0.9885150110820069, "grad_norm": 0.052898604422807693, "learning_rate": 9.415384293567045e-05, "loss": 0.1976, "step": 4906 }, { "epoch": 0.9889179931493048, "grad_norm": 0.04791862890124321, "learning_rate": 9.414758818853198e-05, "loss": 0.2208, "step": 4908 }, { "epoch": 0.9893209752166029, "grad_norm": 0.057992760092020035, "learning_rate": 9.414133030521946e-05, "loss": 0.1953, "step": 4910 }, { "epoch": 0.9897239572839008, "grad_norm": 0.045697104185819626, "learning_rate": 9.413506928617744e-05, "loss": 0.1606, "step": 4912 }, { "epoch": 0.9901269393511989, "grad_norm": 0.05711643025279045, "learning_rate": 9.412880513185065e-05, "loss": 0.1584, "step": 4914 }, { "epoch": 0.9905299214184968, "grad_norm": 0.06118881329894066, "learning_rate": 9.412253784268414e-05, "loss": 0.1672, "step": 4916 }, { "epoch": 0.9909329034857949, "grad_norm": 0.048959724605083466, "learning_rate": 9.411626741912309e-05, "loss": 0.2152, "step": 4918 }, { "epoch": 0.9913358855530929, "grad_norm": 0.06793203949928284, "learning_rate": 9.410999386161297e-05, "loss": 0.1952, "step": 4920 }, { "epoch": 0.9917388676203909, "grad_norm": 0.04434814676642418, "learning_rate": 9.410371717059943e-05, "loss": 0.2083, "step": 4922 }, { "epoch": 0.9921418496876889, "grad_norm": 0.05386332795023918, "learning_rate": 9.409743734652834e-05, "loss": 0.1835, "step": 4924 }, { "epoch": 0.9925448317549869, "grad_norm": 0.05638045817613602, "learning_rate": 9.409115438984584e-05, "loss": 0.1961, "step": 4926 }, { "epoch": 0.9929478138222849, "grad_norm": 0.08104156702756882, "learning_rate": 9.408486830099824e-05, "loss": 0.266, "step": 4928 }, { "epoch": 0.9933507958895829, "grad_norm": 0.054531458765268326, "learning_rate": 9.40785790804321e-05, "loss": 0.2087, "step": 4930 }, { "epoch": 0.9937537779568809, "grad_norm": 0.051535993814468384, "learning_rate": 9.40722867285942e-05, "loss": 0.2245, "step": 4932 }, { "epoch": 0.994156760024179, "grad_norm": 0.06591679900884628, "learning_rate": 9.406599124593152e-05, "loss": 0.1696, "step": 4934 }, { "epoch": 0.9945597420914769, "grad_norm": 0.04971354827284813, "learning_rate": 9.405969263289131e-05, "loss": 0.1651, "step": 4936 }, { "epoch": 0.994962724158775, "grad_norm": 0.06742224842309952, "learning_rate": 9.405339088992099e-05, "loss": 0.2121, "step": 4938 }, { "epoch": 0.9953657062260729, "grad_norm": 0.061622168868780136, "learning_rate": 9.404708601746823e-05, "loss": 0.2156, "step": 4940 }, { "epoch": 0.995768688293371, "grad_norm": 0.04887613281607628, "learning_rate": 9.404077801598093e-05, "loss": 0.1869, "step": 4942 }, { "epoch": 0.996171670360669, "grad_norm": 0.051833122968673706, "learning_rate": 9.403446688590719e-05, "loss": 0.1906, "step": 4944 }, { "epoch": 0.996574652427967, "grad_norm": 0.04879339411854744, "learning_rate": 9.402815262769536e-05, "loss": 0.1643, "step": 4946 }, { "epoch": 0.996977634495265, "grad_norm": 0.045783065259456635, "learning_rate": 9.402183524179395e-05, "loss": 0.2048, "step": 4948 }, { "epoch": 0.9973806165625629, "grad_norm": 0.048180919140577316, "learning_rate": 9.401551472865179e-05, "loss": 0.1594, "step": 4950 }, { "epoch": 0.997783598629861, "grad_norm": 0.04826292395591736, "learning_rate": 9.400919108871783e-05, "loss": 0.2254, "step": 4952 }, { "epoch": 0.9981865806971589, "grad_norm": 0.0434853732585907, "learning_rate": 9.400286432244135e-05, "loss": 0.2108, "step": 4954 }, { "epoch": 0.998589562764457, "grad_norm": 0.04653545469045639, "learning_rate": 9.399653443027175e-05, "loss": 0.2503, "step": 4956 }, { "epoch": 0.998992544831755, "grad_norm": 0.05442295968532562, "learning_rate": 9.399020141265871e-05, "loss": 0.2082, "step": 4958 }, { "epoch": 0.999395526899053, "grad_norm": 0.05237448215484619, "learning_rate": 9.39838652700521e-05, "loss": 0.1517, "step": 4960 }, { "epoch": 0.999798508966351, "grad_norm": 0.04914592206478119, "learning_rate": 9.397752600290205e-05, "loss": 0.1774, "step": 4962 }, { "epoch": 1.000201491033649, "grad_norm": 0.03976700082421303, "learning_rate": 9.397118361165889e-05, "loss": 0.1439, "step": 4964 }, { "epoch": 1.000604473100947, "grad_norm": 0.048117585480213165, "learning_rate": 9.396483809677316e-05, "loss": 0.2467, "step": 4966 }, { "epoch": 1.001007455168245, "grad_norm": 0.04981888830661774, "learning_rate": 9.395848945869564e-05, "loss": 0.2025, "step": 4968 }, { "epoch": 1.001410437235543, "grad_norm": 0.04174968972802162, "learning_rate": 9.395213769787734e-05, "loss": 0.2232, "step": 4970 }, { "epoch": 1.001813419302841, "grad_norm": 0.05808541178703308, "learning_rate": 9.394578281476946e-05, "loss": 0.1699, "step": 4972 }, { "epoch": 1.0022164013701391, "grad_norm": 0.059681493788957596, "learning_rate": 9.393942480982345e-05, "loss": 0.2036, "step": 4974 }, { "epoch": 1.002619383437437, "grad_norm": 0.04494740068912506, "learning_rate": 9.393306368349099e-05, "loss": 0.1592, "step": 4976 }, { "epoch": 1.003022365504735, "grad_norm": 0.048827823251485825, "learning_rate": 9.392669943622391e-05, "loss": 0.2018, "step": 4978 }, { "epoch": 1.003425347572033, "grad_norm": 0.05513651296496391, "learning_rate": 9.39203320684744e-05, "loss": 0.1701, "step": 4980 }, { "epoch": 1.003828329639331, "grad_norm": 0.0650024265050888, "learning_rate": 9.39139615806947e-05, "loss": 0.2187, "step": 4982 }, { "epoch": 1.004231311706629, "grad_norm": 0.05801888927817345, "learning_rate": 9.390758797333742e-05, "loss": 0.2212, "step": 4984 }, { "epoch": 1.004634293773927, "grad_norm": 0.05577477440237999, "learning_rate": 9.39012112468553e-05, "loss": 0.2248, "step": 4986 }, { "epoch": 1.0050372758412252, "grad_norm": 0.053429532796144485, "learning_rate": 9.389483140170134e-05, "loss": 0.2302, "step": 4988 }, { "epoch": 1.005440257908523, "grad_norm": 0.04849427193403244, "learning_rate": 9.388844843832878e-05, "loss": 0.1972, "step": 4990 }, { "epoch": 1.005843239975821, "grad_norm": 0.04170486330986023, "learning_rate": 9.388206235719102e-05, "loss": 0.2255, "step": 4992 }, { "epoch": 1.006246222043119, "grad_norm": 0.0438525527715683, "learning_rate": 9.387567315874171e-05, "loss": 0.1649, "step": 4994 }, { "epoch": 1.0066492041104171, "grad_norm": 0.041522156447172165, "learning_rate": 9.386928084343478e-05, "loss": 0.1447, "step": 4996 }, { "epoch": 1.007052186177715, "grad_norm": 0.05541053041815758, "learning_rate": 9.386288541172428e-05, "loss": 0.231, "step": 4998 }, { "epoch": 1.007455168245013, "grad_norm": 0.047914352267980576, "learning_rate": 9.385648686406454e-05, "loss": 0.1826, "step": 5000 }, { "epoch": 1.0078581503123112, "grad_norm": 0.05127349868416786, "learning_rate": 9.385008520091012e-05, "loss": 0.1679, "step": 5002 }, { "epoch": 1.0082611323796091, "grad_norm": 0.06173473596572876, "learning_rate": 9.384368042271577e-05, "loss": 0.1756, "step": 5004 }, { "epoch": 1.008664114446907, "grad_norm": 0.053565483540296555, "learning_rate": 9.383727252993649e-05, "loss": 0.1678, "step": 5006 }, { "epoch": 1.009067096514205, "grad_norm": 0.06874987483024597, "learning_rate": 9.383086152302747e-05, "loss": 0.1762, "step": 5008 }, { "epoch": 1.0094700785815032, "grad_norm": 0.06190743297338486, "learning_rate": 9.382444740244415e-05, "loss": 0.2323, "step": 5010 }, { "epoch": 1.0098730606488011, "grad_norm": 0.052252788096666336, "learning_rate": 9.381803016864216e-05, "loss": 0.1899, "step": 5012 }, { "epoch": 1.010276042716099, "grad_norm": 0.05020655691623688, "learning_rate": 9.38116098220774e-05, "loss": 0.1517, "step": 5014 }, { "epoch": 1.0106790247833972, "grad_norm": 0.05819331109523773, "learning_rate": 9.380518636320594e-05, "loss": 0.1582, "step": 5016 }, { "epoch": 1.0110820068506952, "grad_norm": 0.04920068383216858, "learning_rate": 9.37987597924841e-05, "loss": 0.1482, "step": 5018 }, { "epoch": 1.011484988917993, "grad_norm": 0.05984297767281532, "learning_rate": 9.37923301103684e-05, "loss": 0.2166, "step": 5020 }, { "epoch": 1.0118879709852913, "grad_norm": 0.03330834209918976, "learning_rate": 9.378589731731561e-05, "loss": 0.1439, "step": 5022 }, { "epoch": 1.0122909530525892, "grad_norm": 0.05390724912285805, "learning_rate": 9.37794614137827e-05, "loss": 0.1796, "step": 5024 }, { "epoch": 1.0126939351198871, "grad_norm": 0.04868139326572418, "learning_rate": 9.377302240022687e-05, "loss": 0.1869, "step": 5026 }, { "epoch": 1.013096917187185, "grad_norm": 0.04992436245083809, "learning_rate": 9.376658027710552e-05, "loss": 0.2336, "step": 5028 }, { "epoch": 1.0134998992544832, "grad_norm": 0.0564495250582695, "learning_rate": 9.37601350448763e-05, "loss": 0.165, "step": 5030 }, { "epoch": 1.0139028813217812, "grad_norm": 0.06212518364191055, "learning_rate": 9.375368670399709e-05, "loss": 0.1943, "step": 5032 }, { "epoch": 1.0143058633890791, "grad_norm": 0.06465470790863037, "learning_rate": 9.374723525492594e-05, "loss": 0.1914, "step": 5034 }, { "epoch": 1.0147088454563773, "grad_norm": 0.0705813467502594, "learning_rate": 9.374078069812116e-05, "loss": 0.1924, "step": 5036 }, { "epoch": 1.0151118275236752, "grad_norm": 0.06170513108372688, "learning_rate": 9.373432303404128e-05, "loss": 0.2444, "step": 5038 }, { "epoch": 1.0155148095909732, "grad_norm": 0.04834654927253723, "learning_rate": 9.372786226314503e-05, "loss": 0.1862, "step": 5040 }, { "epoch": 1.0159177916582711, "grad_norm": 0.04867038130760193, "learning_rate": 9.372139838589138e-05, "loss": 0.252, "step": 5042 }, { "epoch": 1.0163207737255693, "grad_norm": 0.03534555062651634, "learning_rate": 9.37149314027395e-05, "loss": 0.136, "step": 5044 }, { "epoch": 1.0167237557928672, "grad_norm": 0.06922519952058792, "learning_rate": 9.37084613141488e-05, "loss": 0.176, "step": 5046 }, { "epoch": 1.0171267378601652, "grad_norm": 0.044819701462984085, "learning_rate": 9.370198812057893e-05, "loss": 0.1751, "step": 5048 }, { "epoch": 1.0175297199274633, "grad_norm": 0.049208372831344604, "learning_rate": 9.36955118224897e-05, "loss": 0.2234, "step": 5050 }, { "epoch": 1.0179327019947613, "grad_norm": 0.048055894672870636, "learning_rate": 9.368903242034121e-05, "loss": 0.1297, "step": 5052 }, { "epoch": 1.0183356840620592, "grad_norm": 0.04360407590866089, "learning_rate": 9.368254991459371e-05, "loss": 0.1929, "step": 5054 }, { "epoch": 1.0187386661293572, "grad_norm": 0.06082189083099365, "learning_rate": 9.367606430570772e-05, "loss": 0.1798, "step": 5056 }, { "epoch": 1.0191416481966553, "grad_norm": 0.062363721430301666, "learning_rate": 9.366957559414399e-05, "loss": 0.2117, "step": 5058 }, { "epoch": 1.0195446302639533, "grad_norm": 0.07241443544626236, "learning_rate": 9.366308378036344e-05, "loss": 0.1657, "step": 5060 }, { "epoch": 1.0199476123312512, "grad_norm": 0.05443087965250015, "learning_rate": 9.365658886482725e-05, "loss": 0.1419, "step": 5062 }, { "epoch": 1.0203505943985494, "grad_norm": 0.05649973824620247, "learning_rate": 9.365009084799678e-05, "loss": 0.1858, "step": 5064 }, { "epoch": 1.0207535764658473, "grad_norm": 0.05326389893889427, "learning_rate": 9.36435897303337e-05, "loss": 0.1853, "step": 5066 }, { "epoch": 1.0211565585331452, "grad_norm": 0.05806842818856239, "learning_rate": 9.363708551229978e-05, "loss": 0.2173, "step": 5068 }, { "epoch": 1.0215595406004432, "grad_norm": 0.0407080352306366, "learning_rate": 9.36305781943571e-05, "loss": 0.1853, "step": 5070 }, { "epoch": 1.0219625226677413, "grad_norm": 0.07433430105447769, "learning_rate": 9.362406777696793e-05, "loss": 0.2404, "step": 5072 }, { "epoch": 1.0223655047350393, "grad_norm": 0.04541980102658272, "learning_rate": 9.361755426059473e-05, "loss": 0.1299, "step": 5074 }, { "epoch": 1.0227684868023372, "grad_norm": 0.057482391595840454, "learning_rate": 9.361103764570025e-05, "loss": 0.1706, "step": 5076 }, { "epoch": 1.0231714688696354, "grad_norm": 0.06770235300064087, "learning_rate": 9.36045179327474e-05, "loss": 0.2241, "step": 5078 }, { "epoch": 1.0235744509369333, "grad_norm": 0.058020222932100296, "learning_rate": 9.359799512219932e-05, "loss": 0.1901, "step": 5080 }, { "epoch": 1.0239774330042313, "grad_norm": 0.0436263270676136, "learning_rate": 9.35914692145194e-05, "loss": 0.1821, "step": 5082 }, { "epoch": 1.0243804150715292, "grad_norm": 0.0542580746114254, "learning_rate": 9.358494021017121e-05, "loss": 0.2041, "step": 5084 }, { "epoch": 1.0247833971388274, "grad_norm": 0.05703788995742798, "learning_rate": 9.35784081096186e-05, "loss": 0.2051, "step": 5086 }, { "epoch": 1.0251863792061253, "grad_norm": 0.035361260175704956, "learning_rate": 9.357187291332554e-05, "loss": 0.1488, "step": 5088 }, { "epoch": 1.0255893612734233, "grad_norm": 0.05394968390464783, "learning_rate": 9.356533462175632e-05, "loss": 0.1925, "step": 5090 }, { "epoch": 1.0259923433407214, "grad_norm": 0.056257717311382294, "learning_rate": 9.35587932353754e-05, "loss": 0.2336, "step": 5092 }, { "epoch": 1.0263953254080194, "grad_norm": 0.05867587774991989, "learning_rate": 9.355224875464748e-05, "loss": 0.2295, "step": 5094 }, { "epoch": 1.0267983074753173, "grad_norm": 0.05022086203098297, "learning_rate": 9.354570118003745e-05, "loss": 0.2216, "step": 5096 }, { "epoch": 1.0272012895426152, "grad_norm": 0.045713771134614944, "learning_rate": 9.353915051201046e-05, "loss": 0.1787, "step": 5098 }, { "epoch": 1.0276042716099134, "grad_norm": 0.043041035532951355, "learning_rate": 9.353259675103185e-05, "loss": 0.1466, "step": 5100 }, { "epoch": 1.0280072536772114, "grad_norm": 0.053802333772182465, "learning_rate": 9.352603989756717e-05, "loss": 0.1919, "step": 5102 }, { "epoch": 1.0284102357445093, "grad_norm": 0.049435701221227646, "learning_rate": 9.351947995208224e-05, "loss": 0.19, "step": 5104 }, { "epoch": 1.0288132178118075, "grad_norm": 0.049988001585006714, "learning_rate": 9.351291691504305e-05, "loss": 0.2154, "step": 5106 }, { "epoch": 1.0292161998791054, "grad_norm": 0.054042112082242966, "learning_rate": 9.350635078691583e-05, "loss": 0.153, "step": 5108 }, { "epoch": 1.0296191819464033, "grad_norm": 0.04441095143556595, "learning_rate": 9.349978156816702e-05, "loss": 0.1965, "step": 5110 }, { "epoch": 1.0300221640137015, "grad_norm": 0.04497204348444939, "learning_rate": 9.34932092592633e-05, "loss": 0.2281, "step": 5112 }, { "epoch": 1.0304251460809994, "grad_norm": 0.0403619110584259, "learning_rate": 9.348663386067156e-05, "loss": 0.1855, "step": 5114 }, { "epoch": 1.0308281281482974, "grad_norm": 0.05032897740602493, "learning_rate": 9.348005537285889e-05, "loss": 0.2216, "step": 5116 }, { "epoch": 1.0312311102155953, "grad_norm": 0.058560241013765335, "learning_rate": 9.347347379629262e-05, "loss": 0.2346, "step": 5118 }, { "epoch": 1.0316340922828935, "grad_norm": 0.04787188768386841, "learning_rate": 9.346688913144031e-05, "loss": 0.1653, "step": 5120 }, { "epoch": 1.0320370743501914, "grad_norm": 0.07317479699850082, "learning_rate": 9.34603013787697e-05, "loss": 0.1305, "step": 5122 }, { "epoch": 1.0324400564174894, "grad_norm": 0.07495363801717758, "learning_rate": 9.345371053874878e-05, "loss": 0.1945, "step": 5124 }, { "epoch": 1.0328430384847875, "grad_norm": 0.05415000021457672, "learning_rate": 9.344711661184575e-05, "loss": 0.1667, "step": 5126 }, { "epoch": 1.0332460205520855, "grad_norm": 0.05984153226017952, "learning_rate": 9.344051959852907e-05, "loss": 0.2495, "step": 5128 }, { "epoch": 1.0336490026193834, "grad_norm": 0.055079031735658646, "learning_rate": 9.343391949926732e-05, "loss": 0.2101, "step": 5130 }, { "epoch": 1.0340519846866814, "grad_norm": 0.05712849646806717, "learning_rate": 9.342731631452942e-05, "loss": 0.2127, "step": 5132 }, { "epoch": 1.0344549667539795, "grad_norm": 0.04696709290146828, "learning_rate": 9.342071004478439e-05, "loss": 0.2077, "step": 5134 }, { "epoch": 1.0348579488212775, "grad_norm": 0.05382629111409187, "learning_rate": 9.341410069050159e-05, "loss": 0.2282, "step": 5136 }, { "epoch": 1.0352609308885754, "grad_norm": 0.05978304147720337, "learning_rate": 9.340748825215047e-05, "loss": 0.1724, "step": 5138 }, { "epoch": 1.0356639129558736, "grad_norm": 0.0483553521335125, "learning_rate": 9.340087273020084e-05, "loss": 0.1775, "step": 5140 }, { "epoch": 1.0360668950231715, "grad_norm": 0.06312581151723862, "learning_rate": 9.339425412512259e-05, "loss": 0.195, "step": 5142 }, { "epoch": 1.0364698770904694, "grad_norm": 0.049008119851350784, "learning_rate": 9.338763243738595e-05, "loss": 0.1825, "step": 5144 }, { "epoch": 1.0368728591577674, "grad_norm": 0.12464253604412079, "learning_rate": 9.338100766746129e-05, "loss": 0.1478, "step": 5146 }, { "epoch": 1.0372758412250656, "grad_norm": 0.04872802644968033, "learning_rate": 9.337437981581921e-05, "loss": 0.2437, "step": 5148 }, { "epoch": 1.0376788232923635, "grad_norm": 0.06108350679278374, "learning_rate": 9.336774888293056e-05, "loss": 0.1857, "step": 5150 }, { "epoch": 1.0380818053596614, "grad_norm": 0.06063728407025337, "learning_rate": 9.336111486926639e-05, "loss": 0.1936, "step": 5152 }, { "epoch": 1.0384847874269596, "grad_norm": 0.06168040260672569, "learning_rate": 9.335447777529795e-05, "loss": 0.205, "step": 5154 }, { "epoch": 1.0388877694942575, "grad_norm": 0.058670446276664734, "learning_rate": 9.334783760149677e-05, "loss": 0.1868, "step": 5156 }, { "epoch": 1.0392907515615555, "grad_norm": 0.0711679607629776, "learning_rate": 9.334119434833452e-05, "loss": 0.2227, "step": 5158 }, { "epoch": 1.0396937336288534, "grad_norm": 0.05397256091237068, "learning_rate": 9.333454801628313e-05, "loss": 0.1852, "step": 5160 }, { "epoch": 1.0400967156961516, "grad_norm": 0.06917382031679153, "learning_rate": 9.332789860581475e-05, "loss": 0.2189, "step": 5162 }, { "epoch": 1.0404996977634495, "grad_norm": 0.05291181057691574, "learning_rate": 9.332124611740176e-05, "loss": 0.2203, "step": 5164 }, { "epoch": 1.0409026798307475, "grad_norm": 0.046576354652643204, "learning_rate": 9.331459055151673e-05, "loss": 0.16, "step": 5166 }, { "epoch": 1.0413056618980456, "grad_norm": 0.04400373622775078, "learning_rate": 9.330793190863244e-05, "loss": 0.1689, "step": 5168 }, { "epoch": 1.0417086439653436, "grad_norm": 0.055641304701566696, "learning_rate": 9.330127018922194e-05, "loss": 0.1461, "step": 5170 }, { "epoch": 1.0421116260326415, "grad_norm": 0.05043255165219307, "learning_rate": 9.329460539375844e-05, "loss": 0.2048, "step": 5172 }, { "epoch": 1.0425146080999395, "grad_norm": 0.2573598325252533, "learning_rate": 9.328793752271543e-05, "loss": 0.1971, "step": 5174 }, { "epoch": 1.0429175901672376, "grad_norm": 0.06544458121061325, "learning_rate": 9.328126657656657e-05, "loss": 0.2092, "step": 5176 }, { "epoch": 1.0433205722345356, "grad_norm": 0.07152343541383743, "learning_rate": 9.327459255578574e-05, "loss": 0.2437, "step": 5178 }, { "epoch": 1.0437235543018335, "grad_norm": 0.07631577551364899, "learning_rate": 9.326791546084706e-05, "loss": 0.199, "step": 5180 }, { "epoch": 1.0441265363691317, "grad_norm": 0.060583699494600296, "learning_rate": 9.326123529222489e-05, "loss": 0.1864, "step": 5182 }, { "epoch": 1.0445295184364296, "grad_norm": 0.04643869027495384, "learning_rate": 9.325455205039372e-05, "loss": 0.1652, "step": 5184 }, { "epoch": 1.0449325005037275, "grad_norm": 0.2666691839694977, "learning_rate": 9.324786573582836e-05, "loss": 0.1951, "step": 5186 }, { "epoch": 1.0453354825710255, "grad_norm": 0.060796648263931274, "learning_rate": 9.324117634900378e-05, "loss": 0.224, "step": 5188 }, { "epoch": 1.0457384646383237, "grad_norm": 0.05381575971841812, "learning_rate": 9.323448389039517e-05, "loss": 0.2816, "step": 5190 }, { "epoch": 1.0461414467056216, "grad_norm": 0.06347894668579102, "learning_rate": 9.322778836047798e-05, "loss": 0.2011, "step": 5192 }, { "epoch": 1.0465444287729195, "grad_norm": 0.047166090458631516, "learning_rate": 9.322108975972786e-05, "loss": 0.2097, "step": 5194 }, { "epoch": 1.0469474108402177, "grad_norm": 0.047421008348464966, "learning_rate": 9.321438808862061e-05, "loss": 0.2342, "step": 5196 }, { "epoch": 1.0473503929075156, "grad_norm": 0.0566796138882637, "learning_rate": 9.320768334763236e-05, "loss": 0.2047, "step": 5198 }, { "epoch": 1.0477533749748136, "grad_norm": 0.05123184621334076, "learning_rate": 9.320097553723938e-05, "loss": 0.2139, "step": 5200 }, { "epoch": 1.0481563570421115, "grad_norm": 0.04648276045918465, "learning_rate": 9.319426465791821e-05, "loss": 0.2232, "step": 5202 }, { "epoch": 1.0485593391094097, "grad_norm": 0.061085715889930725, "learning_rate": 9.318755071014554e-05, "loss": 0.1368, "step": 5204 }, { "epoch": 1.0489623211767076, "grad_norm": 0.06661339849233627, "learning_rate": 9.318083369439833e-05, "loss": 0.1924, "step": 5206 }, { "epoch": 1.0493653032440056, "grad_norm": 0.043311674147844315, "learning_rate": 9.317411361115376e-05, "loss": 0.196, "step": 5208 }, { "epoch": 1.0497682853113037, "grad_norm": 0.07116144895553589, "learning_rate": 9.31673904608892e-05, "loss": 0.1658, "step": 5210 }, { "epoch": 1.0501712673786017, "grad_norm": 0.054282158613204956, "learning_rate": 9.316066424408225e-05, "loss": 0.1705, "step": 5212 }, { "epoch": 1.0505742494458996, "grad_norm": 0.057639311999082565, "learning_rate": 9.315393496121075e-05, "loss": 0.2219, "step": 5214 }, { "epoch": 1.0509772315131976, "grad_norm": 0.06315790861845016, "learning_rate": 9.314720261275273e-05, "loss": 0.2211, "step": 5216 }, { "epoch": 1.0513802135804957, "grad_norm": 0.07036170363426208, "learning_rate": 9.314046719918644e-05, "loss": 0.1876, "step": 5218 }, { "epoch": 1.0517831956477937, "grad_norm": 0.24687032401561737, "learning_rate": 9.313372872099033e-05, "loss": 0.1613, "step": 5220 }, { "epoch": 1.0521861777150916, "grad_norm": 0.05008988082408905, "learning_rate": 9.312698717864314e-05, "loss": 0.2066, "step": 5222 }, { "epoch": 1.0525891597823898, "grad_norm": 0.06380771845579147, "learning_rate": 9.312024257262373e-05, "loss": 0.1926, "step": 5224 }, { "epoch": 1.0529921418496877, "grad_norm": 0.06707505881786346, "learning_rate": 9.311349490341126e-05, "loss": 0.229, "step": 5226 }, { "epoch": 1.0533951239169856, "grad_norm": 0.1017342135310173, "learning_rate": 9.310674417148507e-05, "loss": 0.2125, "step": 5228 }, { "epoch": 1.0537981059842838, "grad_norm": 0.09565824270248413, "learning_rate": 9.30999903773247e-05, "loss": 0.2331, "step": 5230 }, { "epoch": 1.0542010880515817, "grad_norm": 0.05370059609413147, "learning_rate": 9.309323352140996e-05, "loss": 0.1902, "step": 5232 }, { "epoch": 1.0546040701188797, "grad_norm": 0.07205013185739517, "learning_rate": 9.30864736042208e-05, "loss": 0.2382, "step": 5234 }, { "epoch": 1.0550070521861776, "grad_norm": 0.046149007976055145, "learning_rate": 9.307971062623748e-05, "loss": 0.1579, "step": 5236 }, { "epoch": 1.0554100342534758, "grad_norm": 0.07567006349563599, "learning_rate": 9.307294458794041e-05, "loss": 0.1881, "step": 5238 }, { "epoch": 1.0558130163207737, "grad_norm": 0.06721212714910507, "learning_rate": 9.306617548981024e-05, "loss": 0.1988, "step": 5240 }, { "epoch": 1.0562159983880717, "grad_norm": 0.0702115148305893, "learning_rate": 9.305940333232784e-05, "loss": 0.1687, "step": 5242 }, { "epoch": 1.0566189804553698, "grad_norm": 0.05544662848114967, "learning_rate": 9.305262811597429e-05, "loss": 0.1663, "step": 5244 }, { "epoch": 1.0570219625226678, "grad_norm": 0.0851738229393959, "learning_rate": 9.304584984123089e-05, "loss": 0.2534, "step": 5246 }, { "epoch": 1.0574249445899657, "grad_norm": 0.042979102581739426, "learning_rate": 9.303906850857917e-05, "loss": 0.145, "step": 5248 }, { "epoch": 1.0578279266572637, "grad_norm": 0.07365533709526062, "learning_rate": 9.303228411850085e-05, "loss": 0.1765, "step": 5250 }, { "epoch": 1.0582309087245618, "grad_norm": 0.07229702174663544, "learning_rate": 9.302549667147787e-05, "loss": 0.1969, "step": 5252 }, { "epoch": 1.0586338907918598, "grad_norm": 0.11042525619268417, "learning_rate": 9.301870616799242e-05, "loss": 0.2076, "step": 5254 }, { "epoch": 1.0590368728591577, "grad_norm": 0.06097995117306709, "learning_rate": 9.301191260852688e-05, "loss": 0.1772, "step": 5256 }, { "epoch": 1.0594398549264559, "grad_norm": 0.08126639574766159, "learning_rate": 9.300511599356387e-05, "loss": 0.2103, "step": 5258 }, { "epoch": 1.0598428369937538, "grad_norm": 0.06578926742076874, "learning_rate": 9.29983163235862e-05, "loss": 0.2008, "step": 5260 }, { "epoch": 1.0602458190610518, "grad_norm": 0.05863117054104805, "learning_rate": 9.299151359907689e-05, "loss": 0.2324, "step": 5262 }, { "epoch": 1.0606488011283497, "grad_norm": 0.06247061491012573, "learning_rate": 9.29847078205192e-05, "loss": 0.217, "step": 5264 }, { "epoch": 1.0610517831956479, "grad_norm": 0.07186681032180786, "learning_rate": 9.297789898839662e-05, "loss": 0.2127, "step": 5266 }, { "epoch": 1.0614547652629458, "grad_norm": 0.0571020282804966, "learning_rate": 9.297108710319285e-05, "loss": 0.1943, "step": 5268 }, { "epoch": 1.0618577473302437, "grad_norm": 0.054367225617170334, "learning_rate": 9.296427216539175e-05, "loss": 0.2208, "step": 5270 }, { "epoch": 1.062260729397542, "grad_norm": 0.05774039775133133, "learning_rate": 9.295745417547747e-05, "loss": 0.2079, "step": 5272 }, { "epoch": 1.0626637114648398, "grad_norm": 0.05585348606109619, "learning_rate": 9.295063313393435e-05, "loss": 0.2041, "step": 5274 }, { "epoch": 1.0630666935321378, "grad_norm": 0.056385934352874756, "learning_rate": 9.294380904124693e-05, "loss": 0.1697, "step": 5276 }, { "epoch": 1.0634696755994357, "grad_norm": 0.059455644339323044, "learning_rate": 9.293698189790002e-05, "loss": 0.2062, "step": 5278 }, { "epoch": 1.063872657666734, "grad_norm": 0.05323608219623566, "learning_rate": 9.293015170437856e-05, "loss": 0.1897, "step": 5280 }, { "epoch": 1.0642756397340318, "grad_norm": 0.04466480761766434, "learning_rate": 9.292331846116779e-05, "loss": 0.2029, "step": 5282 }, { "epoch": 1.0646786218013298, "grad_norm": 0.05269274860620499, "learning_rate": 9.29164821687531e-05, "loss": 0.2499, "step": 5284 }, { "epoch": 1.065081603868628, "grad_norm": 0.07230303436517715, "learning_rate": 9.290964282762018e-05, "loss": 0.1982, "step": 5286 }, { "epoch": 1.0654845859359259, "grad_norm": 0.03910969942808151, "learning_rate": 9.290280043825486e-05, "loss": 0.1533, "step": 5288 }, { "epoch": 1.0658875680032238, "grad_norm": 0.06969325244426727, "learning_rate": 9.289595500114319e-05, "loss": 0.1881, "step": 5290 }, { "epoch": 1.0662905500705218, "grad_norm": 0.04106011241674423, "learning_rate": 9.288910651677149e-05, "loss": 0.1504, "step": 5292 }, { "epoch": 1.06669353213782, "grad_norm": 0.06589590013027191, "learning_rate": 9.288225498562624e-05, "loss": 0.2022, "step": 5294 }, { "epoch": 1.0670965142051179, "grad_norm": 0.07294216752052307, "learning_rate": 9.287540040819418e-05, "loss": 0.2199, "step": 5296 }, { "epoch": 1.0674994962724158, "grad_norm": 0.04226839542388916, "learning_rate": 9.286854278496226e-05, "loss": 0.1784, "step": 5298 }, { "epoch": 1.067902478339714, "grad_norm": 0.06024034321308136, "learning_rate": 9.286168211641762e-05, "loss": 0.253, "step": 5300 }, { "epoch": 1.068305460407012, "grad_norm": 0.052907638251781464, "learning_rate": 9.28548184030476e-05, "loss": 0.1703, "step": 5302 }, { "epoch": 1.0687084424743098, "grad_norm": 0.06218164786696434, "learning_rate": 9.284795164533984e-05, "loss": 0.2072, "step": 5304 }, { "epoch": 1.069111424541608, "grad_norm": 0.05378426983952522, "learning_rate": 9.284108184378212e-05, "loss": 0.1843, "step": 5306 }, { "epoch": 1.069514406608906, "grad_norm": 0.04734671860933304, "learning_rate": 9.283420899886245e-05, "loss": 0.1888, "step": 5308 }, { "epoch": 1.069917388676204, "grad_norm": 0.04951245337724686, "learning_rate": 9.282733311106908e-05, "loss": 0.228, "step": 5310 }, { "epoch": 1.0703203707435018, "grad_norm": 0.06134570389986038, "learning_rate": 9.282045418089047e-05, "loss": 0.1987, "step": 5312 }, { "epoch": 1.0707233528108, "grad_norm": 0.06411506235599518, "learning_rate": 9.281357220881526e-05, "loss": 0.214, "step": 5314 }, { "epoch": 1.071126334878098, "grad_norm": 0.058260899037122726, "learning_rate": 9.280668719533236e-05, "loss": 0.2182, "step": 5316 }, { "epoch": 1.0715293169453959, "grad_norm": 0.04684171453118324, "learning_rate": 9.279979914093084e-05, "loss": 0.1945, "step": 5318 }, { "epoch": 1.071932299012694, "grad_norm": 0.0659032016992569, "learning_rate": 9.279290804610005e-05, "loss": 0.2218, "step": 5320 }, { "epoch": 1.072335281079992, "grad_norm": 0.061241116374731064, "learning_rate": 9.278601391132953e-05, "loss": 0.2016, "step": 5322 }, { "epoch": 1.07273826314729, "grad_norm": 0.056910622864961624, "learning_rate": 9.2779116737109e-05, "loss": 0.2354, "step": 5324 }, { "epoch": 1.0731412452145879, "grad_norm": 0.04516435042023659, "learning_rate": 9.277221652392841e-05, "loss": 0.2081, "step": 5326 }, { "epoch": 1.073544227281886, "grad_norm": 0.047263868153095245, "learning_rate": 9.276531327227798e-05, "loss": 0.2268, "step": 5328 }, { "epoch": 1.073947209349184, "grad_norm": 0.07561453431844711, "learning_rate": 9.275840698264808e-05, "loss": 0.2418, "step": 5330 }, { "epoch": 1.074350191416482, "grad_norm": 0.04668330028653145, "learning_rate": 9.275149765552933e-05, "loss": 0.2266, "step": 5332 }, { "epoch": 1.07475317348378, "grad_norm": 0.042202576994895935, "learning_rate": 9.274458529141256e-05, "loss": 0.1451, "step": 5334 }, { "epoch": 1.075156155551078, "grad_norm": 0.05034751817584038, "learning_rate": 9.273766989078883e-05, "loss": 0.2515, "step": 5336 }, { "epoch": 1.075559137618376, "grad_norm": 0.054490312933921814, "learning_rate": 9.273075145414935e-05, "loss": 0.1836, "step": 5338 }, { "epoch": 1.075962119685674, "grad_norm": 0.0491647832095623, "learning_rate": 9.272382998198563e-05, "loss": 0.1985, "step": 5340 }, { "epoch": 1.076365101752972, "grad_norm": 0.06226971000432968, "learning_rate": 9.271690547478937e-05, "loss": 0.2063, "step": 5342 }, { "epoch": 1.07676808382027, "grad_norm": 0.043259039521217346, "learning_rate": 9.270997793305245e-05, "loss": 0.169, "step": 5344 }, { "epoch": 1.077171065887568, "grad_norm": 0.05381862819194794, "learning_rate": 9.2703047357267e-05, "loss": 0.1795, "step": 5346 }, { "epoch": 1.077574047954866, "grad_norm": 0.05488457530736923, "learning_rate": 9.269611374792537e-05, "loss": 0.1761, "step": 5348 }, { "epoch": 1.077977030022164, "grad_norm": 0.0528683140873909, "learning_rate": 9.26891771055201e-05, "loss": 0.1615, "step": 5350 }, { "epoch": 1.078380012089462, "grad_norm": 0.05571812763810158, "learning_rate": 9.268223743054394e-05, "loss": 0.1655, "step": 5352 }, { "epoch": 1.07878299415676, "grad_norm": 0.0493588000535965, "learning_rate": 9.267529472348992e-05, "loss": 0.155, "step": 5354 }, { "epoch": 1.079185976224058, "grad_norm": 0.05553653463721275, "learning_rate": 9.266834898485119e-05, "loss": 0.2077, "step": 5356 }, { "epoch": 1.079588958291356, "grad_norm": 0.07048819959163666, "learning_rate": 9.26614002151212e-05, "loss": 0.1946, "step": 5358 }, { "epoch": 1.079991940358654, "grad_norm": 0.04235006496310234, "learning_rate": 9.265444841479356e-05, "loss": 0.1742, "step": 5360 }, { "epoch": 1.0803949224259521, "grad_norm": 0.044974058866500854, "learning_rate": 9.264749358436213e-05, "loss": 0.1744, "step": 5362 }, { "epoch": 1.08079790449325, "grad_norm": 0.05903393775224686, "learning_rate": 9.264053572432094e-05, "loss": 0.2158, "step": 5364 }, { "epoch": 1.081200886560548, "grad_norm": 0.06140587851405144, "learning_rate": 9.263357483516431e-05, "loss": 0.1826, "step": 5366 }, { "epoch": 1.081603868627846, "grad_norm": 0.044442079961299896, "learning_rate": 9.262661091738668e-05, "loss": 0.1985, "step": 5368 }, { "epoch": 1.0820068506951441, "grad_norm": 0.04410601034760475, "learning_rate": 9.261964397148279e-05, "loss": 0.1989, "step": 5370 }, { "epoch": 1.082409832762442, "grad_norm": 0.05235345661640167, "learning_rate": 9.261267399794757e-05, "loss": 0.1553, "step": 5372 }, { "epoch": 1.08281281482974, "grad_norm": 0.042015720158815384, "learning_rate": 9.260570099727612e-05, "loss": 0.1862, "step": 5374 }, { "epoch": 1.0832157968970382, "grad_norm": 0.05750071257352829, "learning_rate": 9.259872496996382e-05, "loss": 0.1873, "step": 5376 }, { "epoch": 1.0836187789643361, "grad_norm": 0.07005587220191956, "learning_rate": 9.259174591650621e-05, "loss": 0.2258, "step": 5378 }, { "epoch": 1.084021761031634, "grad_norm": 0.06074369698762894, "learning_rate": 9.258476383739909e-05, "loss": 0.1777, "step": 5380 }, { "epoch": 1.084424743098932, "grad_norm": 0.048150282353162766, "learning_rate": 9.257777873313847e-05, "loss": 0.1437, "step": 5382 }, { "epoch": 1.0848277251662302, "grad_norm": 0.04304511845111847, "learning_rate": 9.257079060422051e-05, "loss": 0.1864, "step": 5384 }, { "epoch": 1.085230707233528, "grad_norm": 0.042921341955661774, "learning_rate": 9.256379945114168e-05, "loss": 0.1959, "step": 5386 }, { "epoch": 1.085633689300826, "grad_norm": 0.06277068704366684, "learning_rate": 9.255680527439862e-05, "loss": 0.2267, "step": 5388 }, { "epoch": 1.0860366713681242, "grad_norm": 0.04990841820836067, "learning_rate": 9.254980807448818e-05, "loss": 0.209, "step": 5390 }, { "epoch": 1.0864396534354221, "grad_norm": 0.061241425573825836, "learning_rate": 9.25428078519074e-05, "loss": 0.1987, "step": 5392 }, { "epoch": 1.08684263550272, "grad_norm": 0.08670193701982498, "learning_rate": 9.25358046071536e-05, "loss": 0.2124, "step": 5394 }, { "epoch": 1.087245617570018, "grad_norm": 0.06139161437749863, "learning_rate": 9.252879834072425e-05, "loss": 0.1906, "step": 5396 }, { "epoch": 1.0876485996373162, "grad_norm": 0.05512767285108566, "learning_rate": 9.25217890531171e-05, "loss": 0.1975, "step": 5398 }, { "epoch": 1.0880515817046141, "grad_norm": 0.05780330300331116, "learning_rate": 9.251477674483005e-05, "loss": 0.1792, "step": 5400 }, { "epoch": 1.088454563771912, "grad_norm": 0.044492777436971664, "learning_rate": 9.250776141636126e-05, "loss": 0.1552, "step": 5402 }, { "epoch": 1.0888575458392102, "grad_norm": 0.044563908129930496, "learning_rate": 9.250074306820907e-05, "loss": 0.1773, "step": 5404 }, { "epoch": 1.0892605279065082, "grad_norm": 0.042871829122304916, "learning_rate": 9.249372170087208e-05, "loss": 0.2186, "step": 5406 }, { "epoch": 1.0896635099738061, "grad_norm": 0.05457884073257446, "learning_rate": 9.248669731484903e-05, "loss": 0.2273, "step": 5408 }, { "epoch": 1.090066492041104, "grad_norm": 0.04954477399587631, "learning_rate": 9.247966991063897e-05, "loss": 0.2203, "step": 5410 }, { "epoch": 1.0904694741084022, "grad_norm": 0.060218147933483124, "learning_rate": 9.24726394887411e-05, "loss": 0.2143, "step": 5412 }, { "epoch": 1.0908724561757002, "grad_norm": 0.05448036640882492, "learning_rate": 9.246560604965483e-05, "loss": 0.1576, "step": 5414 }, { "epoch": 1.091275438242998, "grad_norm": 0.06038915738463402, "learning_rate": 9.245856959387984e-05, "loss": 0.1761, "step": 5416 }, { "epoch": 1.0916784203102963, "grad_norm": 0.06423316150903702, "learning_rate": 9.245153012191594e-05, "loss": 0.2227, "step": 5418 }, { "epoch": 1.0920814023775942, "grad_norm": 0.04629231616854668, "learning_rate": 9.244448763426325e-05, "loss": 0.1706, "step": 5420 }, { "epoch": 1.0924843844448922, "grad_norm": 0.046612486243247986, "learning_rate": 9.243744213142203e-05, "loss": 0.1885, "step": 5422 }, { "epoch": 1.09288736651219, "grad_norm": 0.04658891260623932, "learning_rate": 9.24303936138928e-05, "loss": 0.2422, "step": 5424 }, { "epoch": 1.0932903485794883, "grad_norm": 0.049364447593688965, "learning_rate": 9.242334208217627e-05, "loss": 0.1901, "step": 5426 }, { "epoch": 1.0936933306467862, "grad_norm": 0.08126164227724075, "learning_rate": 9.241628753677335e-05, "loss": 0.2406, "step": 5428 }, { "epoch": 1.0940963127140841, "grad_norm": 0.044822290539741516, "learning_rate": 9.240922997818519e-05, "loss": 0.2249, "step": 5430 }, { "epoch": 1.0944992947813823, "grad_norm": 0.0389249213039875, "learning_rate": 9.240216940691318e-05, "loss": 0.178, "step": 5432 }, { "epoch": 1.0949022768486802, "grad_norm": 0.043713074177503586, "learning_rate": 9.239510582345885e-05, "loss": 0.1621, "step": 5434 }, { "epoch": 1.0953052589159782, "grad_norm": 0.0456581637263298, "learning_rate": 9.238803922832402e-05, "loss": 0.2192, "step": 5436 }, { "epoch": 1.0957082409832761, "grad_norm": 0.057899102568626404, "learning_rate": 9.238096962201066e-05, "loss": 0.2148, "step": 5438 }, { "epoch": 1.0961112230505743, "grad_norm": 0.06534866988658905, "learning_rate": 9.237389700502099e-05, "loss": 0.2245, "step": 5440 }, { "epoch": 1.0965142051178722, "grad_norm": 0.05663755163550377, "learning_rate": 9.236682137785746e-05, "loss": 0.2097, "step": 5442 }, { "epoch": 1.0969171871851702, "grad_norm": 0.07649657875299454, "learning_rate": 9.23597427410227e-05, "loss": 0.2231, "step": 5444 }, { "epoch": 1.0973201692524683, "grad_norm": 0.056587040424346924, "learning_rate": 9.235266109501955e-05, "loss": 0.2006, "step": 5446 }, { "epoch": 1.0977231513197663, "grad_norm": 0.04581359773874283, "learning_rate": 9.234557644035108e-05, "loss": 0.1974, "step": 5448 }, { "epoch": 1.0981261333870642, "grad_norm": 0.04404517635703087, "learning_rate": 9.233848877752058e-05, "loss": 0.1691, "step": 5450 }, { "epoch": 1.0985291154543624, "grad_norm": 0.03833284601569176, "learning_rate": 9.233139810703156e-05, "loss": 0.1831, "step": 5452 }, { "epoch": 1.0989320975216603, "grad_norm": 0.04854941368103027, "learning_rate": 9.232430442938771e-05, "loss": 0.2112, "step": 5454 }, { "epoch": 1.0993350795889583, "grad_norm": 0.051708050072193146, "learning_rate": 9.231720774509297e-05, "loss": 0.1781, "step": 5456 }, { "epoch": 1.0997380616562562, "grad_norm": 0.038904573768377304, "learning_rate": 9.231010805465145e-05, "loss": 0.1538, "step": 5458 }, { "epoch": 1.1001410437235544, "grad_norm": 0.03842492401599884, "learning_rate": 9.230300535856755e-05, "loss": 0.2175, "step": 5460 }, { "epoch": 1.1005440257908523, "grad_norm": 0.058032989501953125, "learning_rate": 9.229589965734577e-05, "loss": 0.2119, "step": 5462 }, { "epoch": 1.1009470078581503, "grad_norm": 0.04689335078001022, "learning_rate": 9.228879095149094e-05, "loss": 0.1421, "step": 5464 }, { "epoch": 1.1013499899254484, "grad_norm": 0.056845102459192276, "learning_rate": 9.228167924150803e-05, "loss": 0.2294, "step": 5466 }, { "epoch": 1.1017529719927464, "grad_norm": 0.054818570613861084, "learning_rate": 9.227456452790224e-05, "loss": 0.2249, "step": 5468 }, { "epoch": 1.1021559540600443, "grad_norm": 0.09339629113674164, "learning_rate": 9.2267446811179e-05, "loss": 0.1741, "step": 5470 }, { "epoch": 1.1025589361273422, "grad_norm": 0.06134527549147606, "learning_rate": 9.226032609184394e-05, "loss": 0.1906, "step": 5472 }, { "epoch": 1.1029619181946404, "grad_norm": 0.056619711220264435, "learning_rate": 9.225320237040289e-05, "loss": 0.1955, "step": 5474 }, { "epoch": 1.1033649002619383, "grad_norm": 0.041108060628175735, "learning_rate": 9.224607564736192e-05, "loss": 0.1855, "step": 5476 }, { "epoch": 1.1037678823292363, "grad_norm": 0.04002273827791214, "learning_rate": 9.22389459232273e-05, "loss": 0.1777, "step": 5478 }, { "epoch": 1.1041708643965344, "grad_norm": 0.0521574467420578, "learning_rate": 9.223181319850551e-05, "loss": 0.1726, "step": 5480 }, { "epoch": 1.1045738464638324, "grad_norm": 0.07421046495437622, "learning_rate": 9.222467747370325e-05, "loss": 0.2221, "step": 5482 }, { "epoch": 1.1049768285311303, "grad_norm": 0.05241503566503525, "learning_rate": 9.221753874932743e-05, "loss": 0.1346, "step": 5484 }, { "epoch": 1.1053798105984283, "grad_norm": 0.05384815111756325, "learning_rate": 9.221039702588519e-05, "loss": 0.1861, "step": 5486 }, { "epoch": 1.1057827926657264, "grad_norm": 0.047820959240198135, "learning_rate": 9.220325230388382e-05, "loss": 0.2723, "step": 5488 }, { "epoch": 1.1061857747330244, "grad_norm": 0.04170846566557884, "learning_rate": 9.219610458383092e-05, "loss": 0.157, "step": 5490 }, { "epoch": 1.1065887568003223, "grad_norm": 0.044703803956508636, "learning_rate": 9.218895386623424e-05, "loss": 0.2218, "step": 5492 }, { "epoch": 1.1069917388676205, "grad_norm": 0.0474412739276886, "learning_rate": 9.218180015160173e-05, "loss": 0.1925, "step": 5494 }, { "epoch": 1.1073947209349184, "grad_norm": 0.06012911722064018, "learning_rate": 9.217464344044162e-05, "loss": 0.1965, "step": 5496 }, { "epoch": 1.1077977030022164, "grad_norm": 0.05040358379483223, "learning_rate": 9.216748373326227e-05, "loss": 0.2436, "step": 5498 }, { "epoch": 1.1082006850695145, "grad_norm": 0.0459962859749794, "learning_rate": 9.216032103057232e-05, "loss": 0.1476, "step": 5500 }, { "epoch": 1.1086036671368125, "grad_norm": 0.050243549048900604, "learning_rate": 9.215315533288057e-05, "loss": 0.2147, "step": 5502 }, { "epoch": 1.1090066492041104, "grad_norm": 0.03991679847240448, "learning_rate": 9.214598664069611e-05, "loss": 0.1953, "step": 5504 }, { "epoch": 1.1094096312714083, "grad_norm": 0.05045531690120697, "learning_rate": 9.213881495452815e-05, "loss": 0.2051, "step": 5506 }, { "epoch": 1.1098126133387065, "grad_norm": 0.057895347476005554, "learning_rate": 9.213164027488617e-05, "loss": 0.179, "step": 5508 }, { "epoch": 1.1102155954060045, "grad_norm": 0.047821614891290665, "learning_rate": 9.212446260227982e-05, "loss": 0.2077, "step": 5510 }, { "epoch": 1.1106185774733024, "grad_norm": 0.051236581057310104, "learning_rate": 9.211728193721904e-05, "loss": 0.1889, "step": 5512 }, { "epoch": 1.1110215595406006, "grad_norm": 0.062080029398202896, "learning_rate": 9.211009828021391e-05, "loss": 0.2335, "step": 5514 }, { "epoch": 1.1114245416078985, "grad_norm": 0.04640135169029236, "learning_rate": 9.210291163177474e-05, "loss": 0.2103, "step": 5516 }, { "epoch": 1.1118275236751964, "grad_norm": 0.043231479823589325, "learning_rate": 9.209572199241206e-05, "loss": 0.1743, "step": 5518 }, { "epoch": 1.1122305057424944, "grad_norm": 0.07690000534057617, "learning_rate": 9.20885293626366e-05, "loss": 0.2403, "step": 5520 }, { "epoch": 1.1126334878097925, "grad_norm": 0.033921536058187485, "learning_rate": 9.208133374295934e-05, "loss": 0.1443, "step": 5522 }, { "epoch": 1.1130364698770905, "grad_norm": 0.054961346089839935, "learning_rate": 9.207413513389141e-05, "loss": 0.1989, "step": 5524 }, { "epoch": 1.1134394519443884, "grad_norm": 0.04538585990667343, "learning_rate": 9.206693353594422e-05, "loss": 0.1754, "step": 5526 }, { "epoch": 1.1138424340116866, "grad_norm": 0.0442589595913887, "learning_rate": 9.205972894962936e-05, "loss": 0.2155, "step": 5528 }, { "epoch": 1.1142454160789845, "grad_norm": 0.052050501108169556, "learning_rate": 9.205252137545861e-05, "loss": 0.2069, "step": 5530 }, { "epoch": 1.1146483981462825, "grad_norm": 0.062034666538238525, "learning_rate": 9.204531081394399e-05, "loss": 0.2195, "step": 5532 }, { "epoch": 1.1150513802135804, "grad_norm": 0.04442548006772995, "learning_rate": 9.203809726559773e-05, "loss": 0.1445, "step": 5534 }, { "epoch": 1.1154543622808786, "grad_norm": 0.04459670931100845, "learning_rate": 9.203088073093227e-05, "loss": 0.1902, "step": 5536 }, { "epoch": 1.1158573443481765, "grad_norm": 0.041819144040346146, "learning_rate": 9.202366121046027e-05, "loss": 0.2043, "step": 5538 }, { "epoch": 1.1162603264154745, "grad_norm": 0.03972846269607544, "learning_rate": 9.201643870469458e-05, "loss": 0.18, "step": 5540 }, { "epoch": 1.1166633084827726, "grad_norm": 0.05215775966644287, "learning_rate": 9.200921321414829e-05, "loss": 0.1896, "step": 5542 }, { "epoch": 1.1170662905500706, "grad_norm": 0.05705489218235016, "learning_rate": 9.200198473933466e-05, "loss": 0.2474, "step": 5544 }, { "epoch": 1.1174692726173685, "grad_norm": 0.0615994967520237, "learning_rate": 9.19947532807672e-05, "loss": 0.2095, "step": 5546 }, { "epoch": 1.1178722546846664, "grad_norm": 0.048454973846673965, "learning_rate": 9.198751883895967e-05, "loss": 0.1614, "step": 5548 }, { "epoch": 1.1182752367519646, "grad_norm": 0.07016980648040771, "learning_rate": 9.198028141442591e-05, "loss": 0.1869, "step": 5550 }, { "epoch": 1.1186782188192625, "grad_norm": 0.04643561318516731, "learning_rate": 9.19730410076801e-05, "loss": 0.2311, "step": 5552 }, { "epoch": 1.1190812008865605, "grad_norm": 0.0610547810792923, "learning_rate": 9.19657976192366e-05, "loss": 0.2155, "step": 5554 }, { "epoch": 1.1194841829538587, "grad_norm": 0.04994925484061241, "learning_rate": 9.195855124960995e-05, "loss": 0.249, "step": 5556 }, { "epoch": 1.1198871650211566, "grad_norm": 0.055203359574079514, "learning_rate": 9.19513018993149e-05, "loss": 0.2251, "step": 5558 }, { "epoch": 1.1202901470884545, "grad_norm": 0.05273060500621796, "learning_rate": 9.194404956886648e-05, "loss": 0.1437, "step": 5560 }, { "epoch": 1.1206931291557525, "grad_norm": 0.04343268647789955, "learning_rate": 9.193679425877983e-05, "loss": 0.1886, "step": 5562 }, { "epoch": 1.1210961112230506, "grad_norm": 0.04952103644609451, "learning_rate": 9.192953596957041e-05, "loss": 0.1578, "step": 5564 }, { "epoch": 1.1214990932903486, "grad_norm": 0.06240951642394066, "learning_rate": 9.192227470175381e-05, "loss": 0.1606, "step": 5566 }, { "epoch": 1.1219020753576465, "grad_norm": 0.04541534557938576, "learning_rate": 9.191501045584586e-05, "loss": 0.2096, "step": 5568 }, { "epoch": 1.1223050574249447, "grad_norm": 0.04394442215561867, "learning_rate": 9.190774323236258e-05, "loss": 0.2232, "step": 5570 }, { "epoch": 1.1227080394922426, "grad_norm": 0.06251212954521179, "learning_rate": 9.190047303182025e-05, "loss": 0.1937, "step": 5572 }, { "epoch": 1.1231110215595406, "grad_norm": 0.04978261888027191, "learning_rate": 9.189319985473532e-05, "loss": 0.2086, "step": 5574 }, { "epoch": 1.1235140036268385, "grad_norm": 0.042393967509269714, "learning_rate": 9.18859237016245e-05, "loss": 0.225, "step": 5576 }, { "epoch": 1.1239169856941367, "grad_norm": 0.04710579290986061, "learning_rate": 9.187864457300461e-05, "loss": 0.2149, "step": 5578 }, { "epoch": 1.1243199677614346, "grad_norm": 0.05299391224980354, "learning_rate": 9.187136246939281e-05, "loss": 0.221, "step": 5580 }, { "epoch": 1.1247229498287326, "grad_norm": 0.08004529774188995, "learning_rate": 9.186407739130638e-05, "loss": 0.1848, "step": 5582 }, { "epoch": 1.1251259318960307, "grad_norm": 0.05207962915301323, "learning_rate": 9.185678933926284e-05, "loss": 0.2194, "step": 5584 }, { "epoch": 1.1255289139633287, "grad_norm": 0.05663863569498062, "learning_rate": 9.184949831377992e-05, "loss": 0.1864, "step": 5586 }, { "epoch": 1.1259318960306266, "grad_norm": 0.06725175678730011, "learning_rate": 9.184220431537558e-05, "loss": 0.1973, "step": 5588 }, { "epoch": 1.1263348780979245, "grad_norm": 0.050569623708724976, "learning_rate": 9.183490734456794e-05, "loss": 0.1651, "step": 5590 }, { "epoch": 1.1267378601652227, "grad_norm": 0.05040304362773895, "learning_rate": 9.182760740187542e-05, "loss": 0.2196, "step": 5592 }, { "epoch": 1.1271408422325206, "grad_norm": 0.0837257131934166, "learning_rate": 9.182030448781654e-05, "loss": 0.2125, "step": 5594 }, { "epoch": 1.1275438242998186, "grad_norm": 0.056179650127887726, "learning_rate": 9.181299860291011e-05, "loss": 0.1813, "step": 5596 }, { "epoch": 1.1279468063671167, "grad_norm": 0.059334564954042435, "learning_rate": 9.180568974767513e-05, "loss": 0.167, "step": 5598 }, { "epoch": 1.1283497884344147, "grad_norm": 0.050333570688962936, "learning_rate": 9.179837792263082e-05, "loss": 0.2414, "step": 5600 }, { "epoch": 1.1287527705017126, "grad_norm": 0.05855239927768707, "learning_rate": 9.179106312829659e-05, "loss": 0.2002, "step": 5602 }, { "epoch": 1.1291557525690106, "grad_norm": 0.04838457331061363, "learning_rate": 9.178374536519206e-05, "loss": 0.2037, "step": 5604 }, { "epoch": 1.1295587346363087, "grad_norm": 0.04486094042658806, "learning_rate": 9.177642463383708e-05, "loss": 0.1879, "step": 5606 }, { "epoch": 1.1299617167036067, "grad_norm": 0.044857099652290344, "learning_rate": 9.176910093475172e-05, "loss": 0.1604, "step": 5608 }, { "epoch": 1.1303646987709046, "grad_norm": 0.04508034139871597, "learning_rate": 9.176177426845623e-05, "loss": 0.1911, "step": 5610 }, { "epoch": 1.1307676808382028, "grad_norm": 0.0480637326836586, "learning_rate": 9.175444463547108e-05, "loss": 0.2079, "step": 5612 }, { "epoch": 1.1311706629055007, "grad_norm": 0.0715208500623703, "learning_rate": 9.174711203631694e-05, "loss": 0.1998, "step": 5614 }, { "epoch": 1.1315736449727987, "grad_norm": 0.04499208182096481, "learning_rate": 9.173977647151475e-05, "loss": 0.1715, "step": 5616 }, { "epoch": 1.1319766270400966, "grad_norm": 0.0567949116230011, "learning_rate": 9.173243794158557e-05, "loss": 0.1727, "step": 5618 }, { "epoch": 1.1323796091073948, "grad_norm": 0.04465009644627571, "learning_rate": 9.172509644705077e-05, "loss": 0.2048, "step": 5620 }, { "epoch": 1.1327825911746927, "grad_norm": 0.05172666534781456, "learning_rate": 9.171775198843183e-05, "loss": 0.1939, "step": 5622 }, { "epoch": 1.1331855732419907, "grad_norm": 0.049424417316913605, "learning_rate": 9.17104045662505e-05, "loss": 0.1683, "step": 5624 }, { "epoch": 1.1335885553092888, "grad_norm": 0.04293535649776459, "learning_rate": 9.170305418102874e-05, "loss": 0.2179, "step": 5626 }, { "epoch": 1.1339915373765868, "grad_norm": 0.054131362587213516, "learning_rate": 9.169570083328871e-05, "loss": 0.175, "step": 5628 }, { "epoch": 1.1343945194438847, "grad_norm": 0.054785408079624176, "learning_rate": 9.168834452355277e-05, "loss": 0.218, "step": 5630 }, { "epoch": 1.1347975015111826, "grad_norm": 0.050389211624860764, "learning_rate": 9.168098525234351e-05, "loss": 0.2058, "step": 5632 }, { "epoch": 1.1352004835784808, "grad_norm": 0.043904174119234085, "learning_rate": 9.167362302018372e-05, "loss": 0.1675, "step": 5634 }, { "epoch": 1.1356034656457787, "grad_norm": 0.044593095779418945, "learning_rate": 9.166625782759639e-05, "loss": 0.1556, "step": 5636 }, { "epoch": 1.1360064477130767, "grad_norm": 0.07273683696985245, "learning_rate": 9.165888967510474e-05, "loss": 0.2114, "step": 5638 }, { "epoch": 1.1364094297803748, "grad_norm": 0.05269932374358177, "learning_rate": 9.16515185632322e-05, "loss": 0.2006, "step": 5640 }, { "epoch": 1.1368124118476728, "grad_norm": 0.0487261600792408, "learning_rate": 9.164414449250239e-05, "loss": 0.2053, "step": 5642 }, { "epoch": 1.1372153939149707, "grad_norm": 0.057701513171195984, "learning_rate": 9.163676746343914e-05, "loss": 0.2576, "step": 5644 }, { "epoch": 1.1376183759822687, "grad_norm": 0.056653860956430435, "learning_rate": 9.162938747656652e-05, "loss": 0.2186, "step": 5646 }, { "epoch": 1.1380213580495668, "grad_norm": 0.04430731385946274, "learning_rate": 9.162200453240882e-05, "loss": 0.1844, "step": 5648 }, { "epoch": 1.1384243401168648, "grad_norm": 0.03643381595611572, "learning_rate": 9.161461863149046e-05, "loss": 0.1805, "step": 5650 }, { "epoch": 1.1388273221841627, "grad_norm": 0.05398820340633392, "learning_rate": 9.160722977433613e-05, "loss": 0.2303, "step": 5652 }, { "epoch": 1.1392303042514609, "grad_norm": 0.07164819538593292, "learning_rate": 9.159983796147078e-05, "loss": 0.2121, "step": 5654 }, { "epoch": 1.1396332863187588, "grad_norm": 0.046827998012304306, "learning_rate": 9.159244319341944e-05, "loss": 0.2502, "step": 5656 }, { "epoch": 1.1400362683860568, "grad_norm": 0.04478400945663452, "learning_rate": 9.158504547070745e-05, "loss": 0.2045, "step": 5658 }, { "epoch": 1.1404392504533547, "grad_norm": 0.041052401065826416, "learning_rate": 9.157764479386035e-05, "loss": 0.1818, "step": 5660 }, { "epoch": 1.1408422325206529, "grad_norm": 0.06413646787405014, "learning_rate": 9.157024116340384e-05, "loss": 0.2311, "step": 5662 }, { "epoch": 1.1412452145879508, "grad_norm": 0.052323296666145325, "learning_rate": 9.15628345798639e-05, "loss": 0.1733, "step": 5664 }, { "epoch": 1.1416481966552487, "grad_norm": 0.08062373101711273, "learning_rate": 9.155542504376664e-05, "loss": 0.1951, "step": 5666 }, { "epoch": 1.142051178722547, "grad_norm": 0.04671994969248772, "learning_rate": 9.154801255563845e-05, "loss": 0.1823, "step": 5668 }, { "epoch": 1.1424541607898449, "grad_norm": 0.04927929863333702, "learning_rate": 9.154059711600591e-05, "loss": 0.184, "step": 5670 }, { "epoch": 1.1428571428571428, "grad_norm": 0.04393012821674347, "learning_rate": 9.153317872539578e-05, "loss": 0.1605, "step": 5672 }, { "epoch": 1.143260124924441, "grad_norm": 0.05194476246833801, "learning_rate": 9.152575738433505e-05, "loss": 0.2001, "step": 5674 }, { "epoch": 1.143663106991739, "grad_norm": 0.05223282799124718, "learning_rate": 9.151833309335092e-05, "loss": 0.1982, "step": 5676 }, { "epoch": 1.1440660890590368, "grad_norm": 0.08013258129358292, "learning_rate": 9.151090585297082e-05, "loss": 0.2021, "step": 5678 }, { "epoch": 1.144469071126335, "grad_norm": 0.03999164327979088, "learning_rate": 9.150347566372234e-05, "loss": 0.1648, "step": 5680 }, { "epoch": 1.144872053193633, "grad_norm": 0.056218814104795456, "learning_rate": 9.149604252613332e-05, "loss": 0.1562, "step": 5682 }, { "epoch": 1.1452750352609309, "grad_norm": 0.04334927350282669, "learning_rate": 9.148860644073182e-05, "loss": 0.2053, "step": 5684 }, { "epoch": 1.1456780173282288, "grad_norm": 0.06221528723835945, "learning_rate": 9.148116740804606e-05, "loss": 0.2721, "step": 5686 }, { "epoch": 1.146080999395527, "grad_norm": 0.04984398931264877, "learning_rate": 9.147372542860451e-05, "loss": 0.2652, "step": 5688 }, { "epoch": 1.146483981462825, "grad_norm": 0.04859272390604019, "learning_rate": 9.146628050293584e-05, "loss": 0.1944, "step": 5690 }, { "epoch": 1.1468869635301229, "grad_norm": 0.0628964751958847, "learning_rate": 9.145883263156891e-05, "loss": 0.1631, "step": 5692 }, { "epoch": 1.147289945597421, "grad_norm": 0.05849664658308029, "learning_rate": 9.145138181503281e-05, "loss": 0.1846, "step": 5694 }, { "epoch": 1.147692927664719, "grad_norm": 0.04668128490447998, "learning_rate": 9.144392805385684e-05, "loss": 0.2517, "step": 5696 }, { "epoch": 1.148095909732017, "grad_norm": 0.047532372176647186, "learning_rate": 9.14364713485705e-05, "loss": 0.2122, "step": 5698 }, { "epoch": 1.1484988917993149, "grad_norm": 0.04797205701470375, "learning_rate": 9.14290116997035e-05, "loss": 0.2055, "step": 5700 }, { "epoch": 1.148901873866613, "grad_norm": 0.04008246585726738, "learning_rate": 9.142154910778578e-05, "loss": 0.2234, "step": 5702 }, { "epoch": 1.149304855933911, "grad_norm": 0.04848853126168251, "learning_rate": 9.141408357334744e-05, "loss": 0.2334, "step": 5704 }, { "epoch": 1.149707838001209, "grad_norm": 0.09612215310335159, "learning_rate": 9.140661509691885e-05, "loss": 0.1858, "step": 5706 }, { "epoch": 1.150110820068507, "grad_norm": 0.048193544149398804, "learning_rate": 9.139914367903053e-05, "loss": 0.183, "step": 5708 }, { "epoch": 1.150513802135805, "grad_norm": 0.04777916520833969, "learning_rate": 9.139166932021326e-05, "loss": 0.1848, "step": 5710 }, { "epoch": 1.150916784203103, "grad_norm": 0.03997446224093437, "learning_rate": 9.1384192020998e-05, "loss": 0.1549, "step": 5712 }, { "epoch": 1.151319766270401, "grad_norm": 0.053762342780828476, "learning_rate": 9.137671178191592e-05, "loss": 0.1862, "step": 5714 }, { "epoch": 1.151722748337699, "grad_norm": 0.03450224921107292, "learning_rate": 9.13692286034984e-05, "loss": 0.154, "step": 5716 }, { "epoch": 1.152125730404997, "grad_norm": 0.05368155613541603, "learning_rate": 9.136174248627703e-05, "loss": 0.2018, "step": 5718 }, { "epoch": 1.152528712472295, "grad_norm": 0.043950121849775314, "learning_rate": 9.135425343078364e-05, "loss": 0.1567, "step": 5720 }, { "epoch": 1.152931694539593, "grad_norm": 0.06447681039571762, "learning_rate": 9.134676143755022e-05, "loss": 0.1946, "step": 5722 }, { "epoch": 1.153334676606891, "grad_norm": 0.051290884613990784, "learning_rate": 9.133926650710898e-05, "loss": 0.2268, "step": 5724 }, { "epoch": 1.153737658674189, "grad_norm": 0.060127224773168564, "learning_rate": 9.133176863999238e-05, "loss": 0.2142, "step": 5726 }, { "epoch": 1.154140640741487, "grad_norm": 0.040788453072309494, "learning_rate": 9.132426783673303e-05, "loss": 0.1696, "step": 5728 }, { "epoch": 1.154543622808785, "grad_norm": 0.05721386522054672, "learning_rate": 9.131676409786379e-05, "loss": 0.2706, "step": 5730 }, { "epoch": 1.154946604876083, "grad_norm": 0.04448957368731499, "learning_rate": 9.130925742391767e-05, "loss": 0.1767, "step": 5732 }, { "epoch": 1.155349586943381, "grad_norm": 0.03977445885539055, "learning_rate": 9.1301747815428e-05, "loss": 0.1781, "step": 5734 }, { "epoch": 1.1557525690106791, "grad_norm": 0.044697824865579605, "learning_rate": 9.12942352729282e-05, "loss": 0.1693, "step": 5736 }, { "epoch": 1.156155551077977, "grad_norm": 0.06158357486128807, "learning_rate": 9.128671979695198e-05, "loss": 0.1666, "step": 5738 }, { "epoch": 1.156558533145275, "grad_norm": 0.05893006548285484, "learning_rate": 9.127920138803321e-05, "loss": 0.1851, "step": 5740 }, { "epoch": 1.156961515212573, "grad_norm": 0.04412064328789711, "learning_rate": 9.127168004670599e-05, "loss": 0.1425, "step": 5742 }, { "epoch": 1.1573644972798711, "grad_norm": 0.04011744633316994, "learning_rate": 9.126415577350461e-05, "loss": 0.2082, "step": 5744 }, { "epoch": 1.157767479347169, "grad_norm": 0.05388535559177399, "learning_rate": 9.125662856896362e-05, "loss": 0.1851, "step": 5746 }, { "epoch": 1.158170461414467, "grad_norm": 0.049892496317625046, "learning_rate": 9.12490984336177e-05, "loss": 0.2151, "step": 5748 }, { "epoch": 1.1585734434817652, "grad_norm": 0.046070702373981476, "learning_rate": 9.12415653680018e-05, "loss": 0.1925, "step": 5750 }, { "epoch": 1.158976425549063, "grad_norm": 0.04378350451588631, "learning_rate": 9.123402937265104e-05, "loss": 0.2271, "step": 5752 }, { "epoch": 1.159379407616361, "grad_norm": 0.04043138027191162, "learning_rate": 9.12264904481008e-05, "loss": 0.1627, "step": 5754 }, { "epoch": 1.159782389683659, "grad_norm": 0.06803450733423233, "learning_rate": 9.12189485948866e-05, "loss": 0.2098, "step": 5756 }, { "epoch": 1.1601853717509572, "grad_norm": 0.0573253408074379, "learning_rate": 9.12114038135442e-05, "loss": 0.1875, "step": 5758 }, { "epoch": 1.160588353818255, "grad_norm": 0.049632567912340164, "learning_rate": 9.12038561046096e-05, "loss": 0.2009, "step": 5760 }, { "epoch": 1.160991335885553, "grad_norm": 0.04781503975391388, "learning_rate": 9.119630546861895e-05, "loss": 0.2163, "step": 5762 }, { "epoch": 1.1613943179528512, "grad_norm": 0.04843660071492195, "learning_rate": 9.118875190610865e-05, "loss": 0.1457, "step": 5764 }, { "epoch": 1.1617973000201491, "grad_norm": 0.06017361208796501, "learning_rate": 9.118119541761527e-05, "loss": 0.1663, "step": 5766 }, { "epoch": 1.162200282087447, "grad_norm": 0.057248305529356, "learning_rate": 9.117363600367566e-05, "loss": 0.2276, "step": 5768 }, { "epoch": 1.162603264154745, "grad_norm": 0.07495728135108948, "learning_rate": 9.116607366482676e-05, "loss": 0.1746, "step": 5770 }, { "epoch": 1.1630062462220432, "grad_norm": 0.061165809631347656, "learning_rate": 9.115850840160583e-05, "loss": 0.2502, "step": 5772 }, { "epoch": 1.1634092282893411, "grad_norm": 0.042846087366342545, "learning_rate": 9.11509402145503e-05, "loss": 0.2142, "step": 5774 }, { "epoch": 1.163812210356639, "grad_norm": 0.06059703230857849, "learning_rate": 9.114336910419779e-05, "loss": 0.2013, "step": 5776 }, { "epoch": 1.1642151924239372, "grad_norm": 0.045994676649570465, "learning_rate": 9.113579507108612e-05, "loss": 0.2065, "step": 5778 }, { "epoch": 1.1646181744912352, "grad_norm": 0.059071313589811325, "learning_rate": 9.112821811575336e-05, "loss": 0.1969, "step": 5780 }, { "epoch": 1.165021156558533, "grad_norm": 0.050841737538576126, "learning_rate": 9.112063823873776e-05, "loss": 0.1471, "step": 5782 }, { "epoch": 1.165424138625831, "grad_norm": 0.056296203285455704, "learning_rate": 9.11130554405778e-05, "loss": 0.2354, "step": 5784 }, { "epoch": 1.1658271206931292, "grad_norm": 0.040048208087682724, "learning_rate": 9.110546972181211e-05, "loss": 0.2454, "step": 5786 }, { "epoch": 1.1662301027604272, "grad_norm": 0.046090155839920044, "learning_rate": 9.109788108297959e-05, "loss": 0.1594, "step": 5788 }, { "epoch": 1.166633084827725, "grad_norm": 0.05453578010201454, "learning_rate": 9.109028952461934e-05, "loss": 0.2359, "step": 5790 }, { "epoch": 1.1670360668950233, "grad_norm": 0.039822839200496674, "learning_rate": 9.108269504727063e-05, "loss": 0.1897, "step": 5792 }, { "epoch": 1.1674390489623212, "grad_norm": 0.054770614951848984, "learning_rate": 9.107509765147294e-05, "loss": 0.1906, "step": 5794 }, { "epoch": 1.1678420310296191, "grad_norm": 0.0513848178088665, "learning_rate": 9.106749733776605e-05, "loss": 0.1986, "step": 5796 }, { "epoch": 1.168245013096917, "grad_norm": 0.04624282568693161, "learning_rate": 9.10598941066898e-05, "loss": 0.2392, "step": 5798 }, { "epoch": 1.1686479951642152, "grad_norm": 0.05323868989944458, "learning_rate": 9.105228795878434e-05, "loss": 0.2303, "step": 5800 }, { "epoch": 1.1690509772315132, "grad_norm": 0.05079076439142227, "learning_rate": 9.104467889458999e-05, "loss": 0.192, "step": 5802 }, { "epoch": 1.1694539592988111, "grad_norm": 0.04578608646988869, "learning_rate": 9.10370669146473e-05, "loss": 0.209, "step": 5804 }, { "epoch": 1.1698569413661093, "grad_norm": 0.04013809934258461, "learning_rate": 9.102945201949701e-05, "loss": 0.1483, "step": 5806 }, { "epoch": 1.1702599234334072, "grad_norm": 0.05745330825448036, "learning_rate": 9.102183420968006e-05, "loss": 0.1694, "step": 5808 }, { "epoch": 1.1706629055007052, "grad_norm": 0.045204054564237595, "learning_rate": 9.101421348573763e-05, "loss": 0.2133, "step": 5810 }, { "epoch": 1.1710658875680031, "grad_norm": 0.0566680021584034, "learning_rate": 9.100658984821105e-05, "loss": 0.199, "step": 5812 }, { "epoch": 1.1714688696353013, "grad_norm": 0.052215367555618286, "learning_rate": 9.09989632976419e-05, "loss": 0.1555, "step": 5814 }, { "epoch": 1.1718718517025992, "grad_norm": 0.051735419780015945, "learning_rate": 9.099133383457196e-05, "loss": 0.2188, "step": 5816 }, { "epoch": 1.1722748337698972, "grad_norm": 0.05079289525747299, "learning_rate": 9.098370145954325e-05, "loss": 0.2182, "step": 5818 }, { "epoch": 1.1726778158371953, "grad_norm": 0.04231395944952965, "learning_rate": 9.097606617309792e-05, "loss": 0.1908, "step": 5820 }, { "epoch": 1.1730807979044933, "grad_norm": 0.05421389639377594, "learning_rate": 9.096842797577838e-05, "loss": 0.2239, "step": 5822 }, { "epoch": 1.1734837799717912, "grad_norm": 0.06685738265514374, "learning_rate": 9.096078686812724e-05, "loss": 0.1902, "step": 5824 }, { "epoch": 1.1738867620390891, "grad_norm": 0.07613111287355423, "learning_rate": 9.095314285068729e-05, "loss": 0.1891, "step": 5826 }, { "epoch": 1.1742897441063873, "grad_norm": 0.04930075258016586, "learning_rate": 9.094549592400156e-05, "loss": 0.2344, "step": 5828 }, { "epoch": 1.1746927261736853, "grad_norm": 0.054598815739154816, "learning_rate": 9.093784608861332e-05, "loss": 0.1918, "step": 5830 }, { "epoch": 1.1750957082409832, "grad_norm": 0.05315352603793144, "learning_rate": 9.093019334506594e-05, "loss": 0.189, "step": 5832 }, { "epoch": 1.1754986903082814, "grad_norm": 0.0629630908370018, "learning_rate": 9.092253769390308e-05, "loss": 0.2278, "step": 5834 }, { "epoch": 1.1759016723755793, "grad_norm": 0.04600967466831207, "learning_rate": 9.09148791356686e-05, "loss": 0.1794, "step": 5836 }, { "epoch": 1.1763046544428772, "grad_norm": 0.049229636788368225, "learning_rate": 9.090721767090654e-05, "loss": 0.2453, "step": 5838 }, { "epoch": 1.1767076365101752, "grad_norm": 0.04404792562127113, "learning_rate": 9.089955330016115e-05, "loss": 0.2238, "step": 5840 }, { "epoch": 1.1771106185774733, "grad_norm": 0.061983443796634674, "learning_rate": 9.089188602397692e-05, "loss": 0.2076, "step": 5842 }, { "epoch": 1.1775136006447713, "grad_norm": 0.06978388875722885, "learning_rate": 9.088421584289848e-05, "loss": 0.1874, "step": 5844 }, { "epoch": 1.1779165827120692, "grad_norm": 0.04500797390937805, "learning_rate": 9.087654275747074e-05, "loss": 0.2087, "step": 5846 }, { "epoch": 1.1783195647793674, "grad_norm": 0.05657447502017021, "learning_rate": 9.086886676823878e-05, "loss": 0.1595, "step": 5848 }, { "epoch": 1.1787225468466653, "grad_norm": 0.0397280678153038, "learning_rate": 9.086118787574787e-05, "loss": 0.1718, "step": 5850 }, { "epoch": 1.1791255289139633, "grad_norm": 0.0470387302339077, "learning_rate": 9.085350608054354e-05, "loss": 0.2014, "step": 5852 }, { "epoch": 1.1795285109812612, "grad_norm": 0.0703502893447876, "learning_rate": 9.084582138317146e-05, "loss": 0.1792, "step": 5854 }, { "epoch": 1.1799314930485594, "grad_norm": 0.03431640937924385, "learning_rate": 9.083813378417756e-05, "loss": 0.1386, "step": 5856 }, { "epoch": 1.1803344751158573, "grad_norm": 0.05815456807613373, "learning_rate": 9.083044328410794e-05, "loss": 0.1662, "step": 5858 }, { "epoch": 1.1807374571831553, "grad_norm": 0.05745692178606987, "learning_rate": 9.082274988350894e-05, "loss": 0.2128, "step": 5860 }, { "epoch": 1.1811404392504534, "grad_norm": 0.049616165459156036, "learning_rate": 9.081505358292707e-05, "loss": 0.1829, "step": 5862 }, { "epoch": 1.1815434213177514, "grad_norm": 0.06410976499319077, "learning_rate": 9.080735438290906e-05, "loss": 0.2012, "step": 5864 }, { "epoch": 1.1819464033850493, "grad_norm": 0.06344123184680939, "learning_rate": 9.079965228400187e-05, "loss": 0.1708, "step": 5866 }, { "epoch": 1.1823493854523472, "grad_norm": 0.04813029244542122, "learning_rate": 9.079194728675261e-05, "loss": 0.191, "step": 5868 }, { "epoch": 1.1827523675196454, "grad_norm": 0.06375063210725784, "learning_rate": 9.078423939170868e-05, "loss": 0.2086, "step": 5870 }, { "epoch": 1.1831553495869434, "grad_norm": 0.06699992716312408, "learning_rate": 9.077652859941759e-05, "loss": 0.2428, "step": 5872 }, { "epoch": 1.1835583316542413, "grad_norm": 0.08780429512262344, "learning_rate": 9.076881491042711e-05, "loss": 0.2224, "step": 5874 }, { "epoch": 1.1839613137215395, "grad_norm": 0.05629020184278488, "learning_rate": 9.076109832528523e-05, "loss": 0.1999, "step": 5876 }, { "epoch": 1.1843642957888374, "grad_norm": 0.0460069477558136, "learning_rate": 9.075337884454012e-05, "loss": 0.1994, "step": 5878 }, { "epoch": 1.1847672778561353, "grad_norm": 0.06257259100675583, "learning_rate": 9.074565646874014e-05, "loss": 0.2424, "step": 5880 }, { "epoch": 1.1851702599234335, "grad_norm": 0.04899228736758232, "learning_rate": 9.07379311984339e-05, "loss": 0.2154, "step": 5882 }, { "epoch": 1.1855732419907314, "grad_norm": 0.048573628067970276, "learning_rate": 9.073020303417017e-05, "loss": 0.164, "step": 5884 }, { "epoch": 1.1859762240580294, "grad_norm": 0.05294910818338394, "learning_rate": 9.072247197649795e-05, "loss": 0.2525, "step": 5886 }, { "epoch": 1.1863792061253275, "grad_norm": 0.05634959414601326, "learning_rate": 9.071473802596646e-05, "loss": 0.1868, "step": 5888 }, { "epoch": 1.1867821881926255, "grad_norm": 0.05087854340672493, "learning_rate": 9.07070011831251e-05, "loss": 0.2566, "step": 5890 }, { "epoch": 1.1871851702599234, "grad_norm": 0.04623530060052872, "learning_rate": 9.069926144852346e-05, "loss": 0.2095, "step": 5892 }, { "epoch": 1.1875881523272214, "grad_norm": 0.0693693682551384, "learning_rate": 9.069151882271139e-05, "loss": 0.2475, "step": 5894 }, { "epoch": 1.1879911343945195, "grad_norm": 0.04673139005899429, "learning_rate": 9.068377330623887e-05, "loss": 0.2004, "step": 5896 }, { "epoch": 1.1883941164618175, "grad_norm": 0.056024160236120224, "learning_rate": 9.067602489965619e-05, "loss": 0.1873, "step": 5898 }, { "epoch": 1.1887970985291154, "grad_norm": 0.04089471325278282, "learning_rate": 9.066827360351373e-05, "loss": 0.1676, "step": 5900 }, { "epoch": 1.1892000805964136, "grad_norm": 0.0679912269115448, "learning_rate": 9.066051941836218e-05, "loss": 0.189, "step": 5902 }, { "epoch": 1.1896030626637115, "grad_norm": 0.04771299287676811, "learning_rate": 9.065276234475233e-05, "loss": 0.1804, "step": 5904 }, { "epoch": 1.1900060447310095, "grad_norm": 0.04200606793165207, "learning_rate": 9.064500238323528e-05, "loss": 0.1495, "step": 5906 }, { "epoch": 1.1904090267983074, "grad_norm": 0.05432324483990669, "learning_rate": 9.063723953436225e-05, "loss": 0.1868, "step": 5908 }, { "epoch": 1.1908120088656056, "grad_norm": 0.05804635211825371, "learning_rate": 9.062947379868472e-05, "loss": 0.216, "step": 5910 }, { "epoch": 1.1912149909329035, "grad_norm": 0.04404330998659134, "learning_rate": 9.062170517675434e-05, "loss": 0.2023, "step": 5912 }, { "epoch": 1.1916179730002014, "grad_norm": 0.06136954203248024, "learning_rate": 9.061393366912298e-05, "loss": 0.2247, "step": 5914 }, { "epoch": 1.1920209550674996, "grad_norm": 0.054422929883003235, "learning_rate": 9.060615927634275e-05, "loss": 0.1607, "step": 5916 }, { "epoch": 1.1924239371347976, "grad_norm": 0.06473580747842789, "learning_rate": 9.059838199896588e-05, "loss": 0.2242, "step": 5918 }, { "epoch": 1.1928269192020955, "grad_norm": 0.05070899799466133, "learning_rate": 9.05906018375449e-05, "loss": 0.1601, "step": 5920 }, { "epoch": 1.1932299012693934, "grad_norm": 0.04901570454239845, "learning_rate": 9.058281879263247e-05, "loss": 0.1697, "step": 5922 }, { "epoch": 1.1936328833366916, "grad_norm": 0.04864586517214775, "learning_rate": 9.05750328647815e-05, "loss": 0.1868, "step": 5924 }, { "epoch": 1.1940358654039895, "grad_norm": 0.044245872646570206, "learning_rate": 9.056724405454509e-05, "loss": 0.201, "step": 5926 }, { "epoch": 1.1944388474712875, "grad_norm": 0.050298575311899185, "learning_rate": 9.055945236247654e-05, "loss": 0.1613, "step": 5928 }, { "epoch": 1.1948418295385856, "grad_norm": 0.054503366351127625, "learning_rate": 9.055165778912934e-05, "loss": 0.2154, "step": 5930 }, { "epoch": 1.1952448116058836, "grad_norm": 0.04412766918540001, "learning_rate": 9.054386033505724e-05, "loss": 0.1519, "step": 5932 }, { "epoch": 1.1956477936731815, "grad_norm": 0.03437204286456108, "learning_rate": 9.053606000081413e-05, "loss": 0.1507, "step": 5934 }, { "epoch": 1.1960507757404795, "grad_norm": 0.05740992724895477, "learning_rate": 9.052825678695417e-05, "loss": 0.1893, "step": 5936 }, { "epoch": 1.1964537578077776, "grad_norm": 0.06025834009051323, "learning_rate": 9.052045069403165e-05, "loss": 0.2163, "step": 5938 }, { "epoch": 1.1968567398750756, "grad_norm": 0.06567951291799545, "learning_rate": 9.05126417226011e-05, "loss": 0.182, "step": 5940 }, { "epoch": 1.1972597219423735, "grad_norm": 0.04909198358654976, "learning_rate": 9.050482987321729e-05, "loss": 0.1967, "step": 5942 }, { "epoch": 1.1976627040096717, "grad_norm": 0.0788033977150917, "learning_rate": 9.049701514643514e-05, "loss": 0.2236, "step": 5944 }, { "epoch": 1.1980656860769696, "grad_norm": 0.05334840714931488, "learning_rate": 9.04891975428098e-05, "loss": 0.1899, "step": 5946 }, { "epoch": 1.1984686681442676, "grad_norm": 0.05208491533994675, "learning_rate": 9.048137706289662e-05, "loss": 0.1837, "step": 5948 }, { "epoch": 1.1988716502115655, "grad_norm": 0.05720973014831543, "learning_rate": 9.047355370725115e-05, "loss": 0.2488, "step": 5950 }, { "epoch": 1.1992746322788637, "grad_norm": 0.04644571989774704, "learning_rate": 9.046572747642916e-05, "loss": 0.2018, "step": 5952 }, { "epoch": 1.1996776143461616, "grad_norm": 0.04885132610797882, "learning_rate": 9.045789837098659e-05, "loss": 0.217, "step": 5954 }, { "epoch": 1.2000805964134595, "grad_norm": 0.053192708641290665, "learning_rate": 9.045006639147964e-05, "loss": 0.1962, "step": 5956 }, { "epoch": 1.2004835784807577, "grad_norm": 0.04022448509931564, "learning_rate": 9.044223153846466e-05, "loss": 0.1872, "step": 5958 }, { "epoch": 1.2008865605480556, "grad_norm": 0.046723004430532455, "learning_rate": 9.043439381249823e-05, "loss": 0.2085, "step": 5960 }, { "epoch": 1.2012895426153536, "grad_norm": 0.058833975344896317, "learning_rate": 9.042655321413712e-05, "loss": 0.214, "step": 5962 }, { "epoch": 1.2016925246826515, "grad_norm": 0.05478130653500557, "learning_rate": 9.041870974393832e-05, "loss": 0.2313, "step": 5964 }, { "epoch": 1.2020955067499497, "grad_norm": 0.0757334753870964, "learning_rate": 9.041086340245904e-05, "loss": 0.1936, "step": 5966 }, { "epoch": 1.2024984888172476, "grad_norm": 0.04867379739880562, "learning_rate": 9.040301419025663e-05, "loss": 0.1978, "step": 5968 }, { "epoch": 1.2029014708845456, "grad_norm": 0.04826593026518822, "learning_rate": 9.039516210788872e-05, "loss": 0.1717, "step": 5970 }, { "epoch": 1.2033044529518437, "grad_norm": 0.05275225266814232, "learning_rate": 9.038730715591308e-05, "loss": 0.1761, "step": 5972 }, { "epoch": 1.2037074350191417, "grad_norm": 0.05469043180346489, "learning_rate": 9.037944933488776e-05, "loss": 0.2534, "step": 5974 }, { "epoch": 1.2041104170864396, "grad_norm": 0.040808357298374176, "learning_rate": 9.03715886453709e-05, "loss": 0.1671, "step": 5976 }, { "epoch": 1.2045133991537376, "grad_norm": 0.050247691571712494, "learning_rate": 9.036372508792097e-05, "loss": 0.1579, "step": 5978 }, { "epoch": 1.2049163812210357, "grad_norm": 0.05738181248307228, "learning_rate": 9.035585866309656e-05, "loss": 0.1743, "step": 5980 }, { "epoch": 1.2053193632883337, "grad_norm": 0.04823550209403038, "learning_rate": 9.034798937145649e-05, "loss": 0.1769, "step": 5982 }, { "epoch": 1.2057223453556316, "grad_norm": 0.045579053461551666, "learning_rate": 9.034011721355977e-05, "loss": 0.1475, "step": 5984 }, { "epoch": 1.2061253274229298, "grad_norm": 0.055633220821619034, "learning_rate": 9.033224218996565e-05, "loss": 0.2113, "step": 5986 }, { "epoch": 1.2065283094902277, "grad_norm": 0.05092507600784302, "learning_rate": 9.032436430123355e-05, "loss": 0.2388, "step": 5988 }, { "epoch": 1.2069312915575257, "grad_norm": 0.07090190052986145, "learning_rate": 9.031648354792309e-05, "loss": 0.1895, "step": 5990 }, { "epoch": 1.2073342736248236, "grad_norm": 0.03843579441308975, "learning_rate": 9.030859993059413e-05, "loss": 0.156, "step": 5992 }, { "epoch": 1.2077372556921218, "grad_norm": 0.04407760500907898, "learning_rate": 9.030071344980668e-05, "loss": 0.1831, "step": 5994 }, { "epoch": 1.2081402377594197, "grad_norm": 0.049290142953395844, "learning_rate": 9.0292824106121e-05, "loss": 0.2357, "step": 5996 }, { "epoch": 1.2085432198267176, "grad_norm": 0.05240347981452942, "learning_rate": 9.028493190009754e-05, "loss": 0.2431, "step": 5998 }, { "epoch": 1.2089462018940158, "grad_norm": 0.044982656836509705, "learning_rate": 9.027703683229694e-05, "loss": 0.1933, "step": 6000 }, { "epoch": 1.2093491839613137, "grad_norm": 0.08840004354715347, "learning_rate": 9.026913890328004e-05, "loss": 0.1292, "step": 6002 }, { "epoch": 1.2097521660286117, "grad_norm": 0.04240783676505089, "learning_rate": 9.026123811360794e-05, "loss": 0.2281, "step": 6004 }, { "epoch": 1.2101551480959096, "grad_norm": 0.05913504585623741, "learning_rate": 9.025333446384187e-05, "loss": 0.2185, "step": 6006 }, { "epoch": 1.2105581301632078, "grad_norm": 0.04633234441280365, "learning_rate": 9.024542795454328e-05, "loss": 0.1623, "step": 6008 }, { "epoch": 1.2109611122305057, "grad_norm": 0.03760692849755287, "learning_rate": 9.023751858627387e-05, "loss": 0.1773, "step": 6010 }, { "epoch": 1.2113640942978037, "grad_norm": 0.06671061366796494, "learning_rate": 9.022960635959548e-05, "loss": 0.1771, "step": 6012 }, { "epoch": 1.2117670763651018, "grad_norm": 0.06102992966771126, "learning_rate": 9.022169127507019e-05, "loss": 0.2618, "step": 6014 }, { "epoch": 1.2121700584323998, "grad_norm": 0.05439795181155205, "learning_rate": 9.021377333326027e-05, "loss": 0.1914, "step": 6016 }, { "epoch": 1.2125730404996977, "grad_norm": 0.039903391152620316, "learning_rate": 9.020585253472822e-05, "loss": 0.1595, "step": 6018 }, { "epoch": 1.2129760225669957, "grad_norm": 0.056906476616859436, "learning_rate": 9.019792888003671e-05, "loss": 0.2301, "step": 6020 }, { "epoch": 1.2133790046342938, "grad_norm": 0.05966992676258087, "learning_rate": 9.019000236974859e-05, "loss": 0.1885, "step": 6022 }, { "epoch": 1.2137819867015918, "grad_norm": 0.04310747981071472, "learning_rate": 9.0182073004427e-05, "loss": 0.1629, "step": 6024 }, { "epoch": 1.2141849687688897, "grad_norm": 0.05637119710445404, "learning_rate": 9.01741407846352e-05, "loss": 0.1677, "step": 6026 }, { "epoch": 1.2145879508361879, "grad_norm": 0.06048206612467766, "learning_rate": 9.01662057109367e-05, "loss": 0.1987, "step": 6028 }, { "epoch": 1.2149909329034858, "grad_norm": 0.06276671588420868, "learning_rate": 9.015826778389517e-05, "loss": 0.1893, "step": 6030 }, { "epoch": 1.2153939149707838, "grad_norm": 0.05327733978629112, "learning_rate": 9.015032700407452e-05, "loss": 0.2143, "step": 6032 }, { "epoch": 1.2157968970380817, "grad_norm": 0.06122863292694092, "learning_rate": 9.014238337203885e-05, "loss": 0.2751, "step": 6034 }, { "epoch": 1.2161998791053799, "grad_norm": 0.043489329516887665, "learning_rate": 9.013443688835246e-05, "loss": 0.1712, "step": 6036 }, { "epoch": 1.2166028611726778, "grad_norm": 0.04354552924633026, "learning_rate": 9.012648755357986e-05, "loss": 0.2167, "step": 6038 }, { "epoch": 1.2170058432399757, "grad_norm": 0.0550360232591629, "learning_rate": 9.011853536828576e-05, "loss": 0.1874, "step": 6040 }, { "epoch": 1.217408825307274, "grad_norm": 0.06048135086894035, "learning_rate": 9.011058033303508e-05, "loss": 0.1952, "step": 6042 }, { "epoch": 1.2178118073745718, "grad_norm": 0.05317499861121178, "learning_rate": 9.010262244839292e-05, "loss": 0.1439, "step": 6044 }, { "epoch": 1.2182147894418698, "grad_norm": 0.04616203531622887, "learning_rate": 9.009466171492458e-05, "loss": 0.1396, "step": 6046 }, { "epoch": 1.2186177715091677, "grad_norm": 0.059253908693790436, "learning_rate": 9.008669813319559e-05, "loss": 0.2175, "step": 6048 }, { "epoch": 1.2190207535764659, "grad_norm": 0.047273196280002594, "learning_rate": 9.00787317037717e-05, "loss": 0.1592, "step": 6050 }, { "epoch": 1.2194237356437638, "grad_norm": 0.050518494099378586, "learning_rate": 9.007076242721878e-05, "loss": 0.2352, "step": 6052 }, { "epoch": 1.2198267177110618, "grad_norm": 0.0745004266500473, "learning_rate": 9.006279030410298e-05, "loss": 0.2386, "step": 6054 }, { "epoch": 1.22022969977836, "grad_norm": 0.06451984494924545, "learning_rate": 9.005481533499065e-05, "loss": 0.2335, "step": 6056 }, { "epoch": 1.2206326818456579, "grad_norm": 0.06747965514659882, "learning_rate": 9.004683752044828e-05, "loss": 0.268, "step": 6058 }, { "epoch": 1.2210356639129558, "grad_norm": 0.05588728189468384, "learning_rate": 9.003885686104262e-05, "loss": 0.2193, "step": 6060 }, { "epoch": 1.2214386459802538, "grad_norm": 0.05268767848610878, "learning_rate": 9.00308733573406e-05, "loss": 0.2084, "step": 6062 }, { "epoch": 1.221841628047552, "grad_norm": 0.04871091991662979, "learning_rate": 9.002288700990937e-05, "loss": 0.1493, "step": 6064 }, { "epoch": 1.2222446101148499, "grad_norm": 0.06996570527553558, "learning_rate": 9.001489781931624e-05, "loss": 0.2083, "step": 6066 }, { "epoch": 1.2226475921821478, "grad_norm": 0.04460399970412254, "learning_rate": 9.000690578612877e-05, "loss": 0.1949, "step": 6068 }, { "epoch": 1.223050574249446, "grad_norm": 0.05490953475236893, "learning_rate": 8.999891091091469e-05, "loss": 0.2582, "step": 6070 }, { "epoch": 1.223453556316744, "grad_norm": 0.0709412470459938, "learning_rate": 8.999091319424196e-05, "loss": 0.159, "step": 6072 }, { "epoch": 1.2238565383840418, "grad_norm": 0.0431116484105587, "learning_rate": 8.998291263667869e-05, "loss": 0.1924, "step": 6074 }, { "epoch": 1.22425952045134, "grad_norm": 0.03946799412369728, "learning_rate": 8.997490923879327e-05, "loss": 0.1695, "step": 6076 }, { "epoch": 1.224662502518638, "grad_norm": 0.05423908308148384, "learning_rate": 8.996690300115422e-05, "loss": 0.2, "step": 6078 }, { "epoch": 1.225065484585936, "grad_norm": 0.044834546744823456, "learning_rate": 8.99588939243303e-05, "loss": 0.2225, "step": 6080 }, { "epoch": 1.2254684666532338, "grad_norm": 0.05637017637491226, "learning_rate": 8.995088200889046e-05, "loss": 0.2222, "step": 6082 }, { "epoch": 1.225871448720532, "grad_norm": 0.0475136823952198, "learning_rate": 8.994286725540384e-05, "loss": 0.1824, "step": 6084 }, { "epoch": 1.22627443078783, "grad_norm": 0.045652661472558975, "learning_rate": 8.993484966443984e-05, "loss": 0.2257, "step": 6086 }, { "epoch": 1.2266774128551279, "grad_norm": 0.052571386098861694, "learning_rate": 8.992682923656797e-05, "loss": 0.204, "step": 6088 }, { "epoch": 1.227080394922426, "grad_norm": 0.056858647614717484, "learning_rate": 8.9918805972358e-05, "loss": 0.2115, "step": 6090 }, { "epoch": 1.227483376989724, "grad_norm": 0.05734502896666527, "learning_rate": 8.991077987237989e-05, "loss": 0.2268, "step": 6092 }, { "epoch": 1.227886359057022, "grad_norm": 0.04611526057124138, "learning_rate": 8.990275093720381e-05, "loss": 0.1692, "step": 6094 }, { "epoch": 1.22828934112432, "grad_norm": 0.03183615952730179, "learning_rate": 8.989471916740013e-05, "loss": 0.1486, "step": 6096 }, { "epoch": 1.228692323191618, "grad_norm": 0.048749230802059174, "learning_rate": 8.988668456353939e-05, "loss": 0.2033, "step": 6098 }, { "epoch": 1.229095305258916, "grad_norm": 0.04780949279665947, "learning_rate": 8.987864712619238e-05, "loss": 0.2068, "step": 6100 }, { "epoch": 1.229498287326214, "grad_norm": 0.044503167271614075, "learning_rate": 8.987060685593006e-05, "loss": 0.2014, "step": 6102 }, { "epoch": 1.229901269393512, "grad_norm": 0.044010862708091736, "learning_rate": 8.986256375332355e-05, "loss": 0.1874, "step": 6104 }, { "epoch": 1.23030425146081, "grad_norm": 0.049143869429826736, "learning_rate": 8.98545178189443e-05, "loss": 0.1811, "step": 6106 }, { "epoch": 1.230707233528108, "grad_norm": 0.05140808969736099, "learning_rate": 8.984646905336384e-05, "loss": 0.1646, "step": 6108 }, { "epoch": 1.2311102155954061, "grad_norm": 0.0606461800634861, "learning_rate": 8.983841745715393e-05, "loss": 0.2466, "step": 6110 }, { "epoch": 1.231513197662704, "grad_norm": 0.046162523329257965, "learning_rate": 8.983036303088656e-05, "loss": 0.2057, "step": 6112 }, { "epoch": 1.231916179730002, "grad_norm": 0.04632925987243652, "learning_rate": 8.982230577513391e-05, "loss": 0.2076, "step": 6114 }, { "epoch": 1.2323191617973, "grad_norm": 0.04459146410226822, "learning_rate": 8.981424569046834e-05, "loss": 0.2181, "step": 6116 }, { "epoch": 1.232722143864598, "grad_norm": 0.05720078572630882, "learning_rate": 8.980618277746242e-05, "loss": 0.2284, "step": 6118 }, { "epoch": 1.233125125931896, "grad_norm": 0.0529584176838398, "learning_rate": 8.979811703668894e-05, "loss": 0.2177, "step": 6120 }, { "epoch": 1.233528107999194, "grad_norm": 0.0538485161960125, "learning_rate": 8.979004846872088e-05, "loss": 0.2025, "step": 6122 }, { "epoch": 1.2339310900664922, "grad_norm": 0.06908518821001053, "learning_rate": 8.97819770741314e-05, "loss": 0.1802, "step": 6124 }, { "epoch": 1.23433407213379, "grad_norm": 0.0569969080388546, "learning_rate": 8.977390285349391e-05, "loss": 0.1983, "step": 6126 }, { "epoch": 1.234737054201088, "grad_norm": 0.07259988784790039, "learning_rate": 8.976582580738195e-05, "loss": 0.2202, "step": 6128 }, { "epoch": 1.235140036268386, "grad_norm": 0.04598740115761757, "learning_rate": 8.975774593636933e-05, "loss": 0.1815, "step": 6130 }, { "epoch": 1.2355430183356841, "grad_norm": 0.051774248480796814, "learning_rate": 8.974966324103002e-05, "loss": 0.2285, "step": 6132 }, { "epoch": 1.235946000402982, "grad_norm": 0.05057157576084137, "learning_rate": 8.974157772193821e-05, "loss": 0.2287, "step": 6134 }, { "epoch": 1.23634898247028, "grad_norm": 0.06012137979269028, "learning_rate": 8.973348937966826e-05, "loss": 0.2513, "step": 6136 }, { "epoch": 1.2367519645375782, "grad_norm": 0.04695823788642883, "learning_rate": 8.972539821479478e-05, "loss": 0.1839, "step": 6138 }, { "epoch": 1.2371549466048761, "grad_norm": 0.0549938790500164, "learning_rate": 8.971730422789255e-05, "loss": 0.2404, "step": 6140 }, { "epoch": 1.237557928672174, "grad_norm": 0.04319198429584503, "learning_rate": 8.970920741953652e-05, "loss": 0.2047, "step": 6142 }, { "epoch": 1.237960910739472, "grad_norm": 0.04579996317625046, "learning_rate": 8.970110779030193e-05, "loss": 0.1764, "step": 6144 }, { "epoch": 1.2383638928067702, "grad_norm": 0.04213905707001686, "learning_rate": 8.969300534076412e-05, "loss": 0.2204, "step": 6146 }, { "epoch": 1.2387668748740681, "grad_norm": 0.04355452582240105, "learning_rate": 8.96849000714987e-05, "loss": 0.1718, "step": 6148 }, { "epoch": 1.239169856941366, "grad_norm": 0.041120558977127075, "learning_rate": 8.967679198308144e-05, "loss": 0.1902, "step": 6150 }, { "epoch": 1.2395728390086642, "grad_norm": 0.04531228542327881, "learning_rate": 8.966868107608832e-05, "loss": 0.1738, "step": 6152 }, { "epoch": 1.2399758210759622, "grad_norm": 0.04133576154708862, "learning_rate": 8.966056735109555e-05, "loss": 0.2365, "step": 6154 }, { "epoch": 1.24037880314326, "grad_norm": 0.05090603604912758, "learning_rate": 8.96524508086795e-05, "loss": 0.1798, "step": 6156 }, { "epoch": 1.240781785210558, "grad_norm": 0.044529419392347336, "learning_rate": 8.964433144941675e-05, "loss": 0.2278, "step": 6158 }, { "epoch": 1.2411847672778562, "grad_norm": 0.04503866657614708, "learning_rate": 8.963620927388412e-05, "loss": 0.1504, "step": 6160 }, { "epoch": 1.2415877493451541, "grad_norm": 0.04498367756605148, "learning_rate": 8.962808428265855e-05, "loss": 0.1835, "step": 6162 }, { "epoch": 1.241990731412452, "grad_norm": 0.06172780320048332, "learning_rate": 8.961995647631724e-05, "loss": 0.1891, "step": 6164 }, { "epoch": 1.2423937134797503, "grad_norm": 0.0528985895216465, "learning_rate": 8.961182585543762e-05, "loss": 0.2107, "step": 6166 }, { "epoch": 1.2427966955470482, "grad_norm": 0.03874331712722778, "learning_rate": 8.960369242059721e-05, "loss": 0.2512, "step": 6168 }, { "epoch": 1.2431996776143461, "grad_norm": 0.06559088826179504, "learning_rate": 8.959555617237383e-05, "loss": 0.2369, "step": 6170 }, { "epoch": 1.243602659681644, "grad_norm": 0.0783800482749939, "learning_rate": 8.958741711134548e-05, "loss": 0.2538, "step": 6172 }, { "epoch": 1.2440056417489422, "grad_norm": 0.04903315007686615, "learning_rate": 8.957927523809033e-05, "loss": 0.1955, "step": 6174 }, { "epoch": 1.2444086238162402, "grad_norm": 0.0630233958363533, "learning_rate": 8.957113055318674e-05, "loss": 0.2164, "step": 6176 }, { "epoch": 1.2448116058835381, "grad_norm": 0.09295551478862762, "learning_rate": 8.956298305721333e-05, "loss": 0.2149, "step": 6178 }, { "epoch": 1.2452145879508363, "grad_norm": 0.05847977474331856, "learning_rate": 8.95548327507489e-05, "loss": 0.2074, "step": 6180 }, { "epoch": 1.2456175700181342, "grad_norm": 0.04421735554933548, "learning_rate": 8.954667963437238e-05, "loss": 0.1598, "step": 6182 }, { "epoch": 1.2460205520854322, "grad_norm": 0.046964842826128006, "learning_rate": 8.953852370866299e-05, "loss": 0.158, "step": 6184 }, { "epoch": 1.24642353415273, "grad_norm": 0.03633161261677742, "learning_rate": 8.95303649742001e-05, "loss": 0.1394, "step": 6186 }, { "epoch": 1.2468265162200283, "grad_norm": 0.061852145940065384, "learning_rate": 8.952220343156332e-05, "loss": 0.1814, "step": 6188 }, { "epoch": 1.2472294982873262, "grad_norm": 0.045274149626493454, "learning_rate": 8.951403908133242e-05, "loss": 0.2286, "step": 6190 }, { "epoch": 1.2476324803546242, "grad_norm": 0.05303020775318146, "learning_rate": 8.950587192408737e-05, "loss": 0.2134, "step": 6192 }, { "epoch": 1.2480354624219223, "grad_norm": 0.05261223763227463, "learning_rate": 8.949770196040834e-05, "loss": 0.1796, "step": 6194 }, { "epoch": 1.2484384444892203, "grad_norm": 0.051110610365867615, "learning_rate": 8.948952919087575e-05, "loss": 0.1984, "step": 6196 }, { "epoch": 1.2488414265565182, "grad_norm": 0.05076858028769493, "learning_rate": 8.948135361607016e-05, "loss": 0.2459, "step": 6198 }, { "epoch": 1.2492444086238161, "grad_norm": 0.06023627519607544, "learning_rate": 8.947317523657235e-05, "loss": 0.1624, "step": 6200 }, { "epoch": 1.2496473906911143, "grad_norm": 0.0513908714056015, "learning_rate": 8.946499405296328e-05, "loss": 0.209, "step": 6202 }, { "epoch": 1.2500503727584122, "grad_norm": 0.04183659330010414, "learning_rate": 8.945681006582419e-05, "loss": 0.2022, "step": 6204 }, { "epoch": 1.2504533548257102, "grad_norm": 0.068643718957901, "learning_rate": 8.944862327573638e-05, "loss": 0.1795, "step": 6206 }, { "epoch": 1.2508563368930083, "grad_norm": 0.05124150961637497, "learning_rate": 8.944043368328145e-05, "loss": 0.1887, "step": 6208 }, { "epoch": 1.2512593189603063, "grad_norm": 0.048066116869449615, "learning_rate": 8.943224128904122e-05, "loss": 0.1641, "step": 6210 }, { "epoch": 1.2516623010276042, "grad_norm": 0.06339994817972183, "learning_rate": 8.942404609359761e-05, "loss": 0.1863, "step": 6212 }, { "epoch": 1.2520652830949022, "grad_norm": 0.04368380457162857, "learning_rate": 8.941584809753283e-05, "loss": 0.1979, "step": 6214 }, { "epoch": 1.2524682651622003, "grad_norm": 0.05009367689490318, "learning_rate": 8.940764730142922e-05, "loss": 0.2354, "step": 6216 }, { "epoch": 1.2528712472294983, "grad_norm": 0.058022335171699524, "learning_rate": 8.939944370586938e-05, "loss": 0.2433, "step": 6218 }, { "epoch": 1.2532742292967962, "grad_norm": 0.06911885738372803, "learning_rate": 8.939123731143606e-05, "loss": 0.1599, "step": 6220 }, { "epoch": 1.2536772113640944, "grad_norm": 0.04500617831945419, "learning_rate": 8.938302811871225e-05, "loss": 0.1972, "step": 6222 }, { "epoch": 1.2540801934313923, "grad_norm": 0.04577852040529251, "learning_rate": 8.93748161282811e-05, "loss": 0.1829, "step": 6224 }, { "epoch": 1.2544831754986903, "grad_norm": 0.05215632542967796, "learning_rate": 8.936660134072599e-05, "loss": 0.1789, "step": 6226 }, { "epoch": 1.2548861575659882, "grad_norm": 0.046846963465213776, "learning_rate": 8.935838375663047e-05, "loss": 0.1722, "step": 6228 }, { "epoch": 1.2552891396332864, "grad_norm": 0.06160841882228851, "learning_rate": 8.935016337657831e-05, "loss": 0.1877, "step": 6230 }, { "epoch": 1.2556921217005843, "grad_norm": 0.03491106256842613, "learning_rate": 8.934194020115349e-05, "loss": 0.1862, "step": 6232 }, { "epoch": 1.2560951037678822, "grad_norm": 0.046751853078603745, "learning_rate": 8.933371423094014e-05, "loss": 0.2105, "step": 6234 }, { "epoch": 1.2564980858351804, "grad_norm": 0.056983157992362976, "learning_rate": 8.932548546652264e-05, "loss": 0.1568, "step": 6236 }, { "epoch": 1.2569010679024784, "grad_norm": 0.04728730395436287, "learning_rate": 8.931725390848556e-05, "loss": 0.2299, "step": 6238 }, { "epoch": 1.2573040499697763, "grad_norm": 0.06326276808977127, "learning_rate": 8.930901955741363e-05, "loss": 0.1922, "step": 6240 }, { "epoch": 1.2577070320370742, "grad_norm": 0.03530817851424217, "learning_rate": 8.93007824138918e-05, "loss": 0.1629, "step": 6242 }, { "epoch": 1.2581100141043724, "grad_norm": 0.055923838168382645, "learning_rate": 8.929254247850526e-05, "loss": 0.1664, "step": 6244 }, { "epoch": 1.2585129961716703, "grad_norm": 0.06713879108428955, "learning_rate": 8.928429975183934e-05, "loss": 0.1592, "step": 6246 }, { "epoch": 1.2589159782389685, "grad_norm": 0.04550889879465103, "learning_rate": 8.927605423447958e-05, "loss": 0.1682, "step": 6248 }, { "epoch": 1.2593189603062664, "grad_norm": 0.05290335789322853, "learning_rate": 8.926780592701176e-05, "loss": 0.2486, "step": 6250 }, { "epoch": 1.2597219423735644, "grad_norm": 0.04227093979716301, "learning_rate": 8.925955483002178e-05, "loss": 0.1991, "step": 6252 }, { "epoch": 1.2601249244408623, "grad_norm": 0.07582451403141022, "learning_rate": 8.925130094409582e-05, "loss": 0.1773, "step": 6254 }, { "epoch": 1.2605279065081603, "grad_norm": 0.061997395008802414, "learning_rate": 8.924304426982022e-05, "loss": 0.2172, "step": 6256 }, { "epoch": 1.2609308885754584, "grad_norm": 0.07101260870695114, "learning_rate": 8.923478480778151e-05, "loss": 0.1832, "step": 6258 }, { "epoch": 1.2613338706427564, "grad_norm": 0.038211528211832047, "learning_rate": 8.922652255856645e-05, "loss": 0.1695, "step": 6260 }, { "epoch": 1.2617368527100545, "grad_norm": 0.05801432207226753, "learning_rate": 8.921825752276194e-05, "loss": 0.2165, "step": 6262 }, { "epoch": 1.2621398347773525, "grad_norm": 0.04629329964518547, "learning_rate": 8.920998970095515e-05, "loss": 0.1839, "step": 6264 }, { "epoch": 1.2625428168446504, "grad_norm": 0.05066737160086632, "learning_rate": 8.920171909373339e-05, "loss": 0.2055, "step": 6266 }, { "epoch": 1.2629457989119484, "grad_norm": 0.04259856417775154, "learning_rate": 8.91934457016842e-05, "loss": 0.1638, "step": 6268 }, { "epoch": 1.2633487809792463, "grad_norm": 0.061476513743400574, "learning_rate": 8.918516952539532e-05, "loss": 0.1923, "step": 6270 }, { "epoch": 1.2637517630465445, "grad_norm": 0.049006011337041855, "learning_rate": 8.917689056545463e-05, "loss": 0.1802, "step": 6272 }, { "epoch": 1.2641547451138424, "grad_norm": 0.05190061405301094, "learning_rate": 8.916860882245032e-05, "loss": 0.1723, "step": 6274 }, { "epoch": 1.2645577271811406, "grad_norm": 0.06619302183389664, "learning_rate": 8.916032429697069e-05, "loss": 0.152, "step": 6276 }, { "epoch": 1.2649607092484385, "grad_norm": 0.046162430197000504, "learning_rate": 8.915203698960423e-05, "loss": 0.2159, "step": 6278 }, { "epoch": 1.2653636913157364, "grad_norm": 0.05554254725575447, "learning_rate": 8.914374690093967e-05, "loss": 0.1643, "step": 6280 }, { "epoch": 1.2657666733830344, "grad_norm": 0.05046551302075386, "learning_rate": 8.913545403156596e-05, "loss": 0.1734, "step": 6282 }, { "epoch": 1.2661696554503323, "grad_norm": 0.050692591816186905, "learning_rate": 8.912715838207215e-05, "loss": 0.1975, "step": 6284 }, { "epoch": 1.2665726375176305, "grad_norm": 0.048872966319322586, "learning_rate": 8.911885995304761e-05, "loss": 0.2695, "step": 6286 }, { "epoch": 1.2669756195849284, "grad_norm": 0.06595724821090698, "learning_rate": 8.911055874508181e-05, "loss": 0.2077, "step": 6288 }, { "epoch": 1.2673786016522266, "grad_norm": 0.042650818824768066, "learning_rate": 8.910225475876446e-05, "loss": 0.2249, "step": 6290 }, { "epoch": 1.2677815837195245, "grad_norm": 0.05222772806882858, "learning_rate": 8.909394799468547e-05, "loss": 0.2154, "step": 6292 }, { "epoch": 1.2681845657868225, "grad_norm": 0.04049530252814293, "learning_rate": 8.908563845343494e-05, "loss": 0.1806, "step": 6294 }, { "epoch": 1.2685875478541204, "grad_norm": 0.052481092512607574, "learning_rate": 8.907732613560316e-05, "loss": 0.2266, "step": 6296 }, { "epoch": 1.2689905299214184, "grad_norm": 0.032652221620082855, "learning_rate": 8.906901104178062e-05, "loss": 0.1711, "step": 6298 }, { "epoch": 1.2693935119887165, "grad_norm": 0.04992491006851196, "learning_rate": 8.906069317255801e-05, "loss": 0.192, "step": 6300 }, { "epoch": 1.2697964940560145, "grad_norm": 0.04392782226204872, "learning_rate": 8.905237252852624e-05, "loss": 0.1647, "step": 6302 }, { "epoch": 1.2701994761233126, "grad_norm": 0.04683556407690048, "learning_rate": 8.904404911027638e-05, "loss": 0.1929, "step": 6304 }, { "epoch": 1.2706024581906106, "grad_norm": 0.048300039023160934, "learning_rate": 8.903572291839971e-05, "loss": 0.1956, "step": 6306 }, { "epoch": 1.2710054402579085, "grad_norm": 0.058879271149635315, "learning_rate": 8.902739395348771e-05, "loss": 0.1853, "step": 6308 }, { "epoch": 1.2714084223252065, "grad_norm": 0.05005578324198723, "learning_rate": 8.901906221613206e-05, "loss": 0.217, "step": 6310 }, { "epoch": 1.2718114043925044, "grad_norm": 0.06488041579723358, "learning_rate": 8.901072770692464e-05, "loss": 0.2339, "step": 6312 }, { "epoch": 1.2722143864598026, "grad_norm": 0.05080641061067581, "learning_rate": 8.900239042645751e-05, "loss": 0.1918, "step": 6314 }, { "epoch": 1.2726173685271005, "grad_norm": 0.059111639857292175, "learning_rate": 8.899405037532294e-05, "loss": 0.2268, "step": 6316 }, { "epoch": 1.2730203505943987, "grad_norm": 0.05126844719052315, "learning_rate": 8.898570755411338e-05, "loss": 0.1732, "step": 6318 }, { "epoch": 1.2734233326616966, "grad_norm": 0.05296769365668297, "learning_rate": 8.897736196342151e-05, "loss": 0.2006, "step": 6320 }, { "epoch": 1.2738263147289945, "grad_norm": 0.054364196956157684, "learning_rate": 8.896901360384018e-05, "loss": 0.2214, "step": 6322 }, { "epoch": 1.2742292967962925, "grad_norm": 0.04188178852200508, "learning_rate": 8.896066247596245e-05, "loss": 0.2247, "step": 6324 }, { "epoch": 1.2746322788635907, "grad_norm": 0.048656053841114044, "learning_rate": 8.895230858038157e-05, "loss": 0.1718, "step": 6326 }, { "epoch": 1.2750352609308886, "grad_norm": 0.07030941545963287, "learning_rate": 8.894395191769099e-05, "loss": 0.2284, "step": 6328 }, { "epoch": 1.2754382429981865, "grad_norm": 0.04568159952759743, "learning_rate": 8.893559248848431e-05, "loss": 0.1851, "step": 6330 }, { "epoch": 1.2758412250654847, "grad_norm": 0.04896444454789162, "learning_rate": 8.892723029335544e-05, "loss": 0.1895, "step": 6332 }, { "epoch": 1.2762442071327826, "grad_norm": 0.06214359775185585, "learning_rate": 8.891886533289839e-05, "loss": 0.14, "step": 6334 }, { "epoch": 1.2766471892000806, "grad_norm": 0.062105149030685425, "learning_rate": 8.891049760770737e-05, "loss": 0.2003, "step": 6336 }, { "epoch": 1.2770501712673785, "grad_norm": 0.06353707611560822, "learning_rate": 8.890212711837684e-05, "loss": 0.1847, "step": 6338 }, { "epoch": 1.2774531533346767, "grad_norm": 0.05937555432319641, "learning_rate": 8.88937538655014e-05, "loss": 0.2148, "step": 6340 }, { "epoch": 1.2778561354019746, "grad_norm": 0.055293429642915726, "learning_rate": 8.88853778496759e-05, "loss": 0.2006, "step": 6342 }, { "epoch": 1.2782591174692726, "grad_norm": 0.06335525959730148, "learning_rate": 8.887699907149534e-05, "loss": 0.2237, "step": 6344 }, { "epoch": 1.2786620995365707, "grad_norm": 0.05069897323846817, "learning_rate": 8.886861753155495e-05, "loss": 0.1777, "step": 6346 }, { "epoch": 1.2790650816038687, "grad_norm": 0.04641510918736458, "learning_rate": 8.886023323045012e-05, "loss": 0.1996, "step": 6348 }, { "epoch": 1.2794680636711666, "grad_norm": 0.0559643991291523, "learning_rate": 8.885184616877647e-05, "loss": 0.2127, "step": 6350 }, { "epoch": 1.2798710457384646, "grad_norm": 0.055252932012081146, "learning_rate": 8.88434563471298e-05, "loss": 0.2204, "step": 6352 }, { "epoch": 1.2802740278057627, "grad_norm": 0.0576675646007061, "learning_rate": 8.883506376610612e-05, "loss": 0.2209, "step": 6354 }, { "epoch": 1.2806770098730607, "grad_norm": 0.06650307774543762, "learning_rate": 8.882666842630162e-05, "loss": 0.2201, "step": 6356 }, { "epoch": 1.2810799919403586, "grad_norm": 0.060477275401353836, "learning_rate": 8.881827032831268e-05, "loss": 0.1611, "step": 6358 }, { "epoch": 1.2814829740076568, "grad_norm": 0.0532105453312397, "learning_rate": 8.880986947273591e-05, "loss": 0.1582, "step": 6360 }, { "epoch": 1.2818859560749547, "grad_norm": 0.04423583298921585, "learning_rate": 8.880146586016806e-05, "loss": 0.1612, "step": 6362 }, { "epoch": 1.2822889381422526, "grad_norm": 0.06925471872091293, "learning_rate": 8.879305949120613e-05, "loss": 0.2171, "step": 6364 }, { "epoch": 1.2826919202095506, "grad_norm": 0.03289978951215744, "learning_rate": 8.878465036644732e-05, "loss": 0.1585, "step": 6366 }, { "epoch": 1.2830949022768487, "grad_norm": 0.06422306597232819, "learning_rate": 8.877623848648894e-05, "loss": 0.2094, "step": 6368 }, { "epoch": 1.2834978843441467, "grad_norm": 0.06235311180353165, "learning_rate": 8.876782385192861e-05, "loss": 0.226, "step": 6370 }, { "epoch": 1.2839008664114446, "grad_norm": 0.062130313366651535, "learning_rate": 8.875940646336409e-05, "loss": 0.1998, "step": 6372 }, { "epoch": 1.2843038484787428, "grad_norm": 0.05406568944454193, "learning_rate": 8.87509863213933e-05, "loss": 0.2353, "step": 6374 }, { "epoch": 1.2847068305460407, "grad_norm": 0.0408284068107605, "learning_rate": 8.874256342661442e-05, "loss": 0.2246, "step": 6376 }, { "epoch": 1.2851098126133387, "grad_norm": 0.03897031396627426, "learning_rate": 8.873413777962578e-05, "loss": 0.1423, "step": 6378 }, { "epoch": 1.2855127946806366, "grad_norm": 0.05661426857113838, "learning_rate": 8.872570938102595e-05, "loss": 0.2576, "step": 6380 }, { "epoch": 1.2859157767479348, "grad_norm": 0.04744990915060043, "learning_rate": 8.871727823141367e-05, "loss": 0.2002, "step": 6382 }, { "epoch": 1.2863187588152327, "grad_norm": 0.045208241790533066, "learning_rate": 8.870884433138785e-05, "loss": 0.2259, "step": 6384 }, { "epoch": 1.2867217408825307, "grad_norm": 0.04680623859167099, "learning_rate": 8.870040768154763e-05, "loss": 0.1799, "step": 6386 }, { "epoch": 1.2871247229498288, "grad_norm": 0.04088175669312477, "learning_rate": 8.869196828249235e-05, "loss": 0.1982, "step": 6388 }, { "epoch": 1.2875277050171268, "grad_norm": 0.05093217268586159, "learning_rate": 8.868352613482153e-05, "loss": 0.2254, "step": 6390 }, { "epoch": 1.2879306870844247, "grad_norm": 0.0655287578701973, "learning_rate": 8.867508123913486e-05, "loss": 0.1938, "step": 6392 }, { "epoch": 1.2883336691517226, "grad_norm": 0.054968371987342834, "learning_rate": 8.866663359603228e-05, "loss": 0.2205, "step": 6394 }, { "epoch": 1.2887366512190208, "grad_norm": 0.04670187085866928, "learning_rate": 8.865818320611388e-05, "loss": 0.1916, "step": 6396 }, { "epoch": 1.2891396332863188, "grad_norm": 0.03799133747816086, "learning_rate": 8.864973006997999e-05, "loss": 0.1747, "step": 6398 }, { "epoch": 1.2895426153536167, "grad_norm": 0.0437266044318676, "learning_rate": 8.864127418823107e-05, "loss": 0.1568, "step": 6400 }, { "epoch": 1.2899455974209149, "grad_norm": 0.05466373264789581, "learning_rate": 8.863281556146783e-05, "loss": 0.2209, "step": 6402 }, { "epoch": 1.2903485794882128, "grad_norm": 0.04808984696865082, "learning_rate": 8.862435419029116e-05, "loss": 0.2492, "step": 6404 }, { "epoch": 1.2907515615555107, "grad_norm": 0.06633257120847702, "learning_rate": 8.861589007530214e-05, "loss": 0.1691, "step": 6406 }, { "epoch": 1.2911545436228087, "grad_norm": 0.038611918687820435, "learning_rate": 8.860742321710204e-05, "loss": 0.1695, "step": 6408 }, { "epoch": 1.2915575256901068, "grad_norm": 0.054731372743844986, "learning_rate": 8.859895361629233e-05, "loss": 0.172, "step": 6410 }, { "epoch": 1.2919605077574048, "grad_norm": 0.04989850893616676, "learning_rate": 8.859048127347472e-05, "loss": 0.1281, "step": 6412 }, { "epoch": 1.2923634898247027, "grad_norm": 0.052453186362981796, "learning_rate": 8.8582006189251e-05, "loss": 0.1744, "step": 6414 }, { "epoch": 1.292766471892001, "grad_norm": 0.03481290861964226, "learning_rate": 8.857352836422328e-05, "loss": 0.1577, "step": 6416 }, { "epoch": 1.2931694539592988, "grad_norm": 0.03524111956357956, "learning_rate": 8.856504779899378e-05, "loss": 0.1481, "step": 6418 }, { "epoch": 1.2935724360265968, "grad_norm": 0.04402356222271919, "learning_rate": 8.855656449416498e-05, "loss": 0.1864, "step": 6420 }, { "epoch": 1.2939754180938947, "grad_norm": 0.06525306403636932, "learning_rate": 8.854807845033949e-05, "loss": 0.201, "step": 6422 }, { "epoch": 1.2943784001611929, "grad_norm": 0.04306092485785484, "learning_rate": 8.853958966812015e-05, "loss": 0.1546, "step": 6424 }, { "epoch": 1.2947813822284908, "grad_norm": 0.045840419828891754, "learning_rate": 8.853109814811e-05, "loss": 0.2124, "step": 6426 }, { "epoch": 1.2951843642957888, "grad_norm": 0.04774376004934311, "learning_rate": 8.852260389091227e-05, "loss": 0.1978, "step": 6428 }, { "epoch": 1.295587346363087, "grad_norm": 0.06106032803654671, "learning_rate": 8.851410689713036e-05, "loss": 0.2433, "step": 6430 }, { "epoch": 1.2959903284303849, "grad_norm": 0.05506075546145439, "learning_rate": 8.850560716736789e-05, "loss": 0.2102, "step": 6432 }, { "epoch": 1.2963933104976828, "grad_norm": 0.047731101512908936, "learning_rate": 8.849710470222865e-05, "loss": 0.1948, "step": 6434 }, { "epoch": 1.2967962925649807, "grad_norm": 0.058562930673360825, "learning_rate": 8.848859950231668e-05, "loss": 0.2239, "step": 6436 }, { "epoch": 1.297199274632279, "grad_norm": 0.03972695767879486, "learning_rate": 8.848009156823615e-05, "loss": 0.1916, "step": 6438 }, { "epoch": 1.2976022566995769, "grad_norm": 0.04205062612891197, "learning_rate": 8.847158090059145e-05, "loss": 0.165, "step": 6440 }, { "epoch": 1.2980052387668748, "grad_norm": 0.054686374962329865, "learning_rate": 8.846306749998719e-05, "loss": 0.1919, "step": 6442 }, { "epoch": 1.298408220834173, "grad_norm": 0.0542772077023983, "learning_rate": 8.845455136702809e-05, "loss": 0.225, "step": 6444 }, { "epoch": 1.298811202901471, "grad_norm": 0.04445016384124756, "learning_rate": 8.844603250231918e-05, "loss": 0.2461, "step": 6446 }, { "epoch": 1.2992141849687688, "grad_norm": 0.04408948868513107, "learning_rate": 8.843751090646562e-05, "loss": 0.1767, "step": 6448 }, { "epoch": 1.2996171670360668, "grad_norm": 0.041693881154060364, "learning_rate": 8.842898658007274e-05, "loss": 0.1658, "step": 6450 }, { "epoch": 1.300020149103365, "grad_norm": 0.04052754119038582, "learning_rate": 8.842045952374612e-05, "loss": 0.1824, "step": 6452 }, { "epoch": 1.3004231311706629, "grad_norm": 0.05492265522480011, "learning_rate": 8.841192973809149e-05, "loss": 0.2019, "step": 6454 }, { "epoch": 1.300826113237961, "grad_norm": 0.06872859597206116, "learning_rate": 8.84033972237148e-05, "loss": 0.2278, "step": 6456 }, { "epoch": 1.301229095305259, "grad_norm": 0.048062894493341446, "learning_rate": 8.83948619812222e-05, "loss": 0.2154, "step": 6458 }, { "epoch": 1.301632077372557, "grad_norm": 0.045196451246738434, "learning_rate": 8.838632401122e-05, "loss": 0.1845, "step": 6460 }, { "epoch": 1.3020350594398549, "grad_norm": 0.04608435556292534, "learning_rate": 8.837778331431475e-05, "loss": 0.2171, "step": 6462 }, { "epoch": 1.3024380415071528, "grad_norm": 0.04621405154466629, "learning_rate": 8.836923989111313e-05, "loss": 0.1573, "step": 6464 }, { "epoch": 1.302841023574451, "grad_norm": 0.04945302754640579, "learning_rate": 8.836069374222206e-05, "loss": 0.1985, "step": 6466 }, { "epoch": 1.303244005641749, "grad_norm": 0.04604468122124672, "learning_rate": 8.835214486824869e-05, "loss": 0.2051, "step": 6468 }, { "epoch": 1.303646987709047, "grad_norm": 0.05772722512483597, "learning_rate": 8.834359326980026e-05, "loss": 0.1724, "step": 6470 }, { "epoch": 1.304049969776345, "grad_norm": 0.045948565006256104, "learning_rate": 8.833503894748429e-05, "loss": 0.2082, "step": 6472 }, { "epoch": 1.304452951843643, "grad_norm": 0.06069548428058624, "learning_rate": 8.832648190190847e-05, "loss": 0.2108, "step": 6474 }, { "epoch": 1.304855933910941, "grad_norm": 0.05096980184316635, "learning_rate": 8.831792213368065e-05, "loss": 0.2311, "step": 6476 }, { "epoch": 1.3052589159782388, "grad_norm": 0.054760102182626724, "learning_rate": 8.830935964340894e-05, "loss": 0.2228, "step": 6478 }, { "epoch": 1.305661898045537, "grad_norm": 0.046445854008197784, "learning_rate": 8.830079443170158e-05, "loss": 0.2161, "step": 6480 }, { "epoch": 1.306064880112835, "grad_norm": 0.06579455733299255, "learning_rate": 8.829222649916704e-05, "loss": 0.2356, "step": 6482 }, { "epoch": 1.306467862180133, "grad_norm": 0.05317157134413719, "learning_rate": 8.828365584641396e-05, "loss": 0.1861, "step": 6484 }, { "epoch": 1.306870844247431, "grad_norm": 0.045328740030527115, "learning_rate": 8.82750824740512e-05, "loss": 0.1509, "step": 6486 }, { "epoch": 1.307273826314729, "grad_norm": 0.03549671545624733, "learning_rate": 8.826650638268781e-05, "loss": 0.1512, "step": 6488 }, { "epoch": 1.307676808382027, "grad_norm": 0.045670825988054276, "learning_rate": 8.825792757293299e-05, "loss": 0.2287, "step": 6490 }, { "epoch": 1.3080797904493249, "grad_norm": 0.11401376128196716, "learning_rate": 8.824934604539617e-05, "loss": 0.1898, "step": 6492 }, { "epoch": 1.308482772516623, "grad_norm": 0.07366832345724106, "learning_rate": 8.8240761800687e-05, "loss": 0.1657, "step": 6494 }, { "epoch": 1.308885754583921, "grad_norm": 0.06504294276237488, "learning_rate": 8.823217483941524e-05, "loss": 0.2702, "step": 6496 }, { "epoch": 1.3092887366512191, "grad_norm": 0.052060194313526154, "learning_rate": 8.822358516219093e-05, "loss": 0.2161, "step": 6498 }, { "epoch": 1.309691718718517, "grad_norm": 0.046481553465127945, "learning_rate": 8.821499276962429e-05, "loss": 0.2609, "step": 6500 }, { "epoch": 1.310094700785815, "grad_norm": 0.06982485949993134, "learning_rate": 8.820639766232565e-05, "loss": 0.2059, "step": 6502 }, { "epoch": 1.310497682853113, "grad_norm": 0.05630074441432953, "learning_rate": 8.819779984090562e-05, "loss": 0.207, "step": 6504 }, { "epoch": 1.310900664920411, "grad_norm": 0.05257268622517586, "learning_rate": 8.8189199305975e-05, "loss": 0.179, "step": 6506 }, { "epoch": 1.311303646987709, "grad_norm": 0.0464656688272953, "learning_rate": 8.818059605814472e-05, "loss": 0.1982, "step": 6508 }, { "epoch": 1.311706629055007, "grad_norm": 0.05108407139778137, "learning_rate": 8.817199009802595e-05, "loss": 0.2116, "step": 6510 }, { "epoch": 1.3121096111223052, "grad_norm": 0.0434158593416214, "learning_rate": 8.816338142623007e-05, "loss": 0.1984, "step": 6512 }, { "epoch": 1.3125125931896031, "grad_norm": 0.04521339014172554, "learning_rate": 8.815477004336858e-05, "loss": 0.2046, "step": 6514 }, { "epoch": 1.312915575256901, "grad_norm": 0.05215785652399063, "learning_rate": 8.814615595005328e-05, "loss": 0.1816, "step": 6516 }, { "epoch": 1.313318557324199, "grad_norm": 0.050764963030815125, "learning_rate": 8.813753914689605e-05, "loss": 0.1906, "step": 6518 }, { "epoch": 1.313721539391497, "grad_norm": 0.05948259308934212, "learning_rate": 8.812891963450903e-05, "loss": 0.194, "step": 6520 }, { "epoch": 1.314124521458795, "grad_norm": 0.062436092644929886, "learning_rate": 8.812029741350454e-05, "loss": 0.2094, "step": 6522 }, { "epoch": 1.314527503526093, "grad_norm": 0.045266736298799515, "learning_rate": 8.811167248449508e-05, "loss": 0.198, "step": 6524 }, { "epoch": 1.3149304855933912, "grad_norm": 0.042582929134368896, "learning_rate": 8.810304484809336e-05, "loss": 0.1983, "step": 6526 }, { "epoch": 1.3153334676606891, "grad_norm": 0.05839109048247337, "learning_rate": 8.809441450491227e-05, "loss": 0.2705, "step": 6528 }, { "epoch": 1.315736449727987, "grad_norm": 0.05570756644010544, "learning_rate": 8.80857814555649e-05, "loss": 0.1926, "step": 6530 }, { "epoch": 1.316139431795285, "grad_norm": 0.033902619034051895, "learning_rate": 8.807714570066454e-05, "loss": 0.1553, "step": 6532 }, { "epoch": 1.3165424138625832, "grad_norm": 0.05374903604388237, "learning_rate": 8.806850724082462e-05, "loss": 0.1889, "step": 6534 }, { "epoch": 1.3169453959298811, "grad_norm": 0.049184996634721756, "learning_rate": 8.805986607665884e-05, "loss": 0.1544, "step": 6536 }, { "epoch": 1.317348377997179, "grad_norm": 0.03358803689479828, "learning_rate": 8.805122220878104e-05, "loss": 0.1384, "step": 6538 }, { "epoch": 1.3177513600644772, "grad_norm": 0.04204018786549568, "learning_rate": 8.804257563780525e-05, "loss": 0.1581, "step": 6540 }, { "epoch": 1.3181543421317752, "grad_norm": 0.04981936141848564, "learning_rate": 8.803392636434575e-05, "loss": 0.2046, "step": 6542 }, { "epoch": 1.3185573241990731, "grad_norm": 0.04282686114311218, "learning_rate": 8.802527438901693e-05, "loss": 0.1827, "step": 6544 }, { "epoch": 1.318960306266371, "grad_norm": 0.052251186221838, "learning_rate": 8.801661971243345e-05, "loss": 0.154, "step": 6546 }, { "epoch": 1.3193632883336692, "grad_norm": 0.0645936131477356, "learning_rate": 8.80079623352101e-05, "loss": 0.2281, "step": 6548 }, { "epoch": 1.3197662704009672, "grad_norm": 0.04686029627919197, "learning_rate": 8.799930225796187e-05, "loss": 0.2311, "step": 6550 }, { "epoch": 1.320169252468265, "grad_norm": 0.05446161329746246, "learning_rate": 8.7990639481304e-05, "loss": 0.2203, "step": 6552 }, { "epoch": 1.3205722345355633, "grad_norm": 0.043119318783283234, "learning_rate": 8.798197400585185e-05, "loss": 0.1367, "step": 6554 }, { "epoch": 1.3209752166028612, "grad_norm": 0.03402850776910782, "learning_rate": 8.7973305832221e-05, "loss": 0.1273, "step": 6556 }, { "epoch": 1.3213781986701592, "grad_norm": 0.05417346581816673, "learning_rate": 8.796463496102725e-05, "loss": 0.2275, "step": 6558 }, { "epoch": 1.321781180737457, "grad_norm": 0.04006451368331909, "learning_rate": 8.795596139288655e-05, "loss": 0.2158, "step": 6560 }, { "epoch": 1.3221841628047553, "grad_norm": 0.04239325597882271, "learning_rate": 8.794728512841504e-05, "loss": 0.1763, "step": 6562 }, { "epoch": 1.3225871448720532, "grad_norm": 0.05095091834664345, "learning_rate": 8.79386061682291e-05, "loss": 0.2323, "step": 6564 }, { "epoch": 1.3229901269393511, "grad_norm": 0.05127684772014618, "learning_rate": 8.792992451294522e-05, "loss": 0.2092, "step": 6566 }, { "epoch": 1.3233931090066493, "grad_norm": 0.053673263639211655, "learning_rate": 8.79212401631802e-05, "loss": 0.1998, "step": 6568 }, { "epoch": 1.3237960910739472, "grad_norm": 0.05202171579003334, "learning_rate": 8.79125531195509e-05, "loss": 0.2041, "step": 6570 }, { "epoch": 1.3241990731412452, "grad_norm": 0.05307517573237419, "learning_rate": 8.790386338267447e-05, "loss": 0.182, "step": 6572 }, { "epoch": 1.3246020552085431, "grad_norm": 0.04658079519867897, "learning_rate": 8.789517095316819e-05, "loss": 0.2143, "step": 6574 }, { "epoch": 1.3250050372758413, "grad_norm": 0.04600522294640541, "learning_rate": 8.788647583164959e-05, "loss": 0.1783, "step": 6576 }, { "epoch": 1.3254080193431392, "grad_norm": 0.045908063650131226, "learning_rate": 8.787777801873632e-05, "loss": 0.1593, "step": 6578 }, { "epoch": 1.3258110014104372, "grad_norm": 0.04135711118578911, "learning_rate": 8.786907751504628e-05, "loss": 0.1651, "step": 6580 }, { "epoch": 1.3262139834777353, "grad_norm": 0.07556430250406265, "learning_rate": 8.786037432119754e-05, "loss": 0.1893, "step": 6582 }, { "epoch": 1.3266169655450333, "grad_norm": 0.04877334460616112, "learning_rate": 8.785166843780837e-05, "loss": 0.1789, "step": 6584 }, { "epoch": 1.3270199476123312, "grad_norm": 0.055644772946834564, "learning_rate": 8.784295986549717e-05, "loss": 0.2269, "step": 6586 }, { "epoch": 1.3274229296796292, "grad_norm": 0.04751408472657204, "learning_rate": 8.783424860488266e-05, "loss": 0.216, "step": 6588 }, { "epoch": 1.3278259117469273, "grad_norm": 0.05544084310531616, "learning_rate": 8.782553465658363e-05, "loss": 0.1849, "step": 6590 }, { "epoch": 1.3282288938142253, "grad_norm": 0.07215588539838791, "learning_rate": 8.781681802121911e-05, "loss": 0.2223, "step": 6592 }, { "epoch": 1.3286318758815232, "grad_norm": 0.04705362394452095, "learning_rate": 8.780809869940829e-05, "loss": 0.1941, "step": 6594 }, { "epoch": 1.3290348579488214, "grad_norm": 0.04929559677839279, "learning_rate": 8.779937669177064e-05, "loss": 0.2029, "step": 6596 }, { "epoch": 1.3294378400161193, "grad_norm": 0.05044730007648468, "learning_rate": 8.77906519989257e-05, "loss": 0.1913, "step": 6598 }, { "epoch": 1.3298408220834173, "grad_norm": 0.09016617387533188, "learning_rate": 8.778192462149328e-05, "loss": 0.1968, "step": 6600 }, { "epoch": 1.3302438041507152, "grad_norm": 0.07022813707590103, "learning_rate": 8.777319456009337e-05, "loss": 0.2, "step": 6602 }, { "epoch": 1.3306467862180134, "grad_norm": 0.0559733621776104, "learning_rate": 8.776446181534612e-05, "loss": 0.1923, "step": 6604 }, { "epoch": 1.3310497682853113, "grad_norm": 0.06447982043027878, "learning_rate": 8.775572638787189e-05, "loss": 0.1711, "step": 6606 }, { "epoch": 1.3314527503526092, "grad_norm": 0.06927803158760071, "learning_rate": 8.774698827829126e-05, "loss": 0.2384, "step": 6608 }, { "epoch": 1.3318557324199074, "grad_norm": 0.062186673283576965, "learning_rate": 8.773824748722492e-05, "loss": 0.2065, "step": 6610 }, { "epoch": 1.3322587144872053, "grad_norm": 0.06363217532634735, "learning_rate": 8.772950401529386e-05, "loss": 0.1999, "step": 6612 }, { "epoch": 1.3326616965545033, "grad_norm": 0.05283069983124733, "learning_rate": 8.772075786311916e-05, "loss": 0.2548, "step": 6614 }, { "epoch": 1.3330646786218012, "grad_norm": 0.045274145901203156, "learning_rate": 8.771200903132215e-05, "loss": 0.1845, "step": 6616 }, { "epoch": 1.3334676606890994, "grad_norm": 0.05644421651959419, "learning_rate": 8.770325752052432e-05, "loss": 0.1792, "step": 6618 }, { "epoch": 1.3338706427563973, "grad_norm": 0.04432598128914833, "learning_rate": 8.769450333134739e-05, "loss": 0.1934, "step": 6620 }, { "epoch": 1.3342736248236953, "grad_norm": 0.05254184454679489, "learning_rate": 8.768574646441323e-05, "loss": 0.1675, "step": 6622 }, { "epoch": 1.3346766068909934, "grad_norm": 0.05594053491950035, "learning_rate": 8.767698692034389e-05, "loss": 0.2387, "step": 6624 }, { "epoch": 1.3350795889582914, "grad_norm": 0.07795904576778412, "learning_rate": 8.766822469976167e-05, "loss": 0.2225, "step": 6626 }, { "epoch": 1.3354825710255893, "grad_norm": 0.05772830918431282, "learning_rate": 8.7659459803289e-05, "loss": 0.1746, "step": 6628 }, { "epoch": 1.3358855530928873, "grad_norm": 0.04830396547913551, "learning_rate": 8.765069223154853e-05, "loss": 0.1767, "step": 6630 }, { "epoch": 1.3362885351601854, "grad_norm": 0.04581188037991524, "learning_rate": 8.764192198516313e-05, "loss": 0.1916, "step": 6632 }, { "epoch": 1.3366915172274834, "grad_norm": 0.06372931599617004, "learning_rate": 8.763314906475574e-05, "loss": 0.1885, "step": 6634 }, { "epoch": 1.3370944992947813, "grad_norm": 0.047626253217458725, "learning_rate": 8.762437347094965e-05, "loss": 0.1751, "step": 6636 }, { "epoch": 1.3374974813620795, "grad_norm": 0.03912975639104843, "learning_rate": 8.761559520436826e-05, "loss": 0.1827, "step": 6638 }, { "epoch": 1.3379004634293774, "grad_norm": 0.07140173017978668, "learning_rate": 8.760681426563512e-05, "loss": 0.2485, "step": 6640 }, { "epoch": 1.3383034454966753, "grad_norm": 0.050327908247709274, "learning_rate": 8.759803065537404e-05, "loss": 0.1755, "step": 6642 }, { "epoch": 1.3387064275639733, "grad_norm": 0.047158148139715195, "learning_rate": 8.758924437420898e-05, "loss": 0.1858, "step": 6644 }, { "epoch": 1.3391094096312715, "grad_norm": 0.037591077387332916, "learning_rate": 8.758045542276414e-05, "loss": 0.1511, "step": 6646 }, { "epoch": 1.3395123916985694, "grad_norm": 0.045494113117456436, "learning_rate": 8.757166380166384e-05, "loss": 0.2192, "step": 6648 }, { "epoch": 1.3399153737658676, "grad_norm": 0.058958739042282104, "learning_rate": 8.756286951153263e-05, "loss": 0.2174, "step": 6650 }, { "epoch": 1.3403183558331655, "grad_norm": 0.07330820709466934, "learning_rate": 8.755407255299524e-05, "loss": 0.2206, "step": 6652 }, { "epoch": 1.3407213379004634, "grad_norm": 0.12760120630264282, "learning_rate": 8.75452729266766e-05, "loss": 0.2324, "step": 6654 }, { "epoch": 1.3411243199677614, "grad_norm": 0.08199845999479294, "learning_rate": 8.75364706332018e-05, "loss": 0.1753, "step": 6656 }, { "epoch": 1.3415273020350593, "grad_norm": 0.0407002717256546, "learning_rate": 8.752766567319616e-05, "loss": 0.1774, "step": 6658 }, { "epoch": 1.3419302841023575, "grad_norm": 0.043844059109687805, "learning_rate": 8.751885804728519e-05, "loss": 0.1779, "step": 6660 }, { "epoch": 1.3423332661696554, "grad_norm": 0.06086337938904762, "learning_rate": 8.751004775609452e-05, "loss": 0.1774, "step": 6662 }, { "epoch": 1.3427362482369536, "grad_norm": 0.048990312963724136, "learning_rate": 8.750123480025007e-05, "loss": 0.1705, "step": 6664 }, { "epoch": 1.3431392303042515, "grad_norm": 0.03980618715286255, "learning_rate": 8.749241918037788e-05, "loss": 0.1758, "step": 6666 }, { "epoch": 1.3435422123715495, "grad_norm": 0.051659561693668365, "learning_rate": 8.748360089710416e-05, "loss": 0.196, "step": 6668 }, { "epoch": 1.3439451944388474, "grad_norm": 0.05354244261980057, "learning_rate": 8.74747799510554e-05, "loss": 0.121, "step": 6670 }, { "epoch": 1.3443481765061454, "grad_norm": 0.055109504610300064, "learning_rate": 8.74659563428582e-05, "loss": 0.2251, "step": 6672 }, { "epoch": 1.3447511585734435, "grad_norm": 0.043078888207674026, "learning_rate": 8.745713007313937e-05, "loss": 0.169, "step": 6674 }, { "epoch": 1.3451541406407415, "grad_norm": 0.05727505311369896, "learning_rate": 8.744830114252592e-05, "loss": 0.2085, "step": 6676 }, { "epoch": 1.3455571227080396, "grad_norm": 0.043431010097265244, "learning_rate": 8.743946955164506e-05, "loss": 0.1945, "step": 6678 }, { "epoch": 1.3459601047753376, "grad_norm": 0.04095043987035751, "learning_rate": 8.743063530112416e-05, "loss": 0.2139, "step": 6680 }, { "epoch": 1.3463630868426355, "grad_norm": 0.05373978987336159, "learning_rate": 8.742179839159077e-05, "loss": 0.1522, "step": 6682 }, { "epoch": 1.3467660689099334, "grad_norm": 0.05662880837917328, "learning_rate": 8.741295882367269e-05, "loss": 0.1871, "step": 6684 }, { "epoch": 1.3471690509772314, "grad_norm": 0.04732615128159523, "learning_rate": 8.740411659799785e-05, "loss": 0.2025, "step": 6686 }, { "epoch": 1.3475720330445295, "grad_norm": 0.043686844408512115, "learning_rate": 8.739527171519437e-05, "loss": 0.2166, "step": 6688 }, { "epoch": 1.3479750151118275, "grad_norm": 0.039230071008205414, "learning_rate": 8.73864241758906e-05, "loss": 0.1561, "step": 6690 }, { "epoch": 1.3483779971791257, "grad_norm": 0.04516879841685295, "learning_rate": 8.737757398071505e-05, "loss": 0.2342, "step": 6692 }, { "epoch": 1.3487809792464236, "grad_norm": 0.03462446853518486, "learning_rate": 8.736872113029642e-05, "loss": 0.1728, "step": 6694 }, { "epoch": 1.3491839613137215, "grad_norm": 0.045789361000061035, "learning_rate": 8.735986562526361e-05, "loss": 0.1634, "step": 6696 }, { "epoch": 1.3495869433810195, "grad_norm": 0.045005571097135544, "learning_rate": 8.735100746624568e-05, "loss": 0.1449, "step": 6698 }, { "epoch": 1.3499899254483174, "grad_norm": 0.0720299556851387, "learning_rate": 8.734214665387193e-05, "loss": 0.1818, "step": 6700 }, { "epoch": 1.3503929075156156, "grad_norm": 0.04918527230620384, "learning_rate": 8.733328318877179e-05, "loss": 0.1818, "step": 6702 }, { "epoch": 1.3507958895829135, "grad_norm": 0.04074351117014885, "learning_rate": 8.73244170715749e-05, "loss": 0.1968, "step": 6704 }, { "epoch": 1.3511988716502117, "grad_norm": 0.055268727242946625, "learning_rate": 8.731554830291114e-05, "loss": 0.2063, "step": 6706 }, { "epoch": 1.3516018537175096, "grad_norm": 0.037626102566719055, "learning_rate": 8.73066768834105e-05, "loss": 0.1903, "step": 6708 }, { "epoch": 1.3520048357848076, "grad_norm": 0.05037368834018707, "learning_rate": 8.72978028137032e-05, "loss": 0.1652, "step": 6710 }, { "epoch": 1.3524078178521055, "grad_norm": 0.07378649711608887, "learning_rate": 8.728892609441964e-05, "loss": 0.1998, "step": 6712 }, { "epoch": 1.3528107999194035, "grad_norm": 0.040671806782484055, "learning_rate": 8.728004672619039e-05, "loss": 0.1678, "step": 6714 }, { "epoch": 1.3532137819867016, "grad_norm": 0.04682549834251404, "learning_rate": 8.727116470964624e-05, "loss": 0.2063, "step": 6716 }, { "epoch": 1.3536167640539996, "grad_norm": 0.06274406611919403, "learning_rate": 8.726228004541818e-05, "loss": 0.1976, "step": 6718 }, { "epoch": 1.3540197461212977, "grad_norm": 0.049354761838912964, "learning_rate": 8.725339273413731e-05, "loss": 0.2151, "step": 6720 }, { "epoch": 1.3544227281885957, "grad_norm": 0.06009558215737343, "learning_rate": 8.724450277643501e-05, "loss": 0.2396, "step": 6722 }, { "epoch": 1.3548257102558936, "grad_norm": 0.042581889778375626, "learning_rate": 8.72356101729428e-05, "loss": 0.2058, "step": 6724 }, { "epoch": 1.3552286923231915, "grad_norm": 0.05875520780682564, "learning_rate": 8.72267149242924e-05, "loss": 0.2039, "step": 6726 }, { "epoch": 1.3556316743904897, "grad_norm": 0.04886787384748459, "learning_rate": 8.721781703111568e-05, "loss": 0.1559, "step": 6728 }, { "epoch": 1.3560346564577876, "grad_norm": 0.04805205762386322, "learning_rate": 8.72089164940448e-05, "loss": 0.1894, "step": 6730 }, { "epoch": 1.3564376385250856, "grad_norm": 0.0584242157638073, "learning_rate": 8.720001331371197e-05, "loss": 0.1765, "step": 6732 }, { "epoch": 1.3568406205923838, "grad_norm": 0.043926917016506195, "learning_rate": 8.719110749074969e-05, "loss": 0.1696, "step": 6734 }, { "epoch": 1.3572436026596817, "grad_norm": 0.07441367954015732, "learning_rate": 8.71821990257906e-05, "loss": 0.2257, "step": 6736 }, { "epoch": 1.3576465847269796, "grad_norm": 0.052764154970645905, "learning_rate": 8.717328791946758e-05, "loss": 0.161, "step": 6738 }, { "epoch": 1.3580495667942776, "grad_norm": 0.05789724364876747, "learning_rate": 8.716437417241363e-05, "loss": 0.1846, "step": 6740 }, { "epoch": 1.3584525488615757, "grad_norm": 0.05074714124202728, "learning_rate": 8.715545778526197e-05, "loss": 0.1703, "step": 6742 }, { "epoch": 1.3588555309288737, "grad_norm": 0.057830099016427994, "learning_rate": 8.714653875864601e-05, "loss": 0.1658, "step": 6744 }, { "epoch": 1.3592585129961716, "grad_norm": 0.06288470327854156, "learning_rate": 8.713761709319934e-05, "loss": 0.1938, "step": 6746 }, { "epoch": 1.3596614950634698, "grad_norm": 0.0460391566157341, "learning_rate": 8.712869278955575e-05, "loss": 0.2288, "step": 6748 }, { "epoch": 1.3600644771307677, "grad_norm": 0.04402383044362068, "learning_rate": 8.71197658483492e-05, "loss": 0.182, "step": 6750 }, { "epoch": 1.3604674591980657, "grad_norm": 0.04419035091996193, "learning_rate": 8.711083627021386e-05, "loss": 0.169, "step": 6752 }, { "epoch": 1.3608704412653636, "grad_norm": 0.040202751755714417, "learning_rate": 8.710190405578404e-05, "loss": 0.1829, "step": 6754 }, { "epoch": 1.3612734233326618, "grad_norm": 0.04582332819700241, "learning_rate": 8.709296920569432e-05, "loss": 0.2035, "step": 6756 }, { "epoch": 1.3616764053999597, "grad_norm": 0.06750451028347015, "learning_rate": 8.708403172057936e-05, "loss": 0.2098, "step": 6758 }, { "epoch": 1.3620793874672577, "grad_norm": 0.05921706184744835, "learning_rate": 8.707509160107411e-05, "loss": 0.1745, "step": 6760 }, { "epoch": 1.3624823695345558, "grad_norm": 0.05832752212882042, "learning_rate": 8.706614884781363e-05, "loss": 0.1917, "step": 6762 }, { "epoch": 1.3628853516018538, "grad_norm": 0.03713718429207802, "learning_rate": 8.705720346143325e-05, "loss": 0.1709, "step": 6764 }, { "epoch": 1.3632883336691517, "grad_norm": 0.057797905057668686, "learning_rate": 8.704825544256837e-05, "loss": 0.2044, "step": 6766 }, { "epoch": 1.3636913157364496, "grad_norm": 0.04862739145755768, "learning_rate": 8.703930479185467e-05, "loss": 0.217, "step": 6768 }, { "epoch": 1.3640942978037478, "grad_norm": 0.05247446522116661, "learning_rate": 8.703035150992802e-05, "loss": 0.2, "step": 6770 }, { "epoch": 1.3644972798710457, "grad_norm": 0.05208409205079079, "learning_rate": 8.70213955974244e-05, "loss": 0.1663, "step": 6772 }, { "epoch": 1.3649002619383437, "grad_norm": 0.0481448620557785, "learning_rate": 8.701243705498003e-05, "loss": 0.1961, "step": 6774 }, { "epoch": 1.3653032440056418, "grad_norm": 0.044661637395620346, "learning_rate": 8.700347588323135e-05, "loss": 0.1508, "step": 6776 }, { "epoch": 1.3657062260729398, "grad_norm": 0.06829683482646942, "learning_rate": 8.69945120828149e-05, "loss": 0.1964, "step": 6778 }, { "epoch": 1.3661092081402377, "grad_norm": 0.06598050892353058, "learning_rate": 8.69855456543675e-05, "loss": 0.1932, "step": 6780 }, { "epoch": 1.3665121902075357, "grad_norm": 0.04563366621732712, "learning_rate": 8.697657659852608e-05, "loss": 0.2021, "step": 6782 }, { "epoch": 1.3669151722748338, "grad_norm": 0.049588385969400406, "learning_rate": 8.696760491592778e-05, "loss": 0.2342, "step": 6784 }, { "epoch": 1.3673181543421318, "grad_norm": 0.06191490218043327, "learning_rate": 8.695863060720995e-05, "loss": 0.2053, "step": 6786 }, { "epoch": 1.3677211364094297, "grad_norm": 0.04240760579705238, "learning_rate": 8.694965367301013e-05, "loss": 0.2053, "step": 6788 }, { "epoch": 1.3681241184767279, "grad_norm": 0.0616329126060009, "learning_rate": 8.694067411396599e-05, "loss": 0.1861, "step": 6790 }, { "epoch": 1.3685271005440258, "grad_norm": 0.04297064244747162, "learning_rate": 8.693169193071543e-05, "loss": 0.1687, "step": 6792 }, { "epoch": 1.3689300826113238, "grad_norm": 0.06308623403310776, "learning_rate": 8.692270712389654e-05, "loss": 0.2375, "step": 6794 }, { "epoch": 1.3693330646786217, "grad_norm": 0.04142848029732704, "learning_rate": 8.691371969414759e-05, "loss": 0.1678, "step": 6796 }, { "epoch": 1.3697360467459199, "grad_norm": 0.055353887379169464, "learning_rate": 8.690472964210703e-05, "loss": 0.1774, "step": 6798 }, { "epoch": 1.3701390288132178, "grad_norm": 0.05628889426589012, "learning_rate": 8.689573696841351e-05, "loss": 0.1804, "step": 6800 }, { "epoch": 1.3705420108805157, "grad_norm": 0.0353449322283268, "learning_rate": 8.688674167370583e-05, "loss": 0.1727, "step": 6802 }, { "epoch": 1.370944992947814, "grad_norm": 0.060777902603149414, "learning_rate": 8.687774375862301e-05, "loss": 0.1996, "step": 6804 }, { "epoch": 1.3713479750151119, "grad_norm": 0.05757424980401993, "learning_rate": 8.686874322380425e-05, "loss": 0.2303, "step": 6806 }, { "epoch": 1.3717509570824098, "grad_norm": 0.05264467000961304, "learning_rate": 8.685974006988893e-05, "loss": 0.2153, "step": 6808 }, { "epoch": 1.3721539391497077, "grad_norm": 0.05513720214366913, "learning_rate": 8.685073429751663e-05, "loss": 0.2005, "step": 6810 }, { "epoch": 1.372556921217006, "grad_norm": 0.06795477867126465, "learning_rate": 8.68417259073271e-05, "loss": 0.1603, "step": 6812 }, { "epoch": 1.3729599032843038, "grad_norm": 0.04166838526725769, "learning_rate": 8.683271489996029e-05, "loss": 0.2005, "step": 6814 }, { "epoch": 1.3733628853516018, "grad_norm": 0.05827517807483673, "learning_rate": 8.68237012760563e-05, "loss": 0.2218, "step": 6816 }, { "epoch": 1.3737658674189, "grad_norm": 0.07584039121866226, "learning_rate": 8.681468503625548e-05, "loss": 0.2158, "step": 6818 }, { "epoch": 1.3741688494861979, "grad_norm": 0.05099467188119888, "learning_rate": 8.680566618119829e-05, "loss": 0.221, "step": 6820 }, { "epoch": 1.3745718315534958, "grad_norm": 0.06616433709859848, "learning_rate": 8.679664471152546e-05, "loss": 0.1717, "step": 6822 }, { "epoch": 1.3749748136207938, "grad_norm": 0.048993200063705444, "learning_rate": 8.678762062787782e-05, "loss": 0.1877, "step": 6824 }, { "epoch": 1.375377795688092, "grad_norm": 0.059043314307928085, "learning_rate": 8.677859393089646e-05, "loss": 0.2321, "step": 6826 }, { "epoch": 1.3757807777553899, "grad_norm": 0.06745105981826782, "learning_rate": 8.676956462122259e-05, "loss": 0.1757, "step": 6828 }, { "epoch": 1.3761837598226878, "grad_norm": 0.03870021179318428, "learning_rate": 8.676053269949766e-05, "loss": 0.1822, "step": 6830 }, { "epoch": 1.376586741889986, "grad_norm": 0.03666188195347786, "learning_rate": 8.675149816636327e-05, "loss": 0.1847, "step": 6832 }, { "epoch": 1.376989723957284, "grad_norm": 0.04749156907200813, "learning_rate": 8.674246102246125e-05, "loss": 0.2007, "step": 6834 }, { "epoch": 1.3773927060245819, "grad_norm": 0.044996339827775955, "learning_rate": 8.673342126843353e-05, "loss": 0.2072, "step": 6836 }, { "epoch": 1.3777956880918798, "grad_norm": 0.047479066997766495, "learning_rate": 8.672437890492234e-05, "loss": 0.1748, "step": 6838 }, { "epoch": 1.378198670159178, "grad_norm": 0.049724042415618896, "learning_rate": 8.671533393256998e-05, "loss": 0.1375, "step": 6840 }, { "epoch": 1.378601652226476, "grad_norm": 0.05517043545842171, "learning_rate": 8.670628635201901e-05, "loss": 0.1694, "step": 6842 }, { "epoch": 1.3790046342937738, "grad_norm": 0.04162873700261116, "learning_rate": 8.669723616391217e-05, "loss": 0.1945, "step": 6844 }, { "epoch": 1.379407616361072, "grad_norm": 0.05656104534864426, "learning_rate": 8.668818336889237e-05, "loss": 0.1662, "step": 6846 }, { "epoch": 1.37981059842837, "grad_norm": 0.04506509006023407, "learning_rate": 8.667912796760269e-05, "loss": 0.2062, "step": 6848 }, { "epoch": 1.380213580495668, "grad_norm": 0.06399524211883545, "learning_rate": 8.667006996068642e-05, "loss": 0.2252, "step": 6850 }, { "epoch": 1.3806165625629658, "grad_norm": 0.0532291941344738, "learning_rate": 8.666100934878702e-05, "loss": 0.1534, "step": 6852 }, { "epoch": 1.381019544630264, "grad_norm": 0.06636221706867218, "learning_rate": 8.665194613254814e-05, "loss": 0.2373, "step": 6854 }, { "epoch": 1.381422526697562, "grad_norm": 0.051087286323308945, "learning_rate": 8.664288031261365e-05, "loss": 0.2099, "step": 6856 }, { "epoch": 1.38182550876486, "grad_norm": 0.07028748840093613, "learning_rate": 8.663381188962753e-05, "loss": 0.1864, "step": 6858 }, { "epoch": 1.382228490832158, "grad_norm": 0.0513707660138607, "learning_rate": 8.6624740864234e-05, "loss": 0.2527, "step": 6860 }, { "epoch": 1.382631472899456, "grad_norm": 0.041053254157304764, "learning_rate": 8.661566723707745e-05, "loss": 0.1516, "step": 6862 }, { "epoch": 1.383034454966754, "grad_norm": 0.029478134587407112, "learning_rate": 8.660659100880246e-05, "loss": 0.1596, "step": 6864 }, { "epoch": 1.3834374370340519, "grad_norm": 0.05876408517360687, "learning_rate": 8.659751218005379e-05, "loss": 0.1817, "step": 6866 }, { "epoch": 1.38384041910135, "grad_norm": 0.06829272210597992, "learning_rate": 8.658843075147636e-05, "loss": 0.1947, "step": 6868 }, { "epoch": 1.384243401168648, "grad_norm": 0.049879446625709534, "learning_rate": 8.657934672371534e-05, "loss": 0.2194, "step": 6870 }, { "epoch": 1.3846463832359461, "grad_norm": 0.03202884644269943, "learning_rate": 8.657026009741605e-05, "loss": 0.1768, "step": 6872 }, { "epoch": 1.385049365303244, "grad_norm": 0.06864320486783981, "learning_rate": 8.656117087322395e-05, "loss": 0.2288, "step": 6874 }, { "epoch": 1.385452347370542, "grad_norm": 0.05002441629767418, "learning_rate": 8.655207905178474e-05, "loss": 0.2283, "step": 6876 }, { "epoch": 1.38585532943784, "grad_norm": 0.06078348681330681, "learning_rate": 8.654298463374429e-05, "loss": 0.1693, "step": 6878 }, { "epoch": 1.386258311505138, "grad_norm": 0.050742197781801224, "learning_rate": 8.653388761974865e-05, "loss": 0.1947, "step": 6880 }, { "epoch": 1.386661293572436, "grad_norm": 0.03963219001889229, "learning_rate": 8.652478801044407e-05, "loss": 0.161, "step": 6882 }, { "epoch": 1.387064275639734, "grad_norm": 0.04068451747298241, "learning_rate": 8.651568580647698e-05, "loss": 0.1831, "step": 6884 }, { "epoch": 1.3874672577070322, "grad_norm": 0.05580208823084831, "learning_rate": 8.650658100849394e-05, "loss": 0.1938, "step": 6886 }, { "epoch": 1.38787023977433, "grad_norm": 0.06083298847079277, "learning_rate": 8.649747361714178e-05, "loss": 0.2187, "step": 6888 }, { "epoch": 1.388273221841628, "grad_norm": 0.06060599535703659, "learning_rate": 8.648836363306745e-05, "loss": 0.186, "step": 6890 }, { "epoch": 1.388676203908926, "grad_norm": 0.05033830180764198, "learning_rate": 8.647925105691814e-05, "loss": 0.1962, "step": 6892 }, { "epoch": 1.389079185976224, "grad_norm": 0.052239105105400085, "learning_rate": 8.647013588934117e-05, "loss": 0.2469, "step": 6894 }, { "epoch": 1.389482168043522, "grad_norm": 0.06745372712612152, "learning_rate": 8.646101813098407e-05, "loss": 0.2453, "step": 6896 }, { "epoch": 1.38988515011082, "grad_norm": 0.05417153611779213, "learning_rate": 8.645189778249456e-05, "loss": 0.1875, "step": 6898 }, { "epoch": 1.3902881321781182, "grad_norm": 0.05408632755279541, "learning_rate": 8.644277484452052e-05, "loss": 0.1902, "step": 6900 }, { "epoch": 1.3906911142454161, "grad_norm": 0.05546940863132477, "learning_rate": 8.643364931771004e-05, "loss": 0.2093, "step": 6902 }, { "epoch": 1.391094096312714, "grad_norm": 0.041996292769908905, "learning_rate": 8.642452120271137e-05, "loss": 0.2097, "step": 6904 }, { "epoch": 1.391497078380012, "grad_norm": 0.03974250331521034, "learning_rate": 8.641539050017297e-05, "loss": 0.1665, "step": 6906 }, { "epoch": 1.39190006044731, "grad_norm": 0.04208201915025711, "learning_rate": 8.640625721074347e-05, "loss": 0.1463, "step": 6908 }, { "epoch": 1.3923030425146081, "grad_norm": 0.07863224297761917, "learning_rate": 8.639712133507169e-05, "loss": 0.2162, "step": 6910 }, { "epoch": 1.392706024581906, "grad_norm": 0.042710717767477036, "learning_rate": 8.63879828738066e-05, "loss": 0.1739, "step": 6912 }, { "epoch": 1.3931090066492042, "grad_norm": 0.06020447984337807, "learning_rate": 8.637884182759741e-05, "loss": 0.2377, "step": 6914 }, { "epoch": 1.3935119887165022, "grad_norm": 0.05448845028877258, "learning_rate": 8.636969819709348e-05, "loss": 0.1784, "step": 6916 }, { "epoch": 1.3939149707838001, "grad_norm": 0.04206893965601921, "learning_rate": 8.636055198294434e-05, "loss": 0.2199, "step": 6918 }, { "epoch": 1.394317952851098, "grad_norm": 0.06374957412481308, "learning_rate": 8.635140318579976e-05, "loss": 0.1851, "step": 6920 }, { "epoch": 1.394720934918396, "grad_norm": 0.04802941903471947, "learning_rate": 8.634225180630962e-05, "loss": 0.1895, "step": 6922 }, { "epoch": 1.3951239169856942, "grad_norm": 0.062166936695575714, "learning_rate": 8.633309784512403e-05, "loss": 0.245, "step": 6924 }, { "epoch": 1.395526899052992, "grad_norm": 0.0570203997194767, "learning_rate": 8.632394130289328e-05, "loss": 0.2336, "step": 6926 }, { "epoch": 1.3959298811202903, "grad_norm": 0.046736180782318115, "learning_rate": 8.631478218026782e-05, "loss": 0.2514, "step": 6928 }, { "epoch": 1.3963328631875882, "grad_norm": 0.0648045614361763, "learning_rate": 8.630562047789833e-05, "loss": 0.1536, "step": 6930 }, { "epoch": 1.3967358452548861, "grad_norm": 0.06284670531749725, "learning_rate": 8.629645619643561e-05, "loss": 0.2086, "step": 6932 }, { "epoch": 1.397138827322184, "grad_norm": 0.05843097344040871, "learning_rate": 8.62872893365307e-05, "loss": 0.2045, "step": 6934 }, { "epoch": 1.3975418093894822, "grad_norm": 0.03313927724957466, "learning_rate": 8.627811989883479e-05, "loss": 0.1621, "step": 6936 }, { "epoch": 1.3979447914567802, "grad_norm": 0.05084552243351936, "learning_rate": 8.626894788399925e-05, "loss": 0.245, "step": 6938 }, { "epoch": 1.3983477735240781, "grad_norm": 0.0713992714881897, "learning_rate": 8.625977329267565e-05, "loss": 0.2607, "step": 6940 }, { "epoch": 1.3987507555913763, "grad_norm": 0.0683627724647522, "learning_rate": 8.625059612551575e-05, "loss": 0.1904, "step": 6942 }, { "epoch": 1.3991537376586742, "grad_norm": 0.03592758998274803, "learning_rate": 8.624141638317149e-05, "loss": 0.203, "step": 6944 }, { "epoch": 1.3995567197259722, "grad_norm": 0.04698161408305168, "learning_rate": 8.623223406629495e-05, "loss": 0.1839, "step": 6946 }, { "epoch": 1.3999597017932701, "grad_norm": 0.05979125574231148, "learning_rate": 8.622304917553846e-05, "loss": 0.1984, "step": 6948 }, { "epoch": 1.4003626838605683, "grad_norm": 0.042240679264068604, "learning_rate": 8.621386171155448e-05, "loss": 0.186, "step": 6950 }, { "epoch": 1.4007656659278662, "grad_norm": 0.06413422524929047, "learning_rate": 8.620467167499568e-05, "loss": 0.1845, "step": 6952 }, { "epoch": 1.4011686479951642, "grad_norm": 0.0588700994849205, "learning_rate": 8.61954790665149e-05, "loss": 0.1922, "step": 6954 }, { "epoch": 1.4015716300624623, "grad_norm": 0.06049403175711632, "learning_rate": 8.61862838867652e-05, "loss": 0.2095, "step": 6956 }, { "epoch": 1.4019746121297603, "grad_norm": 0.049626272171735764, "learning_rate": 8.617708613639973e-05, "loss": 0.179, "step": 6958 }, { "epoch": 1.4023775941970582, "grad_norm": 0.04943282529711723, "learning_rate": 8.616788581607193e-05, "loss": 0.1887, "step": 6960 }, { "epoch": 1.4027805762643561, "grad_norm": 0.043147701770067215, "learning_rate": 8.615868292643536e-05, "loss": 0.1697, "step": 6962 }, { "epoch": 1.4031835583316543, "grad_norm": 0.04488087072968483, "learning_rate": 8.614947746814379e-05, "loss": 0.1726, "step": 6964 }, { "epoch": 1.4035865403989523, "grad_norm": 0.05846525728702545, "learning_rate": 8.614026944185117e-05, "loss": 0.1389, "step": 6966 }, { "epoch": 1.4039895224662502, "grad_norm": 0.03812554106116295, "learning_rate": 8.613105884821157e-05, "loss": 0.1742, "step": 6968 }, { "epoch": 1.4043925045335484, "grad_norm": 0.06095854192972183, "learning_rate": 8.612184568787936e-05, "loss": 0.1818, "step": 6970 }, { "epoch": 1.4047954866008463, "grad_norm": 0.05667596682906151, "learning_rate": 8.611262996150899e-05, "loss": 0.2194, "step": 6972 }, { "epoch": 1.4051984686681442, "grad_norm": 0.06258885562419891, "learning_rate": 8.610341166975513e-05, "loss": 0.2332, "step": 6974 }, { "epoch": 1.4056014507354422, "grad_norm": 0.05343182757496834, "learning_rate": 8.609419081327266e-05, "loss": 0.1721, "step": 6976 }, { "epoch": 1.4060044328027403, "grad_norm": 0.0530022569000721, "learning_rate": 8.608496739271659e-05, "loss": 0.21, "step": 6978 }, { "epoch": 1.4064074148700383, "grad_norm": 0.05478819087147713, "learning_rate": 8.607574140874214e-05, "loss": 0.1335, "step": 6980 }, { "epoch": 1.4068103969373362, "grad_norm": 0.047832515090703964, "learning_rate": 8.606651286200474e-05, "loss": 0.1664, "step": 6982 }, { "epoch": 1.4072133790046344, "grad_norm": 0.030942801386117935, "learning_rate": 8.605728175315993e-05, "loss": 0.1981, "step": 6984 }, { "epoch": 1.4076163610719323, "grad_norm": 0.047290656715631485, "learning_rate": 8.604804808286348e-05, "loss": 0.195, "step": 6986 }, { "epoch": 1.4080193431392303, "grad_norm": 0.061841338872909546, "learning_rate": 8.603881185177136e-05, "loss": 0.202, "step": 6988 }, { "epoch": 1.4084223252065282, "grad_norm": 0.054422467947006226, "learning_rate": 8.602957306053968e-05, "loss": 0.2143, "step": 6990 }, { "epoch": 1.4088253072738264, "grad_norm": 0.04802738130092621, "learning_rate": 8.602033170982475e-05, "loss": 0.1735, "step": 6992 }, { "epoch": 1.4092282893411243, "grad_norm": 0.07405896484851837, "learning_rate": 8.601108780028306e-05, "loss": 0.2378, "step": 6994 }, { "epoch": 1.4096312714084223, "grad_norm": 0.07153638452291489, "learning_rate": 8.600184133257127e-05, "loss": 0.2177, "step": 6996 }, { "epoch": 1.4100342534757204, "grad_norm": 0.11758548021316528, "learning_rate": 8.599259230734626e-05, "loss": 0.2042, "step": 6998 }, { "epoch": 1.4104372355430184, "grad_norm": 0.050041794776916504, "learning_rate": 8.598334072526507e-05, "loss": 0.1478, "step": 7000 }, { "epoch": 1.4108402176103163, "grad_norm": 0.044110897928476334, "learning_rate": 8.597408658698488e-05, "loss": 0.1432, "step": 7002 }, { "epoch": 1.4112431996776142, "grad_norm": 0.06034010276198387, "learning_rate": 8.596482989316312e-05, "loss": 0.1976, "step": 7004 }, { "epoch": 1.4116461817449124, "grad_norm": 0.07794831693172455, "learning_rate": 8.595557064445736e-05, "loss": 0.2376, "step": 7006 }, { "epoch": 1.4120491638122104, "grad_norm": 0.07392967492341995, "learning_rate": 8.594630884152537e-05, "loss": 0.2335, "step": 7008 }, { "epoch": 1.4124521458795083, "grad_norm": 0.06214665621519089, "learning_rate": 8.593704448502507e-05, "loss": 0.1827, "step": 7010 }, { "epoch": 1.4128551279468065, "grad_norm": 0.05099937692284584, "learning_rate": 8.59277775756146e-05, "loss": 0.1647, "step": 7012 }, { "epoch": 1.4132581100141044, "grad_norm": 0.08212552219629288, "learning_rate": 8.591850811395231e-05, "loss": 0.2181, "step": 7014 }, { "epoch": 1.4136610920814023, "grad_norm": 0.11238207668066025, "learning_rate": 8.59092361006966e-05, "loss": 0.2202, "step": 7016 }, { "epoch": 1.4140640741487003, "grad_norm": 0.0554063580930233, "learning_rate": 8.589996153650622e-05, "loss": 0.1892, "step": 7018 }, { "epoch": 1.4144670562159984, "grad_norm": 0.045140497386455536, "learning_rate": 8.589068442203996e-05, "loss": 0.1798, "step": 7020 }, { "epoch": 1.4148700382832964, "grad_norm": 0.04684115946292877, "learning_rate": 8.58814047579569e-05, "loss": 0.181, "step": 7022 }, { "epoch": 1.4152730203505943, "grad_norm": 0.06891334801912308, "learning_rate": 8.587212254491621e-05, "loss": 0.2023, "step": 7024 }, { "epoch": 1.4156760024178925, "grad_norm": 0.0646648108959198, "learning_rate": 8.58628377835773e-05, "loss": 0.1956, "step": 7026 }, { "epoch": 1.4160789844851904, "grad_norm": 0.06795363128185272, "learning_rate": 8.585355047459976e-05, "loss": 0.1768, "step": 7028 }, { "epoch": 1.4164819665524884, "grad_norm": 0.062433384358882904, "learning_rate": 8.584426061864335e-05, "loss": 0.2723, "step": 7030 }, { "epoch": 1.4168849486197863, "grad_norm": 0.030506597831845284, "learning_rate": 8.583496821636797e-05, "loss": 0.1273, "step": 7032 }, { "epoch": 1.4172879306870845, "grad_norm": 0.04489409551024437, "learning_rate": 8.582567326843376e-05, "loss": 0.151, "step": 7034 }, { "epoch": 1.4176909127543824, "grad_norm": 0.043156519532203674, "learning_rate": 8.581637577550101e-05, "loss": 0.222, "step": 7036 }, { "epoch": 1.4180938948216804, "grad_norm": 0.05898886173963547, "learning_rate": 8.580707573823021e-05, "loss": 0.2044, "step": 7038 }, { "epoch": 1.4184968768889785, "grad_norm": 0.05552120506763458, "learning_rate": 8.579777315728202e-05, "loss": 0.192, "step": 7040 }, { "epoch": 1.4188998589562765, "grad_norm": 0.07700909674167633, "learning_rate": 8.578846803331726e-05, "loss": 0.1817, "step": 7042 }, { "epoch": 1.4193028410235744, "grad_norm": 0.05442452058196068, "learning_rate": 8.577916036699698e-05, "loss": 0.1612, "step": 7044 }, { "epoch": 1.4197058230908723, "grad_norm": 0.03962863236665726, "learning_rate": 8.576985015898237e-05, "loss": 0.1251, "step": 7046 }, { "epoch": 1.4201088051581705, "grad_norm": 0.07024955004453659, "learning_rate": 8.57605374099348e-05, "loss": 0.2465, "step": 7048 }, { "epoch": 1.4205117872254684, "grad_norm": 0.06219443678855896, "learning_rate": 8.575122212051585e-05, "loss": 0.173, "step": 7050 }, { "epoch": 1.4209147692927664, "grad_norm": 0.05334271490573883, "learning_rate": 8.574190429138726e-05, "loss": 0.1995, "step": 7052 }, { "epoch": 1.4213177513600646, "grad_norm": 0.05093487352132797, "learning_rate": 8.573258392321093e-05, "loss": 0.1886, "step": 7054 }, { "epoch": 1.4217207334273625, "grad_norm": 0.07703381776809692, "learning_rate": 8.5723261016649e-05, "loss": 0.196, "step": 7056 }, { "epoch": 1.4221237154946604, "grad_norm": 0.05135168135166168, "learning_rate": 8.571393557236373e-05, "loss": 0.1924, "step": 7058 }, { "epoch": 1.4225266975619584, "grad_norm": 0.05865864083170891, "learning_rate": 8.570460759101761e-05, "loss": 0.2021, "step": 7060 }, { "epoch": 1.4229296796292565, "grad_norm": 0.05785017088055611, "learning_rate": 8.569527707327325e-05, "loss": 0.1633, "step": 7062 }, { "epoch": 1.4233326616965545, "grad_norm": 0.05694718286395073, "learning_rate": 8.56859440197935e-05, "loss": 0.1682, "step": 7064 }, { "epoch": 1.4237356437638526, "grad_norm": 0.0642189010977745, "learning_rate": 8.567660843124135e-05, "loss": 0.1794, "step": 7066 }, { "epoch": 1.4241386258311506, "grad_norm": 0.05178860202431679, "learning_rate": 8.566727030828001e-05, "loss": 0.1915, "step": 7068 }, { "epoch": 1.4245416078984485, "grad_norm": 0.06810785830020905, "learning_rate": 8.565792965157281e-05, "loss": 0.2163, "step": 7070 }, { "epoch": 1.4249445899657465, "grad_norm": 0.05047709494829178, "learning_rate": 8.564858646178333e-05, "loss": 0.2125, "step": 7072 }, { "epoch": 1.4253475720330444, "grad_norm": 0.057600557804107666, "learning_rate": 8.563924073957527e-05, "loss": 0.1998, "step": 7074 }, { "epoch": 1.4257505541003426, "grad_norm": 0.04869558662176132, "learning_rate": 8.562989248561256e-05, "loss": 0.1805, "step": 7076 }, { "epoch": 1.4261535361676405, "grad_norm": 0.06351305544376373, "learning_rate": 8.562054170055924e-05, "loss": 0.1748, "step": 7078 }, { "epoch": 1.4265565182349387, "grad_norm": 0.046501412987709045, "learning_rate": 8.561118838507962e-05, "loss": 0.2176, "step": 7080 }, { "epoch": 1.4269595003022366, "grad_norm": 0.057603869587183, "learning_rate": 8.560183253983813e-05, "loss": 0.2286, "step": 7082 }, { "epoch": 1.4273624823695346, "grad_norm": 0.07590744644403458, "learning_rate": 8.55924741654994e-05, "loss": 0.201, "step": 7084 }, { "epoch": 1.4277654644368325, "grad_norm": 0.049121540039777756, "learning_rate": 8.558311326272821e-05, "loss": 0.1838, "step": 7086 }, { "epoch": 1.4281684465041304, "grad_norm": 0.04728762432932854, "learning_rate": 8.557374983218957e-05, "loss": 0.1613, "step": 7088 }, { "epoch": 1.4285714285714286, "grad_norm": 0.05571918934583664, "learning_rate": 8.556438387454864e-05, "loss": 0.231, "step": 7090 }, { "epoch": 1.4289744106387265, "grad_norm": 0.04902351647615433, "learning_rate": 8.555501539047075e-05, "loss": 0.2, "step": 7092 }, { "epoch": 1.4293773927060247, "grad_norm": 0.05943556874990463, "learning_rate": 8.554564438062142e-05, "loss": 0.1962, "step": 7094 }, { "epoch": 1.4297803747733226, "grad_norm": 0.04188118875026703, "learning_rate": 8.553627084566637e-05, "loss": 0.1585, "step": 7096 }, { "epoch": 1.4301833568406206, "grad_norm": 0.058290161192417145, "learning_rate": 8.552689478627147e-05, "loss": 0.1897, "step": 7098 }, { "epoch": 1.4305863389079185, "grad_norm": 0.05757759138941765, "learning_rate": 8.551751620310279e-05, "loss": 0.2302, "step": 7100 }, { "epoch": 1.4309893209752165, "grad_norm": 0.06541424989700317, "learning_rate": 8.550813509682654e-05, "loss": 0.1784, "step": 7102 }, { "epoch": 1.4313923030425146, "grad_norm": 0.05995183438062668, "learning_rate": 8.549875146810918e-05, "loss": 0.2322, "step": 7104 }, { "epoch": 1.4317952851098126, "grad_norm": 0.050467684864997864, "learning_rate": 8.548936531761727e-05, "loss": 0.1946, "step": 7106 }, { "epoch": 1.4321982671771107, "grad_norm": 0.05367492884397507, "learning_rate": 8.547997664601763e-05, "loss": 0.1872, "step": 7108 }, { "epoch": 1.4326012492444087, "grad_norm": 0.07544504106044769, "learning_rate": 8.547058545397717e-05, "loss": 0.2195, "step": 7110 }, { "epoch": 1.4330042313117066, "grad_norm": 0.07416324317455292, "learning_rate": 8.546119174216305e-05, "loss": 0.1982, "step": 7112 }, { "epoch": 1.4334072133790046, "grad_norm": 0.10676012933254242, "learning_rate": 8.545179551124258e-05, "loss": 0.1391, "step": 7114 }, { "epoch": 1.4338101954463025, "grad_norm": 0.04345235228538513, "learning_rate": 8.544239676188326e-05, "loss": 0.1398, "step": 7116 }, { "epoch": 1.4342131775136007, "grad_norm": 0.039523664861917496, "learning_rate": 8.543299549475274e-05, "loss": 0.1978, "step": 7118 }, { "epoch": 1.4346161595808986, "grad_norm": 0.0516582652926445, "learning_rate": 8.54235917105189e-05, "loss": 0.2099, "step": 7120 }, { "epoch": 1.4350191416481968, "grad_norm": 0.0456339530646801, "learning_rate": 8.541418540984975e-05, "loss": 0.2016, "step": 7122 }, { "epoch": 1.4354221237154947, "grad_norm": 0.05590183287858963, "learning_rate": 8.54047765934135e-05, "loss": 0.1898, "step": 7124 }, { "epoch": 1.4358251057827927, "grad_norm": 0.05954501032829285, "learning_rate": 8.539536526187857e-05, "loss": 0.198, "step": 7126 }, { "epoch": 1.4362280878500906, "grad_norm": 0.09433047473430634, "learning_rate": 8.538595141591348e-05, "loss": 0.2157, "step": 7128 }, { "epoch": 1.4366310699173888, "grad_norm": 0.06842294335365295, "learning_rate": 8.5376535056187e-05, "loss": 0.2662, "step": 7130 }, { "epoch": 1.4370340519846867, "grad_norm": 0.0586603581905365, "learning_rate": 8.536711618336802e-05, "loss": 0.1726, "step": 7132 }, { "epoch": 1.4374370340519846, "grad_norm": 0.04986822232604027, "learning_rate": 8.535769479812569e-05, "loss": 0.1851, "step": 7134 }, { "epoch": 1.4378400161192828, "grad_norm": 0.05392155423760414, "learning_rate": 8.534827090112927e-05, "loss": 0.1937, "step": 7136 }, { "epoch": 1.4382429981865807, "grad_norm": 0.033999454230070114, "learning_rate": 8.53388444930482e-05, "loss": 0.1801, "step": 7138 }, { "epoch": 1.4386459802538787, "grad_norm": 0.04176968336105347, "learning_rate": 8.532941557455214e-05, "loss": 0.1373, "step": 7140 }, { "epoch": 1.4390489623211766, "grad_norm": 0.05936324968934059, "learning_rate": 8.53199841463109e-05, "loss": 0.1359, "step": 7142 }, { "epoch": 1.4394519443884748, "grad_norm": 0.04897318407893181, "learning_rate": 8.531055020899448e-05, "loss": 0.1773, "step": 7144 }, { "epoch": 1.4398549264557727, "grad_norm": 0.05380289629101753, "learning_rate": 8.530111376327304e-05, "loss": 0.1658, "step": 7146 }, { "epoch": 1.4402579085230707, "grad_norm": 0.056560710072517395, "learning_rate": 8.529167480981693e-05, "loss": 0.1902, "step": 7148 }, { "epoch": 1.4406608905903688, "grad_norm": 0.05610078573226929, "learning_rate": 8.528223334929669e-05, "loss": 0.1944, "step": 7150 }, { "epoch": 1.4410638726576668, "grad_norm": 0.11309154331684113, "learning_rate": 8.5272789382383e-05, "loss": 0.2735, "step": 7152 }, { "epoch": 1.4414668547249647, "grad_norm": 0.05021926388144493, "learning_rate": 8.52633429097468e-05, "loss": 0.195, "step": 7154 }, { "epoch": 1.4418698367922627, "grad_norm": 0.07295999675989151, "learning_rate": 8.525389393205906e-05, "loss": 0.2276, "step": 7156 }, { "epoch": 1.4422728188595608, "grad_norm": 0.07204556465148926, "learning_rate": 8.524444244999113e-05, "loss": 0.1838, "step": 7158 }, { "epoch": 1.4426758009268588, "grad_norm": 0.04845462366938591, "learning_rate": 8.523498846421435e-05, "loss": 0.1711, "step": 7160 }, { "epoch": 1.4430787829941567, "grad_norm": 0.049779243767261505, "learning_rate": 8.522553197540033e-05, "loss": 0.195, "step": 7162 }, { "epoch": 1.4434817650614549, "grad_norm": 0.06310156732797623, "learning_rate": 8.521607298422087e-05, "loss": 0.1704, "step": 7164 }, { "epoch": 1.4438847471287528, "grad_norm": 0.04404526203870773, "learning_rate": 8.52066114913479e-05, "loss": 0.1512, "step": 7166 }, { "epoch": 1.4442877291960508, "grad_norm": 0.05154913291335106, "learning_rate": 8.519714749745356e-05, "loss": 0.2179, "step": 7168 }, { "epoch": 1.4446907112633487, "grad_norm": 0.04693300276994705, "learning_rate": 8.518768100321013e-05, "loss": 0.1985, "step": 7170 }, { "epoch": 1.4450936933306469, "grad_norm": 0.05438321828842163, "learning_rate": 8.517821200929013e-05, "loss": 0.2356, "step": 7172 }, { "epoch": 1.4454966753979448, "grad_norm": 0.05669800192117691, "learning_rate": 8.516874051636621e-05, "loss": 0.2373, "step": 7174 }, { "epoch": 1.4458996574652427, "grad_norm": 0.051683925092220306, "learning_rate": 8.51592665251112e-05, "loss": 0.2001, "step": 7176 }, { "epoch": 1.446302639532541, "grad_norm": 0.03920593857765198, "learning_rate": 8.514979003619814e-05, "loss": 0.1738, "step": 7178 }, { "epoch": 1.4467056215998388, "grad_norm": 0.0427895151078701, "learning_rate": 8.51403110503002e-05, "loss": 0.162, "step": 7180 }, { "epoch": 1.4471086036671368, "grad_norm": 0.06468407064676285, "learning_rate": 8.513082956809075e-05, "loss": 0.2271, "step": 7182 }, { "epoch": 1.4475115857344347, "grad_norm": 0.039048973470926285, "learning_rate": 8.512134559024337e-05, "loss": 0.1726, "step": 7184 }, { "epoch": 1.4479145678017329, "grad_norm": 0.04582072049379349, "learning_rate": 8.511185911743176e-05, "loss": 0.2126, "step": 7186 }, { "epoch": 1.4483175498690308, "grad_norm": 0.04817645624279976, "learning_rate": 8.510237015032982e-05, "loss": 0.218, "step": 7188 }, { "epoch": 1.4487205319363288, "grad_norm": 0.055014822632074356, "learning_rate": 8.509287868961166e-05, "loss": 0.1736, "step": 7190 }, { "epoch": 1.449123514003627, "grad_norm": 0.04446453973650932, "learning_rate": 8.508338473595152e-05, "loss": 0.2389, "step": 7192 }, { "epoch": 1.4495264960709249, "grad_norm": 0.04078453779220581, "learning_rate": 8.507388829002383e-05, "loss": 0.2143, "step": 7194 }, { "epoch": 1.4499294781382228, "grad_norm": 0.04532262682914734, "learning_rate": 8.50643893525032e-05, "loss": 0.1713, "step": 7196 }, { "epoch": 1.4503324602055208, "grad_norm": 0.05296036973595619, "learning_rate": 8.505488792406444e-05, "loss": 0.1575, "step": 7198 }, { "epoch": 1.450735442272819, "grad_norm": 0.06389618664979935, "learning_rate": 8.504538400538252e-05, "loss": 0.2189, "step": 7200 }, { "epoch": 1.4511384243401169, "grad_norm": 0.04667343944311142, "learning_rate": 8.503587759713253e-05, "loss": 0.2219, "step": 7202 }, { "epoch": 1.4515414064074148, "grad_norm": 0.06371932476758957, "learning_rate": 8.502636869998986e-05, "loss": 0.1854, "step": 7204 }, { "epoch": 1.451944388474713, "grad_norm": 0.04117533937096596, "learning_rate": 8.501685731462995e-05, "loss": 0.21, "step": 7206 }, { "epoch": 1.452347370542011, "grad_norm": 0.043234214186668396, "learning_rate": 8.500734344172849e-05, "loss": 0.2015, "step": 7208 }, { "epoch": 1.4527503526093088, "grad_norm": 0.05427609011530876, "learning_rate": 8.499782708196136e-05, "loss": 0.1649, "step": 7210 }, { "epoch": 1.4531533346766068, "grad_norm": 0.05856523662805557, "learning_rate": 8.498830823600457e-05, "loss": 0.1776, "step": 7212 }, { "epoch": 1.453556316743905, "grad_norm": 0.061823341995477676, "learning_rate": 8.49787869045343e-05, "loss": 0.178, "step": 7214 }, { "epoch": 1.453959298811203, "grad_norm": 0.06578311324119568, "learning_rate": 8.496926308822696e-05, "loss": 0.2282, "step": 7216 }, { "epoch": 1.4543622808785008, "grad_norm": 0.040528710931539536, "learning_rate": 8.49597367877591e-05, "loss": 0.2129, "step": 7218 }, { "epoch": 1.454765262945799, "grad_norm": 0.05041544884443283, "learning_rate": 8.495020800380742e-05, "loss": 0.1878, "step": 7220 }, { "epoch": 1.455168245013097, "grad_norm": 0.05043159797787666, "learning_rate": 8.494067673704888e-05, "loss": 0.2189, "step": 7222 }, { "epoch": 1.4555712270803949, "grad_norm": 0.04681938886642456, "learning_rate": 8.493114298816055e-05, "loss": 0.199, "step": 7224 }, { "epoch": 1.4559742091476928, "grad_norm": 0.055137768387794495, "learning_rate": 8.492160675781967e-05, "loss": 0.1672, "step": 7226 }, { "epoch": 1.456377191214991, "grad_norm": 0.04934654012322426, "learning_rate": 8.49120680467037e-05, "loss": 0.2264, "step": 7228 }, { "epoch": 1.456780173282289, "grad_norm": 0.0343373566865921, "learning_rate": 8.490252685549026e-05, "loss": 0.1736, "step": 7230 }, { "epoch": 1.4571831553495869, "grad_norm": 0.051517315208911896, "learning_rate": 8.489298318485712e-05, "loss": 0.1566, "step": 7232 }, { "epoch": 1.457586137416885, "grad_norm": 0.05607752874493599, "learning_rate": 8.488343703548226e-05, "loss": 0.1562, "step": 7234 }, { "epoch": 1.457989119484183, "grad_norm": 0.049427617341279984, "learning_rate": 8.487388840804383e-05, "loss": 0.2388, "step": 7236 }, { "epoch": 1.458392101551481, "grad_norm": 0.05366994068026543, "learning_rate": 8.486433730322012e-05, "loss": 0.1983, "step": 7238 }, { "epoch": 1.4587950836187789, "grad_norm": 0.05468441918492317, "learning_rate": 8.485478372168966e-05, "loss": 0.2348, "step": 7240 }, { "epoch": 1.459198065686077, "grad_norm": 0.05302935093641281, "learning_rate": 8.48452276641311e-05, "loss": 0.2082, "step": 7242 }, { "epoch": 1.459601047753375, "grad_norm": 0.04598622024059296, "learning_rate": 8.48356691312233e-05, "loss": 0.1921, "step": 7244 }, { "epoch": 1.460004029820673, "grad_norm": 0.06511973589658737, "learning_rate": 8.482610812364527e-05, "loss": 0.1838, "step": 7246 }, { "epoch": 1.460407011887971, "grad_norm": 0.060067228972911835, "learning_rate": 8.481654464207623e-05, "loss": 0.16, "step": 7248 }, { "epoch": 1.460809993955269, "grad_norm": 0.04335297271609306, "learning_rate": 8.480697868719551e-05, "loss": 0.1295, "step": 7250 }, { "epoch": 1.461212976022567, "grad_norm": 0.056406330317258835, "learning_rate": 8.47974102596827e-05, "loss": 0.1607, "step": 7252 }, { "epoch": 1.4616159580898649, "grad_norm": 0.06859233975410461, "learning_rate": 8.478783936021753e-05, "loss": 0.208, "step": 7254 }, { "epoch": 1.462018940157163, "grad_norm": 0.0544903390109539, "learning_rate": 8.477826598947989e-05, "loss": 0.1753, "step": 7256 }, { "epoch": 1.462421922224461, "grad_norm": 0.06715361773967743, "learning_rate": 8.476869014814984e-05, "loss": 0.1487, "step": 7258 }, { "epoch": 1.4628249042917592, "grad_norm": 0.06288928538560867, "learning_rate": 8.475911183690765e-05, "loss": 0.1728, "step": 7260 }, { "epoch": 1.463227886359057, "grad_norm": 0.0655706375837326, "learning_rate": 8.474953105643374e-05, "loss": 0.2087, "step": 7262 }, { "epoch": 1.463630868426355, "grad_norm": 0.045362550765275955, "learning_rate": 8.473994780740873e-05, "loss": 0.162, "step": 7264 }, { "epoch": 1.464033850493653, "grad_norm": 0.05193858593702316, "learning_rate": 8.473036209051337e-05, "loss": 0.2173, "step": 7266 }, { "epoch": 1.464436832560951, "grad_norm": 0.0538729652762413, "learning_rate": 8.472077390642864e-05, "loss": 0.1997, "step": 7268 }, { "epoch": 1.464839814628249, "grad_norm": 0.04861455410718918, "learning_rate": 8.471118325583565e-05, "loss": 0.2214, "step": 7270 }, { "epoch": 1.465242796695547, "grad_norm": 0.04979060962796211, "learning_rate": 8.470159013941572e-05, "loss": 0.1745, "step": 7272 }, { "epoch": 1.4656457787628452, "grad_norm": 0.04076359421014786, "learning_rate": 8.469199455785032e-05, "loss": 0.182, "step": 7274 }, { "epoch": 1.4660487608301431, "grad_norm": 0.03544747456908226, "learning_rate": 8.46823965118211e-05, "loss": 0.1535, "step": 7276 }, { "epoch": 1.466451742897441, "grad_norm": 0.0503484271466732, "learning_rate": 8.467279600200993e-05, "loss": 0.2295, "step": 7278 }, { "epoch": 1.466854724964739, "grad_norm": 0.061403173953294754, "learning_rate": 8.466319302909875e-05, "loss": 0.1619, "step": 7280 }, { "epoch": 1.467257707032037, "grad_norm": 0.048420753329992294, "learning_rate": 8.465358759376979e-05, "loss": 0.1717, "step": 7282 }, { "epoch": 1.4676606890993351, "grad_norm": 0.04460752010345459, "learning_rate": 8.464397969670538e-05, "loss": 0.1453, "step": 7284 }, { "epoch": 1.468063671166633, "grad_norm": 0.05327368900179863, "learning_rate": 8.463436933858806e-05, "loss": 0.2189, "step": 7286 }, { "epoch": 1.4684666532339312, "grad_norm": 0.0458202101290226, "learning_rate": 8.462475652010053e-05, "loss": 0.1892, "step": 7288 }, { "epoch": 1.4688696353012292, "grad_norm": 0.047448720782995224, "learning_rate": 8.461514124192567e-05, "loss": 0.2031, "step": 7290 }, { "epoch": 1.469272617368527, "grad_norm": 0.05255912244319916, "learning_rate": 8.460552350474654e-05, "loss": 0.1929, "step": 7292 }, { "epoch": 1.469675599435825, "grad_norm": 0.05827522650361061, "learning_rate": 8.459590330924636e-05, "loss": 0.2482, "step": 7294 }, { "epoch": 1.470078581503123, "grad_norm": 0.08329940587282181, "learning_rate": 8.458628065610853e-05, "loss": 0.2222, "step": 7296 }, { "epoch": 1.4704815635704211, "grad_norm": 0.0485442690551281, "learning_rate": 8.457665554601667e-05, "loss": 0.1998, "step": 7298 }, { "epoch": 1.470884545637719, "grad_norm": 0.05297599732875824, "learning_rate": 8.456702797965446e-05, "loss": 0.2179, "step": 7300 }, { "epoch": 1.4712875277050173, "grad_norm": 0.05084371566772461, "learning_rate": 8.455739795770588e-05, "loss": 0.2063, "step": 7302 }, { "epoch": 1.4716905097723152, "grad_norm": 0.044608235359191895, "learning_rate": 8.454776548085499e-05, "loss": 0.1736, "step": 7304 }, { "epoch": 1.4720934918396131, "grad_norm": 0.04484618082642555, "learning_rate": 8.453813054978612e-05, "loss": 0.2138, "step": 7306 }, { "epoch": 1.472496473906911, "grad_norm": 0.04494086280465126, "learning_rate": 8.452849316518367e-05, "loss": 0.1769, "step": 7308 }, { "epoch": 1.472899455974209, "grad_norm": 0.04433842748403549, "learning_rate": 8.451885332773231e-05, "loss": 0.196, "step": 7310 }, { "epoch": 1.4733024380415072, "grad_norm": 0.06002422794699669, "learning_rate": 8.450921103811679e-05, "loss": 0.2003, "step": 7312 }, { "epoch": 1.4737054201088051, "grad_norm": 0.0462716743350029, "learning_rate": 8.449956629702214e-05, "loss": 0.207, "step": 7314 }, { "epoch": 1.4741084021761033, "grad_norm": 0.06718819588422775, "learning_rate": 8.448991910513344e-05, "loss": 0.1779, "step": 7316 }, { "epoch": 1.4745113842434012, "grad_norm": 0.04861169680953026, "learning_rate": 8.448026946313607e-05, "loss": 0.1674, "step": 7318 }, { "epoch": 1.4749143663106992, "grad_norm": 0.04479601979255676, "learning_rate": 8.447061737171549e-05, "loss": 0.1838, "step": 7320 }, { "epoch": 1.475317348377997, "grad_norm": 0.05224357545375824, "learning_rate": 8.446096283155736e-05, "loss": 0.2221, "step": 7322 }, { "epoch": 1.475720330445295, "grad_norm": 0.03974277153611183, "learning_rate": 8.445130584334758e-05, "loss": 0.1639, "step": 7324 }, { "epoch": 1.4761233125125932, "grad_norm": 0.05455762520432472, "learning_rate": 8.44416464077721e-05, "loss": 0.2433, "step": 7326 }, { "epoch": 1.4765262945798912, "grad_norm": 0.04934287071228027, "learning_rate": 8.443198452551715e-05, "loss": 0.1937, "step": 7328 }, { "epoch": 1.4769292766471893, "grad_norm": 0.041016895323991776, "learning_rate": 8.442232019726909e-05, "loss": 0.1981, "step": 7330 }, { "epoch": 1.4773322587144873, "grad_norm": 0.037635620683431625, "learning_rate": 8.441265342371445e-05, "loss": 0.1531, "step": 7332 }, { "epoch": 1.4777352407817852, "grad_norm": 0.06261298060417175, "learning_rate": 8.440298420553995e-05, "loss": 0.1733, "step": 7334 }, { "epoch": 1.4781382228490831, "grad_norm": 0.05150596424937248, "learning_rate": 8.439331254343246e-05, "loss": 0.1854, "step": 7336 }, { "epoch": 1.4785412049163813, "grad_norm": 0.04876361042261124, "learning_rate": 8.438363843807906e-05, "loss": 0.1419, "step": 7338 }, { "epoch": 1.4789441869836792, "grad_norm": 0.060993921011686325, "learning_rate": 8.437396189016698e-05, "loss": 0.1656, "step": 7340 }, { "epoch": 1.4793471690509772, "grad_norm": 0.04993096739053726, "learning_rate": 8.43642829003836e-05, "loss": 0.2243, "step": 7342 }, { "epoch": 1.4797501511182753, "grad_norm": 0.0681760385632515, "learning_rate": 8.435460146941653e-05, "loss": 0.2071, "step": 7344 }, { "epoch": 1.4801531331855733, "grad_norm": 0.04877663403749466, "learning_rate": 8.434491759795353e-05, "loss": 0.2088, "step": 7346 }, { "epoch": 1.4805561152528712, "grad_norm": 0.04369872435927391, "learning_rate": 8.43352312866825e-05, "loss": 0.2126, "step": 7348 }, { "epoch": 1.4809590973201692, "grad_norm": 0.0733921229839325, "learning_rate": 8.432554253629154e-05, "loss": 0.1975, "step": 7350 }, { "epoch": 1.4813620793874673, "grad_norm": 0.04548421502113342, "learning_rate": 8.431585134746894e-05, "loss": 0.1712, "step": 7352 }, { "epoch": 1.4817650614547653, "grad_norm": 0.052208177745342255, "learning_rate": 8.430615772090314e-05, "loss": 0.1925, "step": 7354 }, { "epoch": 1.4821680435220632, "grad_norm": 0.05872802808880806, "learning_rate": 8.429646165728275e-05, "loss": 0.1987, "step": 7356 }, { "epoch": 1.4825710255893614, "grad_norm": 0.053144607692956924, "learning_rate": 8.42867631572966e-05, "loss": 0.2145, "step": 7358 }, { "epoch": 1.4829740076566593, "grad_norm": 0.036344993859529495, "learning_rate": 8.427706222163361e-05, "loss": 0.1606, "step": 7360 }, { "epoch": 1.4833769897239573, "grad_norm": 0.045181769877672195, "learning_rate": 8.426735885098293e-05, "loss": 0.1919, "step": 7362 }, { "epoch": 1.4837799717912552, "grad_norm": 0.049181703478097916, "learning_rate": 8.42576530460339e-05, "loss": 0.2088, "step": 7364 }, { "epoch": 1.4841829538585534, "grad_norm": 0.052120842039585114, "learning_rate": 8.424794480747597e-05, "loss": 0.1863, "step": 7366 }, { "epoch": 1.4845859359258513, "grad_norm": 0.04675266519188881, "learning_rate": 8.423823413599883e-05, "loss": 0.1956, "step": 7368 }, { "epoch": 1.4849889179931492, "grad_norm": 0.04355587065219879, "learning_rate": 8.422852103229228e-05, "loss": 0.2108, "step": 7370 }, { "epoch": 1.4853919000604474, "grad_norm": 0.05784691497683525, "learning_rate": 8.421880549704634e-05, "loss": 0.2315, "step": 7372 }, { "epoch": 1.4857948821277454, "grad_norm": 0.03782470524311066, "learning_rate": 8.420908753095118e-05, "loss": 0.1504, "step": 7374 }, { "epoch": 1.4861978641950433, "grad_norm": 0.04663468152284622, "learning_rate": 8.419936713469714e-05, "loss": 0.2133, "step": 7376 }, { "epoch": 1.4866008462623412, "grad_norm": 0.05277375504374504, "learning_rate": 8.418964430897477e-05, "loss": 0.1913, "step": 7378 }, { "epoch": 1.4870038283296394, "grad_norm": 0.04334205016493797, "learning_rate": 8.417991905447473e-05, "loss": 0.1672, "step": 7380 }, { "epoch": 1.4874068103969373, "grad_norm": 0.05344159156084061, "learning_rate": 8.417019137188792e-05, "loss": 0.225, "step": 7382 }, { "epoch": 1.4878097924642353, "grad_norm": 0.045284245163202286, "learning_rate": 8.416046126190536e-05, "loss": 0.1946, "step": 7384 }, { "epoch": 1.4882127745315334, "grad_norm": 0.047893647104501724, "learning_rate": 8.415072872521826e-05, "loss": 0.1857, "step": 7386 }, { "epoch": 1.4886157565988314, "grad_norm": 0.041971355676651, "learning_rate": 8.4140993762518e-05, "loss": 0.1719, "step": 7388 }, { "epoch": 1.4890187386661293, "grad_norm": 0.06434407830238342, "learning_rate": 8.413125637449615e-05, "loss": 0.1602, "step": 7390 }, { "epoch": 1.4894217207334273, "grad_norm": 0.07025641947984695, "learning_rate": 8.412151656184444e-05, "loss": 0.2188, "step": 7392 }, { "epoch": 1.4898247028007254, "grad_norm": 0.0681864395737648, "learning_rate": 8.411177432525475e-05, "loss": 0.2231, "step": 7394 }, { "epoch": 1.4902276848680234, "grad_norm": 0.055996041744947433, "learning_rate": 8.410202966541917e-05, "loss": 0.1936, "step": 7396 }, { "epoch": 1.4906306669353213, "grad_norm": 0.03989433869719505, "learning_rate": 8.409228258302994e-05, "loss": 0.1496, "step": 7398 }, { "epoch": 1.4910336490026195, "grad_norm": 0.062053125351667404, "learning_rate": 8.408253307877947e-05, "loss": 0.2498, "step": 7400 }, { "epoch": 1.4914366310699174, "grad_norm": 0.04868367314338684, "learning_rate": 8.407278115336037e-05, "loss": 0.2198, "step": 7402 }, { "epoch": 1.4918396131372154, "grad_norm": 0.05877026170492172, "learning_rate": 8.406302680746538e-05, "loss": 0.2521, "step": 7404 }, { "epoch": 1.4922425952045133, "grad_norm": 0.04489947110414505, "learning_rate": 8.405327004178745e-05, "loss": 0.2219, "step": 7406 }, { "epoch": 1.4926455772718115, "grad_norm": 0.05375202000141144, "learning_rate": 8.404351085701967e-05, "loss": 0.2286, "step": 7408 }, { "epoch": 1.4930485593391094, "grad_norm": 0.04449663311243057, "learning_rate": 8.403374925385532e-05, "loss": 0.1823, "step": 7410 }, { "epoch": 1.4934515414064073, "grad_norm": 0.04015576094388962, "learning_rate": 8.402398523298786e-05, "loss": 0.1567, "step": 7412 }, { "epoch": 1.4938545234737055, "grad_norm": 0.047074656933546066, "learning_rate": 8.40142187951109e-05, "loss": 0.1762, "step": 7414 }, { "epoch": 1.4942575055410035, "grad_norm": 0.06970192492008209, "learning_rate": 8.400444994091823e-05, "loss": 0.1826, "step": 7416 }, { "epoch": 1.4946604876083014, "grad_norm": 0.04595296084880829, "learning_rate": 8.399467867110382e-05, "loss": 0.2068, "step": 7418 }, { "epoch": 1.4950634696755993, "grad_norm": 0.05146399140357971, "learning_rate": 8.398490498636181e-05, "loss": 0.1942, "step": 7420 }, { "epoch": 1.4954664517428975, "grad_norm": 0.05432479828596115, "learning_rate": 8.39751288873865e-05, "loss": 0.2029, "step": 7422 }, { "epoch": 1.4958694338101954, "grad_norm": 0.04735802486538887, "learning_rate": 8.396535037487236e-05, "loss": 0.1889, "step": 7424 }, { "epoch": 1.4962724158774934, "grad_norm": 0.053364839404821396, "learning_rate": 8.395556944951406e-05, "loss": 0.1904, "step": 7426 }, { "epoch": 1.4966753979447915, "grad_norm": 0.04619657248258591, "learning_rate": 8.394578611200639e-05, "loss": 0.17, "step": 7428 }, { "epoch": 1.4970783800120895, "grad_norm": 0.04902162775397301, "learning_rate": 8.393600036304438e-05, "loss": 0.2198, "step": 7430 }, { "epoch": 1.4974813620793874, "grad_norm": 0.0800173282623291, "learning_rate": 8.392621220332317e-05, "loss": 0.2009, "step": 7432 }, { "epoch": 1.4978843441466854, "grad_norm": 0.0478094182908535, "learning_rate": 8.391642163353812e-05, "loss": 0.2335, "step": 7434 }, { "epoch": 1.4982873262139835, "grad_norm": 0.05949089676141739, "learning_rate": 8.39066286543847e-05, "loss": 0.209, "step": 7436 }, { "epoch": 1.4986903082812815, "grad_norm": 0.043826907873153687, "learning_rate": 8.389683326655862e-05, "loss": 0.1955, "step": 7438 }, { "epoch": 1.4990932903485794, "grad_norm": 0.05305158719420433, "learning_rate": 8.388703547075569e-05, "loss": 0.224, "step": 7440 }, { "epoch": 1.4994962724158776, "grad_norm": 0.04061686992645264, "learning_rate": 8.387723526767197e-05, "loss": 0.1754, "step": 7442 }, { "epoch": 1.4998992544831755, "grad_norm": 0.05251790210604668, "learning_rate": 8.386743265800364e-05, "loss": 0.1544, "step": 7444 }, { "epoch": 1.5003022365504735, "grad_norm": 0.0496913306415081, "learning_rate": 8.385762764244704e-05, "loss": 0.179, "step": 7446 }, { "epoch": 1.5007052186177714, "grad_norm": 0.03873209282755852, "learning_rate": 8.384782022169875e-05, "loss": 0.2258, "step": 7448 }, { "epoch": 1.5011082006850696, "grad_norm": 0.050427861511707306, "learning_rate": 8.383801039645542e-05, "loss": 0.2088, "step": 7450 }, { "epoch": 1.5015111827523675, "grad_norm": 0.08442942798137665, "learning_rate": 8.382819816741394e-05, "loss": 0.2172, "step": 7452 }, { "epoch": 1.5019141648196657, "grad_norm": 0.058537282049655914, "learning_rate": 8.381838353527139e-05, "loss": 0.1713, "step": 7454 }, { "epoch": 1.5023171468869636, "grad_norm": 0.050120122730731964, "learning_rate": 8.380856650072493e-05, "loss": 0.1936, "step": 7456 }, { "epoch": 1.5027201289542615, "grad_norm": 0.06258103996515274, "learning_rate": 8.3798747064472e-05, "loss": 0.2222, "step": 7458 }, { "epoch": 1.5031231110215595, "grad_norm": 0.060956165194511414, "learning_rate": 8.378892522721012e-05, "loss": 0.2083, "step": 7460 }, { "epoch": 1.5035260930888574, "grad_norm": 0.04213668778538704, "learning_rate": 8.377910098963702e-05, "loss": 0.1859, "step": 7462 }, { "epoch": 1.5039290751561556, "grad_norm": 0.04656725376844406, "learning_rate": 8.37692743524506e-05, "loss": 0.1791, "step": 7464 }, { "epoch": 1.5043320572234535, "grad_norm": 0.056995708495378494, "learning_rate": 8.375944531634896e-05, "loss": 0.21, "step": 7466 }, { "epoch": 1.5047350392907517, "grad_norm": 0.0539989173412323, "learning_rate": 8.37496138820303e-05, "loss": 0.1635, "step": 7468 }, { "epoch": 1.5051380213580496, "grad_norm": 0.039313316345214844, "learning_rate": 8.373978005019306e-05, "loss": 0.211, "step": 7470 }, { "epoch": 1.5055410034253476, "grad_norm": 0.05129539594054222, "learning_rate": 8.372994382153579e-05, "loss": 0.1601, "step": 7472 }, { "epoch": 1.5059439854926455, "grad_norm": 0.058646250516176224, "learning_rate": 8.372010519675726e-05, "loss": 0.173, "step": 7474 }, { "epoch": 1.5063469675599435, "grad_norm": 0.04545162245631218, "learning_rate": 8.371026417655639e-05, "loss": 0.1635, "step": 7476 }, { "epoch": 1.5067499496272416, "grad_norm": 0.048731524497270584, "learning_rate": 8.370042076163224e-05, "loss": 0.2135, "step": 7478 }, { "epoch": 1.5071529316945396, "grad_norm": 0.06594579666852951, "learning_rate": 8.369057495268413e-05, "loss": 0.1718, "step": 7480 }, { "epoch": 1.5075559137618377, "grad_norm": 0.05998741090297699, "learning_rate": 8.368072675041144e-05, "loss": 0.1815, "step": 7482 }, { "epoch": 1.5079588958291357, "grad_norm": 0.06421762704849243, "learning_rate": 8.367087615551377e-05, "loss": 0.1649, "step": 7484 }, { "epoch": 1.5083618778964336, "grad_norm": 0.05000369995832443, "learning_rate": 8.366102316869094e-05, "loss": 0.193, "step": 7486 }, { "epoch": 1.5087648599637316, "grad_norm": 0.0470392070710659, "learning_rate": 8.365116779064283e-05, "loss": 0.2088, "step": 7488 }, { "epoch": 1.5091678420310295, "grad_norm": 0.056860171258449554, "learning_rate": 8.364131002206959e-05, "loss": 0.2176, "step": 7490 }, { "epoch": 1.5095708240983277, "grad_norm": 0.045696549117565155, "learning_rate": 8.363144986367146e-05, "loss": 0.1933, "step": 7492 }, { "epoch": 1.5099738061656256, "grad_norm": 0.04788528010249138, "learning_rate": 8.362158731614895e-05, "loss": 0.1648, "step": 7494 }, { "epoch": 1.5103767882329238, "grad_norm": 0.05643216148018837, "learning_rate": 8.361172238020264e-05, "loss": 0.1823, "step": 7496 }, { "epoch": 1.5107797703002217, "grad_norm": 0.04428640007972717, "learning_rate": 8.360185505653332e-05, "loss": 0.2071, "step": 7498 }, { "epoch": 1.5111827523675196, "grad_norm": 0.0673820972442627, "learning_rate": 8.359198534584197e-05, "loss": 0.1674, "step": 7500 }, { "epoch": 1.5115857344348176, "grad_norm": 0.07432591915130615, "learning_rate": 8.358211324882968e-05, "loss": 0.2074, "step": 7502 }, { "epoch": 1.5119887165021155, "grad_norm": 0.05492899566888809, "learning_rate": 8.357223876619778e-05, "loss": 0.2054, "step": 7504 }, { "epoch": 1.5123916985694137, "grad_norm": 0.04310237616300583, "learning_rate": 8.356236189864772e-05, "loss": 0.1929, "step": 7506 }, { "epoch": 1.5127946806367116, "grad_norm": 0.051562558859586716, "learning_rate": 8.355248264688116e-05, "loss": 0.1941, "step": 7508 }, { "epoch": 1.5131976627040098, "grad_norm": 0.06979775428771973, "learning_rate": 8.35426010115999e-05, "loss": 0.2007, "step": 7510 }, { "epoch": 1.5136006447713077, "grad_norm": 0.049285538494586945, "learning_rate": 8.35327169935059e-05, "loss": 0.1474, "step": 7512 }, { "epoch": 1.5140036268386057, "grad_norm": 0.06848005950450897, "learning_rate": 8.352283059330131e-05, "loss": 0.2531, "step": 7514 }, { "epoch": 1.5144066089059036, "grad_norm": 0.05815625190734863, "learning_rate": 8.351294181168843e-05, "loss": 0.2505, "step": 7516 }, { "epoch": 1.5148095909732016, "grad_norm": 0.08098626136779785, "learning_rate": 8.350305064936978e-05, "loss": 0.2006, "step": 7518 }, { "epoch": 1.5152125730404997, "grad_norm": 0.04606293886899948, "learning_rate": 8.349315710704799e-05, "loss": 0.218, "step": 7520 }, { "epoch": 1.5156155551077977, "grad_norm": 0.04414965957403183, "learning_rate": 8.348326118542588e-05, "loss": 0.2248, "step": 7522 }, { "epoch": 1.5160185371750958, "grad_norm": 0.049683474004268646, "learning_rate": 8.347336288520644e-05, "loss": 0.2149, "step": 7524 }, { "epoch": 1.5164215192423938, "grad_norm": 0.05648723617196083, "learning_rate": 8.346346220709284e-05, "loss": 0.2169, "step": 7526 }, { "epoch": 1.5168245013096917, "grad_norm": 0.05552398040890694, "learning_rate": 8.34535591517884e-05, "loss": 0.1927, "step": 7528 }, { "epoch": 1.5172274833769896, "grad_norm": 0.03558554872870445, "learning_rate": 8.344365371999661e-05, "loss": 0.2295, "step": 7530 }, { "epoch": 1.5176304654442876, "grad_norm": 0.04220812767744064, "learning_rate": 8.343374591242117e-05, "loss": 0.1748, "step": 7532 }, { "epoch": 1.5180334475115858, "grad_norm": 0.046235982328653336, "learning_rate": 8.342383572976586e-05, "loss": 0.1707, "step": 7534 }, { "epoch": 1.5184364295788837, "grad_norm": 0.04958495497703552, "learning_rate": 8.341392317273473e-05, "loss": 0.195, "step": 7536 }, { "epoch": 1.5188394116461819, "grad_norm": 0.04524749517440796, "learning_rate": 8.340400824203194e-05, "loss": 0.1947, "step": 7538 }, { "epoch": 1.5192423937134798, "grad_norm": 0.050707731395959854, "learning_rate": 8.339409093836182e-05, "loss": 0.1692, "step": 7540 }, { "epoch": 1.5196453757807777, "grad_norm": 0.05605167895555496, "learning_rate": 8.338417126242888e-05, "loss": 0.185, "step": 7542 }, { "epoch": 1.5200483578480757, "grad_norm": 0.04622466117143631, "learning_rate": 8.337424921493781e-05, "loss": 0.1966, "step": 7544 }, { "epoch": 1.5204513399153736, "grad_norm": 0.04225487262010574, "learning_rate": 8.336432479659344e-05, "loss": 0.1821, "step": 7546 }, { "epoch": 1.5208543219826718, "grad_norm": 0.04641425609588623, "learning_rate": 8.33543980081008e-05, "loss": 0.2138, "step": 7548 }, { "epoch": 1.5212573040499697, "grad_norm": 0.10159898549318314, "learning_rate": 8.334446885016507e-05, "loss": 0.2255, "step": 7550 }, { "epoch": 1.521660286117268, "grad_norm": 0.05289870873093605, "learning_rate": 8.333453732349161e-05, "loss": 0.1797, "step": 7552 }, { "epoch": 1.5220632681845658, "grad_norm": 0.04265284165740013, "learning_rate": 8.33246034287859e-05, "loss": 0.1829, "step": 7554 }, { "epoch": 1.5224662502518638, "grad_norm": 0.05362769961357117, "learning_rate": 8.33146671667537e-05, "loss": 0.1789, "step": 7556 }, { "epoch": 1.5228692323191617, "grad_norm": 0.06503970921039581, "learning_rate": 8.330472853810078e-05, "loss": 0.2332, "step": 7558 }, { "epoch": 1.5232722143864597, "grad_norm": 0.04482598602771759, "learning_rate": 8.329478754353324e-05, "loss": 0.1456, "step": 7560 }, { "epoch": 1.5236751964537578, "grad_norm": 0.046206843107938766, "learning_rate": 8.328484418375721e-05, "loss": 0.1937, "step": 7562 }, { "epoch": 1.5240781785210558, "grad_norm": 0.05199309438467026, "learning_rate": 8.327489845947911e-05, "loss": 0.1884, "step": 7564 }, { "epoch": 1.524481160588354, "grad_norm": 0.036253806203603745, "learning_rate": 8.326495037140543e-05, "loss": 0.149, "step": 7566 }, { "epoch": 1.5248841426556519, "grad_norm": 0.04791543632745743, "learning_rate": 8.325499992024286e-05, "loss": 0.1886, "step": 7568 }, { "epoch": 1.5252871247229498, "grad_norm": 0.04419293627142906, "learning_rate": 8.32450471066983e-05, "loss": 0.1618, "step": 7570 }, { "epoch": 1.5256901067902477, "grad_norm": 0.0419553704559803, "learning_rate": 8.323509193147876e-05, "loss": 0.1759, "step": 7572 }, { "epoch": 1.5260930888575457, "grad_norm": 0.04536983743309975, "learning_rate": 8.322513439529142e-05, "loss": 0.1887, "step": 7574 }, { "epoch": 1.5264960709248439, "grad_norm": 0.06128111481666565, "learning_rate": 8.321517449884369e-05, "loss": 0.2233, "step": 7576 }, { "epoch": 1.526899052992142, "grad_norm": 0.04495418816804886, "learning_rate": 8.320521224284308e-05, "loss": 0.1392, "step": 7578 }, { "epoch": 1.52730203505944, "grad_norm": 0.05643144249916077, "learning_rate": 8.319524762799728e-05, "loss": 0.248, "step": 7580 }, { "epoch": 1.527705017126738, "grad_norm": 0.044090867042541504, "learning_rate": 8.318528065501419e-05, "loss": 0.179, "step": 7582 }, { "epoch": 1.5281079991940358, "grad_norm": 0.050828419625759125, "learning_rate": 8.317531132460183e-05, "loss": 0.2008, "step": 7584 }, { "epoch": 1.5285109812613338, "grad_norm": 0.05592074245214462, "learning_rate": 8.316533963746841e-05, "loss": 0.1993, "step": 7586 }, { "epoch": 1.5289139633286317, "grad_norm": 0.08189481496810913, "learning_rate": 8.315536559432231e-05, "loss": 0.2118, "step": 7588 }, { "epoch": 1.5293169453959299, "grad_norm": 0.06498509645462036, "learning_rate": 8.314538919587205e-05, "loss": 0.1956, "step": 7590 }, { "epoch": 1.529719927463228, "grad_norm": 0.048073191195726395, "learning_rate": 8.313541044282636e-05, "loss": 0.1888, "step": 7592 }, { "epoch": 1.530122909530526, "grad_norm": 0.04223538562655449, "learning_rate": 8.31254293358941e-05, "loss": 0.2121, "step": 7594 }, { "epoch": 1.530525891597824, "grad_norm": 0.04270782321691513, "learning_rate": 8.311544587578431e-05, "loss": 0.1862, "step": 7596 }, { "epoch": 1.5309288736651219, "grad_norm": 0.04466244578361511, "learning_rate": 8.310546006320623e-05, "loss": 0.2138, "step": 7598 }, { "epoch": 1.5313318557324198, "grad_norm": 0.045204807072877884, "learning_rate": 8.309547189886917e-05, "loss": 0.1847, "step": 7600 }, { "epoch": 1.5317348377997178, "grad_norm": 0.03572971746325493, "learning_rate": 8.308548138348274e-05, "loss": 0.1684, "step": 7602 }, { "epoch": 1.532137819867016, "grad_norm": 0.05988677218556404, "learning_rate": 8.307548851775663e-05, "loss": 0.1984, "step": 7604 }, { "epoch": 1.532540801934314, "grad_norm": 0.04439981281757355, "learning_rate": 8.30654933024007e-05, "loss": 0.1932, "step": 7606 }, { "epoch": 1.532943784001612, "grad_norm": 0.042620521038770676, "learning_rate": 8.305549573812501e-05, "loss": 0.1615, "step": 7608 }, { "epoch": 1.53334676606891, "grad_norm": 0.04275134205818176, "learning_rate": 8.304549582563977e-05, "loss": 0.1826, "step": 7610 }, { "epoch": 1.533749748136208, "grad_norm": 0.04868233948945999, "learning_rate": 8.303549356565535e-05, "loss": 0.2086, "step": 7612 }, { "epoch": 1.5341527302035058, "grad_norm": 0.04241720587015152, "learning_rate": 8.302548895888232e-05, "loss": 0.2049, "step": 7614 }, { "epoch": 1.5345557122708038, "grad_norm": 0.040512893348932266, "learning_rate": 8.301548200603134e-05, "loss": 0.2209, "step": 7616 }, { "epoch": 1.534958694338102, "grad_norm": 0.05638391524553299, "learning_rate": 8.300547270781333e-05, "loss": 0.214, "step": 7618 }, { "epoch": 1.5353616764054, "grad_norm": 0.034663956612348557, "learning_rate": 8.299546106493933e-05, "loss": 0.2244, "step": 7620 }, { "epoch": 1.535764658472698, "grad_norm": 0.0562388077378273, "learning_rate": 8.298544707812054e-05, "loss": 0.1999, "step": 7622 }, { "epoch": 1.536167640539996, "grad_norm": 0.03829832375049591, "learning_rate": 8.297543074806834e-05, "loss": 0.2152, "step": 7624 }, { "epoch": 1.536570622607294, "grad_norm": 0.05909931659698486, "learning_rate": 8.296541207549428e-05, "loss": 0.1943, "step": 7626 }, { "epoch": 1.5369736046745919, "grad_norm": 0.049946270883083344, "learning_rate": 8.295539106111007e-05, "loss": 0.1866, "step": 7628 }, { "epoch": 1.53737658674189, "grad_norm": 0.048141710460186005, "learning_rate": 8.294536770562757e-05, "loss": 0.1953, "step": 7630 }, { "epoch": 1.537779568809188, "grad_norm": 0.05740709975361824, "learning_rate": 8.293534200975886e-05, "loss": 0.1717, "step": 7632 }, { "epoch": 1.5381825508764861, "grad_norm": 0.049014464020729065, "learning_rate": 8.29253139742161e-05, "loss": 0.1926, "step": 7634 }, { "epoch": 1.538585532943784, "grad_norm": 0.05644163116812706, "learning_rate": 8.29152835997117e-05, "loss": 0.1373, "step": 7636 }, { "epoch": 1.538988515011082, "grad_norm": 0.06402353942394257, "learning_rate": 8.29052508869582e-05, "loss": 0.1903, "step": 7638 }, { "epoch": 1.53939149707838, "grad_norm": 0.04729839786887169, "learning_rate": 8.289521583666829e-05, "loss": 0.196, "step": 7640 }, { "epoch": 1.539794479145678, "grad_norm": 0.0404207780957222, "learning_rate": 8.288517844955487e-05, "loss": 0.1727, "step": 7642 }, { "epoch": 1.540197461212976, "grad_norm": 0.060075223445892334, "learning_rate": 8.287513872633094e-05, "loss": 0.2051, "step": 7644 }, { "epoch": 1.540600443280274, "grad_norm": 0.06109379976987839, "learning_rate": 8.286509666770977e-05, "loss": 0.2241, "step": 7646 }, { "epoch": 1.5410034253475722, "grad_norm": 0.04271155223250389, "learning_rate": 8.285505227440466e-05, "loss": 0.2364, "step": 7648 }, { "epoch": 1.5414064074148701, "grad_norm": 0.05317099019885063, "learning_rate": 8.28450055471292e-05, "loss": 0.209, "step": 7650 }, { "epoch": 1.541809389482168, "grad_norm": 0.04707867652177811, "learning_rate": 8.283495648659705e-05, "loss": 0.1701, "step": 7652 }, { "epoch": 1.542212371549466, "grad_norm": 0.04844583570957184, "learning_rate": 8.282490509352212e-05, "loss": 0.2099, "step": 7654 }, { "epoch": 1.542615353616764, "grad_norm": 0.05662320926785469, "learning_rate": 8.281485136861842e-05, "loss": 0.2517, "step": 7656 }, { "epoch": 1.543018335684062, "grad_norm": 0.05386023968458176, "learning_rate": 8.280479531260018e-05, "loss": 0.1859, "step": 7658 }, { "epoch": 1.54342131775136, "grad_norm": 0.03194073960185051, "learning_rate": 8.279473692618172e-05, "loss": 0.1292, "step": 7660 }, { "epoch": 1.5438242998186582, "grad_norm": 0.05544869229197502, "learning_rate": 8.27846762100776e-05, "loss": 0.2069, "step": 7662 }, { "epoch": 1.5442272818859561, "grad_norm": 0.05823779106140137, "learning_rate": 8.277461316500253e-05, "loss": 0.2354, "step": 7664 }, { "epoch": 1.544630263953254, "grad_norm": 0.05206362158060074, "learning_rate": 8.276454779167133e-05, "loss": 0.2107, "step": 7666 }, { "epoch": 1.545033246020552, "grad_norm": 0.04913756251335144, "learning_rate": 8.275448009079907e-05, "loss": 0.2, "step": 7668 }, { "epoch": 1.54543622808785, "grad_norm": 0.04635776951909065, "learning_rate": 8.274441006310091e-05, "loss": 0.2175, "step": 7670 }, { "epoch": 1.5458392101551481, "grad_norm": 0.058479227125644684, "learning_rate": 8.273433770929225e-05, "loss": 0.2039, "step": 7672 }, { "epoch": 1.546242192222446, "grad_norm": 0.042080748826265335, "learning_rate": 8.272426303008858e-05, "loss": 0.2154, "step": 7674 }, { "epoch": 1.5466451742897442, "grad_norm": 0.0478924922645092, "learning_rate": 8.27141860262056e-05, "loss": 0.1844, "step": 7676 }, { "epoch": 1.5470481563570422, "grad_norm": 0.048292964696884155, "learning_rate": 8.270410669835917e-05, "loss": 0.2174, "step": 7678 }, { "epoch": 1.5474511384243401, "grad_norm": 0.05052988976240158, "learning_rate": 8.269402504726529e-05, "loss": 0.1876, "step": 7680 }, { "epoch": 1.547854120491638, "grad_norm": 0.04319053515791893, "learning_rate": 8.268394107364017e-05, "loss": 0.1592, "step": 7682 }, { "epoch": 1.548257102558936, "grad_norm": 0.05351201072335243, "learning_rate": 8.267385477820014e-05, "loss": 0.2157, "step": 7684 }, { "epoch": 1.5486600846262342, "grad_norm": 0.04633256420493126, "learning_rate": 8.266376616166172e-05, "loss": 0.248, "step": 7686 }, { "epoch": 1.549063066693532, "grad_norm": 0.052991047501564026, "learning_rate": 8.26536752247416e-05, "loss": 0.2038, "step": 7688 }, { "epoch": 1.5494660487608303, "grad_norm": 0.07514969259500504, "learning_rate": 8.26435819681566e-05, "loss": 0.2306, "step": 7690 }, { "epoch": 1.5498690308281282, "grad_norm": 0.041682012379169464, "learning_rate": 8.263348639262373e-05, "loss": 0.2014, "step": 7692 }, { "epoch": 1.5502720128954262, "grad_norm": 0.04360057786107063, "learning_rate": 8.26233884988602e-05, "loss": 0.1914, "step": 7694 }, { "epoch": 1.550674994962724, "grad_norm": 0.029803669080138206, "learning_rate": 8.261328828758333e-05, "loss": 0.1599, "step": 7696 }, { "epoch": 1.551077977030022, "grad_norm": 0.046820033341646194, "learning_rate": 8.260318575951059e-05, "loss": 0.1899, "step": 7698 }, { "epoch": 1.5514809590973202, "grad_norm": 0.06097986549139023, "learning_rate": 8.259308091535969e-05, "loss": 0.2214, "step": 7700 }, { "epoch": 1.5518839411646181, "grad_norm": 0.0608573816716671, "learning_rate": 8.258297375584845e-05, "loss": 0.1848, "step": 7702 }, { "epoch": 1.5522869232319163, "grad_norm": 0.050830453634262085, "learning_rate": 8.257286428169486e-05, "loss": 0.1871, "step": 7704 }, { "epoch": 1.5526899052992142, "grad_norm": 0.056992385536432266, "learning_rate": 8.256275249361707e-05, "loss": 0.1706, "step": 7706 }, { "epoch": 1.5530928873665122, "grad_norm": 0.0413503423333168, "learning_rate": 8.255263839233345e-05, "loss": 0.1977, "step": 7708 }, { "epoch": 1.5534958694338101, "grad_norm": 0.04261191189289093, "learning_rate": 8.254252197856242e-05, "loss": 0.2117, "step": 7710 }, { "epoch": 1.553898851501108, "grad_norm": 0.03962111100554466, "learning_rate": 8.253240325302272e-05, "loss": 0.1648, "step": 7712 }, { "epoch": 1.5543018335684062, "grad_norm": 0.04593179374933243, "learning_rate": 8.252228221643308e-05, "loss": 0.1734, "step": 7714 }, { "epoch": 1.5547048156357042, "grad_norm": 0.056593868881464005, "learning_rate": 8.251215886951253e-05, "loss": 0.2084, "step": 7716 }, { "epoch": 1.5551077977030023, "grad_norm": 0.06353504210710526, "learning_rate": 8.250203321298022e-05, "loss": 0.2008, "step": 7718 }, { "epoch": 1.5555107797703003, "grad_norm": 0.044093504548072815, "learning_rate": 8.249190524755546e-05, "loss": 0.1992, "step": 7720 }, { "epoch": 1.5559137618375982, "grad_norm": 0.058922551572322845, "learning_rate": 8.24817749739577e-05, "loss": 0.1842, "step": 7722 }, { "epoch": 1.5563167439048962, "grad_norm": 0.03649704158306122, "learning_rate": 8.247164239290659e-05, "loss": 0.1879, "step": 7724 }, { "epoch": 1.556719725972194, "grad_norm": 0.047487691044807434, "learning_rate": 8.246150750512193e-05, "loss": 0.135, "step": 7726 }, { "epoch": 1.5571227080394923, "grad_norm": 0.04881270229816437, "learning_rate": 8.24513703113237e-05, "loss": 0.2149, "step": 7728 }, { "epoch": 1.5575256901067902, "grad_norm": 0.0515984371304512, "learning_rate": 8.244123081223203e-05, "loss": 0.2158, "step": 7730 }, { "epoch": 1.5579286721740884, "grad_norm": 0.044214099645614624, "learning_rate": 8.24310890085672e-05, "loss": 0.195, "step": 7732 }, { "epoch": 1.5583316542413863, "grad_norm": 0.0432279109954834, "learning_rate": 8.242094490104967e-05, "loss": 0.1775, "step": 7734 }, { "epoch": 1.5587346363086843, "grad_norm": 0.05913606658577919, "learning_rate": 8.241079849040007e-05, "loss": 0.2104, "step": 7736 }, { "epoch": 1.5591376183759822, "grad_norm": 0.05148633196949959, "learning_rate": 8.240064977733916e-05, "loss": 0.1648, "step": 7738 }, { "epoch": 1.5595406004432801, "grad_norm": 0.054173316806554794, "learning_rate": 8.239049876258793e-05, "loss": 0.2285, "step": 7740 }, { "epoch": 1.5599435825105783, "grad_norm": 0.0662137120962143, "learning_rate": 8.238034544686746e-05, "loss": 0.2163, "step": 7742 }, { "epoch": 1.5603465645778762, "grad_norm": 0.061622608453035355, "learning_rate": 8.237018983089902e-05, "loss": 0.2353, "step": 7744 }, { "epoch": 1.5607495466451744, "grad_norm": 0.07295259088277817, "learning_rate": 8.236003191540408e-05, "loss": 0.2081, "step": 7746 }, { "epoch": 1.5611525287124723, "grad_norm": 0.04859080910682678, "learning_rate": 8.234987170110422e-05, "loss": 0.1894, "step": 7748 }, { "epoch": 1.5615555107797703, "grad_norm": 0.053113147616386414, "learning_rate": 8.233970918872122e-05, "loss": 0.2037, "step": 7750 }, { "epoch": 1.5619584928470682, "grad_norm": 0.060238417237997055, "learning_rate": 8.232954437897697e-05, "loss": 0.1833, "step": 7752 }, { "epoch": 1.5623614749143662, "grad_norm": 0.04282419756054878, "learning_rate": 8.231937727259363e-05, "loss": 0.2228, "step": 7754 }, { "epoch": 1.5627644569816643, "grad_norm": 0.04955311492085457, "learning_rate": 8.23092078702934e-05, "loss": 0.1778, "step": 7756 }, { "epoch": 1.5631674390489623, "grad_norm": 0.05403996258974075, "learning_rate": 8.229903617279869e-05, "loss": 0.2412, "step": 7758 }, { "epoch": 1.5635704211162604, "grad_norm": 0.05369593948125839, "learning_rate": 8.228886218083214e-05, "loss": 0.2124, "step": 7760 }, { "epoch": 1.5639734031835584, "grad_norm": 0.04346174746751785, "learning_rate": 8.227868589511643e-05, "loss": 0.1902, "step": 7762 }, { "epoch": 1.5643763852508563, "grad_norm": 0.05239605903625488, "learning_rate": 8.226850731637452e-05, "loss": 0.2224, "step": 7764 }, { "epoch": 1.5647793673181543, "grad_norm": 0.0549917034804821, "learning_rate": 8.225832644532945e-05, "loss": 0.1699, "step": 7766 }, { "epoch": 1.5651823493854522, "grad_norm": 0.05196690559387207, "learning_rate": 8.224814328270444e-05, "loss": 0.2061, "step": 7768 }, { "epoch": 1.5655853314527504, "grad_norm": 0.03883376717567444, "learning_rate": 8.223795782922292e-05, "loss": 0.1644, "step": 7770 }, { "epoch": 1.5659883135200483, "grad_norm": 0.04878639429807663, "learning_rate": 8.222777008560845e-05, "loss": 0.2061, "step": 7772 }, { "epoch": 1.5663912955873465, "grad_norm": 0.04065984860062599, "learning_rate": 8.22175800525847e-05, "loss": 0.1986, "step": 7774 }, { "epoch": 1.5667942776546444, "grad_norm": 0.05418518930673599, "learning_rate": 8.220738773087561e-05, "loss": 0.1841, "step": 7776 }, { "epoch": 1.5671972597219423, "grad_norm": 0.10020854324102402, "learning_rate": 8.21971931212052e-05, "loss": 0.2231, "step": 7778 }, { "epoch": 1.5676002417892403, "grad_norm": 0.057610880583524704, "learning_rate": 8.218699622429768e-05, "loss": 0.2071, "step": 7780 }, { "epoch": 1.5680032238565382, "grad_norm": 0.04992236942052841, "learning_rate": 8.217679704087742e-05, "loss": 0.2034, "step": 7782 }, { "epoch": 1.5684062059238364, "grad_norm": 0.04503064975142479, "learning_rate": 8.216659557166895e-05, "loss": 0.1688, "step": 7784 }, { "epoch": 1.5688091879911346, "grad_norm": 0.06223779916763306, "learning_rate": 8.2156391817397e-05, "loss": 0.2852, "step": 7786 }, { "epoch": 1.5692121700584325, "grad_norm": 0.042011719197034836, "learning_rate": 8.21461857787864e-05, "loss": 0.1581, "step": 7788 }, { "epoch": 1.5696151521257304, "grad_norm": 0.044512271881103516, "learning_rate": 8.213597745656214e-05, "loss": 0.1814, "step": 7790 }, { "epoch": 1.5700181341930284, "grad_norm": 0.036685794591903687, "learning_rate": 8.212576685144946e-05, "loss": 0.1937, "step": 7792 }, { "epoch": 1.5704211162603263, "grad_norm": 0.04587550461292267, "learning_rate": 8.211555396417367e-05, "loss": 0.1991, "step": 7794 }, { "epoch": 1.5708240983276243, "grad_norm": 0.04192047566175461, "learning_rate": 8.21053387954603e-05, "loss": 0.1697, "step": 7796 }, { "epoch": 1.5712270803949224, "grad_norm": 0.04757276922464371, "learning_rate": 8.209512134603499e-05, "loss": 0.2251, "step": 7798 }, { "epoch": 1.5716300624622206, "grad_norm": 0.03852641209959984, "learning_rate": 8.20849016166236e-05, "loss": 0.1986, "step": 7800 }, { "epoch": 1.5720330445295185, "grad_norm": 0.04816350340843201, "learning_rate": 8.20746796079521e-05, "loss": 0.2065, "step": 7802 }, { "epoch": 1.5724360265968165, "grad_norm": 0.0453607551753521, "learning_rate": 8.206445532074667e-05, "loss": 0.2135, "step": 7804 }, { "epoch": 1.5728390086641144, "grad_norm": 0.05096454173326492, "learning_rate": 8.20542287557336e-05, "loss": 0.1533, "step": 7806 }, { "epoch": 1.5732419907314124, "grad_norm": 0.04808863624930382, "learning_rate": 8.20439999136394e-05, "loss": 0.1495, "step": 7808 }, { "epoch": 1.5736449727987103, "grad_norm": 0.04415539652109146, "learning_rate": 8.20337687951907e-05, "loss": 0.1631, "step": 7810 }, { "epoch": 1.5740479548660085, "grad_norm": 0.040588121861219406, "learning_rate": 8.202353540111426e-05, "loss": 0.1696, "step": 7812 }, { "epoch": 1.5744509369333066, "grad_norm": 0.05861745402216911, "learning_rate": 8.201329973213709e-05, "loss": 0.2119, "step": 7814 }, { "epoch": 1.5748539190006046, "grad_norm": 0.09671831130981445, "learning_rate": 8.200306178898633e-05, "loss": 0.2133, "step": 7816 }, { "epoch": 1.5752569010679025, "grad_norm": 0.06667297333478928, "learning_rate": 8.19928215723892e-05, "loss": 0.1915, "step": 7818 }, { "epoch": 1.5756598831352004, "grad_norm": 0.05069169029593468, "learning_rate": 8.198257908307323e-05, "loss": 0.1999, "step": 7820 }, { "epoch": 1.5760628652024984, "grad_norm": 0.04458223283290863, "learning_rate": 8.197233432176597e-05, "loss": 0.2094, "step": 7822 }, { "epoch": 1.5764658472697963, "grad_norm": 0.04166608303785324, "learning_rate": 8.196208728919523e-05, "loss": 0.2149, "step": 7824 }, { "epoch": 1.5768688293370945, "grad_norm": 0.04502653703093529, "learning_rate": 8.195183798608891e-05, "loss": 0.1799, "step": 7826 }, { "epoch": 1.5772718114043927, "grad_norm": 0.04423825815320015, "learning_rate": 8.194158641317512e-05, "loss": 0.2456, "step": 7828 }, { "epoch": 1.5776747934716906, "grad_norm": 0.05600183457136154, "learning_rate": 8.193133257118211e-05, "loss": 0.2021, "step": 7830 }, { "epoch": 1.5780777755389885, "grad_norm": 0.049919243901968, "learning_rate": 8.19210764608383e-05, "loss": 0.2717, "step": 7832 }, { "epoch": 1.5784807576062865, "grad_norm": 0.04759713634848595, "learning_rate": 8.191081808287229e-05, "loss": 0.2121, "step": 7834 }, { "epoch": 1.5788837396735844, "grad_norm": 0.03916984051465988, "learning_rate": 8.190055743801278e-05, "loss": 0.1255, "step": 7836 }, { "epoch": 1.5792867217408826, "grad_norm": 0.06487026810646057, "learning_rate": 8.189029452698868e-05, "loss": 0.2221, "step": 7838 }, { "epoch": 1.5796897038081805, "grad_norm": 0.04453813284635544, "learning_rate": 8.188002935052907e-05, "loss": 0.1533, "step": 7840 }, { "epoch": 1.5800926858754787, "grad_norm": 0.04084227234125137, "learning_rate": 8.186976190936317e-05, "loss": 0.1486, "step": 7842 }, { "epoch": 1.5804956679427766, "grad_norm": 0.05148351192474365, "learning_rate": 8.185949220422034e-05, "loss": 0.2247, "step": 7844 }, { "epoch": 1.5808986500100746, "grad_norm": 0.03696500509977341, "learning_rate": 8.184922023583012e-05, "loss": 0.1468, "step": 7846 }, { "epoch": 1.5813016320773725, "grad_norm": 0.05501485615968704, "learning_rate": 8.183894600492225e-05, "loss": 0.1728, "step": 7848 }, { "epoch": 1.5817046141446705, "grad_norm": 0.05600307881832123, "learning_rate": 8.182866951222656e-05, "loss": 0.2311, "step": 7850 }, { "epoch": 1.5821075962119686, "grad_norm": 0.07867880910634995, "learning_rate": 8.181839075847311e-05, "loss": 0.1805, "step": 7852 }, { "epoch": 1.5825105782792666, "grad_norm": 0.06224419176578522, "learning_rate": 8.180810974439205e-05, "loss": 0.1766, "step": 7854 }, { "epoch": 1.5829135603465647, "grad_norm": 0.052312593907117844, "learning_rate": 8.179782647071374e-05, "loss": 0.2158, "step": 7856 }, { "epoch": 1.5833165424138627, "grad_norm": 0.05663037672638893, "learning_rate": 8.178754093816871e-05, "loss": 0.2584, "step": 7858 }, { "epoch": 1.5837195244811606, "grad_norm": 0.0479075126349926, "learning_rate": 8.17772531474876e-05, "loss": 0.195, "step": 7860 }, { "epoch": 1.5841225065484585, "grad_norm": 0.043161310255527496, "learning_rate": 8.176696309940124e-05, "loss": 0.1939, "step": 7862 }, { "epoch": 1.5845254886157565, "grad_norm": 0.050313014537096024, "learning_rate": 8.175667079464063e-05, "loss": 0.1988, "step": 7864 }, { "epoch": 1.5849284706830546, "grad_norm": 0.0593692846596241, "learning_rate": 8.174637623393692e-05, "loss": 0.2183, "step": 7866 }, { "epoch": 1.5853314527503526, "grad_norm": 0.0714186280965805, "learning_rate": 8.17360794180214e-05, "loss": 0.1856, "step": 7868 }, { "epoch": 1.5857344348176508, "grad_norm": 0.05151008814573288, "learning_rate": 8.172578034762557e-05, "loss": 0.2213, "step": 7870 }, { "epoch": 1.5861374168849487, "grad_norm": 0.0600179061293602, "learning_rate": 8.171547902348102e-05, "loss": 0.2162, "step": 7872 }, { "epoch": 1.5865403989522466, "grad_norm": 0.03982606157660484, "learning_rate": 8.170517544631957e-05, "loss": 0.1737, "step": 7874 }, { "epoch": 1.5869433810195446, "grad_norm": 0.04170709848403931, "learning_rate": 8.169486961687318e-05, "loss": 0.1791, "step": 7876 }, { "epoch": 1.5873463630868425, "grad_norm": 0.0685291662812233, "learning_rate": 8.168456153587391e-05, "loss": 0.1808, "step": 7878 }, { "epoch": 1.5877493451541407, "grad_norm": 0.03968818858265877, "learning_rate": 8.167425120405408e-05, "loss": 0.1677, "step": 7880 }, { "epoch": 1.5881523272214386, "grad_norm": 0.039801955223083496, "learning_rate": 8.166393862214609e-05, "loss": 0.2136, "step": 7882 }, { "epoch": 1.5885553092887368, "grad_norm": 0.0453149750828743, "learning_rate": 8.165362379088255e-05, "loss": 0.2101, "step": 7884 }, { "epoch": 1.5889582913560347, "grad_norm": 0.04556663706898689, "learning_rate": 8.16433067109962e-05, "loss": 0.1948, "step": 7886 }, { "epoch": 1.5893612734233327, "grad_norm": 0.05905143544077873, "learning_rate": 8.163298738321994e-05, "loss": 0.1713, "step": 7888 }, { "epoch": 1.5897642554906306, "grad_norm": 0.053225282579660416, "learning_rate": 8.162266580828684e-05, "loss": 0.1932, "step": 7890 }, { "epoch": 1.5901672375579285, "grad_norm": 0.05515950173139572, "learning_rate": 8.161234198693014e-05, "loss": 0.1598, "step": 7892 }, { "epoch": 1.5905702196252267, "grad_norm": 0.0776933878660202, "learning_rate": 8.160201591988322e-05, "loss": 0.1912, "step": 7894 }, { "epoch": 1.5909732016925247, "grad_norm": 0.06278533488512039, "learning_rate": 8.159168760787964e-05, "loss": 0.2172, "step": 7896 }, { "epoch": 1.5913761837598228, "grad_norm": 0.05375386029481888, "learning_rate": 8.158135705165309e-05, "loss": 0.1618, "step": 7898 }, { "epoch": 1.5917791658271208, "grad_norm": 0.04502701014280319, "learning_rate": 8.157102425193744e-05, "loss": 0.1778, "step": 7900 }, { "epoch": 1.5921821478944187, "grad_norm": 0.07836294174194336, "learning_rate": 8.156068920946672e-05, "loss": 0.2136, "step": 7902 }, { "epoch": 1.5925851299617166, "grad_norm": 0.0926329493522644, "learning_rate": 8.155035192497509e-05, "loss": 0.2013, "step": 7904 }, { "epoch": 1.5929881120290146, "grad_norm": 0.052508678287267685, "learning_rate": 8.154001239919694e-05, "loss": 0.2187, "step": 7906 }, { "epoch": 1.5933910940963127, "grad_norm": 0.060862090438604355, "learning_rate": 8.152967063286674e-05, "loss": 0.1776, "step": 7908 }, { "epoch": 1.5937940761636107, "grad_norm": 0.09297354519367218, "learning_rate": 8.151932662671918e-05, "loss": 0.171, "step": 7910 }, { "epoch": 1.5941970582309088, "grad_norm": 0.05653372034430504, "learning_rate": 8.150898038148904e-05, "loss": 0.2042, "step": 7912 }, { "epoch": 1.5946000402982068, "grad_norm": 0.07802308350801468, "learning_rate": 8.149863189791134e-05, "loss": 0.2464, "step": 7914 }, { "epoch": 1.5950030223655047, "grad_norm": 0.046020377427339554, "learning_rate": 8.14882811767212e-05, "loss": 0.2178, "step": 7916 }, { "epoch": 1.5954060044328027, "grad_norm": 0.05628088861703873, "learning_rate": 8.147792821865392e-05, "loss": 0.2068, "step": 7918 }, { "epoch": 1.5958089865001006, "grad_norm": 0.05584513023495674, "learning_rate": 8.146757302444496e-05, "loss": 0.1852, "step": 7920 }, { "epoch": 1.5962119685673988, "grad_norm": 0.05673876032233238, "learning_rate": 8.145721559482996e-05, "loss": 0.2101, "step": 7922 }, { "epoch": 1.5966149506346967, "grad_norm": 0.05031610652804375, "learning_rate": 8.144685593054465e-05, "loss": 0.2067, "step": 7924 }, { "epoch": 1.5970179327019949, "grad_norm": 0.05931559205055237, "learning_rate": 8.143649403232499e-05, "loss": 0.1883, "step": 7926 }, { "epoch": 1.5974209147692928, "grad_norm": 0.056502941995859146, "learning_rate": 8.142612990090708e-05, "loss": 0.2533, "step": 7928 }, { "epoch": 1.5978238968365908, "grad_norm": 0.06808840483427048, "learning_rate": 8.141576353702715e-05, "loss": 0.2198, "step": 7930 }, { "epoch": 1.5982268789038887, "grad_norm": 0.056388791650533676, "learning_rate": 8.14053949414216e-05, "loss": 0.244, "step": 7932 }, { "epoch": 1.5986298609711866, "grad_norm": 0.04594703018665314, "learning_rate": 8.139502411482705e-05, "loss": 0.2231, "step": 7934 }, { "epoch": 1.5990328430384848, "grad_norm": 0.07403270155191422, "learning_rate": 8.138465105798018e-05, "loss": 0.224, "step": 7936 }, { "epoch": 1.5994358251057827, "grad_norm": 0.06828963756561279, "learning_rate": 8.137427577161791e-05, "loss": 0.2217, "step": 7938 }, { "epoch": 1.599838807173081, "grad_norm": 0.04627368599176407, "learning_rate": 8.136389825647726e-05, "loss": 0.2574, "step": 7940 }, { "epoch": 1.6002417892403789, "grad_norm": 0.043321721255779266, "learning_rate": 8.135351851329543e-05, "loss": 0.1637, "step": 7942 }, { "epoch": 1.6006447713076768, "grad_norm": 0.06069161742925644, "learning_rate": 8.134313654280978e-05, "loss": 0.1878, "step": 7944 }, { "epoch": 1.6010477533749747, "grad_norm": 0.05267888680100441, "learning_rate": 8.133275234575784e-05, "loss": 0.2145, "step": 7946 }, { "epoch": 1.6014507354422727, "grad_norm": 0.03687436878681183, "learning_rate": 8.132236592287729e-05, "loss": 0.1785, "step": 7948 }, { "epoch": 1.6018537175095708, "grad_norm": 0.04854239895939827, "learning_rate": 8.131197727490596e-05, "loss": 0.2081, "step": 7950 }, { "epoch": 1.6022566995768688, "grad_norm": 0.05398216098546982, "learning_rate": 8.130158640258182e-05, "loss": 0.1895, "step": 7952 }, { "epoch": 1.602659681644167, "grad_norm": 0.05339875444769859, "learning_rate": 8.129119330664305e-05, "loss": 0.1989, "step": 7954 }, { "epoch": 1.6030626637114649, "grad_norm": 0.0771302655339241, "learning_rate": 8.128079798782798e-05, "loss": 0.2012, "step": 7956 }, { "epoch": 1.6034656457787628, "grad_norm": 0.04624316468834877, "learning_rate": 8.1270400446875e-05, "loss": 0.209, "step": 7958 }, { "epoch": 1.6038686278460608, "grad_norm": 0.04548148810863495, "learning_rate": 8.126000068452281e-05, "loss": 0.1821, "step": 7960 }, { "epoch": 1.6042716099133587, "grad_norm": 0.0673830509185791, "learning_rate": 8.124959870151017e-05, "loss": 0.252, "step": 7962 }, { "epoch": 1.6046745919806569, "grad_norm": 0.04191439598798752, "learning_rate": 8.1239194498576e-05, "loss": 0.1672, "step": 7964 }, { "epoch": 1.6050775740479548, "grad_norm": 0.044628314673900604, "learning_rate": 8.122878807645941e-05, "loss": 0.1743, "step": 7966 }, { "epoch": 1.605480556115253, "grad_norm": 0.06855150312185287, "learning_rate": 8.121837943589967e-05, "loss": 0.1659, "step": 7968 }, { "epoch": 1.605883538182551, "grad_norm": 0.04463125020265579, "learning_rate": 8.120796857763617e-05, "loss": 0.1596, "step": 7970 }, { "epoch": 1.6062865202498489, "grad_norm": 0.05310614034533501, "learning_rate": 8.119755550240849e-05, "loss": 0.1873, "step": 7972 }, { "epoch": 1.6066895023171468, "grad_norm": 0.07126377522945404, "learning_rate": 8.118714021095636e-05, "loss": 0.1796, "step": 7974 }, { "epoch": 1.6070924843844447, "grad_norm": 0.05647788941860199, "learning_rate": 8.117672270401969e-05, "loss": 0.1719, "step": 7976 }, { "epoch": 1.607495466451743, "grad_norm": 0.05399641394615173, "learning_rate": 8.116630298233847e-05, "loss": 0.1828, "step": 7978 }, { "epoch": 1.607898448519041, "grad_norm": 0.04089745134115219, "learning_rate": 8.115588104665294e-05, "loss": 0.1624, "step": 7980 }, { "epoch": 1.608301430586339, "grad_norm": 0.09755789488554001, "learning_rate": 8.114545689770345e-05, "loss": 0.2119, "step": 7982 }, { "epoch": 1.608704412653637, "grad_norm": 0.045670583844184875, "learning_rate": 8.113503053623051e-05, "loss": 0.1697, "step": 7984 }, { "epoch": 1.609107394720935, "grad_norm": 0.0721781849861145, "learning_rate": 8.11246019629748e-05, "loss": 0.2267, "step": 7986 }, { "epoch": 1.6095103767882328, "grad_norm": 0.04462890326976776, "learning_rate": 8.111417117867715e-05, "loss": 0.1754, "step": 7988 }, { "epoch": 1.6099133588555308, "grad_norm": 0.06390615552663803, "learning_rate": 8.110373818407852e-05, "loss": 0.2028, "step": 7990 }, { "epoch": 1.610316340922829, "grad_norm": 0.07268303632736206, "learning_rate": 8.109330297992009e-05, "loss": 0.2216, "step": 7992 }, { "epoch": 1.610719322990127, "grad_norm": 0.051655374467372894, "learning_rate": 8.10828655669431e-05, "loss": 0.1643, "step": 7994 }, { "epoch": 1.611122305057425, "grad_norm": 0.05964389815926552, "learning_rate": 8.10724259458891e-05, "loss": 0.1843, "step": 7996 }, { "epoch": 1.611525287124723, "grad_norm": 0.04912543296813965, "learning_rate": 8.106198411749964e-05, "loss": 0.2204, "step": 7998 }, { "epoch": 1.611928269192021, "grad_norm": 0.056842729449272156, "learning_rate": 8.10515400825165e-05, "loss": 0.2453, "step": 8000 }, { "epoch": 1.6123312512593189, "grad_norm": 0.05034971982240677, "learning_rate": 8.104109384168162e-05, "loss": 0.213, "step": 8002 }, { "epoch": 1.6127342333266168, "grad_norm": 0.042848989367485046, "learning_rate": 8.103064539573706e-05, "loss": 0.1757, "step": 8004 }, { "epoch": 1.613137215393915, "grad_norm": 0.04389515891671181, "learning_rate": 8.102019474542509e-05, "loss": 0.1823, "step": 8006 }, { "epoch": 1.6135401974612131, "grad_norm": 0.05041665956377983, "learning_rate": 8.100974189148809e-05, "loss": 0.2086, "step": 8008 }, { "epoch": 1.613943179528511, "grad_norm": 0.06093117967247963, "learning_rate": 8.099928683466861e-05, "loss": 0.2188, "step": 8010 }, { "epoch": 1.614346161595809, "grad_norm": 0.05891212821006775, "learning_rate": 8.098882957570937e-05, "loss": 0.1658, "step": 8012 }, { "epoch": 1.614749143663107, "grad_norm": 0.03832251578569412, "learning_rate": 8.097837011535325e-05, "loss": 0.1527, "step": 8014 }, { "epoch": 1.615152125730405, "grad_norm": 0.03451972082257271, "learning_rate": 8.096790845434326e-05, "loss": 0.1676, "step": 8016 }, { "epoch": 1.6155551077977028, "grad_norm": 0.04633672162890434, "learning_rate": 8.095744459342257e-05, "loss": 0.2141, "step": 8018 }, { "epoch": 1.615958089865001, "grad_norm": 0.037216588854789734, "learning_rate": 8.094697853333453e-05, "loss": 0.1653, "step": 8020 }, { "epoch": 1.6163610719322992, "grad_norm": 0.05595245212316513, "learning_rate": 8.093651027482263e-05, "loss": 0.2069, "step": 8022 }, { "epoch": 1.616764053999597, "grad_norm": 0.04852914810180664, "learning_rate": 8.092603981863051e-05, "loss": 0.1542, "step": 8024 }, { "epoch": 1.617167036066895, "grad_norm": 0.048641812056303024, "learning_rate": 8.091556716550198e-05, "loss": 0.2141, "step": 8026 }, { "epoch": 1.617570018134193, "grad_norm": 0.05191672965884209, "learning_rate": 8.0905092316181e-05, "loss": 0.2048, "step": 8028 }, { "epoch": 1.617973000201491, "grad_norm": 0.06447257846593857, "learning_rate": 8.089461527141169e-05, "loss": 0.1651, "step": 8030 }, { "epoch": 1.618375982268789, "grad_norm": 0.05003985017538071, "learning_rate": 8.088413603193831e-05, "loss": 0.1688, "step": 8032 }, { "epoch": 1.618778964336087, "grad_norm": 0.04257288575172424, "learning_rate": 8.087365459850531e-05, "loss": 0.194, "step": 8034 }, { "epoch": 1.6191819464033852, "grad_norm": 0.04626917093992233, "learning_rate": 8.086317097185727e-05, "loss": 0.229, "step": 8036 }, { "epoch": 1.6195849284706831, "grad_norm": 0.04448797181248665, "learning_rate": 8.085268515273891e-05, "loss": 0.1372, "step": 8038 }, { "epoch": 1.619987910537981, "grad_norm": 0.06380314379930496, "learning_rate": 8.084219714189514e-05, "loss": 0.1552, "step": 8040 }, { "epoch": 1.620390892605279, "grad_norm": 0.08029188960790634, "learning_rate": 8.083170694007102e-05, "loss": 0.2016, "step": 8042 }, { "epoch": 1.620793874672577, "grad_norm": 0.05923297628760338, "learning_rate": 8.082121454801174e-05, "loss": 0.2585, "step": 8044 }, { "epoch": 1.6211968567398751, "grad_norm": 0.06981176137924194, "learning_rate": 8.081071996646266e-05, "loss": 0.1868, "step": 8046 }, { "epoch": 1.621599838807173, "grad_norm": 0.0497167594730854, "learning_rate": 8.080022319616931e-05, "loss": 0.1648, "step": 8048 }, { "epoch": 1.6220028208744712, "grad_norm": 0.047734495252370834, "learning_rate": 8.078972423787738e-05, "loss": 0.1367, "step": 8050 }, { "epoch": 1.6224058029417692, "grad_norm": 0.05451458692550659, "learning_rate": 8.077922309233267e-05, "loss": 0.2072, "step": 8052 }, { "epoch": 1.6228087850090671, "grad_norm": 0.04716449975967407, "learning_rate": 8.076871976028117e-05, "loss": 0.2184, "step": 8054 }, { "epoch": 1.623211767076365, "grad_norm": 0.057981863617897034, "learning_rate": 8.075821424246904e-05, "loss": 0.1972, "step": 8056 }, { "epoch": 1.623614749143663, "grad_norm": 0.056067463010549545, "learning_rate": 8.074770653964254e-05, "loss": 0.2298, "step": 8058 }, { "epoch": 1.6240177312109612, "grad_norm": 0.05075881630182266, "learning_rate": 8.073719665254815e-05, "loss": 0.1309, "step": 8060 }, { "epoch": 1.624420713278259, "grad_norm": 0.057302046567201614, "learning_rate": 8.072668458193247e-05, "loss": 0.2061, "step": 8062 }, { "epoch": 1.6248236953455573, "grad_norm": 0.059380508959293365, "learning_rate": 8.071617032854226e-05, "loss": 0.1945, "step": 8064 }, { "epoch": 1.6252266774128552, "grad_norm": 0.058844953775405884, "learning_rate": 8.070565389312443e-05, "loss": 0.2018, "step": 8066 }, { "epoch": 1.6256296594801531, "grad_norm": 0.051923152059316635, "learning_rate": 8.069513527642605e-05, "loss": 0.1757, "step": 8068 }, { "epoch": 1.626032641547451, "grad_norm": 0.05240127071738243, "learning_rate": 8.068461447919435e-05, "loss": 0.2156, "step": 8070 }, { "epoch": 1.626435623614749, "grad_norm": 0.04726002365350723, "learning_rate": 8.06740915021767e-05, "loss": 0.2025, "step": 8072 }, { "epoch": 1.6268386056820472, "grad_norm": 0.048952165991067886, "learning_rate": 8.066356634612067e-05, "loss": 0.219, "step": 8074 }, { "epoch": 1.6272415877493451, "grad_norm": 0.04405834153294563, "learning_rate": 8.065303901177392e-05, "loss": 0.1825, "step": 8076 }, { "epoch": 1.6276445698166433, "grad_norm": 0.0471949465572834, "learning_rate": 8.064250949988429e-05, "loss": 0.1616, "step": 8078 }, { "epoch": 1.6280475518839412, "grad_norm": 0.05710560828447342, "learning_rate": 8.06319778111998e-05, "loss": 0.1621, "step": 8080 }, { "epoch": 1.6284505339512392, "grad_norm": 0.05007312446832657, "learning_rate": 8.062144394646858e-05, "loss": 0.2078, "step": 8082 }, { "epoch": 1.6288535160185371, "grad_norm": 0.043250661343336105, "learning_rate": 8.061090790643897e-05, "loss": 0.2212, "step": 8084 }, { "epoch": 1.629256498085835, "grad_norm": 0.10560256987810135, "learning_rate": 8.060036969185941e-05, "loss": 0.156, "step": 8086 }, { "epoch": 1.6296594801531332, "grad_norm": 0.048674967139959335, "learning_rate": 8.058982930347852e-05, "loss": 0.1581, "step": 8088 }, { "epoch": 1.6300624622204312, "grad_norm": 0.05943009629845619, "learning_rate": 8.05792867420451e-05, "loss": 0.2013, "step": 8090 }, { "epoch": 1.6304654442877293, "grad_norm": 0.043210141360759735, "learning_rate": 8.056874200830803e-05, "loss": 0.1863, "step": 8092 }, { "epoch": 1.6308684263550273, "grad_norm": 0.05146521329879761, "learning_rate": 8.055819510301642e-05, "loss": 0.2236, "step": 8094 }, { "epoch": 1.6312714084223252, "grad_norm": 0.0610971599817276, "learning_rate": 8.054764602691951e-05, "loss": 0.2232, "step": 8096 }, { "epoch": 1.6316743904896231, "grad_norm": 0.04712080955505371, "learning_rate": 8.053709478076668e-05, "loss": 0.2287, "step": 8098 }, { "epoch": 1.632077372556921, "grad_norm": 0.03435961529612541, "learning_rate": 8.052654136530746e-05, "loss": 0.1468, "step": 8100 }, { "epoch": 1.6324803546242193, "grad_norm": 0.07136266678571701, "learning_rate": 8.051598578129157e-05, "loss": 0.227, "step": 8102 }, { "epoch": 1.6328833366915172, "grad_norm": 0.05471991002559662, "learning_rate": 8.050542802946886e-05, "loss": 0.1854, "step": 8104 }, { "epoch": 1.6332863187588154, "grad_norm": 0.046276554465293884, "learning_rate": 8.04948681105893e-05, "loss": 0.1891, "step": 8106 }, { "epoch": 1.6336893008261133, "grad_norm": 0.050421275198459625, "learning_rate": 8.048430602540311e-05, "loss": 0.1964, "step": 8108 }, { "epoch": 1.6340922828934112, "grad_norm": 0.04822089895606041, "learning_rate": 8.047374177466056e-05, "loss": 0.2245, "step": 8110 }, { "epoch": 1.6344952649607092, "grad_norm": 0.055037956684827805, "learning_rate": 8.046317535911214e-05, "loss": 0.1881, "step": 8112 }, { "epoch": 1.6348982470280071, "grad_norm": 0.05557211488485336, "learning_rate": 8.045260677950846e-05, "loss": 0.1872, "step": 8114 }, { "epoch": 1.6353012290953053, "grad_norm": 0.1812106966972351, "learning_rate": 8.044203603660027e-05, "loss": 0.2067, "step": 8116 }, { "epoch": 1.6357042111626032, "grad_norm": 0.05329279601573944, "learning_rate": 8.043146313113854e-05, "loss": 0.2216, "step": 8118 }, { "epoch": 1.6361071932299014, "grad_norm": 0.03556535392999649, "learning_rate": 8.042088806387436e-05, "loss": 0.1244, "step": 8120 }, { "epoch": 1.6365101752971993, "grad_norm": 0.0379241406917572, "learning_rate": 8.041031083555892e-05, "loss": 0.2192, "step": 8122 }, { "epoch": 1.6369131573644973, "grad_norm": 0.07962547242641449, "learning_rate": 8.039973144694364e-05, "loss": 0.2132, "step": 8124 }, { "epoch": 1.6373161394317952, "grad_norm": 0.05779522284865379, "learning_rate": 8.038914989878005e-05, "loss": 0.1916, "step": 8126 }, { "epoch": 1.6377191214990932, "grad_norm": 0.06630147248506546, "learning_rate": 8.037856619181985e-05, "loss": 0.2015, "step": 8128 }, { "epoch": 1.6381221035663913, "grad_norm": 0.0491768941283226, "learning_rate": 8.03679803268149e-05, "loss": 0.2045, "step": 8130 }, { "epoch": 1.6385250856336893, "grad_norm": 0.04900708422064781, "learning_rate": 8.035739230451719e-05, "loss": 0.177, "step": 8132 }, { "epoch": 1.6389280677009874, "grad_norm": 0.033459994941949844, "learning_rate": 8.034680212567887e-05, "loss": 0.1713, "step": 8134 }, { "epoch": 1.6393310497682854, "grad_norm": 0.04298264533281326, "learning_rate": 8.033620979105227e-05, "loss": 0.2095, "step": 8136 }, { "epoch": 1.6397340318355833, "grad_norm": 0.04192821681499481, "learning_rate": 8.032561530138985e-05, "loss": 0.1744, "step": 8138 }, { "epoch": 1.6401370139028812, "grad_norm": 0.05017191916704178, "learning_rate": 8.03150186574442e-05, "loss": 0.1874, "step": 8140 }, { "epoch": 1.6405399959701792, "grad_norm": 0.04265674576163292, "learning_rate": 8.030441985996812e-05, "loss": 0.1553, "step": 8142 }, { "epoch": 1.6409429780374774, "grad_norm": 0.03842853009700775, "learning_rate": 8.02938189097145e-05, "loss": 0.1578, "step": 8144 }, { "epoch": 1.6413459601047753, "grad_norm": 0.04175066202878952, "learning_rate": 8.028321580743645e-05, "loss": 0.2033, "step": 8146 }, { "epoch": 1.6417489421720735, "grad_norm": 0.04019409045577049, "learning_rate": 8.027261055388717e-05, "loss": 0.1894, "step": 8148 }, { "epoch": 1.6421519242393714, "grad_norm": 0.04538341239094734, "learning_rate": 8.026200314982007e-05, "loss": 0.1652, "step": 8150 }, { "epoch": 1.6425549063066693, "grad_norm": 0.05302810296416283, "learning_rate": 8.025139359598863e-05, "loss": 0.2195, "step": 8152 }, { "epoch": 1.6429578883739673, "grad_norm": 0.06037846952676773, "learning_rate": 8.024078189314659e-05, "loss": 0.2235, "step": 8154 }, { "epoch": 1.6433608704412652, "grad_norm": 0.04620375484228134, "learning_rate": 8.023016804204777e-05, "loss": 0.1934, "step": 8156 }, { "epoch": 1.6437638525085634, "grad_norm": 0.04581563174724579, "learning_rate": 8.021955204344615e-05, "loss": 0.1899, "step": 8158 }, { "epoch": 1.6441668345758613, "grad_norm": 0.03631268069148064, "learning_rate": 8.020893389809589e-05, "loss": 0.1662, "step": 8160 }, { "epoch": 1.6445698166431595, "grad_norm": 0.048094492405653, "learning_rate": 8.019831360675127e-05, "loss": 0.2303, "step": 8162 }, { "epoch": 1.6449727987104574, "grad_norm": 0.04222455993294716, "learning_rate": 8.018769117016675e-05, "loss": 0.175, "step": 8164 }, { "epoch": 1.6453757807777554, "grad_norm": 0.05244714021682739, "learning_rate": 8.017706658909692e-05, "loss": 0.1874, "step": 8166 }, { "epoch": 1.6457787628450533, "grad_norm": 0.052215490490198135, "learning_rate": 8.016643986429655e-05, "loss": 0.1876, "step": 8168 }, { "epoch": 1.6461817449123513, "grad_norm": 0.12432952225208282, "learning_rate": 8.015581099652053e-05, "loss": 0.1976, "step": 8170 }, { "epoch": 1.6465847269796494, "grad_norm": 0.05211934074759483, "learning_rate": 8.014517998652393e-05, "loss": 0.2118, "step": 8172 }, { "epoch": 1.6469877090469474, "grad_norm": 0.050590209662914276, "learning_rate": 8.013454683506193e-05, "loss": 0.1852, "step": 8174 }, { "epoch": 1.6473906911142455, "grad_norm": 0.060968901962041855, "learning_rate": 8.012391154288995e-05, "loss": 0.208, "step": 8176 }, { "epoch": 1.6477936731815435, "grad_norm": 0.051988277584314346, "learning_rate": 8.011327411076346e-05, "loss": 0.2507, "step": 8178 }, { "epoch": 1.6481966552488414, "grad_norm": 0.03496647998690605, "learning_rate": 8.010263453943814e-05, "loss": 0.1696, "step": 8180 }, { "epoch": 1.6485996373161393, "grad_norm": 0.05741781368851662, "learning_rate": 8.00919928296698e-05, "loss": 0.1829, "step": 8182 }, { "epoch": 1.6490026193834373, "grad_norm": 0.04385066404938698, "learning_rate": 8.00813489822144e-05, "loss": 0.1922, "step": 8184 }, { "epoch": 1.6494056014507354, "grad_norm": 0.06760777533054352, "learning_rate": 8.007070299782808e-05, "loss": 0.2192, "step": 8186 }, { "epoch": 1.6498085835180336, "grad_norm": 0.03948403149843216, "learning_rate": 8.006005487726713e-05, "loss": 0.1596, "step": 8188 }, { "epoch": 1.6502115655853316, "grad_norm": 0.05976463481783867, "learning_rate": 8.004940462128794e-05, "loss": 0.196, "step": 8190 }, { "epoch": 1.6506145476526295, "grad_norm": 0.03693857043981552, "learning_rate": 8.003875223064711e-05, "loss": 0.1452, "step": 8192 }, { "epoch": 1.6510175297199274, "grad_norm": 0.04535726457834244, "learning_rate": 8.002809770610136e-05, "loss": 0.1711, "step": 8194 }, { "epoch": 1.6514205117872254, "grad_norm": 0.041834503412246704, "learning_rate": 8.001744104840756e-05, "loss": 0.1813, "step": 8196 }, { "epoch": 1.6518234938545233, "grad_norm": 0.055996619164943695, "learning_rate": 8.000678225832275e-05, "loss": 0.1748, "step": 8198 }, { "epoch": 1.6522264759218215, "grad_norm": 0.05592850595712662, "learning_rate": 7.999612133660413e-05, "loss": 0.2037, "step": 8200 }, { "epoch": 1.6526294579891196, "grad_norm": 0.04505985602736473, "learning_rate": 7.998545828400904e-05, "loss": 0.2294, "step": 8202 }, { "epoch": 1.6530324400564176, "grad_norm": 0.06555044651031494, "learning_rate": 7.997479310129491e-05, "loss": 0.2113, "step": 8204 }, { "epoch": 1.6534354221237155, "grad_norm": 0.051124464720487595, "learning_rate": 7.996412578921945e-05, "loss": 0.1841, "step": 8206 }, { "epoch": 1.6538384041910135, "grad_norm": 0.054283417761325836, "learning_rate": 7.995345634854039e-05, "loss": 0.1611, "step": 8208 }, { "epoch": 1.6542413862583114, "grad_norm": 0.04832978919148445, "learning_rate": 7.994278478001571e-05, "loss": 0.181, "step": 8210 }, { "epoch": 1.6546443683256093, "grad_norm": 0.05636703222990036, "learning_rate": 7.993211108440348e-05, "loss": 0.2443, "step": 8212 }, { "epoch": 1.6550473503929075, "grad_norm": 0.0764659196138382, "learning_rate": 7.992143526246195e-05, "loss": 0.2322, "step": 8214 }, { "epoch": 1.6554503324602057, "grad_norm": 0.0435616709291935, "learning_rate": 7.99107573149495e-05, "loss": 0.2397, "step": 8216 }, { "epoch": 1.6558533145275036, "grad_norm": 0.04968470335006714, "learning_rate": 7.99000772426247e-05, "loss": 0.2146, "step": 8218 }, { "epoch": 1.6562562965948016, "grad_norm": 0.039583925157785416, "learning_rate": 7.988939504624622e-05, "loss": 0.1396, "step": 8220 }, { "epoch": 1.6566592786620995, "grad_norm": 0.06294752657413483, "learning_rate": 7.987871072657293e-05, "loss": 0.1602, "step": 8222 }, { "epoch": 1.6570622607293974, "grad_norm": 0.04718125984072685, "learning_rate": 7.98680242843638e-05, "loss": 0.1643, "step": 8224 }, { "epoch": 1.6574652427966954, "grad_norm": 0.04380892589688301, "learning_rate": 7.985733572037802e-05, "loss": 0.1804, "step": 8226 }, { "epoch": 1.6578682248639935, "grad_norm": 0.05865201726555824, "learning_rate": 7.984664503537483e-05, "loss": 0.1857, "step": 8228 }, { "epoch": 1.6582712069312917, "grad_norm": 0.05377458035945892, "learning_rate": 7.983595223011371e-05, "loss": 0.2507, "step": 8230 }, { "epoch": 1.6586741889985896, "grad_norm": 0.05341102182865143, "learning_rate": 7.982525730535426e-05, "loss": 0.219, "step": 8232 }, { "epoch": 1.6590771710658876, "grad_norm": 0.04187353700399399, "learning_rate": 7.981456026185625e-05, "loss": 0.2516, "step": 8234 }, { "epoch": 1.6594801531331855, "grad_norm": 0.04068870469927788, "learning_rate": 7.980386110037954e-05, "loss": 0.2234, "step": 8236 }, { "epoch": 1.6598831352004835, "grad_norm": 0.04932933673262596, "learning_rate": 7.979315982168421e-05, "loss": 0.1944, "step": 8238 }, { "epoch": 1.6602861172677816, "grad_norm": 0.04017691686749458, "learning_rate": 7.978245642653044e-05, "loss": 0.168, "step": 8240 }, { "epoch": 1.6606890993350796, "grad_norm": 0.05114683881402016, "learning_rate": 7.977175091567862e-05, "loss": 0.1944, "step": 8242 }, { "epoch": 1.6610920814023777, "grad_norm": 0.06419112533330917, "learning_rate": 7.976104328988921e-05, "loss": 0.2418, "step": 8244 }, { "epoch": 1.6614950634696757, "grad_norm": 0.0576966255903244, "learning_rate": 7.97503335499229e-05, "loss": 0.2162, "step": 8246 }, { "epoch": 1.6618980455369736, "grad_norm": 0.044565919786691666, "learning_rate": 7.973962169654044e-05, "loss": 0.2053, "step": 8248 }, { "epoch": 1.6623010276042716, "grad_norm": 0.046498291194438934, "learning_rate": 7.972890773050284e-05, "loss": 0.2392, "step": 8250 }, { "epoch": 1.6627040096715695, "grad_norm": 0.058883845806121826, "learning_rate": 7.971819165257117e-05, "loss": 0.1683, "step": 8252 }, { "epoch": 1.6631069917388677, "grad_norm": 0.03590350225567818, "learning_rate": 7.97074734635067e-05, "loss": 0.1825, "step": 8254 }, { "epoch": 1.6635099738061656, "grad_norm": 0.04365016892552376, "learning_rate": 7.969675316407083e-05, "loss": 0.1716, "step": 8256 }, { "epoch": 1.6639129558734638, "grad_norm": 0.038824837654829025, "learning_rate": 7.96860307550251e-05, "loss": 0.1692, "step": 8258 }, { "epoch": 1.6643159379407617, "grad_norm": 0.0652041882276535, "learning_rate": 7.967530623713122e-05, "loss": 0.1787, "step": 8260 }, { "epoch": 1.6647189200080597, "grad_norm": 0.05665358901023865, "learning_rate": 7.966457961115104e-05, "loss": 0.2032, "step": 8262 }, { "epoch": 1.6651219020753576, "grad_norm": 0.04727930948138237, "learning_rate": 7.965385087784657e-05, "loss": 0.1644, "step": 8264 }, { "epoch": 1.6655248841426555, "grad_norm": 0.035444822162389755, "learning_rate": 7.964312003797996e-05, "loss": 0.1738, "step": 8266 }, { "epoch": 1.6659278662099537, "grad_norm": 0.045218631625175476, "learning_rate": 7.963238709231351e-05, "loss": 0.1681, "step": 8268 }, { "epoch": 1.6663308482772516, "grad_norm": 0.07512037456035614, "learning_rate": 7.962165204160966e-05, "loss": 0.2618, "step": 8270 }, { "epoch": 1.6667338303445498, "grad_norm": 0.03968672454357147, "learning_rate": 7.961091488663105e-05, "loss": 0.1868, "step": 8272 }, { "epoch": 1.6671368124118477, "grad_norm": 0.03970741853117943, "learning_rate": 7.960017562814038e-05, "loss": 0.1676, "step": 8274 }, { "epoch": 1.6675397944791457, "grad_norm": 0.0486428327858448, "learning_rate": 7.958943426690056e-05, "loss": 0.2157, "step": 8276 }, { "epoch": 1.6679427765464436, "grad_norm": 0.0547938346862793, "learning_rate": 7.957869080367466e-05, "loss": 0.1734, "step": 8278 }, { "epoch": 1.6683457586137416, "grad_norm": 0.03951823711395264, "learning_rate": 7.956794523922589e-05, "loss": 0.1634, "step": 8280 }, { "epoch": 1.6687487406810397, "grad_norm": 0.06093864515423775, "learning_rate": 7.955719757431755e-05, "loss": 0.1867, "step": 8282 }, { "epoch": 1.6691517227483377, "grad_norm": 0.04404790699481964, "learning_rate": 7.95464478097132e-05, "loss": 0.1829, "step": 8284 }, { "epoch": 1.6695547048156358, "grad_norm": 0.05264603719115257, "learning_rate": 7.95356959461764e-05, "loss": 0.1814, "step": 8286 }, { "epoch": 1.6699576868829338, "grad_norm": 0.05429494380950928, "learning_rate": 7.952494198447102e-05, "loss": 0.1782, "step": 8288 }, { "epoch": 1.6703606689502317, "grad_norm": 0.04684220254421234, "learning_rate": 7.9514185925361e-05, "loss": 0.2271, "step": 8290 }, { "epoch": 1.6707636510175297, "grad_norm": 0.04970792680978775, "learning_rate": 7.950342776961038e-05, "loss": 0.1925, "step": 8292 }, { "epoch": 1.6711666330848276, "grad_norm": 0.061497241258621216, "learning_rate": 7.949266751798345e-05, "loss": 0.2345, "step": 8294 }, { "epoch": 1.6715696151521258, "grad_norm": 0.0509205162525177, "learning_rate": 7.948190517124459e-05, "loss": 0.1747, "step": 8296 }, { "epoch": 1.6719725972194237, "grad_norm": 0.05777128040790558, "learning_rate": 7.947114073015833e-05, "loss": 0.2091, "step": 8298 }, { "epoch": 1.6723755792867219, "grad_norm": 0.03959937021136284, "learning_rate": 7.946037419548936e-05, "loss": 0.1686, "step": 8300 }, { "epoch": 1.6727785613540198, "grad_norm": 0.0511389821767807, "learning_rate": 7.944960556800254e-05, "loss": 0.2136, "step": 8302 }, { "epoch": 1.6731815434213178, "grad_norm": 0.05945146456360817, "learning_rate": 7.943883484846282e-05, "loss": 0.1785, "step": 8304 }, { "epoch": 1.6735845254886157, "grad_norm": 0.0385262668132782, "learning_rate": 7.942806203763535e-05, "loss": 0.1676, "step": 8306 }, { "epoch": 1.6739875075559136, "grad_norm": 0.05760958045721054, "learning_rate": 7.941728713628544e-05, "loss": 0.1924, "step": 8308 }, { "epoch": 1.6743904896232118, "grad_norm": 0.05645795539021492, "learning_rate": 7.940651014517848e-05, "loss": 0.1755, "step": 8310 }, { "epoch": 1.6747934716905097, "grad_norm": 0.04871126636862755, "learning_rate": 7.939573106508008e-05, "loss": 0.1909, "step": 8312 }, { "epoch": 1.675196453757808, "grad_norm": 0.043447740375995636, "learning_rate": 7.938494989675594e-05, "loss": 0.2446, "step": 8314 }, { "epoch": 1.6755994358251058, "grad_norm": 0.038542792201042175, "learning_rate": 7.937416664097195e-05, "loss": 0.1607, "step": 8316 }, { "epoch": 1.6760024178924038, "grad_norm": 0.05882372334599495, "learning_rate": 7.936338129849415e-05, "loss": 0.1976, "step": 8318 }, { "epoch": 1.6764053999597017, "grad_norm": 0.031100820749998093, "learning_rate": 7.935259387008871e-05, "loss": 0.1232, "step": 8320 }, { "epoch": 1.6768083820269997, "grad_norm": 0.04631880298256874, "learning_rate": 7.934180435652194e-05, "loss": 0.1579, "step": 8322 }, { "epoch": 1.6772113640942978, "grad_norm": 0.04775720462203026, "learning_rate": 7.93310127585603e-05, "loss": 0.1832, "step": 8324 }, { "epoch": 1.6776143461615958, "grad_norm": 0.04856050759553909, "learning_rate": 7.932021907697044e-05, "loss": 0.1944, "step": 8326 }, { "epoch": 1.678017328228894, "grad_norm": 0.04146652668714523, "learning_rate": 7.93094233125191e-05, "loss": 0.1789, "step": 8328 }, { "epoch": 1.6784203102961919, "grad_norm": 0.039745211601257324, "learning_rate": 7.92986254659732e-05, "loss": 0.2204, "step": 8330 }, { "epoch": 1.6788232923634898, "grad_norm": 0.047939032316207886, "learning_rate": 7.92878255380998e-05, "loss": 0.2244, "step": 8332 }, { "epoch": 1.6792262744307878, "grad_norm": 0.06156514957547188, "learning_rate": 7.927702352966611e-05, "loss": 0.2438, "step": 8334 }, { "epoch": 1.6796292564980857, "grad_norm": 0.049319975078105927, "learning_rate": 7.92662194414395e-05, "loss": 0.1942, "step": 8336 }, { "epoch": 1.6800322385653839, "grad_norm": 0.06015758216381073, "learning_rate": 7.925541327418747e-05, "loss": 0.197, "step": 8338 }, { "epoch": 1.6804352206326818, "grad_norm": 0.0507478229701519, "learning_rate": 7.924460502867766e-05, "loss": 0.2398, "step": 8340 }, { "epoch": 1.68083820269998, "grad_norm": 0.04949882626533508, "learning_rate": 7.923379470567787e-05, "loss": 0.2022, "step": 8342 }, { "epoch": 1.681241184767278, "grad_norm": 0.0555623322725296, "learning_rate": 7.922298230595607e-05, "loss": 0.2119, "step": 8344 }, { "epoch": 1.6816441668345758, "grad_norm": 0.04433630779385567, "learning_rate": 7.921216783028034e-05, "loss": 0.2112, "step": 8346 }, { "epoch": 1.6820471489018738, "grad_norm": 0.051720310002565384, "learning_rate": 7.920135127941893e-05, "loss": 0.2542, "step": 8348 }, { "epoch": 1.6824501309691717, "grad_norm": 0.05196801573038101, "learning_rate": 7.91905326541402e-05, "loss": 0.2263, "step": 8350 }, { "epoch": 1.68285311303647, "grad_norm": 0.03836916387081146, "learning_rate": 7.917971195521274e-05, "loss": 0.1765, "step": 8352 }, { "epoch": 1.6832560951037678, "grad_norm": 0.04000631347298622, "learning_rate": 7.916888918340521e-05, "loss": 0.2161, "step": 8354 }, { "epoch": 1.683659077171066, "grad_norm": 0.06888539344072342, "learning_rate": 7.915806433948643e-05, "loss": 0.2373, "step": 8356 }, { "epoch": 1.684062059238364, "grad_norm": 0.05565977841615677, "learning_rate": 7.914723742422539e-05, "loss": 0.1777, "step": 8358 }, { "epoch": 1.6844650413056619, "grad_norm": 0.047493912279605865, "learning_rate": 7.913640843839122e-05, "loss": 0.2043, "step": 8360 }, { "epoch": 1.6848680233729598, "grad_norm": 0.04517120495438576, "learning_rate": 7.912557738275319e-05, "loss": 0.1706, "step": 8362 }, { "epoch": 1.6852710054402578, "grad_norm": 0.07502885162830353, "learning_rate": 7.911474425808072e-05, "loss": 0.1983, "step": 8364 }, { "epoch": 1.685673987507556, "grad_norm": 0.06703903526067734, "learning_rate": 7.910390906514338e-05, "loss": 0.1789, "step": 8366 }, { "epoch": 1.6860769695748539, "grad_norm": 0.06102665886282921, "learning_rate": 7.90930718047109e-05, "loss": 0.2034, "step": 8368 }, { "epoch": 1.686479951642152, "grad_norm": 0.08277782052755356, "learning_rate": 7.90822324775531e-05, "loss": 0.2115, "step": 8370 }, { "epoch": 1.68688293370945, "grad_norm": 0.04407776519656181, "learning_rate": 7.907139108444004e-05, "loss": 0.1716, "step": 8372 }, { "epoch": 1.687285915776748, "grad_norm": 0.05880500376224518, "learning_rate": 7.906054762614184e-05, "loss": 0.1869, "step": 8374 }, { "epoch": 1.6876888978440459, "grad_norm": 0.055312976241111755, "learning_rate": 7.904970210342882e-05, "loss": 0.1981, "step": 8376 }, { "epoch": 1.6880918799113438, "grad_norm": 0.05081368237733841, "learning_rate": 7.90388545170714e-05, "loss": 0.1713, "step": 8378 }, { "epoch": 1.688494861978642, "grad_norm": 0.03774468973278999, "learning_rate": 7.902800486784021e-05, "loss": 0.1554, "step": 8380 }, { "epoch": 1.6888978440459401, "grad_norm": 0.04858670011162758, "learning_rate": 7.901715315650597e-05, "loss": 0.1874, "step": 8382 }, { "epoch": 1.689300826113238, "grad_norm": 0.03782944008708, "learning_rate": 7.900629938383959e-05, "loss": 0.1464, "step": 8384 }, { "epoch": 1.689703808180536, "grad_norm": 0.04310869425535202, "learning_rate": 7.899544355061209e-05, "loss": 0.1399, "step": 8386 }, { "epoch": 1.690106790247834, "grad_norm": 0.06209326907992363, "learning_rate": 7.898458565759463e-05, "loss": 0.1511, "step": 8388 }, { "epoch": 1.6905097723151319, "grad_norm": 0.05323124676942825, "learning_rate": 7.897372570555858e-05, "loss": 0.2171, "step": 8390 }, { "epoch": 1.6909127543824298, "grad_norm": 0.0539429634809494, "learning_rate": 7.89628636952754e-05, "loss": 0.2134, "step": 8392 }, { "epoch": 1.691315736449728, "grad_norm": 0.039286866784095764, "learning_rate": 7.895199962751668e-05, "loss": 0.1713, "step": 8394 }, { "epoch": 1.6917187185170262, "grad_norm": 0.0580926276743412, "learning_rate": 7.894113350305421e-05, "loss": 0.2415, "step": 8396 }, { "epoch": 1.692121700584324, "grad_norm": 0.056929927319288254, "learning_rate": 7.893026532265992e-05, "loss": 0.2097, "step": 8398 }, { "epoch": 1.692524682651622, "grad_norm": 0.039593666791915894, "learning_rate": 7.891939508710583e-05, "loss": 0.1637, "step": 8400 }, { "epoch": 1.69292766471892, "grad_norm": 0.052947916090488434, "learning_rate": 7.890852279716416e-05, "loss": 0.1893, "step": 8402 }, { "epoch": 1.693330646786218, "grad_norm": 0.0409802608191967, "learning_rate": 7.889764845360727e-05, "loss": 0.1594, "step": 8404 }, { "epoch": 1.6937336288535159, "grad_norm": 0.04634273424744606, "learning_rate": 7.888677205720767e-05, "loss": 0.2099, "step": 8406 }, { "epoch": 1.694136610920814, "grad_norm": 0.060330476611852646, "learning_rate": 7.887589360873794e-05, "loss": 0.2091, "step": 8408 }, { "epoch": 1.6945395929881122, "grad_norm": 0.05978460609912872, "learning_rate": 7.886501310897094e-05, "loss": 0.2053, "step": 8410 }, { "epoch": 1.6949425750554101, "grad_norm": 0.0454874113202095, "learning_rate": 7.885413055867956e-05, "loss": 0.1795, "step": 8412 }, { "epoch": 1.695345557122708, "grad_norm": 0.05972779542207718, "learning_rate": 7.884324595863688e-05, "loss": 0.2175, "step": 8414 }, { "epoch": 1.695748539190006, "grad_norm": 0.054317884147167206, "learning_rate": 7.883235930961617e-05, "loss": 0.1653, "step": 8416 }, { "epoch": 1.696151521257304, "grad_norm": 0.05237840861082077, "learning_rate": 7.882147061239074e-05, "loss": 0.192, "step": 8418 }, { "epoch": 1.696554503324602, "grad_norm": 0.04566609114408493, "learning_rate": 7.881057986773412e-05, "loss": 0.2368, "step": 8420 }, { "epoch": 1.6969574853919, "grad_norm": 0.05352894216775894, "learning_rate": 7.879968707642e-05, "loss": 0.2011, "step": 8422 }, { "epoch": 1.6973604674591982, "grad_norm": 0.051115117967128754, "learning_rate": 7.878879223922215e-05, "loss": 0.1936, "step": 8424 }, { "epoch": 1.6977634495264962, "grad_norm": 0.07551829516887665, "learning_rate": 7.877789535691455e-05, "loss": 0.2121, "step": 8426 }, { "epoch": 1.698166431593794, "grad_norm": 0.048008158802986145, "learning_rate": 7.87669964302713e-05, "loss": 0.1934, "step": 8428 }, { "epoch": 1.698569413661092, "grad_norm": 0.04184304550290108, "learning_rate": 7.875609546006661e-05, "loss": 0.1513, "step": 8430 }, { "epoch": 1.69897239572839, "grad_norm": 0.06528772413730621, "learning_rate": 7.87451924470749e-05, "loss": 0.1937, "step": 8432 }, { "epoch": 1.6993753777956881, "grad_norm": 0.06352069228887558, "learning_rate": 7.87342873920707e-05, "loss": 0.188, "step": 8434 }, { "epoch": 1.699778359862986, "grad_norm": 0.06633667647838593, "learning_rate": 7.872338029582867e-05, "loss": 0.1779, "step": 8436 }, { "epoch": 1.7001813419302843, "grad_norm": 0.04314619302749634, "learning_rate": 7.871247115912361e-05, "loss": 0.1713, "step": 8438 }, { "epoch": 1.7005843239975822, "grad_norm": 0.07472950220108032, "learning_rate": 7.870155998273055e-05, "loss": 0.1738, "step": 8440 }, { "epoch": 1.7009873060648801, "grad_norm": 0.09097413718700409, "learning_rate": 7.869064676742456e-05, "loss": 0.2039, "step": 8442 }, { "epoch": 1.701390288132178, "grad_norm": 0.052758365869522095, "learning_rate": 7.867973151398091e-05, "loss": 0.1801, "step": 8444 }, { "epoch": 1.701793270199476, "grad_norm": 0.05923820286989212, "learning_rate": 7.866881422317501e-05, "loss": 0.2125, "step": 8446 }, { "epoch": 1.7021962522667742, "grad_norm": 0.06283913552761078, "learning_rate": 7.865789489578239e-05, "loss": 0.1614, "step": 8448 }, { "epoch": 1.7025992343340721, "grad_norm": 0.045529961585998535, "learning_rate": 7.864697353257872e-05, "loss": 0.1559, "step": 8450 }, { "epoch": 1.7030022164013703, "grad_norm": 0.06094701588153839, "learning_rate": 7.86360501343399e-05, "loss": 0.1766, "step": 8452 }, { "epoch": 1.7034051984686682, "grad_norm": 0.10539116710424423, "learning_rate": 7.862512470184187e-05, "loss": 0.213, "step": 8454 }, { "epoch": 1.7038081805359662, "grad_norm": 0.06393177807331085, "learning_rate": 7.861419723586074e-05, "loss": 0.1821, "step": 8456 }, { "epoch": 1.704211162603264, "grad_norm": 0.053990524262189865, "learning_rate": 7.860326773717281e-05, "loss": 0.1965, "step": 8458 }, { "epoch": 1.704614144670562, "grad_norm": 0.06103832647204399, "learning_rate": 7.85923362065545e-05, "loss": 0.1929, "step": 8460 }, { "epoch": 1.7050171267378602, "grad_norm": 0.05043712630867958, "learning_rate": 7.858140264478233e-05, "loss": 0.2383, "step": 8462 }, { "epoch": 1.7054201088051582, "grad_norm": 0.058011483401060104, "learning_rate": 7.857046705263305e-05, "loss": 0.1691, "step": 8464 }, { "epoch": 1.7058230908724563, "grad_norm": 0.07188203185796738, "learning_rate": 7.855952943088346e-05, "loss": 0.2316, "step": 8466 }, { "epoch": 1.7062260729397543, "grad_norm": 0.0951547920703888, "learning_rate": 7.854858978031057e-05, "loss": 0.2202, "step": 8468 }, { "epoch": 1.7066290550070522, "grad_norm": 0.045940931886434555, "learning_rate": 7.853764810169153e-05, "loss": 0.1967, "step": 8470 }, { "epoch": 1.7070320370743501, "grad_norm": 0.06384126842021942, "learning_rate": 7.852670439580362e-05, "loss": 0.2567, "step": 8472 }, { "epoch": 1.707435019141648, "grad_norm": 0.05669160187244415, "learning_rate": 7.851575866342424e-05, "loss": 0.2404, "step": 8474 }, { "epoch": 1.7078380012089462, "grad_norm": 0.04451169818639755, "learning_rate": 7.850481090533097e-05, "loss": 0.1893, "step": 8476 }, { "epoch": 1.7082409832762442, "grad_norm": 0.06958547234535217, "learning_rate": 7.84938611223015e-05, "loss": 0.2434, "step": 8478 }, { "epoch": 1.7086439653435423, "grad_norm": 0.05522121861577034, "learning_rate": 7.848290931511372e-05, "loss": 0.217, "step": 8480 }, { "epoch": 1.7090469474108403, "grad_norm": 0.056487396359443665, "learning_rate": 7.847195548454564e-05, "loss": 0.212, "step": 8482 }, { "epoch": 1.7094499294781382, "grad_norm": 0.058997754007577896, "learning_rate": 7.846099963137535e-05, "loss": 0.2267, "step": 8484 }, { "epoch": 1.7098529115454362, "grad_norm": 0.042182806879282, "learning_rate": 7.845004175638116e-05, "loss": 0.2063, "step": 8486 }, { "epoch": 1.7102558936127341, "grad_norm": 0.05909043177962303, "learning_rate": 7.843908186034152e-05, "loss": 0.1918, "step": 8488 }, { "epoch": 1.7106588756800323, "grad_norm": 0.04273051396012306, "learning_rate": 7.842811994403496e-05, "loss": 0.2098, "step": 8490 }, { "epoch": 1.7110618577473302, "grad_norm": 0.04888763651251793, "learning_rate": 7.841715600824024e-05, "loss": 0.1562, "step": 8492 }, { "epoch": 1.7114648398146284, "grad_norm": 0.048436980694532394, "learning_rate": 7.840619005373621e-05, "loss": 0.2039, "step": 8494 }, { "epoch": 1.7118678218819263, "grad_norm": 0.03377070277929306, "learning_rate": 7.839522208130186e-05, "loss": 0.19, "step": 8496 }, { "epoch": 1.7122708039492243, "grad_norm": 0.04444821551442146, "learning_rate": 7.838425209171633e-05, "loss": 0.2197, "step": 8498 }, { "epoch": 1.7126737860165222, "grad_norm": 0.03357812389731407, "learning_rate": 7.837328008575895e-05, "loss": 0.1413, "step": 8500 }, { "epoch": 1.7130767680838201, "grad_norm": 0.045264676213264465, "learning_rate": 7.836230606420911e-05, "loss": 0.1612, "step": 8502 }, { "epoch": 1.7134797501511183, "grad_norm": 0.043273936957120895, "learning_rate": 7.835133002784642e-05, "loss": 0.1633, "step": 8504 }, { "epoch": 1.7138827322184162, "grad_norm": 0.0491492860019207, "learning_rate": 7.834035197745059e-05, "loss": 0.1686, "step": 8506 }, { "epoch": 1.7142857142857144, "grad_norm": 0.08964493870735168, "learning_rate": 7.832937191380147e-05, "loss": 0.1782, "step": 8508 }, { "epoch": 1.7146886963530124, "grad_norm": 0.04667485132813454, "learning_rate": 7.831838983767907e-05, "loss": 0.2296, "step": 8510 }, { "epoch": 1.7150916784203103, "grad_norm": 0.06557079404592514, "learning_rate": 7.830740574986355e-05, "loss": 0.1757, "step": 8512 }, { "epoch": 1.7154946604876082, "grad_norm": 0.03733866289258003, "learning_rate": 7.82964196511352e-05, "loss": 0.191, "step": 8514 }, { "epoch": 1.7158976425549062, "grad_norm": 0.04955060034990311, "learning_rate": 7.828543154227445e-05, "loss": 0.1708, "step": 8516 }, { "epoch": 1.7163006246222043, "grad_norm": 0.04552415385842323, "learning_rate": 7.827444142406188e-05, "loss": 0.1344, "step": 8518 }, { "epoch": 1.7167036066895023, "grad_norm": 0.0272463858127594, "learning_rate": 7.826344929727821e-05, "loss": 0.1609, "step": 8520 }, { "epoch": 1.7171065887568004, "grad_norm": 0.04317507520318031, "learning_rate": 7.82524551627043e-05, "loss": 0.1573, "step": 8522 }, { "epoch": 1.7175095708240984, "grad_norm": 0.05297327786684036, "learning_rate": 7.824145902112115e-05, "loss": 0.1895, "step": 8524 }, { "epoch": 1.7179125528913963, "grad_norm": 0.0684426873922348, "learning_rate": 7.823046087330992e-05, "loss": 0.2023, "step": 8526 }, { "epoch": 1.7183155349586943, "grad_norm": 0.05523200333118439, "learning_rate": 7.82194607200519e-05, "loss": 0.1781, "step": 8528 }, { "epoch": 1.7187185170259922, "grad_norm": 0.07310648262500763, "learning_rate": 7.820845856212853e-05, "loss": 0.2673, "step": 8530 }, { "epoch": 1.7191214990932904, "grad_norm": 0.05852164700627327, "learning_rate": 7.819745440032136e-05, "loss": 0.1842, "step": 8532 }, { "epoch": 1.7195244811605883, "grad_norm": 0.05170690268278122, "learning_rate": 7.818644823541215e-05, "loss": 0.1529, "step": 8534 }, { "epoch": 1.7199274632278865, "grad_norm": 0.04495406523346901, "learning_rate": 7.817544006818272e-05, "loss": 0.1847, "step": 8536 }, { "epoch": 1.7203304452951844, "grad_norm": 0.06066835671663284, "learning_rate": 7.816442989941508e-05, "loss": 0.2595, "step": 8538 }, { "epoch": 1.7207334273624824, "grad_norm": 0.04806002229452133, "learning_rate": 7.815341772989138e-05, "loss": 0.2052, "step": 8540 }, { "epoch": 1.7211364094297803, "grad_norm": 0.060100167989730835, "learning_rate": 7.814240356039392e-05, "loss": 0.2143, "step": 8542 }, { "epoch": 1.7215393914970782, "grad_norm": 0.04579418525099754, "learning_rate": 7.813138739170511e-05, "loss": 0.2379, "step": 8544 }, { "epoch": 1.7219423735643764, "grad_norm": 0.041652414947748184, "learning_rate": 7.812036922460754e-05, "loss": 0.1726, "step": 8546 }, { "epoch": 1.7223453556316743, "grad_norm": 0.051688678562641144, "learning_rate": 7.810934905988392e-05, "loss": 0.1802, "step": 8548 }, { "epoch": 1.7227483376989725, "grad_norm": 0.05031515657901764, "learning_rate": 7.809832689831707e-05, "loss": 0.2386, "step": 8550 }, { "epoch": 1.7231513197662705, "grad_norm": 0.04816931113600731, "learning_rate": 7.808730274069003e-05, "loss": 0.1636, "step": 8552 }, { "epoch": 1.7235543018335684, "grad_norm": 0.06899578869342804, "learning_rate": 7.807627658778592e-05, "loss": 0.1706, "step": 8554 }, { "epoch": 1.7239572839008663, "grad_norm": 0.03780083358287811, "learning_rate": 7.806524844038803e-05, "loss": 0.2101, "step": 8556 }, { "epoch": 1.7243602659681643, "grad_norm": 0.04826981946825981, "learning_rate": 7.805421829927977e-05, "loss": 0.1997, "step": 8558 }, { "epoch": 1.7247632480354624, "grad_norm": 0.050643905997276306, "learning_rate": 7.80431861652447e-05, "loss": 0.2093, "step": 8560 }, { "epoch": 1.7251662301027604, "grad_norm": 0.048531509935855865, "learning_rate": 7.803215203906655e-05, "loss": 0.1975, "step": 8562 }, { "epoch": 1.7255692121700585, "grad_norm": 0.0731339156627655, "learning_rate": 7.802111592152913e-05, "loss": 0.195, "step": 8564 }, { "epoch": 1.7259721942373565, "grad_norm": 0.05814792215824127, "learning_rate": 7.801007781341644e-05, "loss": 0.2539, "step": 8566 }, { "epoch": 1.7263751763046544, "grad_norm": 0.05993201211094856, "learning_rate": 7.799903771551265e-05, "loss": 0.1668, "step": 8568 }, { "epoch": 1.7267781583719524, "grad_norm": 0.04430992528796196, "learning_rate": 7.798799562860198e-05, "loss": 0.1859, "step": 8570 }, { "epoch": 1.7271811404392503, "grad_norm": 0.04470387473702431, "learning_rate": 7.797695155346887e-05, "loss": 0.1571, "step": 8572 }, { "epoch": 1.7275841225065485, "grad_norm": 0.03621676564216614, "learning_rate": 7.796590549089786e-05, "loss": 0.1829, "step": 8574 }, { "epoch": 1.7279871045738464, "grad_norm": 0.05174838379025459, "learning_rate": 7.795485744167365e-05, "loss": 0.2106, "step": 8576 }, { "epoch": 1.7283900866411446, "grad_norm": 0.042110104113817215, "learning_rate": 7.794380740658107e-05, "loss": 0.2247, "step": 8578 }, { "epoch": 1.7287930687084425, "grad_norm": 0.05196274816989899, "learning_rate": 7.79327553864051e-05, "loss": 0.1762, "step": 8580 }, { "epoch": 1.7291960507757405, "grad_norm": 0.051834408193826675, "learning_rate": 7.792170138193086e-05, "loss": 0.204, "step": 8582 }, { "epoch": 1.7295990328430384, "grad_norm": 0.036789216101169586, "learning_rate": 7.79106453939436e-05, "loss": 0.1595, "step": 8584 }, { "epoch": 1.7300020149103363, "grad_norm": 0.05568787083029747, "learning_rate": 7.789958742322873e-05, "loss": 0.1751, "step": 8586 }, { "epoch": 1.7304049969776345, "grad_norm": 0.061141084879636765, "learning_rate": 7.78885274705718e-05, "loss": 0.1976, "step": 8588 }, { "epoch": 1.7308079790449327, "grad_norm": 0.05548600107431412, "learning_rate": 7.787746553675848e-05, "loss": 0.1615, "step": 8590 }, { "epoch": 1.7312109611122306, "grad_norm": 0.05701598897576332, "learning_rate": 7.78664016225746e-05, "loss": 0.2356, "step": 8592 }, { "epoch": 1.7316139431795285, "grad_norm": 0.06706813722848892, "learning_rate": 7.785533572880609e-05, "loss": 0.2213, "step": 8594 }, { "epoch": 1.7320169252468265, "grad_norm": 0.05262723192572594, "learning_rate": 7.784426785623908e-05, "loss": 0.162, "step": 8596 }, { "epoch": 1.7324199073141244, "grad_norm": 0.03426643833518028, "learning_rate": 7.783319800565984e-05, "loss": 0.1828, "step": 8598 }, { "epoch": 1.7328228893814224, "grad_norm": 0.05540724843740463, "learning_rate": 7.782212617785469e-05, "loss": 0.2208, "step": 8600 }, { "epoch": 1.7332258714487205, "grad_norm": 0.06078488752245903, "learning_rate": 7.781105237361021e-05, "loss": 0.2496, "step": 8602 }, { "epoch": 1.7336288535160187, "grad_norm": 0.06109081953763962, "learning_rate": 7.779997659371305e-05, "loss": 0.1905, "step": 8604 }, { "epoch": 1.7340318355833166, "grad_norm": 0.07080352306365967, "learning_rate": 7.778889883895001e-05, "loss": 0.2268, "step": 8606 }, { "epoch": 1.7344348176506146, "grad_norm": 0.05866062641143799, "learning_rate": 7.777781911010804e-05, "loss": 0.2183, "step": 8608 }, { "epoch": 1.7348377997179125, "grad_norm": 0.06478538364171982, "learning_rate": 7.776673740797422e-05, "loss": 0.2019, "step": 8610 }, { "epoch": 1.7352407817852105, "grad_norm": 0.04579712077975273, "learning_rate": 7.775565373333578e-05, "loss": 0.2049, "step": 8612 }, { "epoch": 1.7356437638525084, "grad_norm": 0.06760866940021515, "learning_rate": 7.774456808698008e-05, "loss": 0.2231, "step": 8614 }, { "epoch": 1.7360467459198066, "grad_norm": 0.047087687999010086, "learning_rate": 7.773348046969465e-05, "loss": 0.1354, "step": 8616 }, { "epoch": 1.7364497279871047, "grad_norm": 0.0526084341108799, "learning_rate": 7.772239088226712e-05, "loss": 0.2409, "step": 8618 }, { "epoch": 1.7368527100544027, "grad_norm": 0.06447000801563263, "learning_rate": 7.771129932548527e-05, "loss": 0.2616, "step": 8620 }, { "epoch": 1.7372556921217006, "grad_norm": 0.06044398248195648, "learning_rate": 7.770020580013703e-05, "loss": 0.2157, "step": 8622 }, { "epoch": 1.7376586741889986, "grad_norm": 0.03387543931603432, "learning_rate": 7.768911030701047e-05, "loss": 0.164, "step": 8624 }, { "epoch": 1.7380616562562965, "grad_norm": 0.043008893728256226, "learning_rate": 7.76780128468938e-05, "loss": 0.1349, "step": 8626 }, { "epoch": 1.7384646383235944, "grad_norm": 0.06238474324345589, "learning_rate": 7.766691342057537e-05, "loss": 0.1753, "step": 8628 }, { "epoch": 1.7388676203908926, "grad_norm": 0.04955996945500374, "learning_rate": 7.765581202884365e-05, "loss": 0.2607, "step": 8630 }, { "epoch": 1.7392706024581908, "grad_norm": 0.03956250473856926, "learning_rate": 7.764470867248726e-05, "loss": 0.1929, "step": 8632 }, { "epoch": 1.7396735845254887, "grad_norm": 0.04237693175673485, "learning_rate": 7.7633603352295e-05, "loss": 0.2227, "step": 8634 }, { "epoch": 1.7400765665927866, "grad_norm": 0.04121999070048332, "learning_rate": 7.762249606905574e-05, "loss": 0.1772, "step": 8636 }, { "epoch": 1.7404795486600846, "grad_norm": 0.04537238925695419, "learning_rate": 7.761138682355854e-05, "loss": 0.1972, "step": 8638 }, { "epoch": 1.7408825307273825, "grad_norm": 0.04738117754459381, "learning_rate": 7.760027561659255e-05, "loss": 0.1604, "step": 8640 }, { "epoch": 1.7412855127946807, "grad_norm": 0.05630556121468544, "learning_rate": 7.758916244894716e-05, "loss": 0.2037, "step": 8642 }, { "epoch": 1.7416884948619786, "grad_norm": 0.05333465337753296, "learning_rate": 7.757804732141177e-05, "loss": 0.1861, "step": 8644 }, { "epoch": 1.7420914769292768, "grad_norm": 0.04941607639193535, "learning_rate": 7.7566930234776e-05, "loss": 0.1992, "step": 8646 }, { "epoch": 1.7424944589965747, "grad_norm": 0.04240027070045471, "learning_rate": 7.755581118982961e-05, "loss": 0.1641, "step": 8648 }, { "epoch": 1.7428974410638727, "grad_norm": 0.05365744233131409, "learning_rate": 7.754469018736245e-05, "loss": 0.2032, "step": 8650 }, { "epoch": 1.7433004231311706, "grad_norm": 0.052164193242788315, "learning_rate": 7.753356722816455e-05, "loss": 0.1564, "step": 8652 }, { "epoch": 1.7437034051984686, "grad_norm": 0.03998196870088577, "learning_rate": 7.752244231302608e-05, "loss": 0.1512, "step": 8654 }, { "epoch": 1.7441063872657667, "grad_norm": 0.03845955803990364, "learning_rate": 7.75113154427373e-05, "loss": 0.1582, "step": 8656 }, { "epoch": 1.7445093693330647, "grad_norm": 0.038581348955631256, "learning_rate": 7.750018661808869e-05, "loss": 0.2324, "step": 8658 }, { "epoch": 1.7449123514003628, "grad_norm": 0.05808325111865997, "learning_rate": 7.748905583987079e-05, "loss": 0.236, "step": 8660 }, { "epoch": 1.7453153334676608, "grad_norm": 0.06073828786611557, "learning_rate": 7.747792310887434e-05, "loss": 0.2215, "step": 8662 }, { "epoch": 1.7457183155349587, "grad_norm": 0.05565022677183151, "learning_rate": 7.746678842589017e-05, "loss": 0.2051, "step": 8664 }, { "epoch": 1.7461212976022567, "grad_norm": 0.07590685784816742, "learning_rate": 7.745565179170927e-05, "loss": 0.2145, "step": 8666 }, { "epoch": 1.7465242796695546, "grad_norm": 0.04306831210851669, "learning_rate": 7.744451320712278e-05, "loss": 0.1701, "step": 8668 }, { "epoch": 1.7469272617368528, "grad_norm": 0.052143242210149765, "learning_rate": 7.743337267292197e-05, "loss": 0.1892, "step": 8670 }, { "epoch": 1.7473302438041507, "grad_norm": 0.040976159274578094, "learning_rate": 7.742223018989822e-05, "loss": 0.1982, "step": 8672 }, { "epoch": 1.7477332258714489, "grad_norm": 0.04762955382466316, "learning_rate": 7.741108575884311e-05, "loss": 0.2214, "step": 8674 }, { "epoch": 1.7481362079387468, "grad_norm": 0.047516968101263046, "learning_rate": 7.73999393805483e-05, "loss": 0.167, "step": 8676 }, { "epoch": 1.7485391900060447, "grad_norm": 0.05364019423723221, "learning_rate": 7.738879105580562e-05, "loss": 0.1911, "step": 8678 }, { "epoch": 1.7489421720733427, "grad_norm": 0.036976758390665054, "learning_rate": 7.737764078540701e-05, "loss": 0.134, "step": 8680 }, { "epoch": 1.7493451541406406, "grad_norm": 0.062130097299814224, "learning_rate": 7.73664885701446e-05, "loss": 0.1687, "step": 8682 }, { "epoch": 1.7497481362079388, "grad_norm": 0.04362349212169647, "learning_rate": 7.73553344108106e-05, "loss": 0.1953, "step": 8684 }, { "epoch": 1.7501511182752367, "grad_norm": 0.045134756714105606, "learning_rate": 7.73441783081974e-05, "loss": 0.2189, "step": 8686 }, { "epoch": 1.750554100342535, "grad_norm": 0.04244035482406616, "learning_rate": 7.73330202630975e-05, "loss": 0.2119, "step": 8688 }, { "epoch": 1.7509570824098328, "grad_norm": 0.06525809317827225, "learning_rate": 7.732186027630355e-05, "loss": 0.2336, "step": 8690 }, { "epoch": 1.7513600644771308, "grad_norm": 0.04660410434007645, "learning_rate": 7.731069834860833e-05, "loss": 0.1688, "step": 8692 }, { "epoch": 1.7517630465444287, "grad_norm": 0.05042422562837601, "learning_rate": 7.729953448080481e-05, "loss": 0.2028, "step": 8694 }, { "epoch": 1.7521660286117267, "grad_norm": 0.05255540832877159, "learning_rate": 7.728836867368599e-05, "loss": 0.2158, "step": 8696 }, { "epoch": 1.7525690106790248, "grad_norm": 0.04372533783316612, "learning_rate": 7.72772009280451e-05, "loss": 0.1658, "step": 8698 }, { "epoch": 1.7529719927463228, "grad_norm": 0.054151248186826706, "learning_rate": 7.726603124467548e-05, "loss": 0.2391, "step": 8700 }, { "epoch": 1.753374974813621, "grad_norm": 0.05295997112989426, "learning_rate": 7.725485962437062e-05, "loss": 0.2183, "step": 8702 }, { "epoch": 1.7537779568809189, "grad_norm": 0.044501155614852905, "learning_rate": 7.724368606792412e-05, "loss": 0.2243, "step": 8704 }, { "epoch": 1.7541809389482168, "grad_norm": 0.04965506121516228, "learning_rate": 7.723251057612972e-05, "loss": 0.1998, "step": 8706 }, { "epoch": 1.7545839210155147, "grad_norm": 0.06588796526193619, "learning_rate": 7.722133314978133e-05, "loss": 0.196, "step": 8708 }, { "epoch": 1.7549869030828127, "grad_norm": 0.057937197387218475, "learning_rate": 7.721015378967296e-05, "loss": 0.2054, "step": 8710 }, { "epoch": 1.7553898851501109, "grad_norm": 0.04531354829668999, "learning_rate": 7.719897249659878e-05, "loss": 0.1873, "step": 8712 }, { "epoch": 1.7557928672174088, "grad_norm": 0.05281313881278038, "learning_rate": 7.71877892713531e-05, "loss": 0.2363, "step": 8714 }, { "epoch": 1.756195849284707, "grad_norm": 0.052877333015203476, "learning_rate": 7.717660411473035e-05, "loss": 0.2576, "step": 8716 }, { "epoch": 1.756598831352005, "grad_norm": 0.046693529933691025, "learning_rate": 7.71654170275251e-05, "loss": 0.1911, "step": 8718 }, { "epoch": 1.7570018134193028, "grad_norm": 0.045685358345508575, "learning_rate": 7.715422801053207e-05, "loss": 0.1662, "step": 8720 }, { "epoch": 1.7574047954866008, "grad_norm": 0.05251702293753624, "learning_rate": 7.714303706454611e-05, "loss": 0.217, "step": 8722 }, { "epoch": 1.7578077775538987, "grad_norm": 0.04720870032906532, "learning_rate": 7.713184419036222e-05, "loss": 0.2178, "step": 8724 }, { "epoch": 1.7582107596211969, "grad_norm": 0.04424897953867912, "learning_rate": 7.712064938877548e-05, "loss": 0.1485, "step": 8726 }, { "epoch": 1.7586137416884948, "grad_norm": 0.04398166388273239, "learning_rate": 7.71094526605812e-05, "loss": 0.1699, "step": 8728 }, { "epoch": 1.759016723755793, "grad_norm": 0.05234614759683609, "learning_rate": 7.709825400657475e-05, "loss": 0.1842, "step": 8730 }, { "epoch": 1.759419705823091, "grad_norm": 0.0670747384428978, "learning_rate": 7.708705342755169e-05, "loss": 0.1806, "step": 8732 }, { "epoch": 1.7598226878903889, "grad_norm": 0.03887630254030228, "learning_rate": 7.707585092430765e-05, "loss": 0.1773, "step": 8734 }, { "epoch": 1.7602256699576868, "grad_norm": 0.056282076984643936, "learning_rate": 7.706464649763847e-05, "loss": 0.1901, "step": 8736 }, { "epoch": 1.7606286520249848, "grad_norm": 0.04681971296668053, "learning_rate": 7.705344014834011e-05, "loss": 0.2137, "step": 8738 }, { "epoch": 1.761031634092283, "grad_norm": 0.05810059607028961, "learning_rate": 7.704223187720861e-05, "loss": 0.1969, "step": 8740 }, { "epoch": 1.7614346161595809, "grad_norm": 0.05052733048796654, "learning_rate": 7.70310216850402e-05, "loss": 0.2087, "step": 8742 }, { "epoch": 1.761837598226879, "grad_norm": 0.04968501254916191, "learning_rate": 7.701980957263123e-05, "loss": 0.1928, "step": 8744 }, { "epoch": 1.762240580294177, "grad_norm": 0.0517900288105011, "learning_rate": 7.700859554077821e-05, "loss": 0.2056, "step": 8746 }, { "epoch": 1.762643562361475, "grad_norm": 0.052969422191381454, "learning_rate": 7.699737959027776e-05, "loss": 0.2397, "step": 8748 }, { "epoch": 1.7630465444287728, "grad_norm": 0.07398225367069244, "learning_rate": 7.698616172192663e-05, "loss": 0.19, "step": 8750 }, { "epoch": 1.7634495264960708, "grad_norm": 0.04895591363310814, "learning_rate": 7.697494193652174e-05, "loss": 0.1756, "step": 8752 }, { "epoch": 1.763852508563369, "grad_norm": 0.058382175862789154, "learning_rate": 7.696372023486012e-05, "loss": 0.2415, "step": 8754 }, { "epoch": 1.764255490630667, "grad_norm": 0.044502172619104385, "learning_rate": 7.695249661773892e-05, "loss": 0.1885, "step": 8756 }, { "epoch": 1.764658472697965, "grad_norm": 0.04054013267159462, "learning_rate": 7.694127108595548e-05, "loss": 0.172, "step": 8758 }, { "epoch": 1.765061454765263, "grad_norm": 0.05540904030203819, "learning_rate": 7.693004364030723e-05, "loss": 0.1634, "step": 8760 }, { "epoch": 1.765464436832561, "grad_norm": 0.052257224917411804, "learning_rate": 7.691881428159172e-05, "loss": 0.2097, "step": 8762 }, { "epoch": 1.7658674188998589, "grad_norm": 0.051800746470689774, "learning_rate": 7.690758301060672e-05, "loss": 0.1682, "step": 8764 }, { "epoch": 1.7662704009671568, "grad_norm": 0.05630108341574669, "learning_rate": 7.689634982815005e-05, "loss": 0.1729, "step": 8766 }, { "epoch": 1.766673383034455, "grad_norm": 0.05631199851632118, "learning_rate": 7.68851147350197e-05, "loss": 0.1729, "step": 8768 }, { "epoch": 1.767076365101753, "grad_norm": 0.05669247731566429, "learning_rate": 7.687387773201379e-05, "loss": 0.1933, "step": 8770 }, { "epoch": 1.767479347169051, "grad_norm": 0.051117509603500366, "learning_rate": 7.686263881993059e-05, "loss": 0.1638, "step": 8772 }, { "epoch": 1.767882329236349, "grad_norm": 0.041834667325019836, "learning_rate": 7.685139799956848e-05, "loss": 0.1576, "step": 8774 }, { "epoch": 1.768285311303647, "grad_norm": 0.044466495513916016, "learning_rate": 7.684015527172601e-05, "loss": 0.2271, "step": 8776 }, { "epoch": 1.768688293370945, "grad_norm": 0.06349855661392212, "learning_rate": 7.682891063720184e-05, "loss": 0.2129, "step": 8778 }, { "epoch": 1.7690912754382428, "grad_norm": 0.06009896472096443, "learning_rate": 7.681766409679476e-05, "loss": 0.2256, "step": 8780 }, { "epoch": 1.769494257505541, "grad_norm": 0.056756600737571716, "learning_rate": 7.680641565130371e-05, "loss": 0.1993, "step": 8782 }, { "epoch": 1.769897239572839, "grad_norm": 0.05109809339046478, "learning_rate": 7.679516530152775e-05, "loss": 0.2196, "step": 8784 }, { "epoch": 1.7703002216401371, "grad_norm": 0.05256953090429306, "learning_rate": 7.67839130482661e-05, "loss": 0.19, "step": 8786 }, { "epoch": 1.770703203707435, "grad_norm": 0.045628610998392105, "learning_rate": 7.677265889231812e-05, "loss": 0.1846, "step": 8788 }, { "epoch": 1.771106185774733, "grad_norm": 0.049970727413892746, "learning_rate": 7.676140283448328e-05, "loss": 0.2571, "step": 8790 }, { "epoch": 1.771509167842031, "grad_norm": 0.06309273838996887, "learning_rate": 7.675014487556114e-05, "loss": 0.247, "step": 8792 }, { "epoch": 1.7719121499093289, "grad_norm": 0.04105915129184723, "learning_rate": 7.673888501635153e-05, "loss": 0.2053, "step": 8794 }, { "epoch": 1.772315131976627, "grad_norm": 0.052911341190338135, "learning_rate": 7.672762325765425e-05, "loss": 0.2354, "step": 8796 }, { "epoch": 1.7727181140439252, "grad_norm": 0.03674977645277977, "learning_rate": 7.671635960026939e-05, "loss": 0.1794, "step": 8798 }, { "epoch": 1.7731210961112231, "grad_norm": 0.051963258534669876, "learning_rate": 7.670509404499706e-05, "loss": 0.1442, "step": 8800 }, { "epoch": 1.773524078178521, "grad_norm": 0.07293074578046799, "learning_rate": 7.669382659263755e-05, "loss": 0.2144, "step": 8802 }, { "epoch": 1.773927060245819, "grad_norm": 0.03861695155501366, "learning_rate": 7.66825572439913e-05, "loss": 0.2062, "step": 8804 }, { "epoch": 1.774330042313117, "grad_norm": 0.04328519478440285, "learning_rate": 7.667128599985887e-05, "loss": 0.2074, "step": 8806 }, { "epoch": 1.774733024380415, "grad_norm": 0.08353982120752335, "learning_rate": 7.666001286104091e-05, "loss": 0.2031, "step": 8808 }, { "epoch": 1.775136006447713, "grad_norm": 0.03887191042304039, "learning_rate": 7.664873782833828e-05, "loss": 0.1916, "step": 8810 }, { "epoch": 1.7755389885150112, "grad_norm": 0.035796571522951126, "learning_rate": 7.663746090255194e-05, "loss": 0.2134, "step": 8812 }, { "epoch": 1.7759419705823092, "grad_norm": 0.07863356918096542, "learning_rate": 7.662618208448297e-05, "loss": 0.1853, "step": 8814 }, { "epoch": 1.7763449526496071, "grad_norm": 0.05820539966225624, "learning_rate": 7.66149013749326e-05, "loss": 0.2169, "step": 8816 }, { "epoch": 1.776747934716905, "grad_norm": 0.050097983330488205, "learning_rate": 7.660361877470221e-05, "loss": 0.2093, "step": 8818 }, { "epoch": 1.777150916784203, "grad_norm": 0.04237944260239601, "learning_rate": 7.65923342845933e-05, "loss": 0.2109, "step": 8820 }, { "epoch": 1.777553898851501, "grad_norm": 0.047271978110075, "learning_rate": 7.658104790540748e-05, "loss": 0.1822, "step": 8822 }, { "epoch": 1.777956880918799, "grad_norm": 0.045994047075510025, "learning_rate": 7.656975963794653e-05, "loss": 0.2027, "step": 8824 }, { "epoch": 1.7783598629860973, "grad_norm": 0.05817195773124695, "learning_rate": 7.655846948301233e-05, "loss": 0.1503, "step": 8826 }, { "epoch": 1.7787628450533952, "grad_norm": 0.08411876112222672, "learning_rate": 7.654717744140694e-05, "loss": 0.2695, "step": 8828 }, { "epoch": 1.7791658271206932, "grad_norm": 0.05115870013833046, "learning_rate": 7.653588351393255e-05, "loss": 0.2016, "step": 8830 }, { "epoch": 1.779568809187991, "grad_norm": 0.03854476660490036, "learning_rate": 7.652458770139139e-05, "loss": 0.1893, "step": 8832 }, { "epoch": 1.779971791255289, "grad_norm": 0.041677094995975494, "learning_rate": 7.651329000458596e-05, "loss": 0.19, "step": 8834 }, { "epoch": 1.780374773322587, "grad_norm": 0.04914252087473869, "learning_rate": 7.650199042431883e-05, "loss": 0.1929, "step": 8836 }, { "epoch": 1.7807777553898851, "grad_norm": 0.058383334428071976, "learning_rate": 7.649068896139264e-05, "loss": 0.2562, "step": 8838 }, { "epoch": 1.7811807374571833, "grad_norm": 0.03746351599693298, "learning_rate": 7.64793856166103e-05, "loss": 0.1706, "step": 8840 }, { "epoch": 1.7815837195244812, "grad_norm": 0.03564516454935074, "learning_rate": 7.646808039077475e-05, "loss": 0.114, "step": 8842 }, { "epoch": 1.7819867015917792, "grad_norm": 0.05532078444957733, "learning_rate": 7.64567732846891e-05, "loss": 0.1985, "step": 8844 }, { "epoch": 1.7823896836590771, "grad_norm": 0.061978355050086975, "learning_rate": 7.644546429915658e-05, "loss": 0.1941, "step": 8846 }, { "epoch": 1.782792665726375, "grad_norm": 0.05426686629652977, "learning_rate": 7.643415343498058e-05, "loss": 0.2307, "step": 8848 }, { "epoch": 1.7831956477936732, "grad_norm": 0.05495776608586311, "learning_rate": 7.642284069296458e-05, "loss": 0.2268, "step": 8850 }, { "epoch": 1.7835986298609712, "grad_norm": 0.047012731432914734, "learning_rate": 7.641152607391224e-05, "loss": 0.1788, "step": 8852 }, { "epoch": 1.7840016119282693, "grad_norm": 0.05379527434706688, "learning_rate": 7.640020957862733e-05, "loss": 0.1869, "step": 8854 }, { "epoch": 1.7844045939955673, "grad_norm": 0.04625250771641731, "learning_rate": 7.638889120791374e-05, "loss": 0.2064, "step": 8856 }, { "epoch": 1.7848075760628652, "grad_norm": 0.04690789431333542, "learning_rate": 7.637757096257554e-05, "loss": 0.2008, "step": 8858 }, { "epoch": 1.7852105581301632, "grad_norm": 0.06040222942829132, "learning_rate": 7.636624884341688e-05, "loss": 0.2081, "step": 8860 }, { "epoch": 1.785613540197461, "grad_norm": 0.04381508380174637, "learning_rate": 7.635492485124207e-05, "loss": 0.1608, "step": 8862 }, { "epoch": 1.7860165222647593, "grad_norm": 0.041126348078250885, "learning_rate": 7.634359898685554e-05, "loss": 0.1963, "step": 8864 }, { "epoch": 1.7864195043320572, "grad_norm": 0.04755943641066551, "learning_rate": 7.633227125106187e-05, "loss": 0.2042, "step": 8866 }, { "epoch": 1.7868224863993554, "grad_norm": 0.046007703989744186, "learning_rate": 7.632094164466577e-05, "loss": 0.1998, "step": 8868 }, { "epoch": 1.7872254684666533, "grad_norm": 0.047473929822444916, "learning_rate": 7.630961016847207e-05, "loss": 0.1677, "step": 8870 }, { "epoch": 1.7876284505339513, "grad_norm": 0.04671850427985191, "learning_rate": 7.629827682328572e-05, "loss": 0.2158, "step": 8872 }, { "epoch": 1.7880314326012492, "grad_norm": 0.05689868703484535, "learning_rate": 7.628694160991185e-05, "loss": 0.1887, "step": 8874 }, { "epoch": 1.7884344146685471, "grad_norm": 0.03737175464630127, "learning_rate": 7.62756045291557e-05, "loss": 0.2039, "step": 8876 }, { "epoch": 1.7888373967358453, "grad_norm": 0.10977429151535034, "learning_rate": 7.626426558182262e-05, "loss": 0.2176, "step": 8878 }, { "epoch": 1.7892403788031432, "grad_norm": 0.04432012513279915, "learning_rate": 7.62529247687181e-05, "loss": 0.2089, "step": 8880 }, { "epoch": 1.7896433608704414, "grad_norm": 0.04522010684013367, "learning_rate": 7.624158209064782e-05, "loss": 0.1346, "step": 8882 }, { "epoch": 1.7900463429377393, "grad_norm": 0.07001801580190659, "learning_rate": 7.62302375484175e-05, "loss": 0.1907, "step": 8884 }, { "epoch": 1.7904493250050373, "grad_norm": 0.05941377207636833, "learning_rate": 7.621889114283305e-05, "loss": 0.2282, "step": 8886 }, { "epoch": 1.7908523070723352, "grad_norm": 0.11536096781492233, "learning_rate": 7.620754287470051e-05, "loss": 0.1884, "step": 8888 }, { "epoch": 1.7912552891396332, "grad_norm": 0.044333018362522125, "learning_rate": 7.619619274482603e-05, "loss": 0.1956, "step": 8890 }, { "epoch": 1.7916582712069313, "grad_norm": 0.06309880316257477, "learning_rate": 7.618484075401591e-05, "loss": 0.2062, "step": 8892 }, { "epoch": 1.7920612532742293, "grad_norm": 0.034674037247896194, "learning_rate": 7.617348690307659e-05, "loss": 0.1751, "step": 8894 }, { "epoch": 1.7924642353415274, "grad_norm": 0.35374215245246887, "learning_rate": 7.616213119281462e-05, "loss": 0.1593, "step": 8896 }, { "epoch": 1.7928672174088254, "grad_norm": 0.043103184551000595, "learning_rate": 7.615077362403669e-05, "loss": 0.1558, "step": 8898 }, { "epoch": 1.7932701994761233, "grad_norm": 0.04136328399181366, "learning_rate": 7.613941419754961e-05, "loss": 0.1753, "step": 8900 }, { "epoch": 1.7936731815434213, "grad_norm": 0.04389548674225807, "learning_rate": 7.612805291416036e-05, "loss": 0.1998, "step": 8902 }, { "epoch": 1.7940761636107192, "grad_norm": 0.04551811143755913, "learning_rate": 7.6116689774676e-05, "loss": 0.1881, "step": 8904 }, { "epoch": 1.7944791456780174, "grad_norm": 0.05296977981925011, "learning_rate": 7.61053247799038e-05, "loss": 0.1489, "step": 8906 }, { "epoch": 1.7948821277453153, "grad_norm": 0.09822306036949158, "learning_rate": 7.609395793065107e-05, "loss": 0.2469, "step": 8908 }, { "epoch": 1.7952851098126135, "grad_norm": 0.04015516862273216, "learning_rate": 7.608258922772527e-05, "loss": 0.1482, "step": 8910 }, { "epoch": 1.7956880918799114, "grad_norm": 0.05408100038766861, "learning_rate": 7.607121867193407e-05, "loss": 0.2308, "step": 8912 }, { "epoch": 1.7960910739472093, "grad_norm": 0.04759914055466652, "learning_rate": 7.605984626408517e-05, "loss": 0.1598, "step": 8914 }, { "epoch": 1.7964940560145073, "grad_norm": 0.05472610890865326, "learning_rate": 7.604847200498649e-05, "loss": 0.1849, "step": 8916 }, { "epoch": 1.7968970380818052, "grad_norm": 0.043661490082740784, "learning_rate": 7.603709589544601e-05, "loss": 0.1537, "step": 8918 }, { "epoch": 1.7973000201491034, "grad_norm": 0.05160943791270256, "learning_rate": 7.602571793627187e-05, "loss": 0.1813, "step": 8920 }, { "epoch": 1.7977030022164013, "grad_norm": 0.05473243072628975, "learning_rate": 7.601433812827235e-05, "loss": 0.208, "step": 8922 }, { "epoch": 1.7981059842836995, "grad_norm": 0.04076346382498741, "learning_rate": 7.600295647225586e-05, "loss": 0.1423, "step": 8924 }, { "epoch": 1.7985089663509974, "grad_norm": 0.06232528015971184, "learning_rate": 7.599157296903092e-05, "loss": 0.2041, "step": 8926 }, { "epoch": 1.7989119484182954, "grad_norm": 0.07285811007022858, "learning_rate": 7.598018761940622e-05, "loss": 0.1965, "step": 8928 }, { "epoch": 1.7993149304855933, "grad_norm": 0.058572810143232346, "learning_rate": 7.596880042419053e-05, "loss": 0.1957, "step": 8930 }, { "epoch": 1.7997179125528913, "grad_norm": 0.06865711510181427, "learning_rate": 7.595741138419279e-05, "loss": 0.1719, "step": 8932 }, { "epoch": 1.8001208946201894, "grad_norm": 0.043107353150844574, "learning_rate": 7.594602050022207e-05, "loss": 0.1416, "step": 8934 }, { "epoch": 1.8005238766874874, "grad_norm": 0.047849707305431366, "learning_rate": 7.593462777308752e-05, "loss": 0.1889, "step": 8936 }, { "epoch": 1.8009268587547855, "grad_norm": 0.053915441036224365, "learning_rate": 7.592323320359849e-05, "loss": 0.2267, "step": 8938 }, { "epoch": 1.8013298408220835, "grad_norm": 0.04879898205399513, "learning_rate": 7.591183679256447e-05, "loss": 0.2082, "step": 8940 }, { "epoch": 1.8017328228893814, "grad_norm": 0.04230300709605217, "learning_rate": 7.590043854079496e-05, "loss": 0.2072, "step": 8942 }, { "epoch": 1.8021358049566794, "grad_norm": 0.05137898400425911, "learning_rate": 7.588903844909973e-05, "loss": 0.2026, "step": 8944 }, { "epoch": 1.8025387870239773, "grad_norm": 0.037783313542604446, "learning_rate": 7.587763651828863e-05, "loss": 0.1661, "step": 8946 }, { "epoch": 1.8029417690912755, "grad_norm": 0.055004462599754333, "learning_rate": 7.58662327491716e-05, "loss": 0.2076, "step": 8948 }, { "epoch": 1.8033447511585734, "grad_norm": 0.056807104498147964, "learning_rate": 7.585482714255877e-05, "loss": 0.1984, "step": 8950 }, { "epoch": 1.8037477332258716, "grad_norm": 0.03771530091762543, "learning_rate": 7.584341969926037e-05, "loss": 0.1587, "step": 8952 }, { "epoch": 1.8041507152931695, "grad_norm": 0.049856383353471756, "learning_rate": 7.583201042008677e-05, "loss": 0.2049, "step": 8954 }, { "epoch": 1.8045536973604674, "grad_norm": 0.051321081817150116, "learning_rate": 7.582059930584844e-05, "loss": 0.1883, "step": 8956 }, { "epoch": 1.8049566794277654, "grad_norm": 0.045879822224378586, "learning_rate": 7.580918635735605e-05, "loss": 0.1875, "step": 8958 }, { "epoch": 1.8053596614950633, "grad_norm": 0.0529901348054409, "learning_rate": 7.579777157542034e-05, "loss": 0.2356, "step": 8960 }, { "epoch": 1.8057626435623615, "grad_norm": 0.04600401595234871, "learning_rate": 7.578635496085218e-05, "loss": 0.2251, "step": 8962 }, { "epoch": 1.8061656256296594, "grad_norm": 0.2081703096628189, "learning_rate": 7.577493651446261e-05, "loss": 0.265, "step": 8964 }, { "epoch": 1.8065686076969576, "grad_norm": 0.0612410344183445, "learning_rate": 7.576351623706277e-05, "loss": 0.2312, "step": 8966 }, { "epoch": 1.8069715897642555, "grad_norm": 0.04466860368847847, "learning_rate": 7.575209412946394e-05, "loss": 0.1786, "step": 8968 }, { "epoch": 1.8073745718315535, "grad_norm": 0.10041309893131256, "learning_rate": 7.574067019247753e-05, "loss": 0.2351, "step": 8970 }, { "epoch": 1.8077775538988514, "grad_norm": 0.06266580522060394, "learning_rate": 7.572924442691505e-05, "loss": 0.2368, "step": 8972 }, { "epoch": 1.8081805359661494, "grad_norm": 0.04474787414073944, "learning_rate": 7.571781683358822e-05, "loss": 0.1473, "step": 8974 }, { "epoch": 1.8085835180334475, "grad_norm": 0.043090350925922394, "learning_rate": 7.57063874133088e-05, "loss": 0.1662, "step": 8976 }, { "epoch": 1.8089865001007455, "grad_norm": 0.22436150908470154, "learning_rate": 7.569495616688873e-05, "loss": 0.2034, "step": 8978 }, { "epoch": 1.8093894821680436, "grad_norm": 0.05931695178151131, "learning_rate": 7.568352309514008e-05, "loss": 0.233, "step": 8980 }, { "epoch": 1.8097924642353416, "grad_norm": 0.05479630082845688, "learning_rate": 7.567208819887502e-05, "loss": 0.1908, "step": 8982 }, { "epoch": 1.8101954463026395, "grad_norm": 0.04764629527926445, "learning_rate": 7.566065147890586e-05, "loss": 0.1856, "step": 8984 }, { "epoch": 1.8105984283699375, "grad_norm": 0.046009134501218796, "learning_rate": 7.564921293604508e-05, "loss": 0.167, "step": 8986 }, { "epoch": 1.8110014104372354, "grad_norm": 0.06495673209428787, "learning_rate": 7.56377725711052e-05, "loss": 0.1937, "step": 8988 }, { "epoch": 1.8114043925045336, "grad_norm": 0.049228839576244354, "learning_rate": 7.562633038489897e-05, "loss": 0.1917, "step": 8990 }, { "epoch": 1.8118073745718317, "grad_norm": 0.04694323614239693, "learning_rate": 7.561488637823924e-05, "loss": 0.1727, "step": 8992 }, { "epoch": 1.8122103566391297, "grad_norm": 0.04595312848687172, "learning_rate": 7.560344055193891e-05, "loss": 0.2031, "step": 8994 }, { "epoch": 1.8126133387064276, "grad_norm": 0.043079208582639694, "learning_rate": 7.559199290681112e-05, "loss": 0.2117, "step": 8996 }, { "epoch": 1.8130163207737255, "grad_norm": 0.052916523069143295, "learning_rate": 7.55805434436691e-05, "loss": 0.1722, "step": 8998 }, { "epoch": 1.8134193028410235, "grad_norm": 0.06618094444274902, "learning_rate": 7.556909216332617e-05, "loss": 0.1934, "step": 9000 }, { "epoch": 1.8138222849083214, "grad_norm": 0.059367429465055466, "learning_rate": 7.555763906659582e-05, "loss": 0.2204, "step": 9002 }, { "epoch": 1.8142252669756196, "grad_norm": 0.05537892505526543, "learning_rate": 7.554618415429168e-05, "loss": 0.1491, "step": 9004 }, { "epoch": 1.8146282490429178, "grad_norm": 0.05505523830652237, "learning_rate": 7.553472742722745e-05, "loss": 0.2061, "step": 9006 }, { "epoch": 1.8150312311102157, "grad_norm": 0.05641672760248184, "learning_rate": 7.552326888621703e-05, "loss": 0.1719, "step": 9008 }, { "epoch": 1.8154342131775136, "grad_norm": 0.04747428745031357, "learning_rate": 7.551180853207442e-05, "loss": 0.2033, "step": 9010 }, { "epoch": 1.8158371952448116, "grad_norm": 0.07291791588068008, "learning_rate": 7.550034636561371e-05, "loss": 0.2159, "step": 9012 }, { "epoch": 1.8162401773121095, "grad_norm": 0.06158110871911049, "learning_rate": 7.54888823876492e-05, "loss": 0.1543, "step": 9014 }, { "epoch": 1.8166431593794075, "grad_norm": 0.05462907254695892, "learning_rate": 7.547741659899523e-05, "loss": 0.1869, "step": 9016 }, { "epoch": 1.8170461414467056, "grad_norm": 0.09753762930631638, "learning_rate": 7.546594900046633e-05, "loss": 0.1726, "step": 9018 }, { "epoch": 1.8174491235140038, "grad_norm": 0.12133771181106567, "learning_rate": 7.545447959287714e-05, "loss": 0.1845, "step": 9020 }, { "epoch": 1.8178521055813017, "grad_norm": 0.05773913860321045, "learning_rate": 7.544300837704244e-05, "loss": 0.194, "step": 9022 }, { "epoch": 1.8182550876485997, "grad_norm": 0.049177125096321106, "learning_rate": 7.543153535377711e-05, "loss": 0.2086, "step": 9024 }, { "epoch": 1.8186580697158976, "grad_norm": 0.05512756481766701, "learning_rate": 7.542006052389619e-05, "loss": 0.2001, "step": 9026 }, { "epoch": 1.8190610517831955, "grad_norm": 0.04927990213036537, "learning_rate": 7.540858388821482e-05, "loss": 0.1936, "step": 9028 }, { "epoch": 1.8194640338504935, "grad_norm": 0.06231493130326271, "learning_rate": 7.539710544754826e-05, "loss": 0.2342, "step": 9030 }, { "epoch": 1.8198670159177917, "grad_norm": 0.040557861328125, "learning_rate": 7.538562520271197e-05, "loss": 0.1761, "step": 9032 }, { "epoch": 1.8202699979850898, "grad_norm": 0.05428093299269676, "learning_rate": 7.537414315452145e-05, "loss": 0.2394, "step": 9034 }, { "epoch": 1.8206729800523878, "grad_norm": 0.041098061949014664, "learning_rate": 7.536265930379239e-05, "loss": 0.2154, "step": 9036 }, { "epoch": 1.8210759621196857, "grad_norm": 0.04267246648669243, "learning_rate": 7.535117365134058e-05, "loss": 0.1659, "step": 9038 }, { "epoch": 1.8214789441869836, "grad_norm": 0.04948339983820915, "learning_rate": 7.533968619798193e-05, "loss": 0.2019, "step": 9040 }, { "epoch": 1.8218819262542816, "grad_norm": 0.08953138440847397, "learning_rate": 7.53281969445325e-05, "loss": 0.3046, "step": 9042 }, { "epoch": 1.8222849083215797, "grad_norm": 0.05375578626990318, "learning_rate": 7.531670589180846e-05, "loss": 0.1992, "step": 9044 }, { "epoch": 1.8226878903888777, "grad_norm": 0.0598725751042366, "learning_rate": 7.530521304062613e-05, "loss": 0.204, "step": 9046 }, { "epoch": 1.8230908724561758, "grad_norm": 0.06039509177207947, "learning_rate": 7.529371839180191e-05, "loss": 0.2076, "step": 9048 }, { "epoch": 1.8234938545234738, "grad_norm": 0.04848941043019295, "learning_rate": 7.528222194615242e-05, "loss": 0.1768, "step": 9050 }, { "epoch": 1.8238968365907717, "grad_norm": 0.04888831824064255, "learning_rate": 7.52707237044943e-05, "loss": 0.1861, "step": 9052 }, { "epoch": 1.8242998186580697, "grad_norm": 0.040995147079229355, "learning_rate": 7.525922366764437e-05, "loss": 0.2125, "step": 9054 }, { "epoch": 1.8247028007253676, "grad_norm": 0.051442645490169525, "learning_rate": 7.524772183641961e-05, "loss": 0.2137, "step": 9056 }, { "epoch": 1.8251057827926658, "grad_norm": 0.04963943362236023, "learning_rate": 7.523621821163707e-05, "loss": 0.2418, "step": 9058 }, { "epoch": 1.8255087648599637, "grad_norm": 0.04928060993552208, "learning_rate": 7.522471279411393e-05, "loss": 0.227, "step": 9060 }, { "epoch": 1.8259117469272619, "grad_norm": 0.04120678827166557, "learning_rate": 7.521320558466755e-05, "loss": 0.1801, "step": 9062 }, { "epoch": 1.8263147289945598, "grad_norm": 0.05170593783259392, "learning_rate": 7.520169658411535e-05, "loss": 0.2048, "step": 9064 }, { "epoch": 1.8267177110618578, "grad_norm": 0.05370767042040825, "learning_rate": 7.519018579327493e-05, "loss": 0.1983, "step": 9066 }, { "epoch": 1.8271206931291557, "grad_norm": 0.050350531935691833, "learning_rate": 7.517867321296402e-05, "loss": 0.219, "step": 9068 }, { "epoch": 1.8275236751964536, "grad_norm": 0.04988570138812065, "learning_rate": 7.51671588440004e-05, "loss": 0.1792, "step": 9070 }, { "epoch": 1.8279266572637518, "grad_norm": 0.04610089212656021, "learning_rate": 7.51556426872021e-05, "loss": 0.2189, "step": 9072 }, { "epoch": 1.8283296393310497, "grad_norm": 0.03649386391043663, "learning_rate": 7.514412474338715e-05, "loss": 0.1567, "step": 9074 }, { "epoch": 1.828732621398348, "grad_norm": 0.05270843580365181, "learning_rate": 7.51326050133738e-05, "loss": 0.1914, "step": 9076 }, { "epoch": 1.8291356034656459, "grad_norm": 0.05600623041391373, "learning_rate": 7.512108349798037e-05, "loss": 0.2287, "step": 9078 }, { "epoch": 1.8295385855329438, "grad_norm": 0.05488625913858414, "learning_rate": 7.510956019802537e-05, "loss": 0.2209, "step": 9080 }, { "epoch": 1.8299415676002417, "grad_norm": 0.0471048466861248, "learning_rate": 7.509803511432734e-05, "loss": 0.2048, "step": 9082 }, { "epoch": 1.8303445496675397, "grad_norm": 0.05616595223546028, "learning_rate": 7.508650824770505e-05, "loss": 0.1831, "step": 9084 }, { "epoch": 1.8307475317348378, "grad_norm": 0.05483395606279373, "learning_rate": 7.507497959897734e-05, "loss": 0.1852, "step": 9086 }, { "epoch": 1.8311505138021358, "grad_norm": 0.05591282621026039, "learning_rate": 7.506344916896317e-05, "loss": 0.1974, "step": 9088 }, { "epoch": 1.831553495869434, "grad_norm": 0.046104494482278824, "learning_rate": 7.505191695848165e-05, "loss": 0.215, "step": 9090 }, { "epoch": 1.8319564779367319, "grad_norm": 0.06968837231397629, "learning_rate": 7.504038296835203e-05, "loss": 0.2492, "step": 9092 }, { "epoch": 1.8323594600040298, "grad_norm": 0.048819273710250854, "learning_rate": 7.502884719939363e-05, "loss": 0.2016, "step": 9094 }, { "epoch": 1.8327624420713278, "grad_norm": 0.06848305463790894, "learning_rate": 7.501730965242598e-05, "loss": 0.1702, "step": 9096 }, { "epoch": 1.8331654241386257, "grad_norm": 0.05366634204983711, "learning_rate": 7.500577032826863e-05, "loss": 0.1789, "step": 9098 }, { "epoch": 1.8335684062059239, "grad_norm": 0.04202277213335037, "learning_rate": 7.499422922774137e-05, "loss": 0.1796, "step": 9100 }, { "epoch": 1.8339713882732218, "grad_norm": 0.04547832906246185, "learning_rate": 7.498268635166403e-05, "loss": 0.1709, "step": 9102 }, { "epoch": 1.83437437034052, "grad_norm": 0.04689347371459007, "learning_rate": 7.497114170085661e-05, "loss": 0.175, "step": 9104 }, { "epoch": 1.834777352407818, "grad_norm": 0.043508078902959824, "learning_rate": 7.495959527613921e-05, "loss": 0.2098, "step": 9106 }, { "epoch": 1.8351803344751159, "grad_norm": 0.05118690803647041, "learning_rate": 7.494804707833208e-05, "loss": 0.159, "step": 9108 }, { "epoch": 1.8355833165424138, "grad_norm": 0.037060126662254333, "learning_rate": 7.493649710825559e-05, "loss": 0.1626, "step": 9110 }, { "epoch": 1.8359862986097117, "grad_norm": 0.05963043123483658, "learning_rate": 7.492494536673021e-05, "loss": 0.1906, "step": 9112 }, { "epoch": 1.83638928067701, "grad_norm": 0.044435955584049225, "learning_rate": 7.49133918545766e-05, "loss": 0.1564, "step": 9114 }, { "epoch": 1.8367922627443078, "grad_norm": 0.058326657861471176, "learning_rate": 7.490183657261546e-05, "loss": 0.1771, "step": 9116 }, { "epoch": 1.837195244811606, "grad_norm": 0.06040395423769951, "learning_rate": 7.489027952166768e-05, "loss": 0.2254, "step": 9118 }, { "epoch": 1.837598226878904, "grad_norm": 0.04052739217877388, "learning_rate": 7.487872070255425e-05, "loss": 0.135, "step": 9120 }, { "epoch": 1.838001208946202, "grad_norm": 0.05821429565548897, "learning_rate": 7.486716011609627e-05, "loss": 0.1784, "step": 9122 }, { "epoch": 1.8384041910134998, "grad_norm": 0.05247338116168976, "learning_rate": 7.485559776311501e-05, "loss": 0.2273, "step": 9124 }, { "epoch": 1.8388071730807978, "grad_norm": 0.05803043395280838, "learning_rate": 7.484403364443185e-05, "loss": 0.2082, "step": 9126 }, { "epoch": 1.839210155148096, "grad_norm": 0.051874928176403046, "learning_rate": 7.483246776086827e-05, "loss": 0.2019, "step": 9128 }, { "epoch": 1.8396131372153939, "grad_norm": 0.048445116728544235, "learning_rate": 7.482090011324588e-05, "loss": 0.1848, "step": 9130 }, { "epoch": 1.840016119282692, "grad_norm": 0.06173400208353996, "learning_rate": 7.480933070238645e-05, "loss": 0.2716, "step": 9132 }, { "epoch": 1.84041910134999, "grad_norm": 0.04523913934826851, "learning_rate": 7.479775952911184e-05, "loss": 0.1917, "step": 9134 }, { "epoch": 1.840822083417288, "grad_norm": 0.054122623056173325, "learning_rate": 7.478618659424406e-05, "loss": 0.2317, "step": 9136 }, { "epoch": 1.8412250654845859, "grad_norm": 0.050042774528265, "learning_rate": 7.477461189860522e-05, "loss": 0.2069, "step": 9138 }, { "epoch": 1.8416280475518838, "grad_norm": 0.04367179423570633, "learning_rate": 7.476303544301757e-05, "loss": 0.1848, "step": 9140 }, { "epoch": 1.842031029619182, "grad_norm": 0.054118309170007706, "learning_rate": 7.475145722830348e-05, "loss": 0.1706, "step": 9142 }, { "epoch": 1.84243401168648, "grad_norm": 0.04461648315191269, "learning_rate": 7.473987725528547e-05, "loss": 0.2188, "step": 9144 }, { "epoch": 1.842836993753778, "grad_norm": 0.0384267196059227, "learning_rate": 7.472829552478613e-05, "loss": 0.1457, "step": 9146 }, { "epoch": 1.843239975821076, "grad_norm": 0.04002009332180023, "learning_rate": 7.471671203762822e-05, "loss": 0.1605, "step": 9148 }, { "epoch": 1.843642957888374, "grad_norm": 0.04789675399661064, "learning_rate": 7.470512679463463e-05, "loss": 0.169, "step": 9150 }, { "epoch": 1.844045939955672, "grad_norm": 0.054053302854299545, "learning_rate": 7.469353979662833e-05, "loss": 0.1935, "step": 9152 }, { "epoch": 1.8444489220229698, "grad_norm": 0.03856454789638519, "learning_rate": 7.468195104443246e-05, "loss": 0.1723, "step": 9154 }, { "epoch": 1.844851904090268, "grad_norm": 0.05694045126438141, "learning_rate": 7.467036053887027e-05, "loss": 0.1889, "step": 9156 }, { "epoch": 1.845254886157566, "grad_norm": 0.03868400678038597, "learning_rate": 7.46587682807651e-05, "loss": 0.2551, "step": 9158 }, { "epoch": 1.845657868224864, "grad_norm": 0.044934503734111786, "learning_rate": 7.464717427094048e-05, "loss": 0.2043, "step": 9160 }, { "epoch": 1.846060850292162, "grad_norm": 0.04237751290202141, "learning_rate": 7.463557851022001e-05, "loss": 0.219, "step": 9162 }, { "epoch": 1.84646383235946, "grad_norm": 0.050449177622795105, "learning_rate": 7.462398099942745e-05, "loss": 0.1908, "step": 9164 }, { "epoch": 1.846866814426758, "grad_norm": 0.03661469370126724, "learning_rate": 7.461238173938667e-05, "loss": 0.1453, "step": 9166 }, { "epoch": 1.8472697964940559, "grad_norm": 0.04814742133021355, "learning_rate": 7.460078073092163e-05, "loss": 0.1919, "step": 9168 }, { "epoch": 1.847672778561354, "grad_norm": 0.09877938777208328, "learning_rate": 7.458917797485648e-05, "loss": 0.2523, "step": 9170 }, { "epoch": 1.848075760628652, "grad_norm": 0.03724796697497368, "learning_rate": 7.457757347201545e-05, "loss": 0.1498, "step": 9172 }, { "epoch": 1.8484787426959501, "grad_norm": 0.04252644255757332, "learning_rate": 7.456596722322292e-05, "loss": 0.1498, "step": 9174 }, { "epoch": 1.848881724763248, "grad_norm": 0.048177529126405716, "learning_rate": 7.455435922930335e-05, "loss": 0.1798, "step": 9176 }, { "epoch": 1.849284706830546, "grad_norm": 0.038370076566934586, "learning_rate": 7.454274949108136e-05, "loss": 0.2012, "step": 9178 }, { "epoch": 1.849687688897844, "grad_norm": 0.04380296543240547, "learning_rate": 7.453113800938172e-05, "loss": 0.2188, "step": 9180 }, { "epoch": 1.850090670965142, "grad_norm": 0.05155162513256073, "learning_rate": 7.451952478502924e-05, "loss": 0.179, "step": 9182 }, { "epoch": 1.85049365303244, "grad_norm": 0.04221152514219284, "learning_rate": 7.450790981884896e-05, "loss": 0.1924, "step": 9184 }, { "epoch": 1.850896635099738, "grad_norm": 0.06590647250413895, "learning_rate": 7.449629311166595e-05, "loss": 0.2229, "step": 9186 }, { "epoch": 1.8512996171670362, "grad_norm": 0.040665872395038605, "learning_rate": 7.448467466430545e-05, "loss": 0.1418, "step": 9188 }, { "epoch": 1.8517025992343341, "grad_norm": 0.0483027920126915, "learning_rate": 7.447305447759282e-05, "loss": 0.2081, "step": 9190 }, { "epoch": 1.852105581301632, "grad_norm": 0.03717666119337082, "learning_rate": 7.446143255235355e-05, "loss": 0.157, "step": 9192 }, { "epoch": 1.85250856336893, "grad_norm": 0.04697210341691971, "learning_rate": 7.444980888941322e-05, "loss": 0.1733, "step": 9194 }, { "epoch": 1.852911545436228, "grad_norm": 0.05617213249206543, "learning_rate": 7.443818348959757e-05, "loss": 0.1802, "step": 9196 }, { "epoch": 1.853314527503526, "grad_norm": 0.057184625416994095, "learning_rate": 7.442655635373246e-05, "loss": 0.1908, "step": 9198 }, { "epoch": 1.8537175095708243, "grad_norm": 0.05311274901032448, "learning_rate": 7.441492748264384e-05, "loss": 0.2275, "step": 9200 }, { "epoch": 1.8541204916381222, "grad_norm": 0.05756756663322449, "learning_rate": 7.440329687715781e-05, "loss": 0.1991, "step": 9202 }, { "epoch": 1.8545234737054201, "grad_norm": 0.07354568690061569, "learning_rate": 7.439166453810061e-05, "loss": 0.227, "step": 9204 }, { "epoch": 1.854926455772718, "grad_norm": 0.07992871105670929, "learning_rate": 7.438003046629857e-05, "loss": 0.2499, "step": 9206 }, { "epoch": 1.855329437840016, "grad_norm": 0.04908233880996704, "learning_rate": 7.436839466257816e-05, "loss": 0.1945, "step": 9208 }, { "epoch": 1.855732419907314, "grad_norm": 0.04218778386712074, "learning_rate": 7.435675712776594e-05, "loss": 0.1944, "step": 9210 }, { "epoch": 1.8561354019746121, "grad_norm": 0.06127457693219185, "learning_rate": 7.434511786268866e-05, "loss": 0.2256, "step": 9212 }, { "epoch": 1.8565383840419103, "grad_norm": 0.05392537638545036, "learning_rate": 7.433347686817316e-05, "loss": 0.2313, "step": 9214 }, { "epoch": 1.8569413661092082, "grad_norm": 0.048670317977666855, "learning_rate": 7.432183414504635e-05, "loss": 0.1915, "step": 9216 }, { "epoch": 1.8573443481765062, "grad_norm": 0.04563140869140625, "learning_rate": 7.431018969413536e-05, "loss": 0.2058, "step": 9218 }, { "epoch": 1.8577473302438041, "grad_norm": 0.052207816392183304, "learning_rate": 7.429854351626737e-05, "loss": 0.2068, "step": 9220 }, { "epoch": 1.858150312311102, "grad_norm": 0.04309464991092682, "learning_rate": 7.428689561226969e-05, "loss": 0.169, "step": 9222 }, { "epoch": 1.8585532943784, "grad_norm": 0.05821385979652405, "learning_rate": 7.42752459829698e-05, "loss": 0.2067, "step": 9224 }, { "epoch": 1.8589562764456982, "grad_norm": 0.04867576062679291, "learning_rate": 7.426359462919527e-05, "loss": 0.186, "step": 9226 }, { "epoch": 1.8593592585129963, "grad_norm": 0.055768147110939026, "learning_rate": 7.425194155177377e-05, "loss": 0.1948, "step": 9228 }, { "epoch": 1.8597622405802943, "grad_norm": 0.055632948875427246, "learning_rate": 7.424028675153313e-05, "loss": 0.1917, "step": 9230 }, { "epoch": 1.8601652226475922, "grad_norm": 0.03492635115981102, "learning_rate": 7.422863022930128e-05, "loss": 0.1538, "step": 9232 }, { "epoch": 1.8605682047148902, "grad_norm": 0.1014142632484436, "learning_rate": 7.421697198590628e-05, "loss": 0.1698, "step": 9234 }, { "epoch": 1.860971186782188, "grad_norm": 0.05396107956767082, "learning_rate": 7.420531202217634e-05, "loss": 0.2376, "step": 9236 }, { "epoch": 1.861374168849486, "grad_norm": 0.058612339198589325, "learning_rate": 7.419365033893972e-05, "loss": 0.185, "step": 9238 }, { "epoch": 1.8617771509167842, "grad_norm": 0.07215067744255066, "learning_rate": 7.418198693702489e-05, "loss": 0.1545, "step": 9240 }, { "epoch": 1.8621801329840824, "grad_norm": 0.05233979597687721, "learning_rate": 7.417032181726038e-05, "loss": 0.1979, "step": 9242 }, { "epoch": 1.8625831150513803, "grad_norm": 0.03794995695352554, "learning_rate": 7.415865498047485e-05, "loss": 0.1615, "step": 9244 }, { "epoch": 1.8629860971186782, "grad_norm": 0.049930084496736526, "learning_rate": 7.414698642749712e-05, "loss": 0.1539, "step": 9246 }, { "epoch": 1.8633890791859762, "grad_norm": 0.05968122184276581, "learning_rate": 7.413531615915609e-05, "loss": 0.1558, "step": 9248 }, { "epoch": 1.8637920612532741, "grad_norm": 0.038739725947380066, "learning_rate": 7.41236441762808e-05, "loss": 0.1648, "step": 9250 }, { "epoch": 1.8641950433205723, "grad_norm": 0.06096603721380234, "learning_rate": 7.41119704797004e-05, "loss": 0.1747, "step": 9252 }, { "epoch": 1.8645980253878702, "grad_norm": 0.06738614290952682, "learning_rate": 7.410029507024418e-05, "loss": 0.2019, "step": 9254 }, { "epoch": 1.8650010074551684, "grad_norm": 0.05740448087453842, "learning_rate": 7.408861794874155e-05, "loss": 0.1431, "step": 9256 }, { "epoch": 1.8654039895224663, "grad_norm": 0.07034429907798767, "learning_rate": 7.407693911602201e-05, "loss": 0.18, "step": 9258 }, { "epoch": 1.8658069715897643, "grad_norm": 0.05217040330171585, "learning_rate": 7.406525857291523e-05, "loss": 0.2113, "step": 9260 }, { "epoch": 1.8662099536570622, "grad_norm": 0.05973799526691437, "learning_rate": 7.405357632025097e-05, "loss": 0.1966, "step": 9262 }, { "epoch": 1.8666129357243602, "grad_norm": 0.06974881887435913, "learning_rate": 7.40418923588591e-05, "loss": 0.2153, "step": 9264 }, { "epoch": 1.8670159177916583, "grad_norm": 0.08291905373334885, "learning_rate": 7.403020668956967e-05, "loss": 0.1821, "step": 9266 }, { "epoch": 1.8674188998589563, "grad_norm": 0.04557829722762108, "learning_rate": 7.401851931321278e-05, "loss": 0.2295, "step": 9268 }, { "epoch": 1.8678218819262544, "grad_norm": 0.04647723585367203, "learning_rate": 7.400683023061868e-05, "loss": 0.2267, "step": 9270 }, { "epoch": 1.8682248639935524, "grad_norm": 0.0661553293466568, "learning_rate": 7.399513944261776e-05, "loss": 0.2783, "step": 9272 }, { "epoch": 1.8686278460608503, "grad_norm": 0.03786701336503029, "learning_rate": 7.398344695004051e-05, "loss": 0.1326, "step": 9274 }, { "epoch": 1.8690308281281482, "grad_norm": 0.05944419279694557, "learning_rate": 7.397175275371754e-05, "loss": 0.1802, "step": 9276 }, { "epoch": 1.8694338101954462, "grad_norm": 0.042872168123722076, "learning_rate": 7.39600568544796e-05, "loss": 0.1665, "step": 9278 }, { "epoch": 1.8698367922627444, "grad_norm": 0.03952530026435852, "learning_rate": 7.394835925315753e-05, "loss": 0.2041, "step": 9280 }, { "epoch": 1.8702397743300423, "grad_norm": 0.045997776091098785, "learning_rate": 7.393665995058232e-05, "loss": 0.2029, "step": 9282 }, { "epoch": 1.8706427563973405, "grad_norm": 0.041152164340019226, "learning_rate": 7.392495894758508e-05, "loss": 0.1887, "step": 9284 }, { "epoch": 1.8710457384646384, "grad_norm": 0.057696882635354996, "learning_rate": 7.3913256244997e-05, "loss": 0.2322, "step": 9286 }, { "epoch": 1.8714487205319363, "grad_norm": 0.05366590991616249, "learning_rate": 7.390155184364944e-05, "loss": 0.1969, "step": 9288 }, { "epoch": 1.8718517025992343, "grad_norm": 0.05442323908209801, "learning_rate": 7.388984574437388e-05, "loss": 0.2094, "step": 9290 }, { "epoch": 1.8722546846665322, "grad_norm": 0.040767259895801544, "learning_rate": 7.387813794800187e-05, "loss": 0.1648, "step": 9292 }, { "epoch": 1.8726576667338304, "grad_norm": 0.04274585098028183, "learning_rate": 7.386642845536513e-05, "loss": 0.2038, "step": 9294 }, { "epoch": 1.8730606488011283, "grad_norm": 0.05937502160668373, "learning_rate": 7.385471726729549e-05, "loss": 0.2133, "step": 9296 }, { "epoch": 1.8734636308684265, "grad_norm": 0.04178241640329361, "learning_rate": 7.384300438462488e-05, "loss": 0.226, "step": 9298 }, { "epoch": 1.8738666129357244, "grad_norm": 0.04841248691082001, "learning_rate": 7.383128980818538e-05, "loss": 0.1665, "step": 9300 }, { "epoch": 1.8742695950030224, "grad_norm": 0.05001138895750046, "learning_rate": 7.381957353880916e-05, "loss": 0.2038, "step": 9302 }, { "epoch": 1.8746725770703203, "grad_norm": 0.047811415046453476, "learning_rate": 7.380785557732851e-05, "loss": 0.1678, "step": 9304 }, { "epoch": 1.8750755591376183, "grad_norm": 0.044050272554159164, "learning_rate": 7.37961359245759e-05, "loss": 0.215, "step": 9306 }, { "epoch": 1.8754785412049164, "grad_norm": 0.05505215376615524, "learning_rate": 7.378441458138383e-05, "loss": 0.1941, "step": 9308 }, { "epoch": 1.8758815232722144, "grad_norm": 0.06320565193891525, "learning_rate": 7.3772691548585e-05, "loss": 0.2459, "step": 9310 }, { "epoch": 1.8762845053395125, "grad_norm": 0.04323972761631012, "learning_rate": 7.376096682701217e-05, "loss": 0.2206, "step": 9312 }, { "epoch": 1.8766874874068105, "grad_norm": 0.054532576352357864, "learning_rate": 7.374924041749826e-05, "loss": 0.2011, "step": 9314 }, { "epoch": 1.8770904694741084, "grad_norm": 0.08867479860782623, "learning_rate": 7.373751232087629e-05, "loss": 0.1648, "step": 9316 }, { "epoch": 1.8774934515414063, "grad_norm": 0.050876423716545105, "learning_rate": 7.372578253797942e-05, "loss": 0.207, "step": 9318 }, { "epoch": 1.8778964336087043, "grad_norm": 0.03266465291380882, "learning_rate": 7.371405106964089e-05, "loss": 0.1775, "step": 9320 }, { "epoch": 1.8782994156760024, "grad_norm": 0.0411655530333519, "learning_rate": 7.37023179166941e-05, "loss": 0.2063, "step": 9322 }, { "epoch": 1.8787023977433004, "grad_norm": 0.04222915321588516, "learning_rate": 7.369058307997255e-05, "loss": 0.1833, "step": 9324 }, { "epoch": 1.8791053798105986, "grad_norm": 0.050624068826436996, "learning_rate": 7.367884656030987e-05, "loss": 0.1812, "step": 9326 }, { "epoch": 1.8795083618778965, "grad_norm": 0.04637574031949043, "learning_rate": 7.366710835853979e-05, "loss": 0.1842, "step": 9328 }, { "epoch": 1.8799113439451944, "grad_norm": 0.0690167099237442, "learning_rate": 7.36553684754962e-05, "loss": 0.1723, "step": 9330 }, { "epoch": 1.8803143260124924, "grad_norm": 0.05005061626434326, "learning_rate": 7.364362691201305e-05, "loss": 0.2336, "step": 9332 }, { "epoch": 1.8807173080797903, "grad_norm": 0.046944767236709595, "learning_rate": 7.363188366892445e-05, "loss": 0.2091, "step": 9334 }, { "epoch": 1.8811202901470885, "grad_norm": 0.05391280725598335, "learning_rate": 7.362013874706465e-05, "loss": 0.1837, "step": 9336 }, { "epoch": 1.8815232722143864, "grad_norm": 0.04615011438727379, "learning_rate": 7.360839214726796e-05, "loss": 0.215, "step": 9338 }, { "epoch": 1.8819262542816846, "grad_norm": 0.05353038012981415, "learning_rate": 7.359664387036884e-05, "loss": 0.1675, "step": 9340 }, { "epoch": 1.8823292363489825, "grad_norm": 0.0974080041050911, "learning_rate": 7.358489391720188e-05, "loss": 0.2209, "step": 9342 }, { "epoch": 1.8827322184162805, "grad_norm": 0.04070080816745758, "learning_rate": 7.357314228860177e-05, "loss": 0.1567, "step": 9344 }, { "epoch": 1.8831352004835784, "grad_norm": 0.05149964988231659, "learning_rate": 7.356138898540333e-05, "loss": 0.2078, "step": 9346 }, { "epoch": 1.8835381825508763, "grad_norm": 0.057487696409225464, "learning_rate": 7.354963400844151e-05, "loss": 0.2376, "step": 9348 }, { "epoch": 1.8839411646181745, "grad_norm": 0.04620504751801491, "learning_rate": 7.353787735855135e-05, "loss": 0.1923, "step": 9350 }, { "epoch": 1.8843441466854725, "grad_norm": 0.058796484023332596, "learning_rate": 7.352611903656802e-05, "loss": 0.2053, "step": 9352 }, { "epoch": 1.8847471287527706, "grad_norm": 0.04183940589427948, "learning_rate": 7.351435904332682e-05, "loss": 0.1519, "step": 9354 }, { "epoch": 1.8851501108200686, "grad_norm": 0.037236157804727554, "learning_rate": 7.350259737966317e-05, "loss": 0.1911, "step": 9356 }, { "epoch": 1.8855530928873665, "grad_norm": 0.05059423670172691, "learning_rate": 7.349083404641257e-05, "loss": 0.254, "step": 9358 }, { "epoch": 1.8859560749546644, "grad_norm": 0.05843758583068848, "learning_rate": 7.347906904441068e-05, "loss": 0.1811, "step": 9360 }, { "epoch": 1.8863590570219624, "grad_norm": 0.08169310539960861, "learning_rate": 7.34673023744933e-05, "loss": 0.1997, "step": 9362 }, { "epoch": 1.8867620390892605, "grad_norm": 0.03737678378820419, "learning_rate": 7.345553403749628e-05, "loss": 0.1459, "step": 9364 }, { "epoch": 1.8871650211565585, "grad_norm": 0.05280197039246559, "learning_rate": 7.344376403425563e-05, "loss": 0.2147, "step": 9366 }, { "epoch": 1.8875680032238566, "grad_norm": 0.04864995554089546, "learning_rate": 7.343199236560748e-05, "loss": 0.2175, "step": 9368 }, { "epoch": 1.8879709852911546, "grad_norm": 0.03643433377146721, "learning_rate": 7.342021903238808e-05, "loss": 0.1972, "step": 9370 }, { "epoch": 1.8883739673584525, "grad_norm": 0.04789574071764946, "learning_rate": 7.340844403543375e-05, "loss": 0.2055, "step": 9372 }, { "epoch": 1.8887769494257505, "grad_norm": 0.04603145644068718, "learning_rate": 7.3396667375581e-05, "loss": 0.1673, "step": 9374 }, { "epoch": 1.8891799314930484, "grad_norm": 0.041253577917814255, "learning_rate": 7.338488905366642e-05, "loss": 0.2102, "step": 9376 }, { "epoch": 1.8895829135603466, "grad_norm": 0.046039849519729614, "learning_rate": 7.337310907052672e-05, "loss": 0.1839, "step": 9378 }, { "epoch": 1.8899858956276445, "grad_norm": 0.03352159634232521, "learning_rate": 7.336132742699873e-05, "loss": 0.2224, "step": 9380 }, { "epoch": 1.8903888776949427, "grad_norm": 0.04696164280176163, "learning_rate": 7.33495441239194e-05, "loss": 0.1989, "step": 9382 }, { "epoch": 1.8907918597622406, "grad_norm": 0.03370807319879532, "learning_rate": 7.33377591621258e-05, "loss": 0.1577, "step": 9384 }, { "epoch": 1.8911948418295386, "grad_norm": 0.039031196385622025, "learning_rate": 7.33259725424551e-05, "loss": 0.2015, "step": 9386 }, { "epoch": 1.8915978238968365, "grad_norm": 0.04066183418035507, "learning_rate": 7.331418426574464e-05, "loss": 0.1625, "step": 9388 }, { "epoch": 1.8920008059641344, "grad_norm": 0.04581267014145851, "learning_rate": 7.330239433283179e-05, "loss": 0.1837, "step": 9390 }, { "epoch": 1.8924037880314326, "grad_norm": 0.05180869251489639, "learning_rate": 7.329060274455412e-05, "loss": 0.1885, "step": 9392 }, { "epoch": 1.8928067700987306, "grad_norm": 0.047756217420101166, "learning_rate": 7.32788095017493e-05, "loss": 0.2492, "step": 9394 }, { "epoch": 1.8932097521660287, "grad_norm": 0.03515281155705452, "learning_rate": 7.326701460525506e-05, "loss": 0.1625, "step": 9396 }, { "epoch": 1.8936127342333267, "grad_norm": 0.05632800981402397, "learning_rate": 7.325521805590932e-05, "loss": 0.1943, "step": 9398 }, { "epoch": 1.8940157163006246, "grad_norm": 0.04352201148867607, "learning_rate": 7.324341985455008e-05, "loss": 0.2154, "step": 9400 }, { "epoch": 1.8944186983679225, "grad_norm": 0.04481047764420509, "learning_rate": 7.323162000201547e-05, "loss": 0.2175, "step": 9402 }, { "epoch": 1.8948216804352205, "grad_norm": 0.0542139895260334, "learning_rate": 7.321981849914372e-05, "loss": 0.1365, "step": 9404 }, { "epoch": 1.8952246625025186, "grad_norm": 0.044655684381723404, "learning_rate": 7.32080153467732e-05, "loss": 0.2363, "step": 9406 }, { "epoch": 1.8956276445698168, "grad_norm": 0.045591697096824646, "learning_rate": 7.319621054574239e-05, "loss": 0.1815, "step": 9408 }, { "epoch": 1.8960306266371147, "grad_norm": 0.05051286518573761, "learning_rate": 7.318440409688988e-05, "loss": 0.2434, "step": 9410 }, { "epoch": 1.8964336087044127, "grad_norm": 0.04740666598081589, "learning_rate": 7.317259600105437e-05, "loss": 0.2038, "step": 9412 }, { "epoch": 1.8968365907717106, "grad_norm": 0.08768098801374435, "learning_rate": 7.31607862590747e-05, "loss": 0.1866, "step": 9414 }, { "epoch": 1.8972395728390086, "grad_norm": 0.055649664252996445, "learning_rate": 7.314897487178985e-05, "loss": 0.1873, "step": 9416 }, { "epoch": 1.8976425549063065, "grad_norm": 0.053928524255752563, "learning_rate": 7.313716184003881e-05, "loss": 0.2381, "step": 9418 }, { "epoch": 1.8980455369736047, "grad_norm": 0.06719769537448883, "learning_rate": 7.312534716466079e-05, "loss": 0.1988, "step": 9420 }, { "epoch": 1.8984485190409028, "grad_norm": 0.03923649340867996, "learning_rate": 7.311353084649511e-05, "loss": 0.1837, "step": 9422 }, { "epoch": 1.8988515011082008, "grad_norm": 0.05952714383602142, "learning_rate": 7.310171288638116e-05, "loss": 0.1996, "step": 9424 }, { "epoch": 1.8992544831754987, "grad_norm": 0.03760204464197159, "learning_rate": 7.308989328515847e-05, "loss": 0.1746, "step": 9426 }, { "epoch": 1.8996574652427967, "grad_norm": 0.04612984135746956, "learning_rate": 7.30780720436667e-05, "loss": 0.2176, "step": 9428 }, { "epoch": 1.9000604473100946, "grad_norm": 0.04813413694500923, "learning_rate": 7.306624916274557e-05, "loss": 0.1552, "step": 9430 }, { "epoch": 1.9004634293773925, "grad_norm": 0.060743726789951324, "learning_rate": 7.3054424643235e-05, "loss": 0.2085, "step": 9432 }, { "epoch": 1.9008664114446907, "grad_norm": 0.05094519630074501, "learning_rate": 7.3042598485975e-05, "loss": 0.1934, "step": 9434 }, { "epoch": 1.9012693935119889, "grad_norm": 0.060220785439014435, "learning_rate": 7.303077069180562e-05, "loss": 0.2007, "step": 9436 }, { "epoch": 1.9016723755792868, "grad_norm": 0.04000959172844887, "learning_rate": 7.301894126156713e-05, "loss": 0.2259, "step": 9438 }, { "epoch": 1.9020753576465848, "grad_norm": 0.04038378223776817, "learning_rate": 7.300711019609989e-05, "loss": 0.1858, "step": 9440 }, { "epoch": 1.9024783397138827, "grad_norm": 0.05266295000910759, "learning_rate": 7.299527749624431e-05, "loss": 0.2069, "step": 9442 }, { "epoch": 1.9028813217811806, "grad_norm": 0.045283444225788116, "learning_rate": 7.2983443162841e-05, "loss": 0.2187, "step": 9444 }, { "epoch": 1.9032843038484788, "grad_norm": 0.04584207758307457, "learning_rate": 7.297160719673064e-05, "loss": 0.1718, "step": 9446 }, { "epoch": 1.9036872859157767, "grad_norm": 0.049773987382650375, "learning_rate": 7.295976959875406e-05, "loss": 0.1761, "step": 9448 }, { "epoch": 1.904090267983075, "grad_norm": 0.04451719671487808, "learning_rate": 7.294793036975214e-05, "loss": 0.192, "step": 9450 }, { "epoch": 1.9044932500503728, "grad_norm": 0.04054448753595352, "learning_rate": 7.293608951056596e-05, "loss": 0.1766, "step": 9452 }, { "epoch": 1.9048962321176708, "grad_norm": 0.05210372805595398, "learning_rate": 7.292424702203666e-05, "loss": 0.2294, "step": 9454 }, { "epoch": 1.9052992141849687, "grad_norm": 0.03780307248234749, "learning_rate": 7.291240290500551e-05, "loss": 0.1625, "step": 9456 }, { "epoch": 1.9057021962522667, "grad_norm": 0.03918739780783653, "learning_rate": 7.290055716031392e-05, "loss": 0.1559, "step": 9458 }, { "epoch": 1.9061051783195648, "grad_norm": 0.050934337079524994, "learning_rate": 7.288870978880336e-05, "loss": 0.1769, "step": 9460 }, { "epoch": 1.9065081603868628, "grad_norm": 0.08649832010269165, "learning_rate": 7.287686079131548e-05, "loss": 0.2055, "step": 9462 }, { "epoch": 1.906911142454161, "grad_norm": 0.06893979012966156, "learning_rate": 7.286501016869197e-05, "loss": 0.2656, "step": 9464 }, { "epoch": 1.9073141245214589, "grad_norm": 0.035416193306446075, "learning_rate": 7.28531579217747e-05, "loss": 0.1738, "step": 9466 }, { "epoch": 1.9077171065887568, "grad_norm": 0.035585805773735046, "learning_rate": 7.284130405140565e-05, "loss": 0.1822, "step": 9468 }, { "epoch": 1.9081200886560548, "grad_norm": 0.06133156269788742, "learning_rate": 7.28294485584269e-05, "loss": 0.1803, "step": 9470 }, { "epoch": 1.9085230707233527, "grad_norm": 0.07323663681745529, "learning_rate": 7.281759144368062e-05, "loss": 0.1774, "step": 9472 }, { "epoch": 1.9089260527906509, "grad_norm": 0.07056137174367905, "learning_rate": 7.280573270800914e-05, "loss": 0.1799, "step": 9474 }, { "epoch": 1.9093290348579488, "grad_norm": 0.03919363394379616, "learning_rate": 7.279387235225488e-05, "loss": 0.1497, "step": 9476 }, { "epoch": 1.909732016925247, "grad_norm": 0.04808547720313072, "learning_rate": 7.278201037726038e-05, "loss": 0.1636, "step": 9478 }, { "epoch": 1.910134998992545, "grad_norm": 0.06041782721877098, "learning_rate": 7.277014678386831e-05, "loss": 0.221, "step": 9480 }, { "epoch": 1.9105379810598428, "grad_norm": 0.06125279888510704, "learning_rate": 7.275828157292142e-05, "loss": 0.1225, "step": 9482 }, { "epoch": 1.9109409631271408, "grad_norm": 0.04989524930715561, "learning_rate": 7.274641474526259e-05, "loss": 0.2219, "step": 9484 }, { "epoch": 1.9113439451944387, "grad_norm": 0.04934917762875557, "learning_rate": 7.273454630173485e-05, "loss": 0.2038, "step": 9486 }, { "epoch": 1.911746927261737, "grad_norm": 0.054218970239162445, "learning_rate": 7.27226762431813e-05, "loss": 0.215, "step": 9488 }, { "epoch": 1.9121499093290348, "grad_norm": 0.04823169857263565, "learning_rate": 7.271080457044515e-05, "loss": 0.1865, "step": 9490 }, { "epoch": 1.912552891396333, "grad_norm": 0.043067727237939835, "learning_rate": 7.26989312843698e-05, "loss": 0.2181, "step": 9492 }, { "epoch": 1.912955873463631, "grad_norm": 0.049720462411642075, "learning_rate": 7.268705638579865e-05, "loss": 0.2346, "step": 9494 }, { "epoch": 1.9133588555309289, "grad_norm": 0.05683385208249092, "learning_rate": 7.267517987557528e-05, "loss": 0.203, "step": 9496 }, { "epoch": 1.9137618375982268, "grad_norm": 0.044213853776454926, "learning_rate": 7.266330175454342e-05, "loss": 0.1637, "step": 9498 }, { "epoch": 1.9141648196655248, "grad_norm": 0.042827364057302475, "learning_rate": 7.265142202354684e-05, "loss": 0.2257, "step": 9500 }, { "epoch": 1.914567801732823, "grad_norm": 0.05701254680752754, "learning_rate": 7.263954068342946e-05, "loss": 0.2565, "step": 9502 }, { "epoch": 1.9149707838001209, "grad_norm": 0.04864765703678131, "learning_rate": 7.262765773503534e-05, "loss": 0.191, "step": 9504 }, { "epoch": 1.915373765867419, "grad_norm": 0.04549378156661987, "learning_rate": 7.261577317920857e-05, "loss": 0.147, "step": 9506 }, { "epoch": 1.915776747934717, "grad_norm": 0.04124518856406212, "learning_rate": 7.260388701679345e-05, "loss": 0.1651, "step": 9508 }, { "epoch": 1.916179730002015, "grad_norm": 0.055826831609010696, "learning_rate": 7.259199924863437e-05, "loss": 0.2172, "step": 9510 }, { "epoch": 1.9165827120693129, "grad_norm": 0.05360211059451103, "learning_rate": 7.258010987557577e-05, "loss": 0.2298, "step": 9512 }, { "epoch": 1.9169856941366108, "grad_norm": 0.047606151551008224, "learning_rate": 7.256821889846228e-05, "loss": 0.2104, "step": 9514 }, { "epoch": 1.917388676203909, "grad_norm": 0.049462106078863144, "learning_rate": 7.255632631813862e-05, "loss": 0.1967, "step": 9516 }, { "epoch": 1.917791658271207, "grad_norm": 0.04005942866206169, "learning_rate": 7.254443213544962e-05, "loss": 0.1788, "step": 9518 }, { "epoch": 1.918194640338505, "grad_norm": 0.048854950815439224, "learning_rate": 7.253253635124018e-05, "loss": 0.1889, "step": 9520 }, { "epoch": 1.918597622405803, "grad_norm": 0.049189258366823196, "learning_rate": 7.252063896635543e-05, "loss": 0.1683, "step": 9522 }, { "epoch": 1.919000604473101, "grad_norm": 0.06358414888381958, "learning_rate": 7.250873998164049e-05, "loss": 0.2469, "step": 9524 }, { "epoch": 1.9194035865403989, "grad_norm": 0.061269596219062805, "learning_rate": 7.249683939794065e-05, "loss": 0.1833, "step": 9526 }, { "epoch": 1.9198065686076968, "grad_norm": 0.045860689133405685, "learning_rate": 7.248493721610134e-05, "loss": 0.2043, "step": 9528 }, { "epoch": 1.920209550674995, "grad_norm": 0.04932362958788872, "learning_rate": 7.247303343696803e-05, "loss": 0.2568, "step": 9530 }, { "epoch": 1.920612532742293, "grad_norm": 0.04170341044664383, "learning_rate": 7.246112806138637e-05, "loss": 0.2479, "step": 9532 }, { "epoch": 1.921015514809591, "grad_norm": 0.038920141756534576, "learning_rate": 7.244922109020209e-05, "loss": 0.147, "step": 9534 }, { "epoch": 1.921418496876889, "grad_norm": 0.049470383673906326, "learning_rate": 7.243731252426105e-05, "loss": 0.2193, "step": 9536 }, { "epoch": 1.921821478944187, "grad_norm": 0.0644969716668129, "learning_rate": 7.242540236440922e-05, "loss": 0.176, "step": 9538 }, { "epoch": 1.922224461011485, "grad_norm": 0.035150207579135895, "learning_rate": 7.241349061149265e-05, "loss": 0.1792, "step": 9540 }, { "epoch": 1.9226274430787829, "grad_norm": 0.04241754859685898, "learning_rate": 7.240157726635757e-05, "loss": 0.2179, "step": 9542 }, { "epoch": 1.923030425146081, "grad_norm": 0.044732749462127686, "learning_rate": 7.238966232985027e-05, "loss": 0.2034, "step": 9544 }, { "epoch": 1.923433407213379, "grad_norm": 0.04975948482751846, "learning_rate": 7.237774580281716e-05, "loss": 0.1864, "step": 9546 }, { "epoch": 1.9238363892806771, "grad_norm": 0.053191766142845154, "learning_rate": 7.236582768610476e-05, "loss": 0.2078, "step": 9548 }, { "epoch": 1.924239371347975, "grad_norm": 0.04343879595398903, "learning_rate": 7.235390798055975e-05, "loss": 0.1637, "step": 9550 }, { "epoch": 1.924642353415273, "grad_norm": 0.059335123747587204, "learning_rate": 7.234198668702885e-05, "loss": 0.1638, "step": 9552 }, { "epoch": 1.925045335482571, "grad_norm": 0.052407678216695786, "learning_rate": 7.233006380635897e-05, "loss": 0.1892, "step": 9554 }, { "epoch": 1.925448317549869, "grad_norm": 0.05545216426253319, "learning_rate": 7.231813933939704e-05, "loss": 0.1467, "step": 9556 }, { "epoch": 1.925851299617167, "grad_norm": 0.0498746857047081, "learning_rate": 7.23062132869902e-05, "loss": 0.2632, "step": 9558 }, { "epoch": 1.926254281684465, "grad_norm": 0.04190506041049957, "learning_rate": 7.229428564998564e-05, "loss": 0.1336, "step": 9560 }, { "epoch": 1.9266572637517632, "grad_norm": 0.04394135996699333, "learning_rate": 7.228235642923069e-05, "loss": 0.2154, "step": 9562 }, { "epoch": 1.927060245819061, "grad_norm": 0.0600002259016037, "learning_rate": 7.227042562557276e-05, "loss": 0.1992, "step": 9564 }, { "epoch": 1.927463227886359, "grad_norm": 0.06488315016031265, "learning_rate": 7.225849323985941e-05, "loss": 0.1654, "step": 9566 }, { "epoch": 1.927866209953657, "grad_norm": 0.035505276173353195, "learning_rate": 7.22465592729383e-05, "loss": 0.1619, "step": 9568 }, { "epoch": 1.928269192020955, "grad_norm": 0.055548056960105896, "learning_rate": 7.223462372565721e-05, "loss": 0.1691, "step": 9570 }, { "epoch": 1.928672174088253, "grad_norm": 0.05716124176979065, "learning_rate": 7.2222686598864e-05, "loss": 0.1839, "step": 9572 }, { "epoch": 1.929075156155551, "grad_norm": 0.07215842604637146, "learning_rate": 7.221074789340667e-05, "loss": 0.179, "step": 9574 }, { "epoch": 1.9294781382228492, "grad_norm": 0.04680660739541054, "learning_rate": 7.219880761013334e-05, "loss": 0.2074, "step": 9576 }, { "epoch": 1.9298811202901471, "grad_norm": 0.04360821843147278, "learning_rate": 7.21868657498922e-05, "loss": 0.1993, "step": 9578 }, { "epoch": 1.930284102357445, "grad_norm": 0.04149682819843292, "learning_rate": 7.217492231353164e-05, "loss": 0.2054, "step": 9580 }, { "epoch": 1.930687084424743, "grad_norm": 0.03975476324558258, "learning_rate": 7.216297730190003e-05, "loss": 0.2075, "step": 9582 }, { "epoch": 1.931090066492041, "grad_norm": 0.07294421643018723, "learning_rate": 7.215103071584596e-05, "loss": 0.2055, "step": 9584 }, { "epoch": 1.9314930485593391, "grad_norm": 0.07211649417877197, "learning_rate": 7.21390825562181e-05, "loss": 0.1824, "step": 9586 }, { "epoch": 1.931896030626637, "grad_norm": 0.04363333433866501, "learning_rate": 7.212713282386521e-05, "loss": 0.1955, "step": 9588 }, { "epoch": 1.9322990126939352, "grad_norm": 0.04804873839020729, "learning_rate": 7.21151815196362e-05, "loss": 0.2456, "step": 9590 }, { "epoch": 1.9327019947612332, "grad_norm": 0.04542528837919235, "learning_rate": 7.210322864438006e-05, "loss": 0.1742, "step": 9592 }, { "epoch": 1.933104976828531, "grad_norm": 0.04244324937462807, "learning_rate": 7.209127419894591e-05, "loss": 0.1685, "step": 9594 }, { "epoch": 1.933507958895829, "grad_norm": 0.044985342770814896, "learning_rate": 7.207931818418297e-05, "loss": 0.1957, "step": 9596 }, { "epoch": 1.933910940963127, "grad_norm": 0.04435814917087555, "learning_rate": 7.206736060094059e-05, "loss": 0.1631, "step": 9598 }, { "epoch": 1.9343139230304252, "grad_norm": 0.04762697592377663, "learning_rate": 7.205540145006818e-05, "loss": 0.2069, "step": 9600 }, { "epoch": 1.9347169050977233, "grad_norm": 0.05355559661984444, "learning_rate": 7.204344073241534e-05, "loss": 0.1931, "step": 9602 }, { "epoch": 1.9351198871650213, "grad_norm": 0.05477927625179291, "learning_rate": 7.203147844883172e-05, "loss": 0.2047, "step": 9604 }, { "epoch": 1.9355228692323192, "grad_norm": 0.04428846761584282, "learning_rate": 7.201951460016709e-05, "loss": 0.1725, "step": 9606 }, { "epoch": 1.9359258512996171, "grad_norm": 0.041742779314517975, "learning_rate": 7.200754918727137e-05, "loss": 0.166, "step": 9608 }, { "epoch": 1.936328833366915, "grad_norm": 0.04589561000466347, "learning_rate": 7.199558221099456e-05, "loss": 0.169, "step": 9610 }, { "epoch": 1.936731815434213, "grad_norm": 0.04550304263830185, "learning_rate": 7.198361367218676e-05, "loss": 0.1958, "step": 9612 }, { "epoch": 1.9371347975015112, "grad_norm": 0.047180287539958954, "learning_rate": 7.19716435716982e-05, "loss": 0.1825, "step": 9614 }, { "epoch": 1.9375377795688093, "grad_norm": 0.05457916855812073, "learning_rate": 7.195967191037922e-05, "loss": 0.2018, "step": 9616 }, { "epoch": 1.9379407616361073, "grad_norm": 0.04042569547891617, "learning_rate": 7.194769868908026e-05, "loss": 0.2112, "step": 9618 }, { "epoch": 1.9383437437034052, "grad_norm": 0.042039696127176285, "learning_rate": 7.19357239086519e-05, "loss": 0.173, "step": 9620 }, { "epoch": 1.9387467257707032, "grad_norm": 0.03700724244117737, "learning_rate": 7.192374756994477e-05, "loss": 0.1802, "step": 9622 }, { "epoch": 1.9391497078380011, "grad_norm": 0.06561025232076645, "learning_rate": 7.19117696738097e-05, "loss": 0.1846, "step": 9624 }, { "epoch": 1.939552689905299, "grad_norm": 0.06743749976158142, "learning_rate": 7.189979022109755e-05, "loss": 0.1871, "step": 9626 }, { "epoch": 1.9399556719725972, "grad_norm": 0.06319016963243484, "learning_rate": 7.188780921265932e-05, "loss": 0.2142, "step": 9628 }, { "epoch": 1.9403586540398954, "grad_norm": 0.06255649775266647, "learning_rate": 7.187582664934613e-05, "loss": 0.1843, "step": 9630 }, { "epoch": 1.9407616361071933, "grad_norm": 0.04725359007716179, "learning_rate": 7.186384253200919e-05, "loss": 0.166, "step": 9632 }, { "epoch": 1.9411646181744913, "grad_norm": 0.08775375038385391, "learning_rate": 7.185185686149987e-05, "loss": 0.1707, "step": 9634 }, { "epoch": 1.9415676002417892, "grad_norm": 0.06302014738321304, "learning_rate": 7.183986963866955e-05, "loss": 0.2212, "step": 9636 }, { "epoch": 1.9419705823090871, "grad_norm": 0.05067121982574463, "learning_rate": 7.182788086436985e-05, "loss": 0.1931, "step": 9638 }, { "epoch": 1.942373564376385, "grad_norm": 0.05320592224597931, "learning_rate": 7.181589053945239e-05, "loss": 0.1897, "step": 9640 }, { "epoch": 1.9427765464436832, "grad_norm": 0.07704076915979385, "learning_rate": 7.180389866476895e-05, "loss": 0.2012, "step": 9642 }, { "epoch": 1.9431795285109814, "grad_norm": 0.03068099170923233, "learning_rate": 7.179190524117143e-05, "loss": 0.1731, "step": 9644 }, { "epoch": 1.9435825105782794, "grad_norm": 0.0580390989780426, "learning_rate": 7.177991026951179e-05, "loss": 0.1883, "step": 9646 }, { "epoch": 1.9439854926455773, "grad_norm": 0.0502253882586956, "learning_rate": 7.176791375064217e-05, "loss": 0.1973, "step": 9648 }, { "epoch": 1.9443884747128752, "grad_norm": 0.048907410353422165, "learning_rate": 7.175591568541479e-05, "loss": 0.201, "step": 9650 }, { "epoch": 1.9447914567801732, "grad_norm": 0.056508488953113556, "learning_rate": 7.174391607468193e-05, "loss": 0.1606, "step": 9652 }, { "epoch": 1.9451944388474713, "grad_norm": 0.037321534007787704, "learning_rate": 7.173191491929605e-05, "loss": 0.1595, "step": 9654 }, { "epoch": 1.9455974209147693, "grad_norm": 0.04397205635905266, "learning_rate": 7.17199122201097e-05, "loss": 0.2088, "step": 9656 }, { "epoch": 1.9460004029820674, "grad_norm": 0.041947152465581894, "learning_rate": 7.17079079779755e-05, "loss": 0.1825, "step": 9658 }, { "epoch": 1.9464033850493654, "grad_norm": 0.053206928074359894, "learning_rate": 7.169590219374625e-05, "loss": 0.2055, "step": 9660 }, { "epoch": 1.9468063671166633, "grad_norm": 0.04078378155827522, "learning_rate": 7.16838948682748e-05, "loss": 0.1977, "step": 9662 }, { "epoch": 1.9472093491839613, "grad_norm": 0.08155234903097153, "learning_rate": 7.167188600241413e-05, "loss": 0.2538, "step": 9664 }, { "epoch": 1.9476123312512592, "grad_norm": 0.04778615012764931, "learning_rate": 7.165987559701735e-05, "loss": 0.1354, "step": 9666 }, { "epoch": 1.9480153133185574, "grad_norm": 0.05697598680853844, "learning_rate": 7.164786365293765e-05, "loss": 0.2056, "step": 9668 }, { "epoch": 1.9484182953858553, "grad_norm": 0.035130467265844345, "learning_rate": 7.163585017102833e-05, "loss": 0.2103, "step": 9670 }, { "epoch": 1.9488212774531535, "grad_norm": 0.06631353497505188, "learning_rate": 7.162383515214281e-05, "loss": 0.2048, "step": 9672 }, { "epoch": 1.9492242595204514, "grad_norm": 0.04498956725001335, "learning_rate": 7.161181859713463e-05, "loss": 0.2305, "step": 9674 }, { "epoch": 1.9496272415877494, "grad_norm": 0.04058938845992088, "learning_rate": 7.159980050685742e-05, "loss": 0.2126, "step": 9676 }, { "epoch": 1.9500302236550473, "grad_norm": 0.0352054089307785, "learning_rate": 7.158778088216494e-05, "loss": 0.1872, "step": 9678 }, { "epoch": 1.9504332057223452, "grad_norm": 0.03873911872506142, "learning_rate": 7.1575759723911e-05, "loss": 0.1815, "step": 9680 }, { "epoch": 1.9508361877896434, "grad_norm": 0.030585993081331253, "learning_rate": 7.156373703294961e-05, "loss": 0.15, "step": 9682 }, { "epoch": 1.9512391698569413, "grad_norm": 0.059601254761219025, "learning_rate": 7.155171281013483e-05, "loss": 0.189, "step": 9684 }, { "epoch": 1.9516421519242395, "grad_norm": 0.04292791336774826, "learning_rate": 7.153968705632083e-05, "loss": 0.2102, "step": 9686 }, { "epoch": 1.9520451339915375, "grad_norm": 0.05211935192346573, "learning_rate": 7.152765977236191e-05, "loss": 0.1977, "step": 9688 }, { "epoch": 1.9524481160588354, "grad_norm": 0.03731980919837952, "learning_rate": 7.15156309591125e-05, "loss": 0.1749, "step": 9690 }, { "epoch": 1.9528510981261333, "grad_norm": 0.047702062875032425, "learning_rate": 7.150360061742702e-05, "loss": 0.2043, "step": 9692 }, { "epoch": 1.9532540801934313, "grad_norm": 0.048761576414108276, "learning_rate": 7.149156874816018e-05, "loss": 0.2322, "step": 9694 }, { "epoch": 1.9536570622607294, "grad_norm": 0.07745224982500076, "learning_rate": 7.147953535216666e-05, "loss": 0.2188, "step": 9696 }, { "epoch": 1.9540600443280274, "grad_norm": 0.06417658925056458, "learning_rate": 7.14675004303013e-05, "loss": 0.2092, "step": 9698 }, { "epoch": 1.9544630263953255, "grad_norm": 0.04546617716550827, "learning_rate": 7.145546398341903e-05, "loss": 0.1456, "step": 9700 }, { "epoch": 1.9548660084626235, "grad_norm": 0.07141395658254623, "learning_rate": 7.144342601237493e-05, "loss": 0.2023, "step": 9702 }, { "epoch": 1.9552689905299214, "grad_norm": 0.06302723288536072, "learning_rate": 7.143138651802412e-05, "loss": 0.1423, "step": 9704 }, { "epoch": 1.9556719725972194, "grad_norm": 0.050656940788030624, "learning_rate": 7.14193455012219e-05, "loss": 0.174, "step": 9706 }, { "epoch": 1.9560749546645173, "grad_norm": 0.05971457064151764, "learning_rate": 7.140730296282363e-05, "loss": 0.194, "step": 9708 }, { "epoch": 1.9564779367318155, "grad_norm": 0.07366830855607986, "learning_rate": 7.139525890368479e-05, "loss": 0.1877, "step": 9710 }, { "epoch": 1.9568809187991134, "grad_norm": 0.04481309652328491, "learning_rate": 7.138321332466097e-05, "loss": 0.2125, "step": 9712 }, { "epoch": 1.9572839008664116, "grad_norm": 0.0536804161965847, "learning_rate": 7.137116622660788e-05, "loss": 0.2313, "step": 9714 }, { "epoch": 1.9576868829337095, "grad_norm": 0.05160369351506233, "learning_rate": 7.135911761038132e-05, "loss": 0.2034, "step": 9716 }, { "epoch": 1.9580898650010075, "grad_norm": 0.04648306593298912, "learning_rate": 7.13470674768372e-05, "loss": 0.1625, "step": 9718 }, { "epoch": 1.9584928470683054, "grad_norm": 0.06373865157365799, "learning_rate": 7.133501582683155e-05, "loss": 0.1981, "step": 9720 }, { "epoch": 1.9588958291356033, "grad_norm": 0.07009351998567581, "learning_rate": 7.132296266122049e-05, "loss": 0.2067, "step": 9722 }, { "epoch": 1.9592988112029015, "grad_norm": 0.06840388476848602, "learning_rate": 7.131090798086026e-05, "loss": 0.2469, "step": 9724 }, { "epoch": 1.9597017932701994, "grad_norm": 0.04283274710178375, "learning_rate": 7.129885178660722e-05, "loss": 0.1898, "step": 9726 }, { "epoch": 1.9601047753374976, "grad_norm": 0.04922349750995636, "learning_rate": 7.128679407931781e-05, "loss": 0.2174, "step": 9728 }, { "epoch": 1.9605077574047955, "grad_norm": 0.049361422657966614, "learning_rate": 7.127473485984859e-05, "loss": 0.2225, "step": 9730 }, { "epoch": 1.9609107394720935, "grad_norm": 0.05047673359513283, "learning_rate": 7.126267412905623e-05, "loss": 0.1927, "step": 9732 }, { "epoch": 1.9613137215393914, "grad_norm": 0.05042770877480507, "learning_rate": 7.125061188779751e-05, "loss": 0.2007, "step": 9734 }, { "epoch": 1.9617167036066894, "grad_norm": 0.05922012776136398, "learning_rate": 7.123854813692929e-05, "loss": 0.1867, "step": 9736 }, { "epoch": 1.9621196856739875, "grad_norm": 0.0502203106880188, "learning_rate": 7.122648287730859e-05, "loss": 0.2325, "step": 9738 }, { "epoch": 1.9625226677412855, "grad_norm": 0.04548550397157669, "learning_rate": 7.12144161097925e-05, "loss": 0.1761, "step": 9740 }, { "epoch": 1.9629256498085836, "grad_norm": 0.04983745142817497, "learning_rate": 7.12023478352382e-05, "loss": 0.1964, "step": 9742 }, { "epoch": 1.9633286318758816, "grad_norm": 0.06270463019609451, "learning_rate": 7.119027805450301e-05, "loss": 0.1915, "step": 9744 }, { "epoch": 1.9637316139431795, "grad_norm": 0.05191759392619133, "learning_rate": 7.117820676844437e-05, "loss": 0.2485, "step": 9746 }, { "epoch": 1.9641345960104775, "grad_norm": 0.061899662017822266, "learning_rate": 7.116613397791978e-05, "loss": 0.228, "step": 9748 }, { "epoch": 1.9645375780777754, "grad_norm": 0.09284082800149918, "learning_rate": 7.11540596837869e-05, "loss": 0.2192, "step": 9750 }, { "epoch": 1.9649405601450736, "grad_norm": 0.054707858711481094, "learning_rate": 7.114198388690344e-05, "loss": 0.1977, "step": 9752 }, { "epoch": 1.9653435422123715, "grad_norm": 0.05887407064437866, "learning_rate": 7.112990658812727e-05, "loss": 0.2365, "step": 9754 }, { "epoch": 1.9657465242796697, "grad_norm": 0.04575325548648834, "learning_rate": 7.111782778831632e-05, "loss": 0.1869, "step": 9756 }, { "epoch": 1.9661495063469676, "grad_norm": 0.05046350136399269, "learning_rate": 7.110574748832864e-05, "loss": 0.1999, "step": 9758 }, { "epoch": 1.9665524884142656, "grad_norm": 0.056472841650247574, "learning_rate": 7.109366568902245e-05, "loss": 0.1741, "step": 9760 }, { "epoch": 1.9669554704815635, "grad_norm": 0.07386624068021774, "learning_rate": 7.108158239125595e-05, "loss": 0.2023, "step": 9762 }, { "epoch": 1.9673584525488614, "grad_norm": 0.049812037497758865, "learning_rate": 7.106949759588757e-05, "loss": 0.191, "step": 9764 }, { "epoch": 1.9677614346161596, "grad_norm": 0.05968543142080307, "learning_rate": 7.105741130377577e-05, "loss": 0.1909, "step": 9766 }, { "epoch": 1.9681644166834575, "grad_norm": 0.04280983284115791, "learning_rate": 7.104532351577914e-05, "loss": 0.2065, "step": 9768 }, { "epoch": 1.9685673987507557, "grad_norm": 0.03398040309548378, "learning_rate": 7.10332342327564e-05, "loss": 0.1459, "step": 9770 }, { "epoch": 1.9689703808180536, "grad_norm": 0.06277387589216232, "learning_rate": 7.102114345556632e-05, "loss": 0.1871, "step": 9772 }, { "epoch": 1.9693733628853516, "grad_norm": 0.04359853267669678, "learning_rate": 7.100905118506785e-05, "loss": 0.2372, "step": 9774 }, { "epoch": 1.9697763449526495, "grad_norm": 0.04512301832437515, "learning_rate": 7.099695742211996e-05, "loss": 0.1822, "step": 9776 }, { "epoch": 1.9701793270199475, "grad_norm": 0.04159922897815704, "learning_rate": 7.09848621675818e-05, "loss": 0.2415, "step": 9778 }, { "epoch": 1.9705823090872456, "grad_norm": 0.047807469964027405, "learning_rate": 7.097276542231259e-05, "loss": 0.1432, "step": 9780 }, { "epoch": 1.9709852911545436, "grad_norm": 0.06380611658096313, "learning_rate": 7.096066718717169e-05, "loss": 0.2216, "step": 9782 }, { "epoch": 1.9713882732218417, "grad_norm": 0.04591721296310425, "learning_rate": 7.09485674630185e-05, "loss": 0.1293, "step": 9784 }, { "epoch": 1.9717912552891397, "grad_norm": 0.056421924382448196, "learning_rate": 7.093646625071256e-05, "loss": 0.2394, "step": 9786 }, { "epoch": 1.9721942373564376, "grad_norm": 0.052102234214544296, "learning_rate": 7.092436355111356e-05, "loss": 0.1894, "step": 9788 }, { "epoch": 1.9725972194237356, "grad_norm": 0.04517102986574173, "learning_rate": 7.091225936508124e-05, "loss": 0.1867, "step": 9790 }, { "epoch": 1.9730002014910335, "grad_norm": 0.04084205627441406, "learning_rate": 7.090015369347544e-05, "loss": 0.1919, "step": 9792 }, { "epoch": 1.9734031835583317, "grad_norm": 0.04202771186828613, "learning_rate": 7.088804653715617e-05, "loss": 0.173, "step": 9794 }, { "epoch": 1.9738061656256296, "grad_norm": 0.05949341878294945, "learning_rate": 7.087593789698345e-05, "loss": 0.1754, "step": 9796 }, { "epoch": 1.9742091476929278, "grad_norm": 0.05214182287454605, "learning_rate": 7.086382777381751e-05, "loss": 0.17, "step": 9798 }, { "epoch": 1.9746121297602257, "grad_norm": 0.058549653738737106, "learning_rate": 7.085171616851862e-05, "loss": 0.193, "step": 9800 }, { "epoch": 1.9750151118275237, "grad_norm": 0.050743550062179565, "learning_rate": 7.083960308194715e-05, "loss": 0.2091, "step": 9802 }, { "epoch": 1.9754180938948216, "grad_norm": 0.06692482531070709, "learning_rate": 7.08274885149636e-05, "loss": 0.2314, "step": 9804 }, { "epoch": 1.9758210759621195, "grad_norm": 0.0435248427093029, "learning_rate": 7.081537246842857e-05, "loss": 0.2488, "step": 9806 }, { "epoch": 1.9762240580294177, "grad_norm": 0.0509062334895134, "learning_rate": 7.080325494320279e-05, "loss": 0.1879, "step": 9808 }, { "epoch": 1.9766270400967159, "grad_norm": 0.05188576877117157, "learning_rate": 7.079113594014702e-05, "loss": 0.2126, "step": 9810 }, { "epoch": 1.9770300221640138, "grad_norm": 0.06566416472196579, "learning_rate": 7.077901546012223e-05, "loss": 0.1967, "step": 9812 }, { "epoch": 1.9774330042313117, "grad_norm": 0.05517309531569481, "learning_rate": 7.076689350398939e-05, "loss": 0.145, "step": 9814 }, { "epoch": 1.9778359862986097, "grad_norm": 0.04569048807024956, "learning_rate": 7.075477007260966e-05, "loss": 0.161, "step": 9816 }, { "epoch": 1.9782389683659076, "grad_norm": 0.04703947901725769, "learning_rate": 7.074264516684427e-05, "loss": 0.1748, "step": 9818 }, { "epoch": 1.9786419504332056, "grad_norm": 0.045801058411598206, "learning_rate": 7.073051878755452e-05, "loss": 0.1644, "step": 9820 }, { "epoch": 1.9790449325005037, "grad_norm": 0.03103361465036869, "learning_rate": 7.071839093560188e-05, "loss": 0.1535, "step": 9822 }, { "epoch": 1.979447914567802, "grad_norm": 0.04012531787157059, "learning_rate": 7.070626161184788e-05, "loss": 0.1788, "step": 9824 }, { "epoch": 1.9798508966350998, "grad_norm": 0.04191760718822479, "learning_rate": 7.069413081715416e-05, "loss": 0.1657, "step": 9826 }, { "epoch": 1.9802538787023978, "grad_norm": 0.06015370413661003, "learning_rate": 7.068199855238249e-05, "loss": 0.2029, "step": 9828 }, { "epoch": 1.9806568607696957, "grad_norm": 0.049391429871320724, "learning_rate": 7.066986481839471e-05, "loss": 0.1951, "step": 9830 }, { "epoch": 1.9810598428369937, "grad_norm": 0.05129459872841835, "learning_rate": 7.065772961605281e-05, "loss": 0.2027, "step": 9832 }, { "epoch": 1.9814628249042916, "grad_norm": 0.05638004466891289, "learning_rate": 7.064559294621882e-05, "loss": 0.1905, "step": 9834 }, { "epoch": 1.9818658069715898, "grad_norm": 0.049807120114564896, "learning_rate": 7.063345480975493e-05, "loss": 0.2021, "step": 9836 }, { "epoch": 1.982268789038888, "grad_norm": 0.056036632508039474, "learning_rate": 7.06213152075234e-05, "loss": 0.2191, "step": 9838 }, { "epoch": 1.9826717711061859, "grad_norm": 0.04058406874537468, "learning_rate": 7.060917414038663e-05, "loss": 0.2015, "step": 9840 }, { "epoch": 1.9830747531734838, "grad_norm": 0.044951487332582474, "learning_rate": 7.059703160920707e-05, "loss": 0.1379, "step": 9842 }, { "epoch": 1.9834777352407817, "grad_norm": 0.06562306731939316, "learning_rate": 7.058488761484735e-05, "loss": 0.2462, "step": 9844 }, { "epoch": 1.9838807173080797, "grad_norm": 0.05451963469386101, "learning_rate": 7.057274215817011e-05, "loss": 0.217, "step": 9846 }, { "epoch": 1.9842836993753776, "grad_norm": 0.04903053119778633, "learning_rate": 7.056059524003818e-05, "loss": 0.1458, "step": 9848 }, { "epoch": 1.9846866814426758, "grad_norm": 0.05418461188673973, "learning_rate": 7.054844686131445e-05, "loss": 0.2123, "step": 9850 }, { "epoch": 1.985089663509974, "grad_norm": 0.04938149452209473, "learning_rate": 7.05362970228619e-05, "loss": 0.2197, "step": 9852 }, { "epoch": 1.985492645577272, "grad_norm": 0.06866676360368729, "learning_rate": 7.052414572554367e-05, "loss": 0.2042, "step": 9854 }, { "epoch": 1.9858956276445698, "grad_norm": 0.04523222893476486, "learning_rate": 7.051199297022295e-05, "loss": 0.1729, "step": 9856 }, { "epoch": 1.9862986097118678, "grad_norm": 0.037189189344644547, "learning_rate": 7.049983875776305e-05, "loss": 0.1686, "step": 9858 }, { "epoch": 1.9867015917791657, "grad_norm": 0.05150702968239784, "learning_rate": 7.048768308902739e-05, "loss": 0.1988, "step": 9860 }, { "epoch": 1.9871045738464639, "grad_norm": 0.04161156713962555, "learning_rate": 7.047552596487947e-05, "loss": 0.1352, "step": 9862 }, { "epoch": 1.9875075559137618, "grad_norm": 0.03971536085009575, "learning_rate": 7.046336738618296e-05, "loss": 0.2015, "step": 9864 }, { "epoch": 1.98791053798106, "grad_norm": 0.05211701616644859, "learning_rate": 7.045120735380155e-05, "loss": 0.1902, "step": 9866 }, { "epoch": 1.988313520048358, "grad_norm": 0.0716228112578392, "learning_rate": 7.043904586859906e-05, "loss": 0.2204, "step": 9868 }, { "epoch": 1.9887165021156559, "grad_norm": 0.0337841659784317, "learning_rate": 7.042688293143946e-05, "loss": 0.1608, "step": 9870 }, { "epoch": 1.9891194841829538, "grad_norm": 0.05220310389995575, "learning_rate": 7.041471854318675e-05, "loss": 0.1816, "step": 9872 }, { "epoch": 1.9895224662502518, "grad_norm": 0.053605400025844574, "learning_rate": 7.040255270470509e-05, "loss": 0.188, "step": 9874 }, { "epoch": 1.98992544831755, "grad_norm": 0.04616335779428482, "learning_rate": 7.039038541685872e-05, "loss": 0.1861, "step": 9876 }, { "epoch": 1.9903284303848479, "grad_norm": 0.041820913553237915, "learning_rate": 7.037821668051196e-05, "loss": 0.1961, "step": 9878 }, { "epoch": 1.990731412452146, "grad_norm": 0.041626784950494766, "learning_rate": 7.03660464965293e-05, "loss": 0.1752, "step": 9880 }, { "epoch": 1.991134394519444, "grad_norm": 0.03145499899983406, "learning_rate": 7.035387486577527e-05, "loss": 0.1503, "step": 9882 }, { "epoch": 1.991537376586742, "grad_norm": 0.03995690122246742, "learning_rate": 7.03417017891145e-05, "loss": 0.2092, "step": 9884 }, { "epoch": 1.9919403586540398, "grad_norm": 0.0410347580909729, "learning_rate": 7.032952726741178e-05, "loss": 0.1642, "step": 9886 }, { "epoch": 1.9923433407213378, "grad_norm": 0.04722330719232559, "learning_rate": 7.031735130153194e-05, "loss": 0.171, "step": 9888 }, { "epoch": 1.992746322788636, "grad_norm": 0.05450482293963432, "learning_rate": 7.030517389233997e-05, "loss": 0.1765, "step": 9890 }, { "epoch": 1.993149304855934, "grad_norm": 0.043954089283943176, "learning_rate": 7.029299504070091e-05, "loss": 0.1902, "step": 9892 }, { "epoch": 1.993552286923232, "grad_norm": 0.051854848861694336, "learning_rate": 7.028081474747996e-05, "loss": 0.2037, "step": 9894 }, { "epoch": 1.99395526899053, "grad_norm": 0.05640044063329697, "learning_rate": 7.026863301354234e-05, "loss": 0.1626, "step": 9896 }, { "epoch": 1.994358251057828, "grad_norm": 0.04978862777352333, "learning_rate": 7.025644983975345e-05, "loss": 0.2004, "step": 9898 }, { "epoch": 1.9947612331251259, "grad_norm": 0.06613018363714218, "learning_rate": 7.024426522697877e-05, "loss": 0.1726, "step": 9900 }, { "epoch": 1.9951642151924238, "grad_norm": 0.043966181576251984, "learning_rate": 7.023207917608385e-05, "loss": 0.1715, "step": 9902 }, { "epoch": 1.995567197259722, "grad_norm": 0.05377311259508133, "learning_rate": 7.021989168793439e-05, "loss": 0.1577, "step": 9904 }, { "epoch": 1.99597017932702, "grad_norm": 0.055230725556612015, "learning_rate": 7.020770276339617e-05, "loss": 0.215, "step": 9906 }, { "epoch": 1.996373161394318, "grad_norm": 0.061077140271663666, "learning_rate": 7.019551240333504e-05, "loss": 0.1898, "step": 9908 }, { "epoch": 1.996776143461616, "grad_norm": 0.06587786227464676, "learning_rate": 7.018332060861704e-05, "loss": 0.215, "step": 9910 }, { "epoch": 1.997179125528914, "grad_norm": 0.04662555083632469, "learning_rate": 7.017112738010819e-05, "loss": 0.2062, "step": 9912 }, { "epoch": 1.997582107596212, "grad_norm": 0.07606983929872513, "learning_rate": 7.01589327186747e-05, "loss": 0.2201, "step": 9914 }, { "epoch": 1.9979850896635099, "grad_norm": 0.051272887736558914, "learning_rate": 7.01467366251829e-05, "loss": 0.1999, "step": 9916 }, { "epoch": 1.998388071730808, "grad_norm": 0.059559039771556854, "learning_rate": 7.013453910049914e-05, "loss": 0.2025, "step": 9918 }, { "epoch": 1.998791053798106, "grad_norm": 0.04579564183950424, "learning_rate": 7.012234014548993e-05, "loss": 0.1785, "step": 9920 }, { "epoch": 1.9991940358654041, "grad_norm": 0.05059002712368965, "learning_rate": 7.011013976102185e-05, "loss": 0.2108, "step": 9922 }, { "epoch": 1.999597017932702, "grad_norm": 0.041644349694252014, "learning_rate": 7.00979379479616e-05, "loss": 0.2111, "step": 9924 }, { "epoch": 2.0, "grad_norm": 0.0676032155752182, "learning_rate": 7.008573470717599e-05, "loss": 0.2367, "step": 9926 }, { "epoch": 2.000402982067298, "grad_norm": 0.04403241351246834, "learning_rate": 7.00735300395319e-05, "loss": 0.2116, "step": 9928 }, { "epoch": 2.000805964134596, "grad_norm": 0.05903521552681923, "learning_rate": 7.006132394589634e-05, "loss": 0.216, "step": 9930 }, { "epoch": 2.001208946201894, "grad_norm": 0.04107736051082611, "learning_rate": 7.004911642713641e-05, "loss": 0.1867, "step": 9932 }, { "epoch": 2.001611928269192, "grad_norm": 0.039992742240428925, "learning_rate": 7.003690748411932e-05, "loss": 0.1964, "step": 9934 }, { "epoch": 2.00201491033649, "grad_norm": 0.04343795403838158, "learning_rate": 7.002469711771236e-05, "loss": 0.1488, "step": 9936 }, { "epoch": 2.002417892403788, "grad_norm": 0.04299810156226158, "learning_rate": 7.001248532878293e-05, "loss": 0.1904, "step": 9938 }, { "epoch": 2.002820874471086, "grad_norm": 0.02997422404587269, "learning_rate": 7.000027211819857e-05, "loss": 0.1424, "step": 9940 }, { "epoch": 2.003223856538384, "grad_norm": 0.04063683748245239, "learning_rate": 6.998805748682686e-05, "loss": 0.1956, "step": 9942 }, { "epoch": 2.003626838605682, "grad_norm": 0.03808411955833435, "learning_rate": 6.99758414355355e-05, "loss": 0.1597, "step": 9944 }, { "epoch": 2.00402982067298, "grad_norm": 0.04115324094891548, "learning_rate": 6.996362396519232e-05, "loss": 0.1725, "step": 9946 }, { "epoch": 2.0044328027402782, "grad_norm": 0.054356805980205536, "learning_rate": 6.995140507666523e-05, "loss": 0.2477, "step": 9948 }, { "epoch": 2.004835784807576, "grad_norm": 0.04506862163543701, "learning_rate": 6.993918477082221e-05, "loss": 0.1809, "step": 9950 }, { "epoch": 2.005238766874874, "grad_norm": 0.05171862989664078, "learning_rate": 6.99269630485314e-05, "loss": 0.184, "step": 9952 }, { "epoch": 2.005641748942172, "grad_norm": 0.06944061070680618, "learning_rate": 6.9914739910661e-05, "loss": 0.1542, "step": 9954 }, { "epoch": 2.00604473100947, "grad_norm": 0.04488156735897064, "learning_rate": 6.990251535807934e-05, "loss": 0.1921, "step": 9956 }, { "epoch": 2.006447713076768, "grad_norm": 0.031162697821855545, "learning_rate": 6.98902893916548e-05, "loss": 0.1407, "step": 9958 }, { "epoch": 2.006850695144066, "grad_norm": 0.04962150752544403, "learning_rate": 6.987806201225592e-05, "loss": 0.1829, "step": 9960 }, { "epoch": 2.0072536772113643, "grad_norm": 0.04459129646420479, "learning_rate": 6.98658332207513e-05, "loss": 0.1636, "step": 9962 }, { "epoch": 2.007656659278662, "grad_norm": 0.04113384708762169, "learning_rate": 6.985360301800967e-05, "loss": 0.1955, "step": 9964 }, { "epoch": 2.00805964134596, "grad_norm": 0.07087098807096481, "learning_rate": 6.984137140489982e-05, "loss": 0.1973, "step": 9966 }, { "epoch": 2.008462623413258, "grad_norm": 0.052531544119119644, "learning_rate": 6.982913838229068e-05, "loss": 0.1526, "step": 9968 }, { "epoch": 2.008865605480556, "grad_norm": 0.06382293254137039, "learning_rate": 6.981690395105128e-05, "loss": 0.2112, "step": 9970 }, { "epoch": 2.009268587547854, "grad_norm": 0.05440857633948326, "learning_rate": 6.98046681120507e-05, "loss": 0.2131, "step": 9972 }, { "epoch": 2.009671569615152, "grad_norm": 0.060303978621959686, "learning_rate": 6.979243086615818e-05, "loss": 0.2324, "step": 9974 }, { "epoch": 2.0100745516824503, "grad_norm": 0.036824069917201996, "learning_rate": 6.978019221424302e-05, "loss": 0.2191, "step": 9976 }, { "epoch": 2.0104775337497482, "grad_norm": 0.048221174627542496, "learning_rate": 6.976795215717462e-05, "loss": 0.1703, "step": 9978 }, { "epoch": 2.010880515817046, "grad_norm": 0.0401717834174633, "learning_rate": 6.975571069582253e-05, "loss": 0.1442, "step": 9980 }, { "epoch": 2.011283497884344, "grad_norm": 0.05119583383202553, "learning_rate": 6.974346783105634e-05, "loss": 0.2227, "step": 9982 }, { "epoch": 2.011686479951642, "grad_norm": 0.06163738667964935, "learning_rate": 6.973122356374578e-05, "loss": 0.1731, "step": 9984 }, { "epoch": 2.01208946201894, "grad_norm": 0.0363716222345829, "learning_rate": 6.971897789476065e-05, "loss": 0.1504, "step": 9986 }, { "epoch": 2.012492444086238, "grad_norm": 0.047141000628471375, "learning_rate": 6.970673082497085e-05, "loss": 0.184, "step": 9988 }, { "epoch": 2.0128954261535363, "grad_norm": 0.07161784917116165, "learning_rate": 6.969448235524643e-05, "loss": 0.2386, "step": 9990 }, { "epoch": 2.0132984082208343, "grad_norm": 0.038642749190330505, "learning_rate": 6.968223248645748e-05, "loss": 0.2215, "step": 9992 }, { "epoch": 2.013701390288132, "grad_norm": 0.033294469118118286, "learning_rate": 6.966998121947419e-05, "loss": 0.1225, "step": 9994 }, { "epoch": 2.01410437235543, "grad_norm": 0.0679098516702652, "learning_rate": 6.965772855516691e-05, "loss": 0.2254, "step": 9996 }, { "epoch": 2.014507354422728, "grad_norm": 0.05759232118725777, "learning_rate": 6.964547449440602e-05, "loss": 0.2548, "step": 9998 }, { "epoch": 2.014910336490026, "grad_norm": 0.04030608758330345, "learning_rate": 6.963321903806206e-05, "loss": 0.132, "step": 10000 }, { "epoch": 2.015313318557324, "grad_norm": 0.0534801110625267, "learning_rate": 6.96209621870056e-05, "loss": 0.1883, "step": 10002 }, { "epoch": 2.0157163006246224, "grad_norm": 0.036523666232824326, "learning_rate": 6.960870394210737e-05, "loss": 0.1703, "step": 10004 }, { "epoch": 2.0161192826919203, "grad_norm": 0.052369460463523865, "learning_rate": 6.959644430423818e-05, "loss": 0.1431, "step": 10006 }, { "epoch": 2.0165222647592183, "grad_norm": 0.051367372274398804, "learning_rate": 6.958418327426889e-05, "loss": 0.1807, "step": 10008 }, { "epoch": 2.016925246826516, "grad_norm": 0.04442469775676727, "learning_rate": 6.95719208530706e-05, "loss": 0.1667, "step": 10010 }, { "epoch": 2.017328228893814, "grad_norm": 0.05809894576668739, "learning_rate": 6.95596570415143e-05, "loss": 0.1925, "step": 10012 }, { "epoch": 2.017731210961112, "grad_norm": 0.07553379237651825, "learning_rate": 6.954739184047127e-05, "loss": 0.1611, "step": 10014 }, { "epoch": 2.01813419302841, "grad_norm": 0.058313459157943726, "learning_rate": 6.953512525081279e-05, "loss": 0.1802, "step": 10016 }, { "epoch": 2.0185371750957084, "grad_norm": 0.03649749606847763, "learning_rate": 6.952285727341025e-05, "loss": 0.1916, "step": 10018 }, { "epoch": 2.0189401571630063, "grad_norm": 0.06322020292282104, "learning_rate": 6.951058790913514e-05, "loss": 0.241, "step": 10020 }, { "epoch": 2.0193431392303043, "grad_norm": 0.06284866482019424, "learning_rate": 6.949831715885909e-05, "loss": 0.22, "step": 10022 }, { "epoch": 2.0197461212976022, "grad_norm": 0.04919195920228958, "learning_rate": 6.948604502345375e-05, "loss": 0.1511, "step": 10024 }, { "epoch": 2.0201491033649, "grad_norm": 0.04934461787343025, "learning_rate": 6.947377150379092e-05, "loss": 0.1878, "step": 10026 }, { "epoch": 2.020552085432198, "grad_norm": 0.04316118359565735, "learning_rate": 6.946149660074255e-05, "loss": 0.206, "step": 10028 }, { "epoch": 2.0209550674994965, "grad_norm": 0.047829512506723404, "learning_rate": 6.944922031518055e-05, "loss": 0.1894, "step": 10030 }, { "epoch": 2.0213580495667944, "grad_norm": 0.06630343943834305, "learning_rate": 6.943694264797707e-05, "loss": 0.2087, "step": 10032 }, { "epoch": 2.0217610316340924, "grad_norm": 0.05703127011656761, "learning_rate": 6.942466360000426e-05, "loss": 0.1625, "step": 10034 }, { "epoch": 2.0221640137013903, "grad_norm": 0.051246266812086105, "learning_rate": 6.94123831721344e-05, "loss": 0.2249, "step": 10036 }, { "epoch": 2.0225669957686883, "grad_norm": 0.06704655289649963, "learning_rate": 6.94001013652399e-05, "loss": 0.1971, "step": 10038 }, { "epoch": 2.022969977835986, "grad_norm": 0.07502034306526184, "learning_rate": 6.938781818019322e-05, "loss": 0.2133, "step": 10040 }, { "epoch": 2.023372959903284, "grad_norm": 0.06751389801502228, "learning_rate": 6.937553361786693e-05, "loss": 0.2083, "step": 10042 }, { "epoch": 2.0237759419705825, "grad_norm": 0.056213442236185074, "learning_rate": 6.936324767913373e-05, "loss": 0.2121, "step": 10044 }, { "epoch": 2.0241789240378805, "grad_norm": 0.05019683390855789, "learning_rate": 6.935096036486639e-05, "loss": 0.2064, "step": 10046 }, { "epoch": 2.0245819061051784, "grad_norm": 0.05334010347723961, "learning_rate": 6.933867167593776e-05, "loss": 0.2139, "step": 10048 }, { "epoch": 2.0249848881724763, "grad_norm": 0.07266365736722946, "learning_rate": 6.932638161322082e-05, "loss": 0.2378, "step": 10050 }, { "epoch": 2.0253878702397743, "grad_norm": 0.05368518829345703, "learning_rate": 6.931409017758866e-05, "loss": 0.1434, "step": 10052 }, { "epoch": 2.0257908523070722, "grad_norm": 0.06453645974397659, "learning_rate": 6.93017973699144e-05, "loss": 0.204, "step": 10054 }, { "epoch": 2.02619383437437, "grad_norm": 0.0673045814037323, "learning_rate": 6.928950319107134e-05, "loss": 0.2312, "step": 10056 }, { "epoch": 2.0265968164416686, "grad_norm": 0.0639619305729866, "learning_rate": 6.927720764193279e-05, "loss": 0.1998, "step": 10058 }, { "epoch": 2.0269997985089665, "grad_norm": 0.07452709972858429, "learning_rate": 6.926491072337226e-05, "loss": 0.251, "step": 10060 }, { "epoch": 2.0274027805762644, "grad_norm": 0.04336640611290932, "learning_rate": 6.92526124362633e-05, "loss": 0.1963, "step": 10062 }, { "epoch": 2.0278057626435624, "grad_norm": 0.041949085891246796, "learning_rate": 6.924031278147952e-05, "loss": 0.2065, "step": 10064 }, { "epoch": 2.0282087447108603, "grad_norm": 0.041135333478450775, "learning_rate": 6.922801175989469e-05, "loss": 0.1776, "step": 10066 }, { "epoch": 2.0286117267781583, "grad_norm": 0.048920802772045135, "learning_rate": 6.921570937238266e-05, "loss": 0.209, "step": 10068 }, { "epoch": 2.029014708845456, "grad_norm": 0.049562010914087296, "learning_rate": 6.920340561981738e-05, "loss": 0.1398, "step": 10070 }, { "epoch": 2.0294176909127546, "grad_norm": 0.043229840695858, "learning_rate": 6.919110050307286e-05, "loss": 0.1732, "step": 10072 }, { "epoch": 2.0298206729800525, "grad_norm": 0.04172005504369736, "learning_rate": 6.917879402302327e-05, "loss": 0.1695, "step": 10074 }, { "epoch": 2.0302236550473505, "grad_norm": 0.05273672565817833, "learning_rate": 6.91664861805428e-05, "loss": 0.1919, "step": 10076 }, { "epoch": 2.0306266371146484, "grad_norm": 0.04306695610284805, "learning_rate": 6.915417697650582e-05, "loss": 0.1703, "step": 10078 }, { "epoch": 2.0310296191819464, "grad_norm": 0.04110530763864517, "learning_rate": 6.914186641178672e-05, "loss": 0.1616, "step": 10080 }, { "epoch": 2.0314326012492443, "grad_norm": 0.05396244302392006, "learning_rate": 6.912955448726006e-05, "loss": 0.2022, "step": 10082 }, { "epoch": 2.0318355833165422, "grad_norm": 0.07104260474443436, "learning_rate": 6.911724120380045e-05, "loss": 0.1944, "step": 10084 }, { "epoch": 2.0322385653838406, "grad_norm": 0.0550391860306263, "learning_rate": 6.910492656228258e-05, "loss": 0.1653, "step": 10086 }, { "epoch": 2.0326415474511386, "grad_norm": 0.0574769526720047, "learning_rate": 6.90926105635813e-05, "loss": 0.2217, "step": 10088 }, { "epoch": 2.0330445295184365, "grad_norm": 0.043378256261348724, "learning_rate": 6.908029320857147e-05, "loss": 0.2421, "step": 10090 }, { "epoch": 2.0334475115857344, "grad_norm": 0.047401294112205505, "learning_rate": 6.906797449812817e-05, "loss": 0.192, "step": 10092 }, { "epoch": 2.0338504936530324, "grad_norm": 0.04810367152094841, "learning_rate": 6.905565443312642e-05, "loss": 0.1818, "step": 10094 }, { "epoch": 2.0342534757203303, "grad_norm": 0.03586292639374733, "learning_rate": 6.904333301444146e-05, "loss": 0.1771, "step": 10096 }, { "epoch": 2.0346564577876283, "grad_norm": 0.042285818606615067, "learning_rate": 6.903101024294858e-05, "loss": 0.2083, "step": 10098 }, { "epoch": 2.0350594398549267, "grad_norm": 0.0737522765994072, "learning_rate": 6.901868611952317e-05, "loss": 0.238, "step": 10100 }, { "epoch": 2.0354624219222246, "grad_norm": 0.05117824673652649, "learning_rate": 6.900636064504071e-05, "loss": 0.1915, "step": 10102 }, { "epoch": 2.0358654039895225, "grad_norm": 0.07703465223312378, "learning_rate": 6.899403382037681e-05, "loss": 0.1974, "step": 10104 }, { "epoch": 2.0362683860568205, "grad_norm": 0.04766885191202164, "learning_rate": 6.898170564640709e-05, "loss": 0.1444, "step": 10106 }, { "epoch": 2.0366713681241184, "grad_norm": 0.07825704663991928, "learning_rate": 6.896937612400738e-05, "loss": 0.1678, "step": 10108 }, { "epoch": 2.0370743501914164, "grad_norm": 0.049602579325437546, "learning_rate": 6.895704525405351e-05, "loss": 0.1925, "step": 10110 }, { "epoch": 2.0374773322587143, "grad_norm": 0.060352474451065063, "learning_rate": 6.894471303742147e-05, "loss": 0.1548, "step": 10112 }, { "epoch": 2.0378803143260127, "grad_norm": 0.060082901269197464, "learning_rate": 6.893237947498732e-05, "loss": 0.1508, "step": 10114 }, { "epoch": 2.0382832963933106, "grad_norm": 0.0637107640504837, "learning_rate": 6.89200445676272e-05, "loss": 0.2168, "step": 10116 }, { "epoch": 2.0386862784606086, "grad_norm": 0.05720100551843643, "learning_rate": 6.890770831621738e-05, "loss": 0.1759, "step": 10118 }, { "epoch": 2.0390892605279065, "grad_norm": 0.048458032310009, "learning_rate": 6.88953707216342e-05, "loss": 0.1957, "step": 10120 }, { "epoch": 2.0394922425952045, "grad_norm": 0.06728371232748032, "learning_rate": 6.888303178475411e-05, "loss": 0.176, "step": 10122 }, { "epoch": 2.0398952246625024, "grad_norm": 0.053860168904066086, "learning_rate": 6.887069150645362e-05, "loss": 0.21, "step": 10124 }, { "epoch": 2.0402982067298003, "grad_norm": 0.06876907497644424, "learning_rate": 6.88583498876094e-05, "loss": 0.197, "step": 10126 }, { "epoch": 2.0407011887970987, "grad_norm": 0.05635106936097145, "learning_rate": 6.884600692909815e-05, "loss": 0.2058, "step": 10128 }, { "epoch": 2.0411041708643967, "grad_norm": 0.04968864470720291, "learning_rate": 6.88336626317967e-05, "loss": 0.2126, "step": 10130 }, { "epoch": 2.0415071529316946, "grad_norm": 0.06307017803192139, "learning_rate": 6.8821316996582e-05, "loss": 0.1865, "step": 10132 }, { "epoch": 2.0419101349989925, "grad_norm": 0.05695520341396332, "learning_rate": 6.880897002433104e-05, "loss": 0.1708, "step": 10134 }, { "epoch": 2.0423131170662905, "grad_norm": 0.056855130940675735, "learning_rate": 6.879662171592092e-05, "loss": 0.1786, "step": 10136 }, { "epoch": 2.0427160991335884, "grad_norm": 0.05637526512145996, "learning_rate": 6.878427207222887e-05, "loss": 0.2307, "step": 10138 }, { "epoch": 2.0431190812008864, "grad_norm": 0.07096744328737259, "learning_rate": 6.877192109413214e-05, "loss": 0.1843, "step": 10140 }, { "epoch": 2.0435220632681848, "grad_norm": 0.06431825459003448, "learning_rate": 6.875956878250819e-05, "loss": 0.2259, "step": 10142 }, { "epoch": 2.0439250453354827, "grad_norm": 0.055992890149354935, "learning_rate": 6.874721513823445e-05, "loss": 0.1992, "step": 10144 }, { "epoch": 2.0443280274027806, "grad_norm": 0.05946403741836548, "learning_rate": 6.873486016218854e-05, "loss": 0.2288, "step": 10146 }, { "epoch": 2.0447310094700786, "grad_norm": 0.07745692878961563, "learning_rate": 6.872250385524813e-05, "loss": 0.2044, "step": 10148 }, { "epoch": 2.0451339915373765, "grad_norm": 0.05934702232480049, "learning_rate": 6.871014621829099e-05, "loss": 0.2129, "step": 10150 }, { "epoch": 2.0455369736046745, "grad_norm": 0.06850776076316833, "learning_rate": 6.869778725219498e-05, "loss": 0.1735, "step": 10152 }, { "epoch": 2.0459399556719724, "grad_norm": 0.06359641253948212, "learning_rate": 6.868542695783806e-05, "loss": 0.164, "step": 10154 }, { "epoch": 2.046342937739271, "grad_norm": 0.05588820204138756, "learning_rate": 6.867306533609829e-05, "loss": 0.1713, "step": 10156 }, { "epoch": 2.0467459198065687, "grad_norm": 0.053674276918172836, "learning_rate": 6.866070238785384e-05, "loss": 0.1908, "step": 10158 }, { "epoch": 2.0471489018738667, "grad_norm": 0.04785943776369095, "learning_rate": 6.864833811398292e-05, "loss": 0.2058, "step": 10160 }, { "epoch": 2.0475518839411646, "grad_norm": 0.05280066281557083, "learning_rate": 6.863597251536389e-05, "loss": 0.2018, "step": 10162 }, { "epoch": 2.0479548660084625, "grad_norm": 0.05563594773411751, "learning_rate": 6.862360559287517e-05, "loss": 0.2189, "step": 10164 }, { "epoch": 2.0483578480757605, "grad_norm": 0.0552031509578228, "learning_rate": 6.86112373473953e-05, "loss": 0.1998, "step": 10166 }, { "epoch": 2.0487608301430584, "grad_norm": 0.07908007502555847, "learning_rate": 6.85988677798029e-05, "loss": 0.1639, "step": 10168 }, { "epoch": 2.049163812210357, "grad_norm": 0.05633252486586571, "learning_rate": 6.858649689097667e-05, "loss": 0.1182, "step": 10170 }, { "epoch": 2.0495667942776548, "grad_norm": 0.04489020258188248, "learning_rate": 6.857412468179543e-05, "loss": 0.1718, "step": 10172 }, { "epoch": 2.0499697763449527, "grad_norm": 0.049923308193683624, "learning_rate": 6.856175115313806e-05, "loss": 0.2262, "step": 10174 }, { "epoch": 2.0503727584122506, "grad_norm": 0.0499308817088604, "learning_rate": 6.854937630588359e-05, "loss": 0.1507, "step": 10176 }, { "epoch": 2.0507757404795486, "grad_norm": 0.047174129635095596, "learning_rate": 6.853700014091108e-05, "loss": 0.1446, "step": 10178 }, { "epoch": 2.0511787225468465, "grad_norm": 0.03964932635426521, "learning_rate": 6.852462265909973e-05, "loss": 0.1218, "step": 10180 }, { "epoch": 2.0515817046141445, "grad_norm": 0.06714117527008057, "learning_rate": 6.851224386132882e-05, "loss": 0.2253, "step": 10182 }, { "epoch": 2.051984686681443, "grad_norm": 0.05636550858616829, "learning_rate": 6.849986374847773e-05, "loss": 0.2178, "step": 10184 }, { "epoch": 2.052387668748741, "grad_norm": 0.06559525430202484, "learning_rate": 6.848748232142586e-05, "loss": 0.2283, "step": 10186 }, { "epoch": 2.0527906508160387, "grad_norm": 0.04787338525056839, "learning_rate": 6.847509958105283e-05, "loss": 0.1585, "step": 10188 }, { "epoch": 2.0531936328833367, "grad_norm": 0.060912150889635086, "learning_rate": 6.84627155282383e-05, "loss": 0.1765, "step": 10190 }, { "epoch": 2.0535966149506346, "grad_norm": 0.04154639318585396, "learning_rate": 6.845033016386196e-05, "loss": 0.1741, "step": 10192 }, { "epoch": 2.0539995970179326, "grad_norm": 0.07247091829776764, "learning_rate": 6.843794348880367e-05, "loss": 0.2379, "step": 10194 }, { "epoch": 2.0544025790852305, "grad_norm": 0.05615399777889252, "learning_rate": 6.842555550394338e-05, "loss": 0.2099, "step": 10196 }, { "epoch": 2.054805561152529, "grad_norm": 0.056154392659664154, "learning_rate": 6.841316621016107e-05, "loss": 0.233, "step": 10198 }, { "epoch": 2.055208543219827, "grad_norm": 0.04954691231250763, "learning_rate": 6.840077560833688e-05, "loss": 0.2172, "step": 10200 }, { "epoch": 2.0556115252871248, "grad_norm": 0.03744814172387123, "learning_rate": 6.838838369935104e-05, "loss": 0.1399, "step": 10202 }, { "epoch": 2.0560145073544227, "grad_norm": 0.06068943440914154, "learning_rate": 6.837599048408381e-05, "loss": 0.1913, "step": 10204 }, { "epoch": 2.0564174894217206, "grad_norm": 0.06408362835645676, "learning_rate": 6.836359596341563e-05, "loss": 0.2003, "step": 10206 }, { "epoch": 2.0568204714890186, "grad_norm": 0.054449256509542465, "learning_rate": 6.835120013822694e-05, "loss": 0.1994, "step": 10208 }, { "epoch": 2.0572234535563165, "grad_norm": 0.055184461176395416, "learning_rate": 6.833880300939835e-05, "loss": 0.1977, "step": 10210 }, { "epoch": 2.057626435623615, "grad_norm": 0.04949701577425003, "learning_rate": 6.832640457781053e-05, "loss": 0.1376, "step": 10212 }, { "epoch": 2.058029417690913, "grad_norm": 0.07206345349550247, "learning_rate": 6.83140048443442e-05, "loss": 0.1795, "step": 10214 }, { "epoch": 2.058432399758211, "grad_norm": 0.060080841183662415, "learning_rate": 6.830160380988029e-05, "loss": 0.2113, "step": 10216 }, { "epoch": 2.0588353818255087, "grad_norm": 0.03272469714283943, "learning_rate": 6.828920147529971e-05, "loss": 0.1516, "step": 10218 }, { "epoch": 2.0592383638928067, "grad_norm": 0.052576545625925064, "learning_rate": 6.82767978414835e-05, "loss": 0.2245, "step": 10220 }, { "epoch": 2.0596413459601046, "grad_norm": 0.05587625131011009, "learning_rate": 6.826439290931279e-05, "loss": 0.2133, "step": 10222 }, { "epoch": 2.060044328027403, "grad_norm": 0.033351000398397446, "learning_rate": 6.825198667966883e-05, "loss": 0.1667, "step": 10224 }, { "epoch": 2.060447310094701, "grad_norm": 0.0870516449213028, "learning_rate": 6.823957915343293e-05, "loss": 0.1779, "step": 10226 }, { "epoch": 2.060850292161999, "grad_norm": 0.06419102102518082, "learning_rate": 6.822717033148649e-05, "loss": 0.2087, "step": 10228 }, { "epoch": 2.061253274229297, "grad_norm": 0.055633675307035446, "learning_rate": 6.821476021471103e-05, "loss": 0.212, "step": 10230 }, { "epoch": 2.0616562562965948, "grad_norm": 0.04498250037431717, "learning_rate": 6.820234880398813e-05, "loss": 0.1942, "step": 10232 }, { "epoch": 2.0620592383638927, "grad_norm": 0.05661292001605034, "learning_rate": 6.818993610019947e-05, "loss": 0.1667, "step": 10234 }, { "epoch": 2.0624622204311907, "grad_norm": 0.06287211179733276, "learning_rate": 6.817752210422686e-05, "loss": 0.216, "step": 10236 }, { "epoch": 2.0628652024984886, "grad_norm": 0.04590194299817085, "learning_rate": 6.816510681695213e-05, "loss": 0.1958, "step": 10238 }, { "epoch": 2.063268184565787, "grad_norm": 0.03706109896302223, "learning_rate": 6.815269023925726e-05, "loss": 0.1317, "step": 10240 }, { "epoch": 2.063671166633085, "grad_norm": 0.04809437692165375, "learning_rate": 6.814027237202433e-05, "loss": 0.203, "step": 10242 }, { "epoch": 2.064074148700383, "grad_norm": 0.05192924663424492, "learning_rate": 6.812785321613545e-05, "loss": 0.2064, "step": 10244 }, { "epoch": 2.064477130767681, "grad_norm": 0.050232961773872375, "learning_rate": 6.811543277247285e-05, "loss": 0.1612, "step": 10246 }, { "epoch": 2.0648801128349787, "grad_norm": 0.051082853227853775, "learning_rate": 6.810301104191891e-05, "loss": 0.1402, "step": 10248 }, { "epoch": 2.0652830949022767, "grad_norm": 0.05042251944541931, "learning_rate": 6.8090588025356e-05, "loss": 0.1719, "step": 10250 }, { "epoch": 2.065686076969575, "grad_norm": 0.09551627933979034, "learning_rate": 6.807816372366664e-05, "loss": 0.2145, "step": 10252 }, { "epoch": 2.066089059036873, "grad_norm": 0.07502955198287964, "learning_rate": 6.806573813773346e-05, "loss": 0.2287, "step": 10254 }, { "epoch": 2.066492041104171, "grad_norm": 0.06268194317817688, "learning_rate": 6.805331126843912e-05, "loss": 0.2206, "step": 10256 }, { "epoch": 2.066895023171469, "grad_norm": 0.05798924341797829, "learning_rate": 6.804088311666642e-05, "loss": 0.2267, "step": 10258 }, { "epoch": 2.067298005238767, "grad_norm": 0.06151028349995613, "learning_rate": 6.802845368329825e-05, "loss": 0.1738, "step": 10260 }, { "epoch": 2.0677009873060648, "grad_norm": 0.051942527294158936, "learning_rate": 6.801602296921755e-05, "loss": 0.1722, "step": 10262 }, { "epoch": 2.0681039693733627, "grad_norm": 0.048653073608875275, "learning_rate": 6.800359097530739e-05, "loss": 0.1447, "step": 10264 }, { "epoch": 2.068506951440661, "grad_norm": 0.07658076286315918, "learning_rate": 6.799115770245093e-05, "loss": 0.2365, "step": 10266 }, { "epoch": 2.068909933507959, "grad_norm": 0.055393122136592865, "learning_rate": 6.797872315153139e-05, "loss": 0.2255, "step": 10268 }, { "epoch": 2.069312915575257, "grad_norm": 0.07611057907342911, "learning_rate": 6.796628732343212e-05, "loss": 0.2032, "step": 10270 }, { "epoch": 2.069715897642555, "grad_norm": 0.05497412756085396, "learning_rate": 6.795385021903652e-05, "loss": 0.201, "step": 10272 }, { "epoch": 2.070118879709853, "grad_norm": 0.0471356138586998, "learning_rate": 6.79414118392281e-05, "loss": 0.2179, "step": 10274 }, { "epoch": 2.070521861777151, "grad_norm": 0.06860160082578659, "learning_rate": 6.792897218489051e-05, "loss": 0.1896, "step": 10276 }, { "epoch": 2.0709248438444487, "grad_norm": 0.04450426623225212, "learning_rate": 6.79165312569074e-05, "loss": 0.196, "step": 10278 }, { "epoch": 2.071327825911747, "grad_norm": 0.06577623635530472, "learning_rate": 6.790408905616254e-05, "loss": 0.1959, "step": 10280 }, { "epoch": 2.071730807979045, "grad_norm": 0.0391886830329895, "learning_rate": 6.789164558353985e-05, "loss": 0.1776, "step": 10282 }, { "epoch": 2.072133790046343, "grad_norm": 0.06485151499509811, "learning_rate": 6.787920083992326e-05, "loss": 0.1951, "step": 10284 }, { "epoch": 2.072536772113641, "grad_norm": 0.06771334260702133, "learning_rate": 6.786675482619684e-05, "loss": 0.1854, "step": 10286 }, { "epoch": 2.072939754180939, "grad_norm": 0.04598645493388176, "learning_rate": 6.785430754324473e-05, "loss": 0.1892, "step": 10288 }, { "epoch": 2.073342736248237, "grad_norm": 0.05460431054234505, "learning_rate": 6.784185899195117e-05, "loss": 0.1618, "step": 10290 }, { "epoch": 2.073745718315535, "grad_norm": 0.0572720542550087, "learning_rate": 6.782940917320048e-05, "loss": 0.194, "step": 10292 }, { "epoch": 2.074148700382833, "grad_norm": 0.062214624136686325, "learning_rate": 6.781695808787708e-05, "loss": 0.1985, "step": 10294 }, { "epoch": 2.074551682450131, "grad_norm": 0.0574166476726532, "learning_rate": 6.780450573686545e-05, "loss": 0.2226, "step": 10296 }, { "epoch": 2.074954664517429, "grad_norm": 0.05803331732749939, "learning_rate": 6.779205212105022e-05, "loss": 0.189, "step": 10298 }, { "epoch": 2.075357646584727, "grad_norm": 0.050910577178001404, "learning_rate": 6.777959724131607e-05, "loss": 0.1377, "step": 10300 }, { "epoch": 2.075760628652025, "grad_norm": 0.04367856681346893, "learning_rate": 6.776714109854777e-05, "loss": 0.1531, "step": 10302 }, { "epoch": 2.076163610719323, "grad_norm": 0.06604331731796265, "learning_rate": 6.775468369363015e-05, "loss": 0.2169, "step": 10304 }, { "epoch": 2.076566592786621, "grad_norm": 0.07328059524297714, "learning_rate": 6.774222502744823e-05, "loss": 0.1856, "step": 10306 }, { "epoch": 2.076969574853919, "grad_norm": 0.06949375569820404, "learning_rate": 6.7729765100887e-05, "loss": 0.2027, "step": 10308 }, { "epoch": 2.077372556921217, "grad_norm": 0.07183837890625, "learning_rate": 6.77173039148316e-05, "loss": 0.2086, "step": 10310 }, { "epoch": 2.077775538988515, "grad_norm": 0.05574171990156174, "learning_rate": 6.77048414701673e-05, "loss": 0.1783, "step": 10312 }, { "epoch": 2.078178521055813, "grad_norm": 0.0639929473400116, "learning_rate": 6.769237776777934e-05, "loss": 0.1766, "step": 10314 }, { "epoch": 2.078581503123111, "grad_norm": 0.06870213896036148, "learning_rate": 6.767991280855316e-05, "loss": 0.2004, "step": 10316 }, { "epoch": 2.078984485190409, "grad_norm": 0.04897087812423706, "learning_rate": 6.766744659337429e-05, "loss": 0.1685, "step": 10318 }, { "epoch": 2.079387467257707, "grad_norm": 0.04760335013270378, "learning_rate": 6.765497912312824e-05, "loss": 0.1805, "step": 10320 }, { "epoch": 2.0797904493250052, "grad_norm": 0.06363385915756226, "learning_rate": 6.76425103987007e-05, "loss": 0.212, "step": 10322 }, { "epoch": 2.080193431392303, "grad_norm": 0.072906494140625, "learning_rate": 6.763004042097745e-05, "loss": 0.18, "step": 10324 }, { "epoch": 2.080596413459601, "grad_norm": 0.08023212850093842, "learning_rate": 6.761756919084432e-05, "loss": 0.1753, "step": 10326 }, { "epoch": 2.080999395526899, "grad_norm": 0.05875491350889206, "learning_rate": 6.760509670918725e-05, "loss": 0.1478, "step": 10328 }, { "epoch": 2.081402377594197, "grad_norm": 0.06197643280029297, "learning_rate": 6.759262297689227e-05, "loss": 0.1899, "step": 10330 }, { "epoch": 2.081805359661495, "grad_norm": 0.07489572465419769, "learning_rate": 6.758014799484548e-05, "loss": 0.2358, "step": 10332 }, { "epoch": 2.082208341728793, "grad_norm": 0.06615934520959854, "learning_rate": 6.75676717639331e-05, "loss": 0.2092, "step": 10334 }, { "epoch": 2.0826113237960913, "grad_norm": 0.05848393589258194, "learning_rate": 6.75551942850414e-05, "loss": 0.182, "step": 10336 }, { "epoch": 2.083014305863389, "grad_norm": 0.054525017738342285, "learning_rate": 6.754271555905678e-05, "loss": 0.1952, "step": 10338 }, { "epoch": 2.083417287930687, "grad_norm": 0.07348795980215073, "learning_rate": 6.753023558686572e-05, "loss": 0.2201, "step": 10340 }, { "epoch": 2.083820269997985, "grad_norm": 0.06381559371948242, "learning_rate": 6.751775436935474e-05, "loss": 0.2075, "step": 10342 }, { "epoch": 2.084223252065283, "grad_norm": 0.05462156981229782, "learning_rate": 6.75052719074105e-05, "loss": 0.2108, "step": 10344 }, { "epoch": 2.084626234132581, "grad_norm": 0.05999904125928879, "learning_rate": 6.749278820191976e-05, "loss": 0.1819, "step": 10346 }, { "epoch": 2.085029216199879, "grad_norm": 0.06505034118890762, "learning_rate": 6.74803032537693e-05, "loss": 0.1821, "step": 10348 }, { "epoch": 2.0854321982671773, "grad_norm": 0.05717167258262634, "learning_rate": 6.746781706384606e-05, "loss": 0.261, "step": 10350 }, { "epoch": 2.0858351803344752, "grad_norm": 0.06013219431042671, "learning_rate": 6.745532963303705e-05, "loss": 0.1888, "step": 10352 }, { "epoch": 2.086238162401773, "grad_norm": 0.043344974517822266, "learning_rate": 6.744284096222932e-05, "loss": 0.1607, "step": 10354 }, { "epoch": 2.086641144469071, "grad_norm": 0.0586077980697155, "learning_rate": 6.743035105231006e-05, "loss": 0.1978, "step": 10356 }, { "epoch": 2.087044126536369, "grad_norm": 0.05550181493163109, "learning_rate": 6.741785990416654e-05, "loss": 0.1743, "step": 10358 }, { "epoch": 2.087447108603667, "grad_norm": 0.049226608127355576, "learning_rate": 6.740536751868611e-05, "loss": 0.1788, "step": 10360 }, { "epoch": 2.087850090670965, "grad_norm": 0.05161656439304352, "learning_rate": 6.739287389675621e-05, "loss": 0.1803, "step": 10362 }, { "epoch": 2.0882530727382633, "grad_norm": 0.0661526471376419, "learning_rate": 6.738037903926437e-05, "loss": 0.241, "step": 10364 }, { "epoch": 2.0886560548055613, "grad_norm": 0.0597810298204422, "learning_rate": 6.736788294709819e-05, "loss": 0.1723, "step": 10366 }, { "epoch": 2.089059036872859, "grad_norm": 0.0493599995970726, "learning_rate": 6.735538562114538e-05, "loss": 0.1674, "step": 10368 }, { "epoch": 2.089462018940157, "grad_norm": 0.04726816341280937, "learning_rate": 6.734288706229373e-05, "loss": 0.1667, "step": 10370 }, { "epoch": 2.089865001007455, "grad_norm": 0.06035396829247475, "learning_rate": 6.733038727143113e-05, "loss": 0.2051, "step": 10372 }, { "epoch": 2.090267983074753, "grad_norm": 0.04860403761267662, "learning_rate": 6.731788624944551e-05, "loss": 0.1549, "step": 10374 }, { "epoch": 2.090670965142051, "grad_norm": 0.05029008165001869, "learning_rate": 6.730538399722497e-05, "loss": 0.1953, "step": 10376 }, { "epoch": 2.0910739472093494, "grad_norm": 0.053353093564510345, "learning_rate": 6.729288051565763e-05, "loss": 0.2012, "step": 10378 }, { "epoch": 2.0914769292766473, "grad_norm": 0.07299556583166122, "learning_rate": 6.72803758056317e-05, "loss": 0.1612, "step": 10380 }, { "epoch": 2.0918799113439452, "grad_norm": 0.0511019267141819, "learning_rate": 6.726786986803552e-05, "loss": 0.1902, "step": 10382 }, { "epoch": 2.092282893411243, "grad_norm": 0.06305370479822159, "learning_rate": 6.725536270375747e-05, "loss": 0.1789, "step": 10384 }, { "epoch": 2.092685875478541, "grad_norm": 0.05989459529519081, "learning_rate": 6.724285431368604e-05, "loss": 0.2489, "step": 10386 }, { "epoch": 2.093088857545839, "grad_norm": 0.07147608697414398, "learning_rate": 6.723034469870983e-05, "loss": 0.1998, "step": 10388 }, { "epoch": 2.093491839613137, "grad_norm": 0.05758042261004448, "learning_rate": 6.72178338597175e-05, "loss": 0.1878, "step": 10390 }, { "epoch": 2.0938948216804354, "grad_norm": 0.05327797308564186, "learning_rate": 6.720532179759777e-05, "loss": 0.2193, "step": 10392 }, { "epoch": 2.0942978037477333, "grad_norm": 0.061337072402238846, "learning_rate": 6.71928085132395e-05, "loss": 0.2337, "step": 10394 }, { "epoch": 2.0947007858150313, "grad_norm": 0.03611127659678459, "learning_rate": 6.718029400753161e-05, "loss": 0.1217, "step": 10396 }, { "epoch": 2.095103767882329, "grad_norm": 0.05926065146923065, "learning_rate": 6.71677782813631e-05, "loss": 0.2102, "step": 10398 }, { "epoch": 2.095506749949627, "grad_norm": 0.04807904362678528, "learning_rate": 6.71552613356231e-05, "loss": 0.1536, "step": 10400 }, { "epoch": 2.095909732016925, "grad_norm": 0.06720544397830963, "learning_rate": 6.714274317120076e-05, "loss": 0.1666, "step": 10402 }, { "epoch": 2.096312714084223, "grad_norm": 0.034379951655864716, "learning_rate": 6.713022378898535e-05, "loss": 0.121, "step": 10404 }, { "epoch": 2.0967156961515214, "grad_norm": 0.0607600137591362, "learning_rate": 6.711770318986624e-05, "loss": 0.2098, "step": 10406 }, { "epoch": 2.0971186782188194, "grad_norm": 0.051908619701862335, "learning_rate": 6.710518137473288e-05, "loss": 0.179, "step": 10408 }, { "epoch": 2.0975216602861173, "grad_norm": 0.049980923533439636, "learning_rate": 6.709265834447479e-05, "loss": 0.1668, "step": 10410 }, { "epoch": 2.0979246423534152, "grad_norm": 0.05012454465031624, "learning_rate": 6.708013409998158e-05, "loss": 0.1529, "step": 10412 }, { "epoch": 2.098327624420713, "grad_norm": 0.12584945559501648, "learning_rate": 6.706760864214297e-05, "loss": 0.1905, "step": 10414 }, { "epoch": 2.098730606488011, "grad_norm": 0.05351203680038452, "learning_rate": 6.705508197184873e-05, "loss": 0.182, "step": 10416 }, { "epoch": 2.0991335885553095, "grad_norm": 0.07033678144216537, "learning_rate": 6.704255408998873e-05, "loss": 0.2062, "step": 10418 }, { "epoch": 2.0995365706226075, "grad_norm": 0.044345423579216, "learning_rate": 6.703002499745296e-05, "loss": 0.1581, "step": 10420 }, { "epoch": 2.0999395526899054, "grad_norm": 0.05711236223578453, "learning_rate": 6.701749469513146e-05, "loss": 0.1727, "step": 10422 }, { "epoch": 2.1003425347572033, "grad_norm": 0.04664299264550209, "learning_rate": 6.700496318391432e-05, "loss": 0.1864, "step": 10424 }, { "epoch": 2.1007455168245013, "grad_norm": 0.06930460780858994, "learning_rate": 6.699243046469182e-05, "loss": 0.2205, "step": 10426 }, { "epoch": 2.101148498891799, "grad_norm": 0.056150782853364944, "learning_rate": 6.697989653835423e-05, "loss": 0.2011, "step": 10428 }, { "epoch": 2.101551480959097, "grad_norm": 0.050900932401418686, "learning_rate": 6.696736140579193e-05, "loss": 0.2052, "step": 10430 }, { "epoch": 2.101954463026395, "grad_norm": 0.060148272663354874, "learning_rate": 6.695482506789542e-05, "loss": 0.1641, "step": 10432 }, { "epoch": 2.1023574450936935, "grad_norm": 0.050767235457897186, "learning_rate": 6.694228752555526e-05, "loss": 0.1799, "step": 10434 }, { "epoch": 2.1027604271609914, "grad_norm": 0.07405157387256622, "learning_rate": 6.692974877966208e-05, "loss": 0.2153, "step": 10436 }, { "epoch": 2.1031634092282894, "grad_norm": 0.05881273001432419, "learning_rate": 6.691720883110662e-05, "loss": 0.177, "step": 10438 }, { "epoch": 2.1035663912955873, "grad_norm": 0.07139863073825836, "learning_rate": 6.69046676807797e-05, "loss": 0.2424, "step": 10440 }, { "epoch": 2.1039693733628853, "grad_norm": 0.05423678830265999, "learning_rate": 6.689212532957224e-05, "loss": 0.1913, "step": 10442 }, { "epoch": 2.104372355430183, "grad_norm": 0.054789528250694275, "learning_rate": 6.687958177837518e-05, "loss": 0.1351, "step": 10444 }, { "epoch": 2.1047753374974816, "grad_norm": 0.06818938255310059, "learning_rate": 6.686703702807965e-05, "loss": 0.2081, "step": 10446 }, { "epoch": 2.1051783195647795, "grad_norm": 0.09108281880617142, "learning_rate": 6.685449107957678e-05, "loss": 0.1892, "step": 10448 }, { "epoch": 2.1055813016320775, "grad_norm": 0.04925645515322685, "learning_rate": 6.684194393375781e-05, "loss": 0.1537, "step": 10450 }, { "epoch": 2.1059842836993754, "grad_norm": 0.05035284161567688, "learning_rate": 6.682939559151409e-05, "loss": 0.1828, "step": 10452 }, { "epoch": 2.1063872657666733, "grad_norm": 0.04909108206629753, "learning_rate": 6.681684605373702e-05, "loss": 0.1615, "step": 10454 }, { "epoch": 2.1067902478339713, "grad_norm": 0.0470598042011261, "learning_rate": 6.680429532131809e-05, "loss": 0.1839, "step": 10456 }, { "epoch": 2.1071932299012692, "grad_norm": 0.04207504913210869, "learning_rate": 6.679174339514891e-05, "loss": 0.145, "step": 10458 }, { "epoch": 2.1075962119685676, "grad_norm": 0.04711586982011795, "learning_rate": 6.677919027612112e-05, "loss": 0.1561, "step": 10460 }, { "epoch": 2.1079991940358656, "grad_norm": 0.13398267328739166, "learning_rate": 6.676663596512649e-05, "loss": 0.1932, "step": 10462 }, { "epoch": 2.1084021761031635, "grad_norm": 0.060606952756643295, "learning_rate": 6.675408046305686e-05, "loss": 0.1956, "step": 10464 }, { "epoch": 2.1088051581704614, "grad_norm": 0.09983116388320923, "learning_rate": 6.674152377080414e-05, "loss": 0.2291, "step": 10466 }, { "epoch": 2.1092081402377594, "grad_norm": 0.0655071958899498, "learning_rate": 6.672896588926035e-05, "loss": 0.2101, "step": 10468 }, { "epoch": 2.1096111223050573, "grad_norm": 0.0752595067024231, "learning_rate": 6.671640681931759e-05, "loss": 0.2017, "step": 10470 }, { "epoch": 2.1100141043723553, "grad_norm": 0.061998266726732254, "learning_rate": 6.670384656186801e-05, "loss": 0.1757, "step": 10472 }, { "epoch": 2.1104170864396536, "grad_norm": 0.04654568433761597, "learning_rate": 6.669128511780388e-05, "loss": 0.1429, "step": 10474 }, { "epoch": 2.1108200685069516, "grad_norm": 0.07311231642961502, "learning_rate": 6.667872248801756e-05, "loss": 0.1845, "step": 10476 }, { "epoch": 2.1112230505742495, "grad_norm": 0.059998735785484314, "learning_rate": 6.666615867340146e-05, "loss": 0.1876, "step": 10478 }, { "epoch": 2.1116260326415475, "grad_norm": 0.046271517872810364, "learning_rate": 6.665359367484812e-05, "loss": 0.1738, "step": 10480 }, { "epoch": 2.1120290147088454, "grad_norm": 0.09801926463842392, "learning_rate": 6.66410274932501e-05, "loss": 0.2305, "step": 10482 }, { "epoch": 2.1124319967761434, "grad_norm": 0.06510908156633377, "learning_rate": 6.66284601295001e-05, "loss": 0.1822, "step": 10484 }, { "epoch": 2.1128349788434413, "grad_norm": 0.07298275083303452, "learning_rate": 6.661589158449089e-05, "loss": 0.2025, "step": 10486 }, { "epoch": 2.1132379609107397, "grad_norm": 0.06558576226234436, "learning_rate": 6.660332185911531e-05, "loss": 0.2144, "step": 10488 }, { "epoch": 2.1136409429780376, "grad_norm": 0.08517705649137497, "learning_rate": 6.65907509542663e-05, "loss": 0.1633, "step": 10490 }, { "epoch": 2.1140439250453356, "grad_norm": 0.06583531200885773, "learning_rate": 6.657817887083688e-05, "loss": 0.1851, "step": 10492 }, { "epoch": 2.1144469071126335, "grad_norm": 0.051145102828741074, "learning_rate": 6.656560560972014e-05, "loss": 0.1252, "step": 10494 }, { "epoch": 2.1148498891799314, "grad_norm": 0.05931749939918518, "learning_rate": 6.655303117180927e-05, "loss": 0.1682, "step": 10496 }, { "epoch": 2.1152528712472294, "grad_norm": 0.057506263256073, "learning_rate": 6.654045555799754e-05, "loss": 0.2283, "step": 10498 }, { "epoch": 2.1156558533145273, "grad_norm": 0.0503305122256279, "learning_rate": 6.652787876917831e-05, "loss": 0.1649, "step": 10500 }, { "epoch": 2.1160588353818257, "grad_norm": 0.05089758709073067, "learning_rate": 6.6515300806245e-05, "loss": 0.201, "step": 10502 }, { "epoch": 2.1164618174491237, "grad_norm": 0.0781029537320137, "learning_rate": 6.650272167009113e-05, "loss": 0.2012, "step": 10504 }, { "epoch": 2.1168647995164216, "grad_norm": 0.050134409219026566, "learning_rate": 6.64901413616103e-05, "loss": 0.1465, "step": 10506 }, { "epoch": 2.1172677815837195, "grad_norm": 0.06083032488822937, "learning_rate": 6.64775598816962e-05, "loss": 0.1987, "step": 10508 }, { "epoch": 2.1176707636510175, "grad_norm": 0.2716810703277588, "learning_rate": 6.646497723124262e-05, "loss": 0.2102, "step": 10510 }, { "epoch": 2.1180737457183154, "grad_norm": 0.0632266253232956, "learning_rate": 6.645239341114335e-05, "loss": 0.1823, "step": 10512 }, { "epoch": 2.1184767277856134, "grad_norm": 0.0860210657119751, "learning_rate": 6.64398084222924e-05, "loss": 0.2009, "step": 10514 }, { "epoch": 2.1188797098529117, "grad_norm": 0.07961370795965195, "learning_rate": 6.642722226558374e-05, "loss": 0.1576, "step": 10516 }, { "epoch": 2.1192826919202097, "grad_norm": 0.054681435227394104, "learning_rate": 6.641463494191146e-05, "loss": 0.2364, "step": 10518 }, { "epoch": 2.1196856739875076, "grad_norm": 0.052845437079668045, "learning_rate": 6.640204645216978e-05, "loss": 0.1589, "step": 10520 }, { "epoch": 2.1200886560548056, "grad_norm": 0.5749832987785339, "learning_rate": 6.638945679725295e-05, "loss": 0.1696, "step": 10522 }, { "epoch": 2.1204916381221035, "grad_norm": 0.06033914163708687, "learning_rate": 6.637686597805533e-05, "loss": 0.2144, "step": 10524 }, { "epoch": 2.1208946201894014, "grad_norm": 0.053227026015520096, "learning_rate": 6.636427399547133e-05, "loss": 0.1954, "step": 10526 }, { "epoch": 2.1212976022566994, "grad_norm": 0.04543086886405945, "learning_rate": 6.635168085039549e-05, "loss": 0.1772, "step": 10528 }, { "epoch": 2.1217005843239978, "grad_norm": 0.05598034709692001, "learning_rate": 6.633908654372239e-05, "loss": 0.2335, "step": 10530 }, { "epoch": 2.1221035663912957, "grad_norm": 0.061585236340761185, "learning_rate": 6.63264910763467e-05, "loss": 0.2383, "step": 10532 }, { "epoch": 2.1225065484585937, "grad_norm": 0.0683068037033081, "learning_rate": 6.63138944491632e-05, "loss": 0.1791, "step": 10534 }, { "epoch": 2.1229095305258916, "grad_norm": 0.053205978125333786, "learning_rate": 6.630129666306674e-05, "loss": 0.1677, "step": 10536 }, { "epoch": 2.1233125125931895, "grad_norm": 0.06105554848909378, "learning_rate": 6.628869771895223e-05, "loss": 0.2059, "step": 10538 }, { "epoch": 2.1237154946604875, "grad_norm": 0.053694818168878555, "learning_rate": 6.627609761771467e-05, "loss": 0.1753, "step": 10540 }, { "epoch": 2.1241184767277854, "grad_norm": 0.04918495565652847, "learning_rate": 6.626349636024918e-05, "loss": 0.1985, "step": 10542 }, { "epoch": 2.124521458795084, "grad_norm": 0.04894405975937843, "learning_rate": 6.625089394745092e-05, "loss": 0.1918, "step": 10544 }, { "epoch": 2.1249244408623817, "grad_norm": 0.04774921387434006, "learning_rate": 6.623829038021512e-05, "loss": 0.1785, "step": 10546 }, { "epoch": 2.1253274229296797, "grad_norm": 0.06385761499404907, "learning_rate": 6.622568565943717e-05, "loss": 0.1786, "step": 10548 }, { "epoch": 2.1257304049969776, "grad_norm": 0.05672699585556984, "learning_rate": 6.621307978601246e-05, "loss": 0.174, "step": 10550 }, { "epoch": 2.1261333870642756, "grad_norm": 0.04841500148177147, "learning_rate": 6.620047276083646e-05, "loss": 0.1782, "step": 10552 }, { "epoch": 2.1265363691315735, "grad_norm": 0.050164107233285904, "learning_rate": 6.61878645848048e-05, "loss": 0.1399, "step": 10554 }, { "epoch": 2.1269393511988715, "grad_norm": 0.05736628547310829, "learning_rate": 6.617525525881315e-05, "loss": 0.2191, "step": 10556 }, { "epoch": 2.12734233326617, "grad_norm": 0.055756233632564545, "learning_rate": 6.61626447837572e-05, "loss": 0.1473, "step": 10558 }, { "epoch": 2.127745315333468, "grad_norm": 0.06790580600500107, "learning_rate": 6.615003316053283e-05, "loss": 0.2168, "step": 10560 }, { "epoch": 2.1281482974007657, "grad_norm": 0.05667269602417946, "learning_rate": 6.613742039003594e-05, "loss": 0.1785, "step": 10562 }, { "epoch": 2.1285512794680637, "grad_norm": 0.043748367577791214, "learning_rate": 6.612480647316251e-05, "loss": 0.1818, "step": 10564 }, { "epoch": 2.1289542615353616, "grad_norm": 0.05138718709349632, "learning_rate": 6.61121914108086e-05, "loss": 0.1582, "step": 10566 }, { "epoch": 2.1293572436026595, "grad_norm": 0.043073199689388275, "learning_rate": 6.609957520387039e-05, "loss": 0.1383, "step": 10568 }, { "epoch": 2.1297602256699575, "grad_norm": 0.07397390902042389, "learning_rate": 6.60869578532441e-05, "loss": 0.1539, "step": 10570 }, { "epoch": 2.130163207737256, "grad_norm": 0.07981264591217041, "learning_rate": 6.607433935982607e-05, "loss": 0.2419, "step": 10572 }, { "epoch": 2.130566189804554, "grad_norm": 0.041248392313718796, "learning_rate": 6.606171972451266e-05, "loss": 0.1984, "step": 10574 }, { "epoch": 2.1309691718718518, "grad_norm": 0.055371470749378204, "learning_rate": 6.604909894820037e-05, "loss": 0.1927, "step": 10576 }, { "epoch": 2.1313721539391497, "grad_norm": 0.08325458317995071, "learning_rate": 6.603647703178577e-05, "loss": 0.2336, "step": 10578 }, { "epoch": 2.1317751360064476, "grad_norm": 0.053513310849666595, "learning_rate": 6.602385397616547e-05, "loss": 0.1879, "step": 10580 }, { "epoch": 2.1321781180737456, "grad_norm": 0.05937230959534645, "learning_rate": 6.601122978223622e-05, "loss": 0.2393, "step": 10582 }, { "epoch": 2.1325811001410435, "grad_norm": 0.05260048806667328, "learning_rate": 6.599860445089481e-05, "loss": 0.1531, "step": 10584 }, { "epoch": 2.132984082208342, "grad_norm": 0.06879477202892303, "learning_rate": 6.598597798303813e-05, "loss": 0.1872, "step": 10586 }, { "epoch": 2.13338706427564, "grad_norm": 0.11585239320993423, "learning_rate": 6.597335037956313e-05, "loss": 0.2035, "step": 10588 }, { "epoch": 2.133790046342938, "grad_norm": 0.04980898275971413, "learning_rate": 6.596072164136689e-05, "loss": 0.17, "step": 10590 }, { "epoch": 2.1341930284102357, "grad_norm": 0.05175916105508804, "learning_rate": 6.594809176934649e-05, "loss": 0.2088, "step": 10592 }, { "epoch": 2.1345960104775337, "grad_norm": 0.0495474636554718, "learning_rate": 6.593546076439915e-05, "loss": 0.1725, "step": 10594 }, { "epoch": 2.1349989925448316, "grad_norm": 0.07358019053936005, "learning_rate": 6.592282862742217e-05, "loss": 0.218, "step": 10596 }, { "epoch": 2.1354019746121295, "grad_norm": 0.05674292892217636, "learning_rate": 6.591019535931291e-05, "loss": 0.2143, "step": 10598 }, { "epoch": 2.135804956679428, "grad_norm": 0.054214123636484146, "learning_rate": 6.589756096096881e-05, "loss": 0.1864, "step": 10600 }, { "epoch": 2.136207938746726, "grad_norm": 0.06335791945457458, "learning_rate": 6.588492543328741e-05, "loss": 0.2023, "step": 10602 }, { "epoch": 2.136610920814024, "grad_norm": 0.05495510995388031, "learning_rate": 6.587228877716632e-05, "loss": 0.1576, "step": 10604 }, { "epoch": 2.1370139028813218, "grad_norm": 0.05250907689332962, "learning_rate": 6.58596509935032e-05, "loss": 0.1167, "step": 10606 }, { "epoch": 2.1374168849486197, "grad_norm": 0.05653761699795723, "learning_rate": 6.584701208319586e-05, "loss": 0.1657, "step": 10608 }, { "epoch": 2.1378198670159176, "grad_norm": 0.046690188348293304, "learning_rate": 6.58343720471421e-05, "loss": 0.2224, "step": 10610 }, { "epoch": 2.138222849083216, "grad_norm": 0.06347212195396423, "learning_rate": 6.582173088623988e-05, "loss": 0.2408, "step": 10612 }, { "epoch": 2.138625831150514, "grad_norm": 0.08230816572904587, "learning_rate": 6.580908860138722e-05, "loss": 0.2525, "step": 10614 }, { "epoch": 2.139028813217812, "grad_norm": 0.079873226583004, "learning_rate": 6.579644519348217e-05, "loss": 0.1941, "step": 10616 }, { "epoch": 2.13943179528511, "grad_norm": 0.0678006038069725, "learning_rate": 6.578380066342291e-05, "loss": 0.1839, "step": 10618 }, { "epoch": 2.139834777352408, "grad_norm": 0.055864691734313965, "learning_rate": 6.577115501210771e-05, "loss": 0.2021, "step": 10620 }, { "epoch": 2.1402377594197057, "grad_norm": 0.08479974418878555, "learning_rate": 6.575850824043488e-05, "loss": 0.233, "step": 10622 }, { "epoch": 2.1406407414870037, "grad_norm": 0.08520139008760452, "learning_rate": 6.574586034930282e-05, "loss": 0.1911, "step": 10624 }, { "epoch": 2.1410437235543016, "grad_norm": 0.06425566226243973, "learning_rate": 6.573321133961003e-05, "loss": 0.1932, "step": 10626 }, { "epoch": 2.1414467056216, "grad_norm": 0.04391736537218094, "learning_rate": 6.572056121225505e-05, "loss": 0.1764, "step": 10628 }, { "epoch": 2.141849687688898, "grad_norm": 0.052960388362407684, "learning_rate": 6.570790996813655e-05, "loss": 0.1735, "step": 10630 }, { "epoch": 2.142252669756196, "grad_norm": 0.06271100789308548, "learning_rate": 6.569525760815326e-05, "loss": 0.1523, "step": 10632 }, { "epoch": 2.142655651823494, "grad_norm": 0.061068955808877945, "learning_rate": 6.568260413320397e-05, "loss": 0.184, "step": 10634 }, { "epoch": 2.1430586338907918, "grad_norm": 0.04499977454543114, "learning_rate": 6.566994954418755e-05, "loss": 0.1898, "step": 10636 }, { "epoch": 2.1434616159580897, "grad_norm": 0.06739667057991028, "learning_rate": 6.565729384200297e-05, "loss": 0.203, "step": 10638 }, { "epoch": 2.143864598025388, "grad_norm": 0.041490063071250916, "learning_rate": 6.564463702754929e-05, "loss": 0.1897, "step": 10640 }, { "epoch": 2.144267580092686, "grad_norm": 0.04645445942878723, "learning_rate": 6.563197910172562e-05, "loss": 0.1341, "step": 10642 }, { "epoch": 2.144670562159984, "grad_norm": 0.043481625616550446, "learning_rate": 6.561932006543115e-05, "loss": 0.1423, "step": 10644 }, { "epoch": 2.145073544227282, "grad_norm": 0.06526371836662292, "learning_rate": 6.560665991956514e-05, "loss": 0.1584, "step": 10646 }, { "epoch": 2.14547652629458, "grad_norm": 0.0430811382830143, "learning_rate": 6.5593998665027e-05, "loss": 0.1757, "step": 10648 }, { "epoch": 2.145879508361878, "grad_norm": 0.08176197856664658, "learning_rate": 6.558133630271611e-05, "loss": 0.1898, "step": 10650 }, { "epoch": 2.1462824904291757, "grad_norm": 0.05684948340058327, "learning_rate": 6.5568672833532e-05, "loss": 0.1899, "step": 10652 }, { "epoch": 2.1466854724964737, "grad_norm": 0.04240806773304939, "learning_rate": 6.555600825837431e-05, "loss": 0.1811, "step": 10654 }, { "epoch": 2.147088454563772, "grad_norm": 0.07894350588321686, "learning_rate": 6.554334257814264e-05, "loss": 0.204, "step": 10656 }, { "epoch": 2.14749143663107, "grad_norm": 0.06273147463798523, "learning_rate": 6.553067579373677e-05, "loss": 0.2152, "step": 10658 }, { "epoch": 2.147894418698368, "grad_norm": 0.06141069903969765, "learning_rate": 6.551800790605655e-05, "loss": 0.1701, "step": 10660 }, { "epoch": 2.148297400765666, "grad_norm": 0.04876910522580147, "learning_rate": 6.550533891600186e-05, "loss": 0.1384, "step": 10662 }, { "epoch": 2.148700382832964, "grad_norm": 0.0704379603266716, "learning_rate": 6.549266882447268e-05, "loss": 0.1589, "step": 10664 }, { "epoch": 2.1491033649002618, "grad_norm": 0.05774037539958954, "learning_rate": 6.547999763236909e-05, "loss": 0.1674, "step": 10666 }, { "epoch": 2.14950634696756, "grad_norm": 0.05031192675232887, "learning_rate": 6.546732534059122e-05, "loss": 0.2287, "step": 10668 }, { "epoch": 2.149909329034858, "grad_norm": 0.06176503375172615, "learning_rate": 6.54546519500393e-05, "loss": 0.1826, "step": 10670 }, { "epoch": 2.150312311102156, "grad_norm": 0.05960913375020027, "learning_rate": 6.544197746161363e-05, "loss": 0.2152, "step": 10672 }, { "epoch": 2.150715293169454, "grad_norm": 0.07033538818359375, "learning_rate": 6.542930187621455e-05, "loss": 0.1944, "step": 10674 }, { "epoch": 2.151118275236752, "grad_norm": 0.05268462374806404, "learning_rate": 6.541662519474256e-05, "loss": 0.2345, "step": 10676 }, { "epoch": 2.15152125730405, "grad_norm": 0.05274904519319534, "learning_rate": 6.540394741809818e-05, "loss": 0.1759, "step": 10678 }, { "epoch": 2.151924239371348, "grad_norm": 0.06389094144105911, "learning_rate": 6.539126854718198e-05, "loss": 0.1585, "step": 10680 }, { "epoch": 2.152327221438646, "grad_norm": 0.07119549810886383, "learning_rate": 6.53785885828947e-05, "loss": 0.2174, "step": 10682 }, { "epoch": 2.152730203505944, "grad_norm": 0.0506424643099308, "learning_rate": 6.536590752613708e-05, "loss": 0.1525, "step": 10684 }, { "epoch": 2.153133185573242, "grad_norm": 0.06556964665651321, "learning_rate": 6.535322537780997e-05, "loss": 0.1803, "step": 10686 }, { "epoch": 2.15353616764054, "grad_norm": 0.0708077996969223, "learning_rate": 6.534054213881426e-05, "loss": 0.2223, "step": 10688 }, { "epoch": 2.153939149707838, "grad_norm": 0.05754520371556282, "learning_rate": 6.5327857810051e-05, "loss": 0.2328, "step": 10690 }, { "epoch": 2.154342131775136, "grad_norm": 0.0400908887386322, "learning_rate": 6.531517239242121e-05, "loss": 0.1488, "step": 10692 }, { "epoch": 2.154745113842434, "grad_norm": 0.0965040922164917, "learning_rate": 6.530248588682607e-05, "loss": 0.1879, "step": 10694 }, { "epoch": 2.155148095909732, "grad_norm": 0.05838022381067276, "learning_rate": 6.528979829416682e-05, "loss": 0.2306, "step": 10696 }, { "epoch": 2.15555107797703, "grad_norm": 0.05278877168893814, "learning_rate": 6.527710961534473e-05, "loss": 0.2185, "step": 10698 }, { "epoch": 2.155954060044328, "grad_norm": 0.0592544861137867, "learning_rate": 6.526441985126121e-05, "loss": 0.18, "step": 10700 }, { "epoch": 2.156357042111626, "grad_norm": 0.17698846757411957, "learning_rate": 6.525172900281774e-05, "loss": 0.1923, "step": 10702 }, { "epoch": 2.156760024178924, "grad_norm": 0.05410192534327507, "learning_rate": 6.523903707091581e-05, "loss": 0.2426, "step": 10704 }, { "epoch": 2.157163006246222, "grad_norm": 0.049191661179065704, "learning_rate": 6.522634405645705e-05, "loss": 0.16, "step": 10706 }, { "epoch": 2.15756598831352, "grad_norm": 0.049240659922361374, "learning_rate": 6.521364996034318e-05, "loss": 0.1309, "step": 10708 }, { "epoch": 2.1579689703808183, "grad_norm": 0.04705396294593811, "learning_rate": 6.520095478347594e-05, "loss": 0.1748, "step": 10710 }, { "epoch": 2.158371952448116, "grad_norm": 0.0652155801653862, "learning_rate": 6.518825852675719e-05, "loss": 0.206, "step": 10712 }, { "epoch": 2.158774934515414, "grad_norm": 0.06381936371326447, "learning_rate": 6.517556119108882e-05, "loss": 0.2326, "step": 10714 }, { "epoch": 2.159177916582712, "grad_norm": 0.05069877207279205, "learning_rate": 6.516286277737287e-05, "loss": 0.2082, "step": 10716 }, { "epoch": 2.15958089865001, "grad_norm": 0.05980228632688522, "learning_rate": 6.51501632865114e-05, "loss": 0.2605, "step": 10718 }, { "epoch": 2.159983880717308, "grad_norm": 0.054766424000263214, "learning_rate": 6.513746271940656e-05, "loss": 0.1951, "step": 10720 }, { "epoch": 2.160386862784606, "grad_norm": 0.07653187215328217, "learning_rate": 6.512476107696058e-05, "loss": 0.2033, "step": 10722 }, { "epoch": 2.1607898448519043, "grad_norm": 0.06018948182463646, "learning_rate": 6.511205836007575e-05, "loss": 0.186, "step": 10724 }, { "epoch": 2.1611928269192022, "grad_norm": 0.06138040870428085, "learning_rate": 6.509935456965446e-05, "loss": 0.1764, "step": 10726 }, { "epoch": 2.1615958089865, "grad_norm": 0.055142004042863846, "learning_rate": 6.508664970659917e-05, "loss": 0.179, "step": 10728 }, { "epoch": 2.161998791053798, "grad_norm": 0.07405485212802887, "learning_rate": 6.507394377181243e-05, "loss": 0.2281, "step": 10730 }, { "epoch": 2.162401773121096, "grad_norm": 0.06127721071243286, "learning_rate": 6.506123676619682e-05, "loss": 0.2391, "step": 10732 }, { "epoch": 2.162804755188394, "grad_norm": 0.178990438580513, "learning_rate": 6.504852869065503e-05, "loss": 0.1965, "step": 10734 }, { "epoch": 2.163207737255692, "grad_norm": 0.05538182333111763, "learning_rate": 6.503581954608984e-05, "loss": 0.1992, "step": 10736 }, { "epoch": 2.1636107193229903, "grad_norm": 0.05879183113574982, "learning_rate": 6.502310933340407e-05, "loss": 0.1626, "step": 10738 }, { "epoch": 2.1640137013902883, "grad_norm": 0.06185007095336914, "learning_rate": 6.501039805350063e-05, "loss": 0.1972, "step": 10740 }, { "epoch": 2.164416683457586, "grad_norm": 0.06894201785326004, "learning_rate": 6.499768570728254e-05, "loss": 0.2017, "step": 10742 }, { "epoch": 2.164819665524884, "grad_norm": 0.07132090628147125, "learning_rate": 6.498497229565283e-05, "loss": 0.2323, "step": 10744 }, { "epoch": 2.165222647592182, "grad_norm": 0.06672022491693497, "learning_rate": 6.497225781951465e-05, "loss": 0.2459, "step": 10746 }, { "epoch": 2.16562562965948, "grad_norm": 0.04566428065299988, "learning_rate": 6.495954227977123e-05, "loss": 0.1642, "step": 10748 }, { "epoch": 2.166028611726778, "grad_norm": 0.077194444835186, "learning_rate": 6.494682567732584e-05, "loss": 0.2101, "step": 10750 }, { "epoch": 2.1664315937940763, "grad_norm": 0.07681397348642349, "learning_rate": 6.493410801308185e-05, "loss": 0.1838, "step": 10752 }, { "epoch": 2.1668345758613743, "grad_norm": 0.08021048456430435, "learning_rate": 6.492138928794274e-05, "loss": 0.2141, "step": 10754 }, { "epoch": 2.1672375579286722, "grad_norm": 0.0956893116235733, "learning_rate": 6.490866950281196e-05, "loss": 0.1837, "step": 10756 }, { "epoch": 2.16764053999597, "grad_norm": 0.06688991189002991, "learning_rate": 6.489594865859316e-05, "loss": 0.1923, "step": 10758 }, { "epoch": 2.168043522063268, "grad_norm": 0.06143457069993019, "learning_rate": 6.488322675619e-05, "loss": 0.1785, "step": 10760 }, { "epoch": 2.168446504130566, "grad_norm": 0.06079430878162384, "learning_rate": 6.487050379650622e-05, "loss": 0.1553, "step": 10762 }, { "epoch": 2.168849486197864, "grad_norm": 0.03899361938238144, "learning_rate": 6.48577797804456e-05, "loss": 0.1434, "step": 10764 }, { "epoch": 2.1692524682651624, "grad_norm": 0.05817665159702301, "learning_rate": 6.484505470891209e-05, "loss": 0.1804, "step": 10766 }, { "epoch": 2.1696554503324603, "grad_norm": 0.06685777008533478, "learning_rate": 6.483232858280962e-05, "loss": 0.195, "step": 10768 }, { "epoch": 2.1700584323997583, "grad_norm": 0.05597671493887901, "learning_rate": 6.481960140304225e-05, "loss": 0.1987, "step": 10770 }, { "epoch": 2.170461414467056, "grad_norm": 0.059224747121334076, "learning_rate": 6.48068731705141e-05, "loss": 0.1825, "step": 10772 }, { "epoch": 2.170864396534354, "grad_norm": 0.07822896540164948, "learning_rate": 6.479414388612936e-05, "loss": 0.1742, "step": 10774 }, { "epoch": 2.171267378601652, "grad_norm": 0.070325568318367, "learning_rate": 6.47814135507923e-05, "loss": 0.2235, "step": 10776 }, { "epoch": 2.17167036066895, "grad_norm": 0.0596688948571682, "learning_rate": 6.476868216540728e-05, "loss": 0.1769, "step": 10778 }, { "epoch": 2.1720733427362484, "grad_norm": 0.04775102809071541, "learning_rate": 6.475594973087866e-05, "loss": 0.1415, "step": 10780 }, { "epoch": 2.1724763248035464, "grad_norm": 0.06646701693534851, "learning_rate": 6.474321624811101e-05, "loss": 0.1886, "step": 10782 }, { "epoch": 2.1728793068708443, "grad_norm": 0.05909574404358864, "learning_rate": 6.473048171800882e-05, "loss": 0.2096, "step": 10784 }, { "epoch": 2.1732822889381422, "grad_norm": 0.10706569254398346, "learning_rate": 6.471774614147678e-05, "loss": 0.214, "step": 10786 }, { "epoch": 2.17368527100544, "grad_norm": 0.06290362775325775, "learning_rate": 6.47050095194196e-05, "loss": 0.2276, "step": 10788 }, { "epoch": 2.174088253072738, "grad_norm": 0.06873378157615662, "learning_rate": 6.469227185274204e-05, "loss": 0.1938, "step": 10790 }, { "epoch": 2.174491235140036, "grad_norm": 0.14236712455749512, "learning_rate": 6.4679533142349e-05, "loss": 0.1875, "step": 10792 }, { "epoch": 2.1748942172073344, "grad_norm": 0.05300990492105484, "learning_rate": 6.466679338914542e-05, "loss": 0.1466, "step": 10794 }, { "epoch": 2.1752971992746324, "grad_norm": 0.06521982699632645, "learning_rate": 6.465405259403626e-05, "loss": 0.2241, "step": 10796 }, { "epoch": 2.1757001813419303, "grad_norm": 0.08661910891532898, "learning_rate": 6.464131075792665e-05, "loss": 0.2403, "step": 10798 }, { "epoch": 2.1761031634092283, "grad_norm": 0.05931783467531204, "learning_rate": 6.462856788172175e-05, "loss": 0.1893, "step": 10800 }, { "epoch": 2.176506145476526, "grad_norm": 0.07565109431743622, "learning_rate": 6.461582396632677e-05, "loss": 0.1655, "step": 10802 }, { "epoch": 2.176909127543824, "grad_norm": 0.08917911350727081, "learning_rate": 6.460307901264704e-05, "loss": 0.167, "step": 10804 }, { "epoch": 2.1773121096111225, "grad_norm": 0.053038131445646286, "learning_rate": 6.459033302158793e-05, "loss": 0.1885, "step": 10806 }, { "epoch": 2.1777150916784205, "grad_norm": 0.062328778207302094, "learning_rate": 6.457758599405489e-05, "loss": 0.1445, "step": 10808 }, { "epoch": 2.1781180737457184, "grad_norm": 0.06661716103553772, "learning_rate": 6.456483793095345e-05, "loss": 0.1779, "step": 10810 }, { "epoch": 2.1785210558130164, "grad_norm": 0.07117008417844772, "learning_rate": 6.455208883318923e-05, "loss": 0.2424, "step": 10812 }, { "epoch": 2.1789240378803143, "grad_norm": 0.05308781564235687, "learning_rate": 6.453933870166788e-05, "loss": 0.2022, "step": 10814 }, { "epoch": 2.1793270199476122, "grad_norm": 0.07809494435787201, "learning_rate": 6.452658753729517e-05, "loss": 0.2611, "step": 10816 }, { "epoch": 2.17973000201491, "grad_norm": 0.06897277384996414, "learning_rate": 6.451383534097692e-05, "loss": 0.1904, "step": 10818 }, { "epoch": 2.180132984082208, "grad_norm": 0.050235819071531296, "learning_rate": 6.450108211361899e-05, "loss": 0.1894, "step": 10820 }, { "epoch": 2.1805359661495065, "grad_norm": 0.053104523569345474, "learning_rate": 6.448832785612739e-05, "loss": 0.1917, "step": 10822 }, { "epoch": 2.1809389482168045, "grad_norm": 0.049236420542001724, "learning_rate": 6.447557256940817e-05, "loss": 0.169, "step": 10824 }, { "epoch": 2.1813419302841024, "grad_norm": 0.10533499717712402, "learning_rate": 6.446281625436741e-05, "loss": 0.2435, "step": 10826 }, { "epoch": 2.1817449123514003, "grad_norm": 0.05212007090449333, "learning_rate": 6.44500589119113e-05, "loss": 0.1979, "step": 10828 }, { "epoch": 2.1821478944186983, "grad_norm": 0.05394808202981949, "learning_rate": 6.443730054294614e-05, "loss": 0.202, "step": 10830 }, { "epoch": 2.182550876485996, "grad_norm": 0.05483834818005562, "learning_rate": 6.442454114837823e-05, "loss": 0.1981, "step": 10832 }, { "epoch": 2.1829538585532946, "grad_norm": 0.05570824816823006, "learning_rate": 6.441178072911398e-05, "loss": 0.1896, "step": 10834 }, { "epoch": 2.1833568406205925, "grad_norm": 0.07239922881126404, "learning_rate": 6.439901928605988e-05, "loss": 0.2285, "step": 10836 }, { "epoch": 2.1837598226878905, "grad_norm": 0.04814567789435387, "learning_rate": 6.438625682012248e-05, "loss": 0.1692, "step": 10838 }, { "epoch": 2.1841628047551884, "grad_norm": 0.07324769347906113, "learning_rate": 6.437349333220838e-05, "loss": 0.2088, "step": 10840 }, { "epoch": 2.1845657868224864, "grad_norm": 0.04402107000350952, "learning_rate": 6.436072882322432e-05, "loss": 0.1789, "step": 10842 }, { "epoch": 2.1849687688897843, "grad_norm": 2.5281331539154053, "learning_rate": 6.434796329407705e-05, "loss": 0.1853, "step": 10844 }, { "epoch": 2.1853717509570822, "grad_norm": 0.04652038589119911, "learning_rate": 6.433519674567342e-05, "loss": 0.1921, "step": 10846 }, { "epoch": 2.18577473302438, "grad_norm": 0.04080631211400032, "learning_rate": 6.432242917892033e-05, "loss": 0.1401, "step": 10848 }, { "epoch": 2.1861777150916786, "grad_norm": 0.062017377465963364, "learning_rate": 6.430966059472478e-05, "loss": 0.1588, "step": 10850 }, { "epoch": 2.1865806971589765, "grad_norm": 0.034065987914800644, "learning_rate": 6.429689099399383e-05, "loss": 0.133, "step": 10852 }, { "epoch": 2.1869836792262745, "grad_norm": 0.044415101408958435, "learning_rate": 6.428412037763459e-05, "loss": 0.1488, "step": 10854 }, { "epoch": 2.1873866612935724, "grad_norm": 0.07276732474565506, "learning_rate": 6.42713487465543e-05, "loss": 0.2159, "step": 10856 }, { "epoch": 2.1877896433608703, "grad_norm": 0.040792450308799744, "learning_rate": 6.425857610166021e-05, "loss": 0.1691, "step": 10858 }, { "epoch": 2.1881926254281683, "grad_norm": 0.0323706679046154, "learning_rate": 6.42458024438597e-05, "loss": 0.1333, "step": 10860 }, { "epoch": 2.1885956074954667, "grad_norm": 0.053765907883644104, "learning_rate": 6.423302777406013e-05, "loss": 0.2161, "step": 10862 }, { "epoch": 2.1889985895627646, "grad_norm": 0.06381344050168991, "learning_rate": 6.422025209316906e-05, "loss": 0.2325, "step": 10864 }, { "epoch": 2.1894015716300625, "grad_norm": 0.046741727739572525, "learning_rate": 6.4207475402094e-05, "loss": 0.1963, "step": 10866 }, { "epoch": 2.1898045536973605, "grad_norm": 0.06143912300467491, "learning_rate": 6.419469770174263e-05, "loss": 0.1893, "step": 10868 }, { "epoch": 2.1902075357646584, "grad_norm": 0.06888391822576523, "learning_rate": 6.418191899302263e-05, "loss": 0.16, "step": 10870 }, { "epoch": 2.1906105178319564, "grad_norm": 0.0596514530479908, "learning_rate": 6.416913927684177e-05, "loss": 0.2057, "step": 10872 }, { "epoch": 2.1910134998992543, "grad_norm": 0.0417722687125206, "learning_rate": 6.415635855410793e-05, "loss": 0.1721, "step": 10874 }, { "epoch": 2.1914164819665523, "grad_norm": 0.06044435501098633, "learning_rate": 6.414357682572903e-05, "loss": 0.2167, "step": 10876 }, { "epoch": 2.1918194640338506, "grad_norm": 0.05764295905828476, "learning_rate": 6.413079409261302e-05, "loss": 0.1648, "step": 10878 }, { "epoch": 2.1922224461011486, "grad_norm": 0.0555766336619854, "learning_rate": 6.411801035566801e-05, "loss": 0.1291, "step": 10880 }, { "epoch": 2.1926254281684465, "grad_norm": 0.06001344323158264, "learning_rate": 6.410522561580213e-05, "loss": 0.1942, "step": 10882 }, { "epoch": 2.1930284102357445, "grad_norm": 0.05169009789824486, "learning_rate": 6.409243987392358e-05, "loss": 0.1904, "step": 10884 }, { "epoch": 2.1934313923030424, "grad_norm": 0.0572587326169014, "learning_rate": 6.407965313094063e-05, "loss": 0.1922, "step": 10886 }, { "epoch": 2.1938343743703403, "grad_norm": 0.05167214199900627, "learning_rate": 6.406686538776166e-05, "loss": 0.1879, "step": 10888 }, { "epoch": 2.1942373564376387, "grad_norm": 0.056762244552373886, "learning_rate": 6.405407664529503e-05, "loss": 0.1969, "step": 10890 }, { "epoch": 2.1946403385049367, "grad_norm": 0.056261587888002396, "learning_rate": 6.40412869044493e-05, "loss": 0.233, "step": 10892 }, { "epoch": 2.1950433205722346, "grad_norm": 0.0477316789329052, "learning_rate": 6.4028496166133e-05, "loss": 0.1761, "step": 10894 }, { "epoch": 2.1954463026395326, "grad_norm": 0.06606003642082214, "learning_rate": 6.401570443125477e-05, "loss": 0.205, "step": 10896 }, { "epoch": 2.1958492847068305, "grad_norm": 0.050838105380535126, "learning_rate": 6.40029117007233e-05, "loss": 0.1704, "step": 10898 }, { "epoch": 2.1962522667741284, "grad_norm": 0.04778251424431801, "learning_rate": 6.399011797544739e-05, "loss": 0.1734, "step": 10900 }, { "epoch": 2.1966552488414264, "grad_norm": 0.06802946329116821, "learning_rate": 6.397732325633587e-05, "loss": 0.2171, "step": 10902 }, { "epoch": 2.1970582309087248, "grad_norm": 0.07286917418241501, "learning_rate": 6.396452754429766e-05, "loss": 0.1792, "step": 10904 }, { "epoch": 2.1974612129760227, "grad_norm": 0.055850863456726074, "learning_rate": 6.395173084024174e-05, "loss": 0.1678, "step": 10906 }, { "epoch": 2.1978641950433206, "grad_norm": 0.0752958357334137, "learning_rate": 6.393893314507717e-05, "loss": 0.2086, "step": 10908 }, { "epoch": 2.1982671771106186, "grad_norm": 0.06332895159721375, "learning_rate": 6.39261344597131e-05, "loss": 0.2271, "step": 10910 }, { "epoch": 2.1986701591779165, "grad_norm": 0.06559202075004578, "learning_rate": 6.39133347850587e-05, "loss": 0.2126, "step": 10912 }, { "epoch": 2.1990731412452145, "grad_norm": 0.08061282336711884, "learning_rate": 6.390053412202324e-05, "loss": 0.2651, "step": 10914 }, { "epoch": 2.1994761233125124, "grad_norm": 0.06229247525334358, "learning_rate": 6.388773247151606e-05, "loss": 0.2288, "step": 10916 }, { "epoch": 2.199879105379811, "grad_norm": 0.0519050732254982, "learning_rate": 6.38749298344466e-05, "loss": 0.1691, "step": 10918 }, { "epoch": 2.2002820874471087, "grad_norm": 0.059164535254240036, "learning_rate": 6.38621262117243e-05, "loss": 0.1838, "step": 10920 }, { "epoch": 2.2006850695144067, "grad_norm": 0.054849907755851746, "learning_rate": 6.384932160425873e-05, "loss": 0.1956, "step": 10922 }, { "epoch": 2.2010880515817046, "grad_norm": 0.05935594439506531, "learning_rate": 6.38365160129595e-05, "loss": 0.2014, "step": 10924 }, { "epoch": 2.2014910336490026, "grad_norm": 0.06581877917051315, "learning_rate": 6.382370943873629e-05, "loss": 0.1512, "step": 10926 }, { "epoch": 2.2018940157163005, "grad_norm": 0.03840584307909012, "learning_rate": 6.381090188249889e-05, "loss": 0.157, "step": 10928 }, { "epoch": 2.2022969977835984, "grad_norm": 0.05577847361564636, "learning_rate": 6.37980933451571e-05, "loss": 0.1547, "step": 10930 }, { "epoch": 2.202699979850897, "grad_norm": 0.07472618669271469, "learning_rate": 6.378528382762082e-05, "loss": 0.1814, "step": 10932 }, { "epoch": 2.2031029619181948, "grad_norm": 0.049337152391672134, "learning_rate": 6.377247333080002e-05, "loss": 0.1926, "step": 10934 }, { "epoch": 2.2035059439854927, "grad_norm": 0.06512777507305145, "learning_rate": 6.375966185560473e-05, "loss": 0.1905, "step": 10936 }, { "epoch": 2.2039089260527907, "grad_norm": 0.07381617277860641, "learning_rate": 6.374684940294508e-05, "loss": 0.1729, "step": 10938 }, { "epoch": 2.2043119081200886, "grad_norm": 0.0456584207713604, "learning_rate": 6.373403597373125e-05, "loss": 0.1729, "step": 10940 }, { "epoch": 2.2047148901873865, "grad_norm": 0.0703442320227623, "learning_rate": 6.372122156887345e-05, "loss": 0.1971, "step": 10942 }, { "epoch": 2.2051178722546845, "grad_norm": 0.06072754040360451, "learning_rate": 6.370840618928202e-05, "loss": 0.2005, "step": 10944 }, { "epoch": 2.205520854321983, "grad_norm": 0.06688905507326126, "learning_rate": 6.369558983586733e-05, "loss": 0.1741, "step": 10946 }, { "epoch": 2.205923836389281, "grad_norm": 0.07325995713472366, "learning_rate": 6.368277250953985e-05, "loss": 0.1826, "step": 10948 }, { "epoch": 2.2063268184565787, "grad_norm": 0.06107252091169357, "learning_rate": 6.366995421121009e-05, "loss": 0.1818, "step": 10950 }, { "epoch": 2.2067298005238767, "grad_norm": 0.07642047852277756, "learning_rate": 6.365713494178865e-05, "loss": 0.1636, "step": 10952 }, { "epoch": 2.2071327825911746, "grad_norm": 0.05578227341175079, "learning_rate": 6.364431470218617e-05, "loss": 0.1681, "step": 10954 }, { "epoch": 2.2075357646584726, "grad_norm": 0.054211586713790894, "learning_rate": 6.363149349331341e-05, "loss": 0.1663, "step": 10956 }, { "epoch": 2.2079387467257705, "grad_norm": 0.054358530789613724, "learning_rate": 6.361867131608115e-05, "loss": 0.1941, "step": 10958 }, { "epoch": 2.208341728793069, "grad_norm": 0.0320722721517086, "learning_rate": 6.360584817140025e-05, "loss": 0.114, "step": 10960 }, { "epoch": 2.208744710860367, "grad_norm": 0.06492000073194504, "learning_rate": 6.359302406018166e-05, "loss": 0.2156, "step": 10962 }, { "epoch": 2.2091476929276648, "grad_norm": 0.06293896585702896, "learning_rate": 6.358019898333638e-05, "loss": 0.1998, "step": 10964 }, { "epoch": 2.2095506749949627, "grad_norm": 0.06004882603883743, "learning_rate": 6.356737294177547e-05, "loss": 0.2076, "step": 10966 }, { "epoch": 2.2099536570622607, "grad_norm": 0.05569273978471756, "learning_rate": 6.35545459364101e-05, "loss": 0.208, "step": 10968 }, { "epoch": 2.2103566391295586, "grad_norm": 0.061380334198474884, "learning_rate": 6.354171796815146e-05, "loss": 0.2243, "step": 10970 }, { "epoch": 2.2107596211968565, "grad_norm": 0.05998251214623451, "learning_rate": 6.352888903791083e-05, "loss": 0.1921, "step": 10972 }, { "epoch": 2.211162603264155, "grad_norm": 0.059841521084308624, "learning_rate": 6.351605914659957e-05, "loss": 0.1814, "step": 10974 }, { "epoch": 2.211565585331453, "grad_norm": 0.05141017213463783, "learning_rate": 6.350322829512908e-05, "loss": 0.1651, "step": 10976 }, { "epoch": 2.211968567398751, "grad_norm": 0.051106277853250504, "learning_rate": 6.349039648441084e-05, "loss": 0.1748, "step": 10978 }, { "epoch": 2.2123715494660487, "grad_norm": 0.046505145728588104, "learning_rate": 6.347756371535642e-05, "loss": 0.1569, "step": 10980 }, { "epoch": 2.2127745315333467, "grad_norm": 0.05843675136566162, "learning_rate": 6.346472998887741e-05, "loss": 0.1889, "step": 10982 }, { "epoch": 2.2131775136006446, "grad_norm": 0.05025404691696167, "learning_rate": 6.345189530588553e-05, "loss": 0.1844, "step": 10984 }, { "epoch": 2.2135804956679426, "grad_norm": 0.06326805055141449, "learning_rate": 6.343905966729251e-05, "loss": 0.2044, "step": 10986 }, { "epoch": 2.213983477735241, "grad_norm": 0.05964503809809685, "learning_rate": 6.342622307401019e-05, "loss": 0.2166, "step": 10988 }, { "epoch": 2.214386459802539, "grad_norm": 0.04110246151685715, "learning_rate": 6.341338552695045e-05, "loss": 0.15, "step": 10990 }, { "epoch": 2.214789441869837, "grad_norm": 0.07898419350385666, "learning_rate": 6.340054702702528e-05, "loss": 0.1853, "step": 10992 }, { "epoch": 2.215192423937135, "grad_norm": 0.057005874812603, "learning_rate": 6.338770757514664e-05, "loss": 0.1539, "step": 10994 }, { "epoch": 2.2155954060044327, "grad_norm": 0.05199525132775307, "learning_rate": 6.337486717222668e-05, "loss": 0.241, "step": 10996 }, { "epoch": 2.2159983880717307, "grad_norm": 0.062137044966220856, "learning_rate": 6.336202581917756e-05, "loss": 0.1732, "step": 10998 }, { "epoch": 2.216401370139029, "grad_norm": 0.0612826943397522, "learning_rate": 6.334918351691149e-05, "loss": 0.1975, "step": 11000 }, { "epoch": 2.216804352206327, "grad_norm": 0.07609107345342636, "learning_rate": 6.333634026634074e-05, "loss": 0.1966, "step": 11002 }, { "epoch": 2.217207334273625, "grad_norm": 0.10069447755813599, "learning_rate": 6.332349606837774e-05, "loss": 0.1884, "step": 11004 }, { "epoch": 2.217610316340923, "grad_norm": 0.054409392178058624, "learning_rate": 6.331065092393487e-05, "loss": 0.1815, "step": 11006 }, { "epoch": 2.218013298408221, "grad_norm": 0.059293653815984726, "learning_rate": 6.329780483392466e-05, "loss": 0.1286, "step": 11008 }, { "epoch": 2.2184162804755188, "grad_norm": 0.039616234600543976, "learning_rate": 6.328495779925966e-05, "loss": 0.1335, "step": 11010 }, { "epoch": 2.2188192625428167, "grad_norm": 0.06314606219530106, "learning_rate": 6.32721098208525e-05, "loss": 0.1814, "step": 11012 }, { "epoch": 2.2192222446101146, "grad_norm": 0.04756368324160576, "learning_rate": 6.325926089961589e-05, "loss": 0.164, "step": 11014 }, { "epoch": 2.219625226677413, "grad_norm": 0.06843490153551102, "learning_rate": 6.324641103646258e-05, "loss": 0.2759, "step": 11016 }, { "epoch": 2.220028208744711, "grad_norm": 0.04619716852903366, "learning_rate": 6.323356023230541e-05, "loss": 0.2057, "step": 11018 }, { "epoch": 2.220431190812009, "grad_norm": 0.05599410831928253, "learning_rate": 6.32207084880573e-05, "loss": 0.141, "step": 11020 }, { "epoch": 2.220834172879307, "grad_norm": 0.056852128356695175, "learning_rate": 6.32078558046312e-05, "loss": 0.1811, "step": 11022 }, { "epoch": 2.221237154946605, "grad_norm": 0.07424285262823105, "learning_rate": 6.319500218294013e-05, "loss": 0.1862, "step": 11024 }, { "epoch": 2.2216401370139027, "grad_norm": 0.05813653767108917, "learning_rate": 6.318214762389723e-05, "loss": 0.1829, "step": 11026 }, { "epoch": 2.222043119081201, "grad_norm": 0.08757123351097107, "learning_rate": 6.316929212841563e-05, "loss": 0.1998, "step": 11028 }, { "epoch": 2.222446101148499, "grad_norm": 0.06450676918029785, "learning_rate": 6.315643569740857e-05, "loss": 0.2525, "step": 11030 }, { "epoch": 2.222849083215797, "grad_norm": 0.04911727085709572, "learning_rate": 6.314357833178939e-05, "loss": 0.2204, "step": 11032 }, { "epoch": 2.223252065283095, "grad_norm": 0.056063055992126465, "learning_rate": 6.31307200324714e-05, "loss": 0.2079, "step": 11034 }, { "epoch": 2.223655047350393, "grad_norm": 0.047666437923908234, "learning_rate": 6.311786080036806e-05, "loss": 0.2186, "step": 11036 }, { "epoch": 2.224058029417691, "grad_norm": 0.05272279307246208, "learning_rate": 6.310500063639289e-05, "loss": 0.1373, "step": 11038 }, { "epoch": 2.2244610114849888, "grad_norm": 0.12969458103179932, "learning_rate": 6.30921395414594e-05, "loss": 0.2269, "step": 11040 }, { "epoch": 2.2248639935522867, "grad_norm": 0.0443669892847538, "learning_rate": 6.307927751648127e-05, "loss": 0.1648, "step": 11042 }, { "epoch": 2.225266975619585, "grad_norm": 0.07922135293483734, "learning_rate": 6.306641456237219e-05, "loss": 0.1777, "step": 11044 }, { "epoch": 2.225669957686883, "grad_norm": 0.04765113815665245, "learning_rate": 6.305355068004591e-05, "loss": 0.1786, "step": 11046 }, { "epoch": 2.226072939754181, "grad_norm": 0.08355677127838135, "learning_rate": 6.304068587041625e-05, "loss": 0.2534, "step": 11048 }, { "epoch": 2.226475921821479, "grad_norm": 0.06936746835708618, "learning_rate": 6.302782013439715e-05, "loss": 0.1977, "step": 11050 }, { "epoch": 2.226878903888777, "grad_norm": 0.07767514139413834, "learning_rate": 6.301495347290252e-05, "loss": 0.2054, "step": 11052 }, { "epoch": 2.227281885956075, "grad_norm": 0.03796292096376419, "learning_rate": 6.300208588684641e-05, "loss": 0.1683, "step": 11054 }, { "epoch": 2.227684868023373, "grad_norm": 0.07153323292732239, "learning_rate": 6.298921737714294e-05, "loss": 0.1856, "step": 11056 }, { "epoch": 2.228087850090671, "grad_norm": 0.051289331167936325, "learning_rate": 6.297634794470621e-05, "loss": 0.1724, "step": 11058 }, { "epoch": 2.228490832157969, "grad_norm": 0.04763934388756752, "learning_rate": 6.296347759045049e-05, "loss": 0.1625, "step": 11060 }, { "epoch": 2.228893814225267, "grad_norm": 0.07084707170724869, "learning_rate": 6.295060631529006e-05, "loss": 0.1415, "step": 11062 }, { "epoch": 2.229296796292565, "grad_norm": 0.08750694990158081, "learning_rate": 6.293773412013926e-05, "loss": 0.2066, "step": 11064 }, { "epoch": 2.229699778359863, "grad_norm": 0.058792464435100555, "learning_rate": 6.29248610059125e-05, "loss": 0.2467, "step": 11066 }, { "epoch": 2.230102760427161, "grad_norm": 0.07559051364660263, "learning_rate": 6.291198697352432e-05, "loss": 0.2182, "step": 11068 }, { "epoch": 2.2305057424944588, "grad_norm": 0.047350767999887466, "learning_rate": 6.289911202388921e-05, "loss": 0.1571, "step": 11070 }, { "epoch": 2.230908724561757, "grad_norm": 0.05143246799707413, "learning_rate": 6.288623615792183e-05, "loss": 0.1774, "step": 11072 }, { "epoch": 2.231311706629055, "grad_norm": 0.037440188229084015, "learning_rate": 6.287335937653682e-05, "loss": 0.141, "step": 11074 }, { "epoch": 2.231714688696353, "grad_norm": 0.06669832766056061, "learning_rate": 6.286048168064896e-05, "loss": 0.2078, "step": 11076 }, { "epoch": 2.232117670763651, "grad_norm": 0.06164056062698364, "learning_rate": 6.284760307117304e-05, "loss": 0.1865, "step": 11078 }, { "epoch": 2.232520652830949, "grad_norm": 0.056018486618995667, "learning_rate": 6.283472354902396e-05, "loss": 0.1628, "step": 11080 }, { "epoch": 2.232923634898247, "grad_norm": 0.0484955869615078, "learning_rate": 6.282184311511664e-05, "loss": 0.1794, "step": 11082 }, { "epoch": 2.2333266169655452, "grad_norm": 0.058489102870225906, "learning_rate": 6.280896177036608e-05, "loss": 0.2127, "step": 11084 }, { "epoch": 2.233729599032843, "grad_norm": 0.051653195172548294, "learning_rate": 6.279607951568737e-05, "loss": 0.1569, "step": 11086 }, { "epoch": 2.234132581100141, "grad_norm": 0.06204577907919884, "learning_rate": 6.278319635199561e-05, "loss": 0.2496, "step": 11088 }, { "epoch": 2.234535563167439, "grad_norm": 0.04957421123981476, "learning_rate": 6.277031228020607e-05, "loss": 0.1664, "step": 11090 }, { "epoch": 2.234938545234737, "grad_norm": 0.07499062269926071, "learning_rate": 6.275742730123394e-05, "loss": 0.2182, "step": 11092 }, { "epoch": 2.235341527302035, "grad_norm": 0.03950365260243416, "learning_rate": 6.274454141599458e-05, "loss": 0.1521, "step": 11094 }, { "epoch": 2.235744509369333, "grad_norm": 0.05986891686916351, "learning_rate": 6.27316546254034e-05, "loss": 0.2178, "step": 11096 }, { "epoch": 2.2361474914366313, "grad_norm": 0.052209652960300446, "learning_rate": 6.27187669303758e-05, "loss": 0.1873, "step": 11098 }, { "epoch": 2.236550473503929, "grad_norm": 0.05092112347483635, "learning_rate": 6.270587833182736e-05, "loss": 0.2084, "step": 11100 }, { "epoch": 2.236953455571227, "grad_norm": 0.06727428734302521, "learning_rate": 6.269298883067365e-05, "loss": 0.1824, "step": 11102 }, { "epoch": 2.237356437638525, "grad_norm": 0.055155668407678604, "learning_rate": 6.26800984278303e-05, "loss": 0.2107, "step": 11104 }, { "epoch": 2.237759419705823, "grad_norm": 0.05623969808220863, "learning_rate": 6.266720712421303e-05, "loss": 0.1845, "step": 11106 }, { "epoch": 2.238162401773121, "grad_norm": 0.04958875849843025, "learning_rate": 6.265431492073765e-05, "loss": 0.1954, "step": 11108 }, { "epoch": 2.238565383840419, "grad_norm": 0.06290893256664276, "learning_rate": 6.264142181831995e-05, "loss": 0.2162, "step": 11110 }, { "epoch": 2.2389683659077173, "grad_norm": 0.04571421071887016, "learning_rate": 6.262852781787587e-05, "loss": 0.1878, "step": 11112 }, { "epoch": 2.2393713479750152, "grad_norm": 0.06730411946773529, "learning_rate": 6.261563292032137e-05, "loss": 0.1651, "step": 11114 }, { "epoch": 2.239774330042313, "grad_norm": 0.045927029103040695, "learning_rate": 6.26027371265725e-05, "loss": 0.2032, "step": 11116 }, { "epoch": 2.240177312109611, "grad_norm": 0.03853674978017807, "learning_rate": 6.258984043754532e-05, "loss": 0.1757, "step": 11118 }, { "epoch": 2.240580294176909, "grad_norm": 0.10684002935886383, "learning_rate": 6.257694285415602e-05, "loss": 0.2282, "step": 11120 }, { "epoch": 2.240983276244207, "grad_norm": 0.06447183340787888, "learning_rate": 6.25640443773208e-05, "loss": 0.2127, "step": 11122 }, { "epoch": 2.241386258311505, "grad_norm": 0.04314524680376053, "learning_rate": 6.255114500795595e-05, "loss": 0.1964, "step": 11124 }, { "epoch": 2.2417892403788033, "grad_norm": 0.0753975510597229, "learning_rate": 6.253824474697787e-05, "loss": 0.1722, "step": 11126 }, { "epoch": 2.2421922224461013, "grad_norm": 0.041436564177274704, "learning_rate": 6.252534359530291e-05, "loss": 0.1355, "step": 11128 }, { "epoch": 2.242595204513399, "grad_norm": 0.07188113033771515, "learning_rate": 6.251244155384758e-05, "loss": 0.1887, "step": 11130 }, { "epoch": 2.242998186580697, "grad_norm": 0.07126448303461075, "learning_rate": 6.249953862352841e-05, "loss": 0.2059, "step": 11132 }, { "epoch": 2.243401168647995, "grad_norm": 0.05256457254290581, "learning_rate": 6.2486634805262e-05, "loss": 0.2111, "step": 11134 }, { "epoch": 2.243804150715293, "grad_norm": 0.07690654695034027, "learning_rate": 6.247373009996502e-05, "loss": 0.228, "step": 11136 }, { "epoch": 2.244207132782591, "grad_norm": 0.04150415584445, "learning_rate": 6.246082450855423e-05, "loss": 0.1777, "step": 11138 }, { "epoch": 2.2446101148498894, "grad_norm": 0.04561259597539902, "learning_rate": 6.244791803194637e-05, "loss": 0.184, "step": 11140 }, { "epoch": 2.2450130969171873, "grad_norm": 0.04242684692144394, "learning_rate": 6.243501067105832e-05, "loss": 0.1289, "step": 11142 }, { "epoch": 2.2454160789844853, "grad_norm": 0.06444307416677475, "learning_rate": 6.242210242680702e-05, "loss": 0.1787, "step": 11144 }, { "epoch": 2.245819061051783, "grad_norm": 0.05252141132950783, "learning_rate": 6.24091933001094e-05, "loss": 0.1534, "step": 11146 }, { "epoch": 2.246222043119081, "grad_norm": 0.046815138310194016, "learning_rate": 6.239628329188256e-05, "loss": 0.1876, "step": 11148 }, { "epoch": 2.246625025186379, "grad_norm": 0.04507233574986458, "learning_rate": 6.238337240304357e-05, "loss": 0.2583, "step": 11150 }, { "epoch": 2.247028007253677, "grad_norm": 0.07129781693220139, "learning_rate": 6.23704606345096e-05, "loss": 0.2266, "step": 11152 }, { "epoch": 2.2474309893209754, "grad_norm": 0.06225457042455673, "learning_rate": 6.235754798719791e-05, "loss": 0.2047, "step": 11154 }, { "epoch": 2.2478339713882733, "grad_norm": 0.04900765046477318, "learning_rate": 6.234463446202575e-05, "loss": 0.2016, "step": 11156 }, { "epoch": 2.2482369534555713, "grad_norm": 0.06152089685201645, "learning_rate": 6.233172005991051e-05, "loss": 0.1688, "step": 11158 }, { "epoch": 2.2486399355228692, "grad_norm": 0.04987475648522377, "learning_rate": 6.231880478176961e-05, "loss": 0.1885, "step": 11160 }, { "epoch": 2.249042917590167, "grad_norm": 0.07317940890789032, "learning_rate": 6.23058886285205e-05, "loss": 0.2245, "step": 11162 }, { "epoch": 2.249445899657465, "grad_norm": 0.08709168434143066, "learning_rate": 6.229297160108075e-05, "loss": 0.1796, "step": 11164 }, { "epoch": 2.249848881724763, "grad_norm": 0.05007123947143555, "learning_rate": 6.228005370036797e-05, "loss": 0.1922, "step": 11166 }, { "epoch": 2.2502518637920614, "grad_norm": 0.06081795319914818, "learning_rate": 6.22671349272998e-05, "loss": 0.1997, "step": 11168 }, { "epoch": 2.2506548458593594, "grad_norm": 0.05448725447058678, "learning_rate": 6.225421528279398e-05, "loss": 0.1809, "step": 11170 }, { "epoch": 2.2510578279266573, "grad_norm": 0.0509071871638298, "learning_rate": 6.224129476776832e-05, "loss": 0.1956, "step": 11172 }, { "epoch": 2.2514608099939553, "grad_norm": 0.04707943648099899, "learning_rate": 6.222837338314065e-05, "loss": 0.1996, "step": 11174 }, { "epoch": 2.251863792061253, "grad_norm": 0.056294072419404984, "learning_rate": 6.221545112982887e-05, "loss": 0.166, "step": 11176 }, { "epoch": 2.252266774128551, "grad_norm": 0.07561463862657547, "learning_rate": 6.220252800875102e-05, "loss": 0.1987, "step": 11178 }, { "epoch": 2.252669756195849, "grad_norm": 0.05711280182003975, "learning_rate": 6.218960402082505e-05, "loss": 0.2061, "step": 11180 }, { "epoch": 2.2530727382631475, "grad_norm": 0.05807644873857498, "learning_rate": 6.217667916696913e-05, "loss": 0.2455, "step": 11182 }, { "epoch": 2.2534757203304454, "grad_norm": 0.04157806560397148, "learning_rate": 6.21637534481014e-05, "loss": 0.1702, "step": 11184 }, { "epoch": 2.2538787023977433, "grad_norm": 0.05336381494998932, "learning_rate": 6.215082686514007e-05, "loss": 0.1849, "step": 11186 }, { "epoch": 2.2542816844650413, "grad_norm": 0.051333993673324585, "learning_rate": 6.213789941900342e-05, "loss": 0.1958, "step": 11188 }, { "epoch": 2.2546846665323392, "grad_norm": 0.08715116232633591, "learning_rate": 6.212497111060983e-05, "loss": 0.2143, "step": 11190 }, { "epoch": 2.255087648599637, "grad_norm": 0.03855302929878235, "learning_rate": 6.211204194087767e-05, "loss": 0.1481, "step": 11192 }, { "epoch": 2.2554906306669356, "grad_norm": 0.04594408720731735, "learning_rate": 6.209911191072541e-05, "loss": 0.1732, "step": 11194 }, { "epoch": 2.2558936127342335, "grad_norm": 0.04794025048613548, "learning_rate": 6.208618102107161e-05, "loss": 0.1607, "step": 11196 }, { "epoch": 2.2562965948015314, "grad_norm": 0.04965617507696152, "learning_rate": 6.207324927283484e-05, "loss": 0.1256, "step": 11198 }, { "epoch": 2.2566995768688294, "grad_norm": 0.05873771011829376, "learning_rate": 6.206031666693372e-05, "loss": 0.1695, "step": 11200 }, { "epoch": 2.2571025589361273, "grad_norm": 0.06736169755458832, "learning_rate": 6.204738320428704e-05, "loss": 0.2153, "step": 11202 }, { "epoch": 2.2575055410034253, "grad_norm": 0.061268240213394165, "learning_rate": 6.203444888581348e-05, "loss": 0.2018, "step": 11204 }, { "epoch": 2.257908523070723, "grad_norm": 0.05184895917773247, "learning_rate": 6.202151371243194e-05, "loss": 0.2235, "step": 11206 }, { "epoch": 2.258311505138021, "grad_norm": 0.04206407815217972, "learning_rate": 6.200857768506129e-05, "loss": 0.2232, "step": 11208 }, { "epoch": 2.2587144872053195, "grad_norm": 0.042239706963300705, "learning_rate": 6.199564080462049e-05, "loss": 0.1615, "step": 11210 }, { "epoch": 2.2591174692726175, "grad_norm": 0.04908422753214836, "learning_rate": 6.198270307202852e-05, "loss": 0.1922, "step": 11212 }, { "epoch": 2.2595204513399154, "grad_norm": 0.05737739056348801, "learning_rate": 6.196976448820453e-05, "loss": 0.1763, "step": 11214 }, { "epoch": 2.2599234334072134, "grad_norm": 0.044318120926618576, "learning_rate": 6.195682505406759e-05, "loss": 0.1983, "step": 11216 }, { "epoch": 2.2603264154745113, "grad_norm": 0.052237652242183685, "learning_rate": 6.194388477053693e-05, "loss": 0.1974, "step": 11218 }, { "epoch": 2.2607293975418092, "grad_norm": 0.04132530465722084, "learning_rate": 6.193094363853179e-05, "loss": 0.1648, "step": 11220 }, { "epoch": 2.2611323796091076, "grad_norm": 0.04706701636314392, "learning_rate": 6.191800165897149e-05, "loss": 0.2216, "step": 11222 }, { "epoch": 2.2615353616764056, "grad_norm": 0.04157636687159538, "learning_rate": 6.190505883277541e-05, "loss": 0.1875, "step": 11224 }, { "epoch": 2.2619383437437035, "grad_norm": 0.049499645829200745, "learning_rate": 6.1892115160863e-05, "loss": 0.2033, "step": 11226 }, { "epoch": 2.2623413258110014, "grad_norm": 0.07546674460172653, "learning_rate": 6.187917064415375e-05, "loss": 0.1983, "step": 11228 }, { "epoch": 2.2627443078782994, "grad_norm": 0.0587388314306736, "learning_rate": 6.186622528356723e-05, "loss": 0.1246, "step": 11230 }, { "epoch": 2.2631472899455973, "grad_norm": 0.038730837404727936, "learning_rate": 6.185327908002301e-05, "loss": 0.143, "step": 11232 }, { "epoch": 2.2635502720128953, "grad_norm": 0.04652130603790283, "learning_rate": 6.184033203444081e-05, "loss": 0.1961, "step": 11234 }, { "epoch": 2.263953254080193, "grad_norm": 0.06499212980270386, "learning_rate": 6.182738414774038e-05, "loss": 0.2313, "step": 11236 }, { "epoch": 2.2643562361474916, "grad_norm": 0.05006347596645355, "learning_rate": 6.181443542084146e-05, "loss": 0.2217, "step": 11238 }, { "epoch": 2.2647592182147895, "grad_norm": 0.05510425940155983, "learning_rate": 6.180148585466397e-05, "loss": 0.1508, "step": 11240 }, { "epoch": 2.2651622002820875, "grad_norm": 0.06103931739926338, "learning_rate": 6.17885354501278e-05, "loss": 0.1751, "step": 11242 }, { "epoch": 2.2655651823493854, "grad_norm": 0.07695218920707703, "learning_rate": 6.177558420815291e-05, "loss": 0.2253, "step": 11244 }, { "epoch": 2.2659681644166834, "grad_norm": 0.05471651628613472, "learning_rate": 6.176263212965935e-05, "loss": 0.1386, "step": 11246 }, { "epoch": 2.2663711464839813, "grad_norm": 0.04918958246707916, "learning_rate": 6.174967921556722e-05, "loss": 0.168, "step": 11248 }, { "epoch": 2.2667741285512797, "grad_norm": 0.06307138502597809, "learning_rate": 6.173672546679667e-05, "loss": 0.1953, "step": 11250 }, { "epoch": 2.2671771106185776, "grad_norm": 0.05643860995769501, "learning_rate": 6.172377088426791e-05, "loss": 0.2656, "step": 11252 }, { "epoch": 2.2675800926858756, "grad_norm": 0.07580362260341644, "learning_rate": 6.171081546890122e-05, "loss": 0.195, "step": 11254 }, { "epoch": 2.2679830747531735, "grad_norm": 0.060459479689598083, "learning_rate": 6.169785922161691e-05, "loss": 0.1836, "step": 11256 }, { "epoch": 2.2683860568204715, "grad_norm": 0.07413048297166824, "learning_rate": 6.16849021433354e-05, "loss": 0.2098, "step": 11258 }, { "epoch": 2.2687890388877694, "grad_norm": 0.06606211513280869, "learning_rate": 6.167194423497715e-05, "loss": 0.2159, "step": 11260 }, { "epoch": 2.2691920209550673, "grad_norm": 0.05219024419784546, "learning_rate": 6.165898549746263e-05, "loss": 0.1579, "step": 11262 }, { "epoch": 2.2695950030223653, "grad_norm": 0.05993516743183136, "learning_rate": 6.164602593171242e-05, "loss": 0.2181, "step": 11264 }, { "epoch": 2.2699979850896637, "grad_norm": 0.06719011068344116, "learning_rate": 6.163306553864717e-05, "loss": 0.2209, "step": 11266 }, { "epoch": 2.2704009671569616, "grad_norm": 0.0497952364385128, "learning_rate": 6.162010431918753e-05, "loss": 0.1758, "step": 11268 }, { "epoch": 2.2708039492242595, "grad_norm": 0.06263022869825363, "learning_rate": 6.160714227425428e-05, "loss": 0.1895, "step": 11270 }, { "epoch": 2.2712069312915575, "grad_norm": 0.07029848545789719, "learning_rate": 6.159417940476819e-05, "loss": 0.2157, "step": 11272 }, { "epoch": 2.2716099133588554, "grad_norm": 0.06518268585205078, "learning_rate": 6.158121571165014e-05, "loss": 0.2074, "step": 11274 }, { "epoch": 2.2720128954261534, "grad_norm": 0.06944593787193298, "learning_rate": 6.156825119582105e-05, "loss": 0.1697, "step": 11276 }, { "epoch": 2.2724158774934518, "grad_norm": 0.054373059421777725, "learning_rate": 6.15552858582019e-05, "loss": 0.1786, "step": 11278 }, { "epoch": 2.2728188595607497, "grad_norm": 0.04644101485610008, "learning_rate": 6.154231969971373e-05, "loss": 0.1821, "step": 11280 }, { "epoch": 2.2732218416280476, "grad_norm": 0.055845990777015686, "learning_rate": 6.152935272127761e-05, "loss": 0.2359, "step": 11282 }, { "epoch": 2.2736248236953456, "grad_norm": 0.0610295832157135, "learning_rate": 6.151638492381473e-05, "loss": 0.1932, "step": 11284 }, { "epoch": 2.2740278057626435, "grad_norm": 0.05903216451406479, "learning_rate": 6.150341630824627e-05, "loss": 0.2192, "step": 11286 }, { "epoch": 2.2744307878299415, "grad_norm": 0.07944760471582413, "learning_rate": 6.149044687549351e-05, "loss": 0.2314, "step": 11288 }, { "epoch": 2.2748337698972394, "grad_norm": 0.04613855481147766, "learning_rate": 6.147747662647777e-05, "loss": 0.213, "step": 11290 }, { "epoch": 2.2752367519645373, "grad_norm": 0.05385150760412216, "learning_rate": 6.146450556212045e-05, "loss": 0.1715, "step": 11292 }, { "epoch": 2.2756397340318357, "grad_norm": 0.057033102959394455, "learning_rate": 6.145153368334302e-05, "loss": 0.194, "step": 11294 }, { "epoch": 2.2760427160991337, "grad_norm": 0.041800301522016525, "learning_rate": 6.143856099106692e-05, "loss": 0.2274, "step": 11296 }, { "epoch": 2.2764456981664316, "grad_norm": 0.0549192801117897, "learning_rate": 6.142558748621376e-05, "loss": 0.203, "step": 11298 }, { "epoch": 2.2768486802337295, "grad_norm": 0.05256705358624458, "learning_rate": 6.141261316970513e-05, "loss": 0.1815, "step": 11300 }, { "epoch": 2.2772516623010275, "grad_norm": 0.06109241768717766, "learning_rate": 6.139963804246271e-05, "loss": 0.1698, "step": 11302 }, { "epoch": 2.2776546443683254, "grad_norm": 0.060304321348667145, "learning_rate": 6.138666210540822e-05, "loss": 0.1746, "step": 11304 }, { "epoch": 2.278057626435624, "grad_norm": 0.05690411105751991, "learning_rate": 6.13736853594635e-05, "loss": 0.1728, "step": 11306 }, { "epoch": 2.2784606085029218, "grad_norm": 0.12367209792137146, "learning_rate": 6.136070780555033e-05, "loss": 0.1808, "step": 11308 }, { "epoch": 2.2788635905702197, "grad_norm": 0.04680660739541054, "learning_rate": 6.134772944459066e-05, "loss": 0.2192, "step": 11310 }, { "epoch": 2.2792665726375176, "grad_norm": 0.0765429437160492, "learning_rate": 6.133475027750644e-05, "loss": 0.2293, "step": 11312 }, { "epoch": 2.2796695547048156, "grad_norm": 0.06817856431007385, "learning_rate": 6.132177030521967e-05, "loss": 0.1625, "step": 11314 }, { "epoch": 2.2800725367721135, "grad_norm": 0.0743875801563263, "learning_rate": 6.130878952865246e-05, "loss": 0.2173, "step": 11316 }, { "epoch": 2.2804755188394115, "grad_norm": 0.08354144543409348, "learning_rate": 6.129580794872694e-05, "loss": 0.1676, "step": 11318 }, { "epoch": 2.2808785009067094, "grad_norm": 0.054814115166664124, "learning_rate": 6.128282556636527e-05, "loss": 0.183, "step": 11320 }, { "epoch": 2.281281482974008, "grad_norm": 0.0482783205807209, "learning_rate": 6.126984238248972e-05, "loss": 0.1679, "step": 11322 }, { "epoch": 2.2816844650413057, "grad_norm": 0.039012711495161057, "learning_rate": 6.125685839802258e-05, "loss": 0.1527, "step": 11324 }, { "epoch": 2.2820874471086037, "grad_norm": 0.06443888694047928, "learning_rate": 6.124387361388624e-05, "loss": 0.2135, "step": 11326 }, { "epoch": 2.2824904291759016, "grad_norm": 0.05576111376285553, "learning_rate": 6.12308880310031e-05, "loss": 0.2194, "step": 11328 }, { "epoch": 2.2828934112431996, "grad_norm": 0.0726291760802269, "learning_rate": 6.121790165029561e-05, "loss": 0.187, "step": 11330 }, { "epoch": 2.2832963933104975, "grad_norm": 0.04712097719311714, "learning_rate": 6.120491447268634e-05, "loss": 0.1997, "step": 11332 }, { "epoch": 2.283699375377796, "grad_norm": 0.04324129596352577, "learning_rate": 6.119192649909788e-05, "loss": 0.168, "step": 11334 }, { "epoch": 2.284102357445094, "grad_norm": 0.07594967633485794, "learning_rate": 6.117893773045286e-05, "loss": 0.2147, "step": 11336 }, { "epoch": 2.2845053395123918, "grad_norm": 0.05412382259964943, "learning_rate": 6.116594816767396e-05, "loss": 0.1807, "step": 11338 }, { "epoch": 2.2849083215796897, "grad_norm": 0.04991452768445015, "learning_rate": 6.115295781168398e-05, "loss": 0.1619, "step": 11340 }, { "epoch": 2.2853113036469876, "grad_norm": 0.05802258849143982, "learning_rate": 6.11399666634057e-05, "loss": 0.1286, "step": 11342 }, { "epoch": 2.2857142857142856, "grad_norm": 0.05308988317847252, "learning_rate": 6.112697472376201e-05, "loss": 0.209, "step": 11344 }, { "epoch": 2.2861172677815835, "grad_norm": 0.0739789754152298, "learning_rate": 6.111398199367584e-05, "loss": 0.161, "step": 11346 }, { "epoch": 2.286520249848882, "grad_norm": 0.04071381315588951, "learning_rate": 6.110098847407014e-05, "loss": 0.1882, "step": 11348 }, { "epoch": 2.28692323191618, "grad_norm": 0.08201833069324493, "learning_rate": 6.108799416586799e-05, "loss": 0.2381, "step": 11350 }, { "epoch": 2.287326213983478, "grad_norm": 0.05134955421090126, "learning_rate": 6.107499906999247e-05, "loss": 0.1813, "step": 11352 }, { "epoch": 2.2877291960507757, "grad_norm": 0.07029873132705688, "learning_rate": 6.106200318736672e-05, "loss": 0.2262, "step": 11354 }, { "epoch": 2.2881321781180737, "grad_norm": 0.057129088789224625, "learning_rate": 6.104900651891394e-05, "loss": 0.1861, "step": 11356 }, { "epoch": 2.2885351601853716, "grad_norm": 0.05836978182196617, "learning_rate": 6.103600906555744e-05, "loss": 0.2227, "step": 11358 }, { "epoch": 2.28893814225267, "grad_norm": 0.06855162978172302, "learning_rate": 6.1023010828220483e-05, "loss": 0.2345, "step": 11360 }, { "epoch": 2.289341124319968, "grad_norm": 0.06893496960401535, "learning_rate": 6.101001180782646e-05, "loss": 0.1785, "step": 11362 }, { "epoch": 2.289744106387266, "grad_norm": 0.05776822194457054, "learning_rate": 6.0997012005298826e-05, "loss": 0.1627, "step": 11364 }, { "epoch": 2.290147088454564, "grad_norm": 0.06768766045570374, "learning_rate": 6.098401142156104e-05, "loss": 0.2323, "step": 11366 }, { "epoch": 2.2905500705218618, "grad_norm": 0.04803290218114853, "learning_rate": 6.0971010057536634e-05, "loss": 0.16, "step": 11368 }, { "epoch": 2.2909530525891597, "grad_norm": 0.044886112213134766, "learning_rate": 6.095800791414924e-05, "loss": 0.1568, "step": 11370 }, { "epoch": 2.2913560346564577, "grad_norm": 0.05759613215923309, "learning_rate": 6.0945004992322473e-05, "loss": 0.1812, "step": 11372 }, { "epoch": 2.2917590167237556, "grad_norm": 0.07767092436552048, "learning_rate": 6.0932001292980065e-05, "loss": 0.1981, "step": 11374 }, { "epoch": 2.292161998791054, "grad_norm": 0.06554151326417923, "learning_rate": 6.091899681704577e-05, "loss": 0.1989, "step": 11376 }, { "epoch": 2.292564980858352, "grad_norm": 0.06701290607452393, "learning_rate": 6.09059915654434e-05, "loss": 0.1745, "step": 11378 }, { "epoch": 2.29296796292565, "grad_norm": 0.07902407646179199, "learning_rate": 6.089298553909684e-05, "loss": 0.28, "step": 11380 }, { "epoch": 2.293370944992948, "grad_norm": 0.07576099038124084, "learning_rate": 6.087997873892999e-05, "loss": 0.1846, "step": 11382 }, { "epoch": 2.2937739270602457, "grad_norm": 0.06893164664506912, "learning_rate": 6.086697116586685e-05, "loss": 0.2181, "step": 11384 }, { "epoch": 2.2941769091275437, "grad_norm": 0.08184853941202164, "learning_rate": 6.085396282083147e-05, "loss": 0.2172, "step": 11386 }, { "epoch": 2.294579891194842, "grad_norm": 0.059146005660295486, "learning_rate": 6.084095370474791e-05, "loss": 0.1481, "step": 11388 }, { "epoch": 2.29498287326214, "grad_norm": 0.060949668288230896, "learning_rate": 6.0827943818540357e-05, "loss": 0.2308, "step": 11390 }, { "epoch": 2.295385855329438, "grad_norm": 0.04430218040943146, "learning_rate": 6.081493316313299e-05, "loss": 0.1673, "step": 11392 }, { "epoch": 2.295788837396736, "grad_norm": 0.055645573884248734, "learning_rate": 6.080192173945006e-05, "loss": 0.1759, "step": 11394 }, { "epoch": 2.296191819464034, "grad_norm": 0.08632339537143707, "learning_rate": 6.078890954841589e-05, "loss": 0.2101, "step": 11396 }, { "epoch": 2.2965948015313318, "grad_norm": 0.1685890555381775, "learning_rate": 6.077589659095484e-05, "loss": 0.1724, "step": 11398 }, { "epoch": 2.2969977835986297, "grad_norm": 0.06395189464092255, "learning_rate": 6.0762882867991325e-05, "loss": 0.1763, "step": 11400 }, { "epoch": 2.2974007656659277, "grad_norm": 0.05895381420850754, "learning_rate": 6.074986838044983e-05, "loss": 0.1772, "step": 11402 }, { "epoch": 2.297803747733226, "grad_norm": 0.07261353731155396, "learning_rate": 6.073685312925488e-05, "loss": 0.1446, "step": 11404 }, { "epoch": 2.298206729800524, "grad_norm": 0.0492069236934185, "learning_rate": 6.072383711533104e-05, "loss": 0.1637, "step": 11406 }, { "epoch": 2.298609711867822, "grad_norm": 0.07343262434005737, "learning_rate": 6.0710820339602955e-05, "loss": 0.1681, "step": 11408 }, { "epoch": 2.29901269393512, "grad_norm": 0.05374588817358017, "learning_rate": 6.069780280299535e-05, "loss": 0.1881, "step": 11410 }, { "epoch": 2.299415676002418, "grad_norm": 0.06401274353265762, "learning_rate": 6.068478450643294e-05, "loss": 0.2119, "step": 11412 }, { "epoch": 2.2998186580697157, "grad_norm": 0.0491844043135643, "learning_rate": 6.06717654508405e-05, "loss": 0.169, "step": 11414 }, { "epoch": 2.300221640137014, "grad_norm": 0.060520414263010025, "learning_rate": 6.065874563714293e-05, "loss": 0.1738, "step": 11416 }, { "epoch": 2.300624622204312, "grad_norm": 0.0507986918091774, "learning_rate": 6.064572506626511e-05, "loss": 0.1753, "step": 11418 }, { "epoch": 2.30102760427161, "grad_norm": 0.045630816370248795, "learning_rate": 6.0632703739132e-05, "loss": 0.1549, "step": 11420 }, { "epoch": 2.301430586338908, "grad_norm": 0.04640620946884155, "learning_rate": 6.061968165666865e-05, "loss": 0.2175, "step": 11422 }, { "epoch": 2.301833568406206, "grad_norm": 0.06813772767782211, "learning_rate": 6.060665881980007e-05, "loss": 0.1738, "step": 11424 }, { "epoch": 2.302236550473504, "grad_norm": 0.049537815153598785, "learning_rate": 6.0593635229451404e-05, "loss": 0.1846, "step": 11426 }, { "epoch": 2.302639532540802, "grad_norm": 0.048957351595163345, "learning_rate": 6.058061088654786e-05, "loss": 0.1928, "step": 11428 }, { "epoch": 2.3030425146080997, "grad_norm": 0.06044716387987137, "learning_rate": 6.0567585792014625e-05, "loss": 0.1538, "step": 11430 }, { "epoch": 2.303445496675398, "grad_norm": 0.07165748625993729, "learning_rate": 6.055455994677699e-05, "loss": 0.2081, "step": 11432 }, { "epoch": 2.303848478742696, "grad_norm": 0.057387061417102814, "learning_rate": 6.0541533351760315e-05, "loss": 0.2362, "step": 11434 }, { "epoch": 2.304251460809994, "grad_norm": 0.051506511867046356, "learning_rate": 6.0528506007889954e-05, "loss": 0.1875, "step": 11436 }, { "epoch": 2.304654442877292, "grad_norm": 0.053086865693330765, "learning_rate": 6.0515477916091365e-05, "loss": 0.1542, "step": 11438 }, { "epoch": 2.30505742494459, "grad_norm": 0.05012943223118782, "learning_rate": 6.050244907729005e-05, "loss": 0.1647, "step": 11440 }, { "epoch": 2.305460407011888, "grad_norm": 0.0971345528960228, "learning_rate": 6.0489419492411534e-05, "loss": 0.1882, "step": 11442 }, { "epoch": 2.305863389079186, "grad_norm": 0.06509329378604889, "learning_rate": 6.047638916238144e-05, "loss": 0.2157, "step": 11444 }, { "epoch": 2.306266371146484, "grad_norm": 0.09002260118722916, "learning_rate": 6.046335808812543e-05, "loss": 0.157, "step": 11446 }, { "epoch": 2.306669353213782, "grad_norm": 0.03628845512866974, "learning_rate": 6.045032627056918e-05, "loss": 0.1655, "step": 11448 }, { "epoch": 2.30707233528108, "grad_norm": 0.0490727573633194, "learning_rate": 6.043729371063846e-05, "loss": 0.2058, "step": 11450 }, { "epoch": 2.307475317348378, "grad_norm": 0.03740858659148216, "learning_rate": 6.04242604092591e-05, "loss": 0.1525, "step": 11452 }, { "epoch": 2.307878299415676, "grad_norm": 0.06834115087985992, "learning_rate": 6.041122636735694e-05, "loss": 0.224, "step": 11454 }, { "epoch": 2.308281281482974, "grad_norm": 0.05891553685069084, "learning_rate": 6.039819158585792e-05, "loss": 0.2065, "step": 11456 }, { "epoch": 2.308684263550272, "grad_norm": 0.04757963865995407, "learning_rate": 6.0385156065687987e-05, "loss": 0.1603, "step": 11458 }, { "epoch": 2.30908724561757, "grad_norm": 0.03673839196562767, "learning_rate": 6.037211980777318e-05, "loss": 0.1729, "step": 11460 }, { "epoch": 2.309490227684868, "grad_norm": 0.06139663606882095, "learning_rate": 6.035908281303958e-05, "loss": 0.1852, "step": 11462 }, { "epoch": 2.309893209752166, "grad_norm": 0.034579966217279434, "learning_rate": 6.0346045082413295e-05, "loss": 0.1267, "step": 11464 }, { "epoch": 2.310296191819464, "grad_norm": 0.0544976070523262, "learning_rate": 6.033300661682051e-05, "loss": 0.2015, "step": 11466 }, { "epoch": 2.310699173886762, "grad_norm": 0.05644477531313896, "learning_rate": 6.031996741718747e-05, "loss": 0.156, "step": 11468 }, { "epoch": 2.31110215595406, "grad_norm": 0.06838435679674149, "learning_rate": 6.0306927484440434e-05, "loss": 0.1519, "step": 11470 }, { "epoch": 2.3115051380213583, "grad_norm": 0.04617998003959656, "learning_rate": 6.029388681950576e-05, "loss": 0.2111, "step": 11472 }, { "epoch": 2.311908120088656, "grad_norm": 0.05172273889183998, "learning_rate": 6.028084542330984e-05, "loss": 0.2078, "step": 11474 }, { "epoch": 2.312311102155954, "grad_norm": 0.07891248166561127, "learning_rate": 6.026780329677909e-05, "loss": 0.2416, "step": 11476 }, { "epoch": 2.312714084223252, "grad_norm": 0.07931914180517197, "learning_rate": 6.025476044084002e-05, "loss": 0.2088, "step": 11478 }, { "epoch": 2.31311706629055, "grad_norm": 0.06278335303068161, "learning_rate": 6.024171685641917e-05, "loss": 0.1975, "step": 11480 }, { "epoch": 2.313520048357848, "grad_norm": 0.059438444674015045, "learning_rate": 6.022867254444313e-05, "loss": 0.1536, "step": 11482 }, { "epoch": 2.313923030425146, "grad_norm": 0.08473483473062515, "learning_rate": 6.021562750583854e-05, "loss": 0.2057, "step": 11484 }, { "epoch": 2.314326012492444, "grad_norm": 0.056876592338085175, "learning_rate": 6.020258174153213e-05, "loss": 0.2491, "step": 11486 }, { "epoch": 2.3147289945597422, "grad_norm": 0.05166206881403923, "learning_rate": 6.0189535252450614e-05, "loss": 0.2241, "step": 11488 }, { "epoch": 2.31513197662704, "grad_norm": 0.17782577872276306, "learning_rate": 6.017648803952082e-05, "loss": 0.2012, "step": 11490 }, { "epoch": 2.315534958694338, "grad_norm": 0.06368952244520187, "learning_rate": 6.0163440103669586e-05, "loss": 0.246, "step": 11492 }, { "epoch": 2.315937940761636, "grad_norm": 0.07152625173330307, "learning_rate": 6.015039144582382e-05, "loss": 0.1911, "step": 11494 }, { "epoch": 2.316340922828934, "grad_norm": 0.061066433787345886, "learning_rate": 6.0137342066910486e-05, "loss": 0.1865, "step": 11496 }, { "epoch": 2.316743904896232, "grad_norm": 0.05115800350904465, "learning_rate": 6.0124291967856585e-05, "loss": 0.1442, "step": 11498 }, { "epoch": 2.3171468869635303, "grad_norm": 0.036835819482803345, "learning_rate": 6.0111241149589156e-05, "loss": 0.1624, "step": 11500 }, { "epoch": 2.3175498690308283, "grad_norm": 0.03765489533543587, "learning_rate": 6.0098189613035335e-05, "loss": 0.1554, "step": 11502 }, { "epoch": 2.317952851098126, "grad_norm": 0.0580357164144516, "learning_rate": 6.008513735912229e-05, "loss": 0.2215, "step": 11504 }, { "epoch": 2.318355833165424, "grad_norm": 0.03914367035031319, "learning_rate": 6.007208438877719e-05, "loss": 0.1123, "step": 11506 }, { "epoch": 2.318758815232722, "grad_norm": 0.05752947926521301, "learning_rate": 6.005903070292733e-05, "loss": 0.199, "step": 11508 }, { "epoch": 2.31916179730002, "grad_norm": 0.06375788152217865, "learning_rate": 6.004597630250003e-05, "loss": 0.1791, "step": 11510 }, { "epoch": 2.319564779367318, "grad_norm": 0.04775358736515045, "learning_rate": 6.003292118842263e-05, "loss": 0.1816, "step": 11512 }, { "epoch": 2.319967761434616, "grad_norm": 0.07813245058059692, "learning_rate": 6.001986536162255e-05, "loss": 0.2239, "step": 11514 }, { "epoch": 2.3203707435019143, "grad_norm": 0.07530572265386581, "learning_rate": 6.000680882302727e-05, "loss": 0.2423, "step": 11516 }, { "epoch": 2.3207737255692122, "grad_norm": 0.05607306584715843, "learning_rate": 5.999375157356428e-05, "loss": 0.1857, "step": 11518 }, { "epoch": 2.32117670763651, "grad_norm": 0.0654715746641159, "learning_rate": 5.9980693614161175e-05, "loss": 0.2488, "step": 11520 }, { "epoch": 2.321579689703808, "grad_norm": 0.06305355578660965, "learning_rate": 5.9967634945745555e-05, "loss": 0.2018, "step": 11522 }, { "epoch": 2.321982671771106, "grad_norm": 0.030810408294200897, "learning_rate": 5.9954575569245086e-05, "loss": 0.159, "step": 11524 }, { "epoch": 2.322385653838404, "grad_norm": 0.07300093024969101, "learning_rate": 5.9941515485587485e-05, "loss": 0.2026, "step": 11526 }, { "epoch": 2.3227886359057024, "grad_norm": 0.05360522121191025, "learning_rate": 5.992845469570053e-05, "loss": 0.2091, "step": 11528 }, { "epoch": 2.3231916179730003, "grad_norm": 0.07325176149606705, "learning_rate": 5.9915393200512024e-05, "loss": 0.1802, "step": 11530 }, { "epoch": 2.3235946000402983, "grad_norm": 0.06028595194220543, "learning_rate": 5.990233100094985e-05, "loss": 0.2081, "step": 11532 }, { "epoch": 2.323997582107596, "grad_norm": 0.08523773401975632, "learning_rate": 5.9889268097941907e-05, "loss": 0.2146, "step": 11534 }, { "epoch": 2.324400564174894, "grad_norm": 0.0636720284819603, "learning_rate": 5.9876204492416185e-05, "loss": 0.2062, "step": 11536 }, { "epoch": 2.324803546242192, "grad_norm": 0.0519365556538105, "learning_rate": 5.986314018530069e-05, "loss": 0.1801, "step": 11538 }, { "epoch": 2.32520652830949, "grad_norm": 0.059190113097429276, "learning_rate": 5.985007517752349e-05, "loss": 0.2202, "step": 11540 }, { "epoch": 2.3256095103767884, "grad_norm": 0.05759743973612785, "learning_rate": 5.9837009470012695e-05, "loss": 0.1759, "step": 11542 }, { "epoch": 2.3260124924440864, "grad_norm": 0.0786181166768074, "learning_rate": 5.9823943063696484e-05, "loss": 0.2165, "step": 11544 }, { "epoch": 2.3264154745113843, "grad_norm": 0.07660158723592758, "learning_rate": 5.9810875959503065e-05, "loss": 0.1728, "step": 11546 }, { "epoch": 2.3268184565786822, "grad_norm": 0.05472486466169357, "learning_rate": 5.979780815836071e-05, "loss": 0.2405, "step": 11548 }, { "epoch": 2.32722143864598, "grad_norm": 0.05465126782655716, "learning_rate": 5.9784739661197744e-05, "loss": 0.185, "step": 11550 }, { "epoch": 2.327624420713278, "grad_norm": 0.07266535609960556, "learning_rate": 5.977167046894251e-05, "loss": 0.2024, "step": 11552 }, { "epoch": 2.328027402780576, "grad_norm": 0.05688047781586647, "learning_rate": 5.975860058252343e-05, "loss": 0.2392, "step": 11554 }, { "epoch": 2.3284303848478745, "grad_norm": 0.05334145203232765, "learning_rate": 5.9745530002868976e-05, "loss": 0.2258, "step": 11556 }, { "epoch": 2.3288333669151724, "grad_norm": 0.056741535663604736, "learning_rate": 5.973245873090766e-05, "loss": 0.1913, "step": 11558 }, { "epoch": 2.3292363489824703, "grad_norm": 0.06352947652339935, "learning_rate": 5.971938676756803e-05, "loss": 0.164, "step": 11560 }, { "epoch": 2.3296393310497683, "grad_norm": 0.054152458906173706, "learning_rate": 5.970631411377872e-05, "loss": 0.1804, "step": 11562 }, { "epoch": 2.330042313117066, "grad_norm": 0.07217392325401306, "learning_rate": 5.969324077046836e-05, "loss": 0.2099, "step": 11564 }, { "epoch": 2.330445295184364, "grad_norm": 0.05785232037305832, "learning_rate": 5.968016673856569e-05, "loss": 0.2028, "step": 11566 }, { "epoch": 2.330848277251662, "grad_norm": 0.03307553008198738, "learning_rate": 5.966709201899947e-05, "loss": 0.1449, "step": 11568 }, { "epoch": 2.3312512593189605, "grad_norm": 0.04345938190817833, "learning_rate": 5.965401661269847e-05, "loss": 0.1264, "step": 11570 }, { "epoch": 2.3316542413862584, "grad_norm": 0.05565191060304642, "learning_rate": 5.964094052059158e-05, "loss": 0.1819, "step": 11572 }, { "epoch": 2.3320572234535564, "grad_norm": 0.05930451303720474, "learning_rate": 5.9627863743607694e-05, "loss": 0.2198, "step": 11574 }, { "epoch": 2.3324602055208543, "grad_norm": 0.051885005086660385, "learning_rate": 5.961478628267576e-05, "loss": 0.1606, "step": 11576 }, { "epoch": 2.3328631875881523, "grad_norm": 0.07921456545591354, "learning_rate": 5.960170813872479e-05, "loss": 0.1337, "step": 11578 }, { "epoch": 2.33326616965545, "grad_norm": 0.06802048534154892, "learning_rate": 5.958862931268383e-05, "loss": 0.2004, "step": 11580 }, { "epoch": 2.3336691517227486, "grad_norm": 0.05401553958654404, "learning_rate": 5.9575549805481976e-05, "loss": 0.1612, "step": 11582 }, { "epoch": 2.3340721337900465, "grad_norm": 0.05115654692053795, "learning_rate": 5.956246961804838e-05, "loss": 0.1779, "step": 11584 }, { "epoch": 2.3344751158573445, "grad_norm": 0.05319082364439964, "learning_rate": 5.954938875131224e-05, "loss": 0.1613, "step": 11586 }, { "epoch": 2.3348780979246424, "grad_norm": 0.058764997869729996, "learning_rate": 5.953630720620278e-05, "loss": 0.1842, "step": 11588 }, { "epoch": 2.3352810799919403, "grad_norm": 0.05345417559146881, "learning_rate": 5.952322498364933e-05, "loss": 0.2541, "step": 11590 }, { "epoch": 2.3356840620592383, "grad_norm": 0.07091548293828964, "learning_rate": 5.951014208458118e-05, "loss": 0.1976, "step": 11592 }, { "epoch": 2.3360870441265362, "grad_norm": 0.048810895532369614, "learning_rate": 5.949705850992775e-05, "loss": 0.175, "step": 11594 }, { "epoch": 2.336490026193834, "grad_norm": 0.042119260877370834, "learning_rate": 5.948397426061849e-05, "loss": 0.1994, "step": 11596 }, { "epoch": 2.3368930082611326, "grad_norm": 0.050638552755117416, "learning_rate": 5.947088933758286e-05, "loss": 0.2191, "step": 11598 }, { "epoch": 2.3372959903284305, "grad_norm": 0.05852693319320679, "learning_rate": 5.9457803741750384e-05, "loss": 0.2201, "step": 11600 }, { "epoch": 2.3376989723957284, "grad_norm": 0.06796692311763763, "learning_rate": 5.944471747405067e-05, "loss": 0.2088, "step": 11602 }, { "epoch": 2.3381019544630264, "grad_norm": 0.051638487726449966, "learning_rate": 5.943163053541333e-05, "loss": 0.181, "step": 11604 }, { "epoch": 2.3385049365303243, "grad_norm": 0.05429547280073166, "learning_rate": 5.941854292676803e-05, "loss": 0.2498, "step": 11606 }, { "epoch": 2.3389079185976223, "grad_norm": 0.041048940271139145, "learning_rate": 5.9405454649044525e-05, "loss": 0.1911, "step": 11608 }, { "epoch": 2.3393109006649206, "grad_norm": 0.057707563042640686, "learning_rate": 5.9392365703172534e-05, "loss": 0.2572, "step": 11610 }, { "epoch": 2.3397138827322186, "grad_norm": 0.037866026163101196, "learning_rate": 5.9379276090081924e-05, "loss": 0.1573, "step": 11612 }, { "epoch": 2.3401168647995165, "grad_norm": 0.04118697717785835, "learning_rate": 5.9366185810702545e-05, "loss": 0.1889, "step": 11614 }, { "epoch": 2.3405198468668145, "grad_norm": 0.06807113438844681, "learning_rate": 5.9353094865964286e-05, "loss": 0.2459, "step": 11616 }, { "epoch": 2.3409228289341124, "grad_norm": 0.06002763658761978, "learning_rate": 5.934000325679714e-05, "loss": 0.1925, "step": 11618 }, { "epoch": 2.3413258110014104, "grad_norm": 0.05739912763237953, "learning_rate": 5.93269109841311e-05, "loss": 0.2382, "step": 11620 }, { "epoch": 2.3417287930687083, "grad_norm": 0.061665866523981094, "learning_rate": 5.931381804889621e-05, "loss": 0.1745, "step": 11622 }, { "epoch": 2.3421317751360062, "grad_norm": 0.04462193325161934, "learning_rate": 5.930072445202258e-05, "loss": 0.1698, "step": 11624 }, { "epoch": 2.3425347572033046, "grad_norm": 0.03827643766999245, "learning_rate": 5.928763019444037e-05, "loss": 0.1476, "step": 11626 }, { "epoch": 2.3429377392706026, "grad_norm": 0.047175608575344086, "learning_rate": 5.9274535277079756e-05, "loss": 0.2098, "step": 11628 }, { "epoch": 2.3433407213379005, "grad_norm": 0.042314786463975906, "learning_rate": 5.926143970087099e-05, "loss": 0.1944, "step": 11630 }, { "epoch": 2.3437437034051984, "grad_norm": 0.03941023349761963, "learning_rate": 5.924834346674437e-05, "loss": 0.1135, "step": 11632 }, { "epoch": 2.3441466854724964, "grad_norm": 0.09028346836566925, "learning_rate": 5.923524657563021e-05, "loss": 0.2228, "step": 11634 }, { "epoch": 2.3445496675397943, "grad_norm": 0.06249230355024338, "learning_rate": 5.922214902845891e-05, "loss": 0.1605, "step": 11636 }, { "epoch": 2.3449526496070927, "grad_norm": 0.051382407546043396, "learning_rate": 5.920905082616088e-05, "loss": 0.15, "step": 11638 }, { "epoch": 2.3453556316743907, "grad_norm": 0.05022916570305824, "learning_rate": 5.919595196966662e-05, "loss": 0.1367, "step": 11640 }, { "epoch": 2.3457586137416886, "grad_norm": 0.07349605113267899, "learning_rate": 5.918285245990662e-05, "loss": 0.244, "step": 11642 }, { "epoch": 2.3461615958089865, "grad_norm": 0.05159524455666542, "learning_rate": 5.9169752297811484e-05, "loss": 0.1832, "step": 11644 }, { "epoch": 2.3465645778762845, "grad_norm": 0.050891030579805374, "learning_rate": 5.915665148431181e-05, "loss": 0.2035, "step": 11646 }, { "epoch": 2.3469675599435824, "grad_norm": 0.06695982813835144, "learning_rate": 5.914355002033825e-05, "loss": 0.2231, "step": 11648 }, { "epoch": 2.3473705420108804, "grad_norm": 0.06656850874423981, "learning_rate": 5.913044790682153e-05, "loss": 0.1957, "step": 11650 }, { "epoch": 2.3477735240781783, "grad_norm": 0.07416415959596634, "learning_rate": 5.9117345144692384e-05, "loss": 0.2301, "step": 11652 }, { "epoch": 2.3481765061454767, "grad_norm": 0.05558010935783386, "learning_rate": 5.9104241734881626e-05, "loss": 0.215, "step": 11654 }, { "epoch": 2.3485794882127746, "grad_norm": 0.04689667001366615, "learning_rate": 5.9091137678320087e-05, "loss": 0.2222, "step": 11656 }, { "epoch": 2.3489824702800726, "grad_norm": 0.05192426219582558, "learning_rate": 5.907803297593867e-05, "loss": 0.1966, "step": 11658 }, { "epoch": 2.3493854523473705, "grad_norm": 0.04759860783815384, "learning_rate": 5.906492762866831e-05, "loss": 0.191, "step": 11660 }, { "epoch": 2.3497884344146684, "grad_norm": 0.071357861161232, "learning_rate": 5.9051821637439984e-05, "loss": 0.1829, "step": 11662 }, { "epoch": 2.3501914164819664, "grad_norm": 0.06483819335699081, "learning_rate": 5.903871500318473e-05, "loss": 0.2303, "step": 11664 }, { "epoch": 2.3505943985492648, "grad_norm": 0.04689923673868179, "learning_rate": 5.902560772683362e-05, "loss": 0.1671, "step": 11666 }, { "epoch": 2.3509973806165627, "grad_norm": 0.07678768038749695, "learning_rate": 5.901249980931777e-05, "loss": 0.202, "step": 11668 }, { "epoch": 2.3514003626838607, "grad_norm": 0.052997030317783356, "learning_rate": 5.8999391251568336e-05, "loss": 0.2071, "step": 11670 }, { "epoch": 2.3518033447511586, "grad_norm": 0.061645928770303726, "learning_rate": 5.898628205451655e-05, "loss": 0.1533, "step": 11672 }, { "epoch": 2.3522063268184565, "grad_norm": 0.047521840780973434, "learning_rate": 5.897317221909367e-05, "loss": 0.1957, "step": 11674 }, { "epoch": 2.3526093088857545, "grad_norm": 0.06109415739774704, "learning_rate": 5.896006174623094e-05, "loss": 0.1608, "step": 11676 }, { "epoch": 2.3530122909530524, "grad_norm": 0.062129903584718704, "learning_rate": 5.89469506368598e-05, "loss": 0.1885, "step": 11678 }, { "epoch": 2.3534152730203504, "grad_norm": 0.06269177794456482, "learning_rate": 5.893383889191158e-05, "loss": 0.2023, "step": 11680 }, { "epoch": 2.3538182550876487, "grad_norm": 0.04957219213247299, "learning_rate": 5.892072651231774e-05, "loss": 0.1842, "step": 11682 }, { "epoch": 2.3542212371549467, "grad_norm": 0.04616183042526245, "learning_rate": 5.890761349900974e-05, "loss": 0.2155, "step": 11684 }, { "epoch": 2.3546242192222446, "grad_norm": 0.042560383677482605, "learning_rate": 5.889449985291913e-05, "loss": 0.2049, "step": 11686 }, { "epoch": 2.3550272012895426, "grad_norm": 0.048294831067323685, "learning_rate": 5.8881385574977485e-05, "loss": 0.2134, "step": 11688 }, { "epoch": 2.3554301833568405, "grad_norm": 0.08637408167123795, "learning_rate": 5.88682706661164e-05, "loss": 0.1966, "step": 11690 }, { "epoch": 2.3558331654241385, "grad_norm": 0.035582225769758224, "learning_rate": 5.885515512726755e-05, "loss": 0.1546, "step": 11692 }, { "epoch": 2.356236147491437, "grad_norm": 0.0642366036772728, "learning_rate": 5.8842038959362656e-05, "loss": 0.2135, "step": 11694 }, { "epoch": 2.356639129558735, "grad_norm": 0.04640725627541542, "learning_rate": 5.882892216333343e-05, "loss": 0.1381, "step": 11696 }, { "epoch": 2.3570421116260327, "grad_norm": 0.046785078942775726, "learning_rate": 5.881580474011171e-05, "loss": 0.2211, "step": 11698 }, { "epoch": 2.3574450936933307, "grad_norm": 0.045676589012145996, "learning_rate": 5.880268669062933e-05, "loss": 0.1763, "step": 11700 }, { "epoch": 2.3578480757606286, "grad_norm": 0.05108589679002762, "learning_rate": 5.878956801581814e-05, "loss": 0.1987, "step": 11702 }, { "epoch": 2.3582510578279265, "grad_norm": 0.06899011135101318, "learning_rate": 5.8776448716610114e-05, "loss": 0.1831, "step": 11704 }, { "epoch": 2.3586540398952245, "grad_norm": 0.05484525114297867, "learning_rate": 5.87633287939372e-05, "loss": 0.217, "step": 11706 }, { "epoch": 2.3590570219625224, "grad_norm": 0.11611830443143845, "learning_rate": 5.875020824873142e-05, "loss": 0.1576, "step": 11708 }, { "epoch": 2.359460004029821, "grad_norm": 0.06362280249595642, "learning_rate": 5.8737087081924845e-05, "loss": 0.1907, "step": 11710 }, { "epoch": 2.3598629860971188, "grad_norm": 0.05416010320186615, "learning_rate": 5.872396529444958e-05, "loss": 0.1945, "step": 11712 }, { "epoch": 2.3602659681644167, "grad_norm": 0.06739898771047592, "learning_rate": 5.871084288723776e-05, "loss": 0.2166, "step": 11714 }, { "epoch": 2.3606689502317146, "grad_norm": 0.05630958080291748, "learning_rate": 5.86977198612216e-05, "loss": 0.1847, "step": 11716 }, { "epoch": 2.3610719322990126, "grad_norm": 0.061415087431669235, "learning_rate": 5.8684596217333346e-05, "loss": 0.2264, "step": 11718 }, { "epoch": 2.3614749143663105, "grad_norm": 0.05699741095304489, "learning_rate": 5.867147195650524e-05, "loss": 0.1604, "step": 11720 }, { "epoch": 2.361877896433609, "grad_norm": 0.05686968192458153, "learning_rate": 5.865834707966964e-05, "loss": 0.1843, "step": 11722 }, { "epoch": 2.362280878500907, "grad_norm": 0.053936101496219635, "learning_rate": 5.864522158775892e-05, "loss": 0.2176, "step": 11724 }, { "epoch": 2.362683860568205, "grad_norm": 0.04711095988750458, "learning_rate": 5.8632095481705486e-05, "loss": 0.1868, "step": 11726 }, { "epoch": 2.3630868426355027, "grad_norm": 0.05970948934555054, "learning_rate": 5.861896876244178e-05, "loss": 0.1718, "step": 11728 }, { "epoch": 2.3634898247028007, "grad_norm": 0.04749145731329918, "learning_rate": 5.860584143090033e-05, "loss": 0.1736, "step": 11730 }, { "epoch": 2.3638928067700986, "grad_norm": 0.042695820331573486, "learning_rate": 5.859271348801366e-05, "loss": 0.1696, "step": 11732 }, { "epoch": 2.3642957888373966, "grad_norm": 0.046593207865953445, "learning_rate": 5.857958493471437e-05, "loss": 0.1662, "step": 11734 }, { "epoch": 2.3646987709046945, "grad_norm": 0.0542333722114563, "learning_rate": 5.8566455771935094e-05, "loss": 0.1947, "step": 11736 }, { "epoch": 2.365101752971993, "grad_norm": 0.04733563959598541, "learning_rate": 5.8553326000608487e-05, "loss": 0.197, "step": 11738 }, { "epoch": 2.365504735039291, "grad_norm": 0.047249022871255875, "learning_rate": 5.854019562166728e-05, "loss": 0.1504, "step": 11740 }, { "epoch": 2.3659077171065888, "grad_norm": 0.05582365393638611, "learning_rate": 5.852706463604425e-05, "loss": 0.1979, "step": 11742 }, { "epoch": 2.3663106991738867, "grad_norm": 0.052134282886981964, "learning_rate": 5.8513933044672164e-05, "loss": 0.2014, "step": 11744 }, { "epoch": 2.3667136812411846, "grad_norm": 0.038980018347501755, "learning_rate": 5.8500800848483895e-05, "loss": 0.149, "step": 11746 }, { "epoch": 2.3671166633084826, "grad_norm": 0.055582620203495026, "learning_rate": 5.848766804841235e-05, "loss": 0.1674, "step": 11748 }, { "epoch": 2.367519645375781, "grad_norm": 0.0617508664727211, "learning_rate": 5.847453464539041e-05, "loss": 0.2204, "step": 11750 }, { "epoch": 2.367922627443079, "grad_norm": 0.08900310844182968, "learning_rate": 5.84614006403511e-05, "loss": 0.1942, "step": 11752 }, { "epoch": 2.368325609510377, "grad_norm": 0.06451424211263657, "learning_rate": 5.844826603422743e-05, "loss": 0.2078, "step": 11754 }, { "epoch": 2.368728591577675, "grad_norm": 0.045638687908649445, "learning_rate": 5.8435130827952433e-05, "loss": 0.2024, "step": 11756 }, { "epoch": 2.3691315736449727, "grad_norm": 0.04450851306319237, "learning_rate": 5.8421995022459245e-05, "loss": 0.1851, "step": 11758 }, { "epoch": 2.3695345557122707, "grad_norm": 0.05700334906578064, "learning_rate": 5.8408858618680984e-05, "loss": 0.1776, "step": 11760 }, { "epoch": 2.3699375377795686, "grad_norm": 0.060567259788513184, "learning_rate": 5.839572161755087e-05, "loss": 0.1426, "step": 11762 }, { "epoch": 2.370340519846867, "grad_norm": 0.07149244099855423, "learning_rate": 5.8382584020002116e-05, "loss": 0.2277, "step": 11764 }, { "epoch": 2.370743501914165, "grad_norm": 0.06128925085067749, "learning_rate": 5.8369445826968e-05, "loss": 0.1954, "step": 11766 }, { "epoch": 2.371146483981463, "grad_norm": 0.05819423869252205, "learning_rate": 5.8356307039381816e-05, "loss": 0.2056, "step": 11768 }, { "epoch": 2.371549466048761, "grad_norm": 0.048833053559064865, "learning_rate": 5.834316765817698e-05, "loss": 0.2106, "step": 11770 }, { "epoch": 2.3719524481160588, "grad_norm": 0.0524032786488533, "learning_rate": 5.833002768428683e-05, "loss": 0.1701, "step": 11772 }, { "epoch": 2.3723554301833567, "grad_norm": 0.05456862598657608, "learning_rate": 5.8316887118644835e-05, "loss": 0.2064, "step": 11774 }, { "epoch": 2.372758412250655, "grad_norm": 0.062148381024599075, "learning_rate": 5.83037459621845e-05, "loss": 0.2047, "step": 11776 }, { "epoch": 2.373161394317953, "grad_norm": 0.06895274668931961, "learning_rate": 5.8290604215839314e-05, "loss": 0.188, "step": 11778 }, { "epoch": 2.373564376385251, "grad_norm": 0.060913410037755966, "learning_rate": 5.8277461880542864e-05, "loss": 0.2027, "step": 11780 }, { "epoch": 2.373967358452549, "grad_norm": 0.06738603115081787, "learning_rate": 5.826431895722877e-05, "loss": 0.1767, "step": 11782 }, { "epoch": 2.374370340519847, "grad_norm": 0.07685627043247223, "learning_rate": 5.8251175446830677e-05, "loss": 0.2092, "step": 11784 }, { "epoch": 2.374773322587145, "grad_norm": 0.10875460505485535, "learning_rate": 5.823803135028226e-05, "loss": 0.2403, "step": 11786 }, { "epoch": 2.3751763046544427, "grad_norm": 0.049758244305849075, "learning_rate": 5.8224886668517285e-05, "loss": 0.2044, "step": 11788 }, { "epoch": 2.3755792867217407, "grad_norm": 0.03598930686712265, "learning_rate": 5.8211741402469496e-05, "loss": 0.1232, "step": 11790 }, { "epoch": 2.375982268789039, "grad_norm": 0.060612574219703674, "learning_rate": 5.8198595553072746e-05, "loss": 0.1988, "step": 11792 }, { "epoch": 2.376385250856337, "grad_norm": 0.047952961176633835, "learning_rate": 5.818544912126089e-05, "loss": 0.1905, "step": 11794 }, { "epoch": 2.376788232923635, "grad_norm": 0.05651276931166649, "learning_rate": 5.8172302107967804e-05, "loss": 0.2082, "step": 11796 }, { "epoch": 2.377191214990933, "grad_norm": 0.05601438134908676, "learning_rate": 5.8159154514127435e-05, "loss": 0.2004, "step": 11798 }, { "epoch": 2.377594197058231, "grad_norm": 0.058857470750808716, "learning_rate": 5.81460063406738e-05, "loss": 0.1537, "step": 11800 }, { "epoch": 2.3779971791255288, "grad_norm": 0.05285648629069328, "learning_rate": 5.813285758854089e-05, "loss": 0.1686, "step": 11802 }, { "epoch": 2.378400161192827, "grad_norm": 0.03570474684238434, "learning_rate": 5.811970825866279e-05, "loss": 0.1495, "step": 11804 }, { "epoch": 2.378803143260125, "grad_norm": 0.05308526009321213, "learning_rate": 5.8106558351973606e-05, "loss": 0.1985, "step": 11806 }, { "epoch": 2.379206125327423, "grad_norm": 0.04530463367700577, "learning_rate": 5.8093407869407466e-05, "loss": 0.1672, "step": 11808 }, { "epoch": 2.379609107394721, "grad_norm": 0.05302588269114494, "learning_rate": 5.808025681189857e-05, "loss": 0.1831, "step": 11810 }, { "epoch": 2.380012089462019, "grad_norm": 0.07352914661169052, "learning_rate": 5.8067105180381174e-05, "loss": 0.2249, "step": 11812 }, { "epoch": 2.380415071529317, "grad_norm": 0.045757245272397995, "learning_rate": 5.8053952975789516e-05, "loss": 0.1529, "step": 11814 }, { "epoch": 2.380818053596615, "grad_norm": 0.06004302576184273, "learning_rate": 5.804080019905792e-05, "loss": 0.2177, "step": 11816 }, { "epoch": 2.3812210356639127, "grad_norm": 0.06710556149482727, "learning_rate": 5.802764685112074e-05, "loss": 0.2256, "step": 11818 }, { "epoch": 2.381624017731211, "grad_norm": 0.057796910405159, "learning_rate": 5.8014492932912354e-05, "loss": 0.2327, "step": 11820 }, { "epoch": 2.382026999798509, "grad_norm": 0.04096395522356033, "learning_rate": 5.800133844536723e-05, "loss": 0.213, "step": 11822 }, { "epoch": 2.382429981865807, "grad_norm": 0.04763369262218475, "learning_rate": 5.79881833894198e-05, "loss": 0.2442, "step": 11824 }, { "epoch": 2.382832963933105, "grad_norm": 0.04647963494062424, "learning_rate": 5.797502776600461e-05, "loss": 0.1827, "step": 11826 }, { "epoch": 2.383235946000403, "grad_norm": 0.05235551670193672, "learning_rate": 5.796187157605619e-05, "loss": 0.1772, "step": 11828 }, { "epoch": 2.383638928067701, "grad_norm": 0.039118025451898575, "learning_rate": 5.7948714820509155e-05, "loss": 0.19, "step": 11830 }, { "epoch": 2.384041910134999, "grad_norm": 0.0516664981842041, "learning_rate": 5.7935557500298124e-05, "loss": 0.1388, "step": 11832 }, { "epoch": 2.384444892202297, "grad_norm": 0.049341436475515366, "learning_rate": 5.792239961635779e-05, "loss": 0.1862, "step": 11834 }, { "epoch": 2.384847874269595, "grad_norm": 0.06240430846810341, "learning_rate": 5.7909241169622844e-05, "loss": 0.2013, "step": 11836 }, { "epoch": 2.385250856336893, "grad_norm": 0.0566558800637722, "learning_rate": 5.789608216102805e-05, "loss": 0.1829, "step": 11838 }, { "epoch": 2.385653838404191, "grad_norm": 0.05318170040845871, "learning_rate": 5.788292259150823e-05, "loss": 0.1576, "step": 11840 }, { "epoch": 2.386056820471489, "grad_norm": 0.04496678337454796, "learning_rate": 5.786976246199818e-05, "loss": 0.1916, "step": 11842 }, { "epoch": 2.386459802538787, "grad_norm": 0.05971567705273628, "learning_rate": 5.78566017734328e-05, "loss": 0.2041, "step": 11844 }, { "epoch": 2.386862784606085, "grad_norm": 0.05461606755852699, "learning_rate": 5.7843440526746986e-05, "loss": 0.2, "step": 11846 }, { "epoch": 2.387265766673383, "grad_norm": 0.046182163059711456, "learning_rate": 5.78302787228757e-05, "loss": 0.1676, "step": 11848 }, { "epoch": 2.387668748740681, "grad_norm": 0.04418657720088959, "learning_rate": 5.781711636275393e-05, "loss": 0.1601, "step": 11850 }, { "epoch": 2.388071730807979, "grad_norm": 0.05287107080221176, "learning_rate": 5.780395344731674e-05, "loss": 0.2134, "step": 11852 }, { "epoch": 2.388474712875277, "grad_norm": 0.049565572291612625, "learning_rate": 5.779078997749916e-05, "loss": 0.2243, "step": 11854 }, { "epoch": 2.388877694942575, "grad_norm": 0.06569905579090118, "learning_rate": 5.777762595423631e-05, "loss": 0.2142, "step": 11856 }, { "epoch": 2.389280677009873, "grad_norm": 0.049043066799640656, "learning_rate": 5.776446137846337e-05, "loss": 0.166, "step": 11858 }, { "epoch": 2.3896836590771713, "grad_norm": 0.05483182892203331, "learning_rate": 5.775129625111551e-05, "loss": 0.2148, "step": 11860 }, { "epoch": 2.3900866411444692, "grad_norm": 0.11051026731729507, "learning_rate": 5.773813057312795e-05, "loss": 0.2218, "step": 11862 }, { "epoch": 2.390489623211767, "grad_norm": 0.08356950432062149, "learning_rate": 5.7724964345435976e-05, "loss": 0.2183, "step": 11864 }, { "epoch": 2.390892605279065, "grad_norm": 0.05710029602050781, "learning_rate": 5.771179756897488e-05, "loss": 0.2078, "step": 11866 }, { "epoch": 2.391295587346363, "grad_norm": 0.052062198519706726, "learning_rate": 5.769863024468002e-05, "loss": 0.1888, "step": 11868 }, { "epoch": 2.391698569413661, "grad_norm": 0.0480102002620697, "learning_rate": 5.7685462373486796e-05, "loss": 0.2023, "step": 11870 }, { "epoch": 2.392101551480959, "grad_norm": 0.05226249620318413, "learning_rate": 5.7672293956330603e-05, "loss": 0.193, "step": 11872 }, { "epoch": 2.392504533548257, "grad_norm": 0.054428581148386, "learning_rate": 5.765912499414691e-05, "loss": 0.1692, "step": 11874 }, { "epoch": 2.3929075156155553, "grad_norm": 0.06447558850049973, "learning_rate": 5.764595548787124e-05, "loss": 0.225, "step": 11876 }, { "epoch": 2.393310497682853, "grad_norm": 0.04479379206895828, "learning_rate": 5.763278543843912e-05, "loss": 0.2317, "step": 11878 }, { "epoch": 2.393713479750151, "grad_norm": 0.057004962116479874, "learning_rate": 5.761961484678612e-05, "loss": 0.2318, "step": 11880 }, { "epoch": 2.394116461817449, "grad_norm": 0.058323029428720474, "learning_rate": 5.760644371384788e-05, "loss": 0.1903, "step": 11882 }, { "epoch": 2.394519443884747, "grad_norm": 0.06854918599128723, "learning_rate": 5.759327204056003e-05, "loss": 0.237, "step": 11884 }, { "epoch": 2.394922425952045, "grad_norm": 0.06595735251903534, "learning_rate": 5.758009982785829e-05, "loss": 0.1988, "step": 11886 }, { "epoch": 2.3953254080193433, "grad_norm": 0.06002083048224449, "learning_rate": 5.756692707667837e-05, "loss": 0.1646, "step": 11888 }, { "epoch": 2.3957283900866413, "grad_norm": 0.04946382716298103, "learning_rate": 5.755375378795604e-05, "loss": 0.2111, "step": 11890 }, { "epoch": 2.3961313721539392, "grad_norm": 0.06664959341287613, "learning_rate": 5.754057996262715e-05, "loss": 0.1618, "step": 11892 }, { "epoch": 2.396534354221237, "grad_norm": 0.06098358705639839, "learning_rate": 5.752740560162751e-05, "loss": 0.2055, "step": 11894 }, { "epoch": 2.396937336288535, "grad_norm": 0.08736960589885712, "learning_rate": 5.7514230705893e-05, "loss": 0.252, "step": 11896 }, { "epoch": 2.397340318355833, "grad_norm": 0.05731016770005226, "learning_rate": 5.750105527635957e-05, "loss": 0.1637, "step": 11898 }, { "epoch": 2.397743300423131, "grad_norm": 0.06296495348215103, "learning_rate": 5.748787931396317e-05, "loss": 0.2017, "step": 11900 }, { "epoch": 2.398146282490429, "grad_norm": 0.04558323696255684, "learning_rate": 5.747470281963979e-05, "loss": 0.1786, "step": 11902 }, { "epoch": 2.3985492645577273, "grad_norm": 0.05776946246623993, "learning_rate": 5.746152579432549e-05, "loss": 0.1729, "step": 11904 }, { "epoch": 2.3989522466250253, "grad_norm": 0.0603502131998539, "learning_rate": 5.744834823895632e-05, "loss": 0.1867, "step": 11906 }, { "epoch": 2.399355228692323, "grad_norm": 0.06661641597747803, "learning_rate": 5.74351701544684e-05, "loss": 0.1869, "step": 11908 }, { "epoch": 2.399758210759621, "grad_norm": 0.058900561183691025, "learning_rate": 5.742199154179789e-05, "loss": 0.166, "step": 11910 }, { "epoch": 2.400161192826919, "grad_norm": 0.059498026967048645, "learning_rate": 5.740881240188097e-05, "loss": 0.1944, "step": 11912 }, { "epoch": 2.400564174894217, "grad_norm": 0.04069728031754494, "learning_rate": 5.739563273565386e-05, "loss": 0.1524, "step": 11914 }, { "epoch": 2.4009671569615154, "grad_norm": 0.057217568159103394, "learning_rate": 5.7382452544052844e-05, "loss": 0.1966, "step": 11916 }, { "epoch": 2.4013701390288134, "grad_norm": 0.08511929214000702, "learning_rate": 5.736927182801419e-05, "loss": 0.228, "step": 11918 }, { "epoch": 2.4017731210961113, "grad_norm": 0.06262311339378357, "learning_rate": 5.7356090588474254e-05, "loss": 0.231, "step": 11920 }, { "epoch": 2.4021761031634092, "grad_norm": 0.04965263605117798, "learning_rate": 5.7342908826369414e-05, "loss": 0.1729, "step": 11922 }, { "epoch": 2.402579085230707, "grad_norm": 0.057901497930288315, "learning_rate": 5.7329726542636064e-05, "loss": 0.1913, "step": 11924 }, { "epoch": 2.402982067298005, "grad_norm": 0.06473572552204132, "learning_rate": 5.731654373821066e-05, "loss": 0.2132, "step": 11926 }, { "epoch": 2.403385049365303, "grad_norm": 0.04945135489106178, "learning_rate": 5.7303360414029706e-05, "loss": 0.1491, "step": 11928 }, { "epoch": 2.403788031432601, "grad_norm": 0.05536174029111862, "learning_rate": 5.72901765710297e-05, "loss": 0.1903, "step": 11930 }, { "epoch": 2.4041910134998994, "grad_norm": 0.061889227479696274, "learning_rate": 5.727699221014719e-05, "loss": 0.2038, "step": 11932 }, { "epoch": 2.4045939955671973, "grad_norm": 0.04592079669237137, "learning_rate": 5.726380733231882e-05, "loss": 0.1755, "step": 11934 }, { "epoch": 2.4049969776344953, "grad_norm": 0.054935865104198456, "learning_rate": 5.725062193848119e-05, "loss": 0.2201, "step": 11936 }, { "epoch": 2.405399959701793, "grad_norm": 0.049928389489650726, "learning_rate": 5.723743602957096e-05, "loss": 0.1442, "step": 11938 }, { "epoch": 2.405802941769091, "grad_norm": 0.05707401782274246, "learning_rate": 5.722424960652486e-05, "loss": 0.1478, "step": 11940 }, { "epoch": 2.406205923836389, "grad_norm": 0.08371180295944214, "learning_rate": 5.7211062670279615e-05, "loss": 0.1762, "step": 11942 }, { "epoch": 2.4066089059036875, "grad_norm": 0.060301292687654495, "learning_rate": 5.7197875221772004e-05, "loss": 0.1517, "step": 11944 }, { "epoch": 2.4070118879709854, "grad_norm": 0.05720106512308121, "learning_rate": 5.718468726193886e-05, "loss": 0.156, "step": 11946 }, { "epoch": 2.4074148700382834, "grad_norm": 0.0571570098400116, "learning_rate": 5.7171498791717014e-05, "loss": 0.1617, "step": 11948 }, { "epoch": 2.4078178521055813, "grad_norm": 0.03223549202084541, "learning_rate": 5.7158309812043374e-05, "loss": 0.1351, "step": 11950 }, { "epoch": 2.4082208341728792, "grad_norm": 0.06487289816141129, "learning_rate": 5.714512032385485e-05, "loss": 0.1887, "step": 11952 }, { "epoch": 2.408623816240177, "grad_norm": 0.6603782773017883, "learning_rate": 5.71319303280884e-05, "loss": 0.2058, "step": 11954 }, { "epoch": 2.409026798307475, "grad_norm": 0.04995949938893318, "learning_rate": 5.7118739825681035e-05, "loss": 0.1826, "step": 11956 }, { "epoch": 2.4094297803747735, "grad_norm": 0.05377180874347687, "learning_rate": 5.710554881756976e-05, "loss": 0.1987, "step": 11958 }, { "epoch": 2.4098327624420715, "grad_norm": 0.048319052904844284, "learning_rate": 5.709235730469168e-05, "loss": 0.1767, "step": 11960 }, { "epoch": 2.4102357445093694, "grad_norm": 0.05564803257584572, "learning_rate": 5.707916528798387e-05, "loss": 0.1956, "step": 11962 }, { "epoch": 2.4106387265766673, "grad_norm": 0.061358992010354996, "learning_rate": 5.706597276838348e-05, "loss": 0.2277, "step": 11964 }, { "epoch": 2.4110417086439653, "grad_norm": 0.04426693171262741, "learning_rate": 5.7052779746827675e-05, "loss": 0.1647, "step": 11966 }, { "epoch": 2.411444690711263, "grad_norm": 0.0576111376285553, "learning_rate": 5.7039586224253704e-05, "loss": 0.179, "step": 11968 }, { "epoch": 2.4118476727785616, "grad_norm": 0.04460928216576576, "learning_rate": 5.7026392201598766e-05, "loss": 0.1597, "step": 11970 }, { "epoch": 2.4122506548458595, "grad_norm": 0.09354057163000107, "learning_rate": 5.701319767980016e-05, "loss": 0.168, "step": 11972 }, { "epoch": 2.4126536369131575, "grad_norm": 0.06683609634637833, "learning_rate": 5.700000265979522e-05, "loss": 0.2157, "step": 11974 }, { "epoch": 2.4130566189804554, "grad_norm": 0.06345374882221222, "learning_rate": 5.698680714252127e-05, "loss": 0.1864, "step": 11976 }, { "epoch": 2.4134596010477534, "grad_norm": 0.041688140481710434, "learning_rate": 5.6973611128915714e-05, "loss": 0.1656, "step": 11978 }, { "epoch": 2.4138625831150513, "grad_norm": 0.09406961500644684, "learning_rate": 5.696041461991599e-05, "loss": 0.1683, "step": 11980 }, { "epoch": 2.4142655651823492, "grad_norm": 0.07213851064443588, "learning_rate": 5.6947217616459536e-05, "loss": 0.181, "step": 11982 }, { "epoch": 2.414668547249647, "grad_norm": 0.05010436475276947, "learning_rate": 5.693402011948385e-05, "loss": 0.167, "step": 11984 }, { "epoch": 2.4150715293169456, "grad_norm": 0.06185581907629967, "learning_rate": 5.692082212992648e-05, "loss": 0.1942, "step": 11986 }, { "epoch": 2.4154745113842435, "grad_norm": 0.04987217113375664, "learning_rate": 5.6907623648724963e-05, "loss": 0.1844, "step": 11988 }, { "epoch": 2.4158774934515415, "grad_norm": 0.0516745001077652, "learning_rate": 5.689442467681691e-05, "loss": 0.2261, "step": 11990 }, { "epoch": 2.4162804755188394, "grad_norm": 0.07256913185119629, "learning_rate": 5.6881225215139947e-05, "loss": 0.1793, "step": 11992 }, { "epoch": 2.4166834575861373, "grad_norm": 0.040182922035455704, "learning_rate": 5.6868025264631755e-05, "loss": 0.1632, "step": 11994 }, { "epoch": 2.4170864396534353, "grad_norm": 0.09456195682287216, "learning_rate": 5.6854824826230024e-05, "loss": 0.2299, "step": 11996 }, { "epoch": 2.4174894217207337, "grad_norm": 0.057920172810554504, "learning_rate": 5.684162390087252e-05, "loss": 0.1902, "step": 11998 }, { "epoch": 2.4178924037880316, "grad_norm": 0.10728804767131805, "learning_rate": 5.682842248949698e-05, "loss": 0.2152, "step": 12000 }, { "epoch": 2.4182953858553295, "grad_norm": 0.04487679898738861, "learning_rate": 5.681522059304123e-05, "loss": 0.1691, "step": 12002 }, { "epoch": 2.4186983679226275, "grad_norm": 0.05849164351820946, "learning_rate": 5.6802018212443105e-05, "loss": 0.1846, "step": 12004 }, { "epoch": 2.4191013499899254, "grad_norm": 0.06814565509557724, "learning_rate": 5.678881534864049e-05, "loss": 0.2089, "step": 12006 }, { "epoch": 2.4195043320572234, "grad_norm": 0.06691406667232513, "learning_rate": 5.6775612002571286e-05, "loss": 0.1781, "step": 12008 }, { "epoch": 2.4199073141245213, "grad_norm": 0.060444775968790054, "learning_rate": 5.676240817517344e-05, "loss": 0.1633, "step": 12010 }, { "epoch": 2.4203102961918193, "grad_norm": 0.06389014422893524, "learning_rate": 5.674920386738494e-05, "loss": 0.1707, "step": 12012 }, { "epoch": 2.4207132782591176, "grad_norm": 0.060476355254650116, "learning_rate": 5.673599908014379e-05, "loss": 0.2348, "step": 12014 }, { "epoch": 2.4211162603264156, "grad_norm": 0.04274730756878853, "learning_rate": 5.672279381438803e-05, "loss": 0.1393, "step": 12016 }, { "epoch": 2.4215192423937135, "grad_norm": 0.06446705758571625, "learning_rate": 5.6709588071055755e-05, "loss": 0.2038, "step": 12018 }, { "epoch": 2.4219222244610115, "grad_norm": 0.06751962006092072, "learning_rate": 5.669638185108507e-05, "loss": 0.2483, "step": 12020 }, { "epoch": 2.4223252065283094, "grad_norm": 0.05737346410751343, "learning_rate": 5.668317515541414e-05, "loss": 0.2671, "step": 12022 }, { "epoch": 2.4227281885956073, "grad_norm": 0.06973695755004883, "learning_rate": 5.666996798498112e-05, "loss": 0.1662, "step": 12024 }, { "epoch": 2.4231311706629057, "grad_norm": 0.06069466099143028, "learning_rate": 5.665676034072425e-05, "loss": 0.1833, "step": 12026 }, { "epoch": 2.4235341527302037, "grad_norm": 0.04661950841546059, "learning_rate": 5.664355222358176e-05, "loss": 0.1636, "step": 12028 }, { "epoch": 2.4239371347975016, "grad_norm": 0.042542293667793274, "learning_rate": 5.6630343634491954e-05, "loss": 0.1768, "step": 12030 }, { "epoch": 2.4243401168647996, "grad_norm": 0.04667328670620918, "learning_rate": 5.661713457439314e-05, "loss": 0.1743, "step": 12032 }, { "epoch": 2.4247430989320975, "grad_norm": 0.05008386820554733, "learning_rate": 5.660392504422366e-05, "loss": 0.1898, "step": 12034 }, { "epoch": 2.4251460809993954, "grad_norm": 0.07631053030490875, "learning_rate": 5.659071504492192e-05, "loss": 0.2055, "step": 12036 }, { "epoch": 2.4255490630666934, "grad_norm": 0.044088345021009445, "learning_rate": 5.657750457742632e-05, "loss": 0.1641, "step": 12038 }, { "epoch": 2.4259520451339913, "grad_norm": 0.0574885718524456, "learning_rate": 5.65642936426753e-05, "loss": 0.1901, "step": 12040 }, { "epoch": 2.4263550272012897, "grad_norm": 0.04501950368285179, "learning_rate": 5.6551082241607365e-05, "loss": 0.1745, "step": 12042 }, { "epoch": 2.4267580092685876, "grad_norm": 0.06143771857023239, "learning_rate": 5.653787037516104e-05, "loss": 0.2342, "step": 12044 }, { "epoch": 2.4271609913358856, "grad_norm": 0.054222654551267624, "learning_rate": 5.6524658044274835e-05, "loss": 0.2231, "step": 12046 }, { "epoch": 2.4275639734031835, "grad_norm": 0.045573096722364426, "learning_rate": 5.6511445249887376e-05, "loss": 0.1726, "step": 12048 }, { "epoch": 2.4279669554704815, "grad_norm": 0.05551764741539955, "learning_rate": 5.649823199293726e-05, "loss": 0.177, "step": 12050 }, { "epoch": 2.4283699375377794, "grad_norm": 0.05046489089727402, "learning_rate": 5.648501827436312e-05, "loss": 0.1782, "step": 12052 }, { "epoch": 2.428772919605078, "grad_norm": 0.05633862689137459, "learning_rate": 5.647180409510366e-05, "loss": 0.1871, "step": 12054 }, { "epoch": 2.4291759016723757, "grad_norm": 0.048323508352041245, "learning_rate": 5.645858945609759e-05, "loss": 0.2305, "step": 12056 }, { "epoch": 2.4295788837396737, "grad_norm": 0.042128656059503555, "learning_rate": 5.6445374358283656e-05, "loss": 0.178, "step": 12058 }, { "epoch": 2.4299818658069716, "grad_norm": 0.05420457571744919, "learning_rate": 5.643215880260062e-05, "loss": 0.1622, "step": 12060 }, { "epoch": 2.4303848478742696, "grad_norm": 0.060230378061532974, "learning_rate": 5.641894278998733e-05, "loss": 0.1957, "step": 12062 }, { "epoch": 2.4307878299415675, "grad_norm": 0.06890761107206345, "learning_rate": 5.640572632138259e-05, "loss": 0.1913, "step": 12064 }, { "epoch": 2.4311908120088654, "grad_norm": 0.06876115500926971, "learning_rate": 5.6392509397725314e-05, "loss": 0.2407, "step": 12066 }, { "epoch": 2.4315937940761634, "grad_norm": 0.05286121740937233, "learning_rate": 5.637929201995439e-05, "loss": 0.1624, "step": 12068 }, { "epoch": 2.4319967761434618, "grad_norm": 0.050193753093481064, "learning_rate": 5.636607418900875e-05, "loss": 0.1676, "step": 12070 }, { "epoch": 2.4323997582107597, "grad_norm": 0.04849841073155403, "learning_rate": 5.6352855905827406e-05, "loss": 0.2098, "step": 12072 }, { "epoch": 2.4328027402780577, "grad_norm": 0.05145376920700073, "learning_rate": 5.633963717134931e-05, "loss": 0.2039, "step": 12074 }, { "epoch": 2.4332057223453556, "grad_norm": 0.04287027567625046, "learning_rate": 5.632641798651355e-05, "loss": 0.2008, "step": 12076 }, { "epoch": 2.4336087044126535, "grad_norm": 0.05822911486029625, "learning_rate": 5.6313198352259166e-05, "loss": 0.1829, "step": 12078 }, { "epoch": 2.4340116864799515, "grad_norm": 0.046748463064432144, "learning_rate": 5.629997826952527e-05, "loss": 0.163, "step": 12080 }, { "epoch": 2.43441466854725, "grad_norm": 0.049855321645736694, "learning_rate": 5.6286757739250987e-05, "loss": 0.2024, "step": 12082 }, { "epoch": 2.434817650614548, "grad_norm": 0.052091654390096664, "learning_rate": 5.627353676237549e-05, "loss": 0.1661, "step": 12084 }, { "epoch": 2.4352206326818457, "grad_norm": 0.056773409247398376, "learning_rate": 5.6260315339837975e-05, "loss": 0.1858, "step": 12086 }, { "epoch": 2.4356236147491437, "grad_norm": 0.06805513799190521, "learning_rate": 5.624709347257767e-05, "loss": 0.1779, "step": 12088 }, { "epoch": 2.4360265968164416, "grad_norm": 0.04931602627038956, "learning_rate": 5.623387116153385e-05, "loss": 0.187, "step": 12090 }, { "epoch": 2.4364295788837396, "grad_norm": 0.057481031864881516, "learning_rate": 5.622064840764577e-05, "loss": 0.1431, "step": 12092 }, { "epoch": 2.4368325609510375, "grad_norm": 0.06208740547299385, "learning_rate": 5.620742521185278e-05, "loss": 0.1878, "step": 12094 }, { "epoch": 2.4372355430183354, "grad_norm": 0.07705783098936081, "learning_rate": 5.619420157509424e-05, "loss": 0.2383, "step": 12096 }, { "epoch": 2.437638525085634, "grad_norm": 0.03305123373866081, "learning_rate": 5.618097749830952e-05, "loss": 0.1221, "step": 12098 }, { "epoch": 2.4380415071529318, "grad_norm": 0.047065626829862595, "learning_rate": 5.616775298243804e-05, "loss": 0.177, "step": 12100 }, { "epoch": 2.4384444892202297, "grad_norm": 0.05077784135937691, "learning_rate": 5.615452802841926e-05, "loss": 0.1877, "step": 12102 }, { "epoch": 2.4388474712875277, "grad_norm": 0.04318971186876297, "learning_rate": 5.6141302637192647e-05, "loss": 0.1949, "step": 12104 }, { "epoch": 2.4392504533548256, "grad_norm": 0.056446243077516556, "learning_rate": 5.612807680969772e-05, "loss": 0.2079, "step": 12106 }, { "epoch": 2.4396534354221235, "grad_norm": 0.036065537482500076, "learning_rate": 5.611485054687402e-05, "loss": 0.1886, "step": 12108 }, { "epoch": 2.440056417489422, "grad_norm": 0.0681459903717041, "learning_rate": 5.61016238496611e-05, "loss": 0.2035, "step": 12110 }, { "epoch": 2.44045939955672, "grad_norm": 0.04323028773069382, "learning_rate": 5.608839671899859e-05, "loss": 0.165, "step": 12112 }, { "epoch": 2.440862381624018, "grad_norm": 0.07516587525606155, "learning_rate": 5.607516915582613e-05, "loss": 0.1722, "step": 12114 }, { "epoch": 2.4412653636913157, "grad_norm": 0.04655295982956886, "learning_rate": 5.6061941161083344e-05, "loss": 0.1984, "step": 12116 }, { "epoch": 2.4416683457586137, "grad_norm": 0.045383911579847336, "learning_rate": 5.6048712735709965e-05, "loss": 0.1924, "step": 12118 }, { "epoch": 2.4420713278259116, "grad_norm": 0.04858412221074104, "learning_rate": 5.60354838806457e-05, "loss": 0.1514, "step": 12120 }, { "epoch": 2.4424743098932096, "grad_norm": 0.049353402107954025, "learning_rate": 5.602225459683031e-05, "loss": 0.1158, "step": 12122 }, { "epoch": 2.4428772919605075, "grad_norm": 0.05863470956683159, "learning_rate": 5.60090248852036e-05, "loss": 0.1913, "step": 12124 }, { "epoch": 2.443280274027806, "grad_norm": 0.04578150436282158, "learning_rate": 5.5995794746705364e-05, "loss": 0.2106, "step": 12126 }, { "epoch": 2.443683256095104, "grad_norm": 0.04799255356192589, "learning_rate": 5.5982564182275456e-05, "loss": 0.2112, "step": 12128 }, { "epoch": 2.444086238162402, "grad_norm": 0.049805283546447754, "learning_rate": 5.596933319285376e-05, "loss": 0.1931, "step": 12130 }, { "epoch": 2.4444892202296997, "grad_norm": 0.043248314410448074, "learning_rate": 5.5956101779380176e-05, "loss": 0.1918, "step": 12132 }, { "epoch": 2.4448922022969977, "grad_norm": 0.05324958264827728, "learning_rate": 5.594286994279464e-05, "loss": 0.2067, "step": 12134 }, { "epoch": 2.4452951843642956, "grad_norm": 0.04655977338552475, "learning_rate": 5.592963768403715e-05, "loss": 0.2068, "step": 12136 }, { "epoch": 2.445698166431594, "grad_norm": 0.070985808968544, "learning_rate": 5.591640500404766e-05, "loss": 0.1831, "step": 12138 }, { "epoch": 2.446101148498892, "grad_norm": 0.04994530230760574, "learning_rate": 5.590317190376623e-05, "loss": 0.1686, "step": 12140 }, { "epoch": 2.44650413056619, "grad_norm": 0.05718931555747986, "learning_rate": 5.588993838413291e-05, "loss": 0.1469, "step": 12142 }, { "epoch": 2.446907112633488, "grad_norm": 0.11910577863454819, "learning_rate": 5.587670444608778e-05, "loss": 0.16, "step": 12144 }, { "epoch": 2.4473100947007858, "grad_norm": 0.05469052866101265, "learning_rate": 5.5863470090570966e-05, "loss": 0.172, "step": 12146 }, { "epoch": 2.4477130767680837, "grad_norm": 0.06286270171403885, "learning_rate": 5.5850235318522625e-05, "loss": 0.2013, "step": 12148 }, { "epoch": 2.4481160588353816, "grad_norm": 0.048199381679296494, "learning_rate": 5.583700013088291e-05, "loss": 0.1626, "step": 12150 }, { "epoch": 2.44851904090268, "grad_norm": 0.035780344158411026, "learning_rate": 5.5823764528592036e-05, "loss": 0.1876, "step": 12152 }, { "epoch": 2.448922022969978, "grad_norm": 0.0799671933054924, "learning_rate": 5.581052851259026e-05, "loss": 0.2456, "step": 12154 }, { "epoch": 2.449325005037276, "grad_norm": 0.06183718889951706, "learning_rate": 5.579729208381782e-05, "loss": 0.2205, "step": 12156 }, { "epoch": 2.449727987104574, "grad_norm": 0.05557161569595337, "learning_rate": 5.5784055243215025e-05, "loss": 0.1684, "step": 12158 }, { "epoch": 2.450130969171872, "grad_norm": 0.07418825477361679, "learning_rate": 5.5770817991722205e-05, "loss": 0.174, "step": 12160 }, { "epoch": 2.4505339512391697, "grad_norm": 0.0453825443983078, "learning_rate": 5.575758033027969e-05, "loss": 0.1369, "step": 12162 }, { "epoch": 2.4509369333064677, "grad_norm": 0.05918452516198158, "learning_rate": 5.5744342259827874e-05, "loss": 0.2572, "step": 12164 }, { "epoch": 2.451339915373766, "grad_norm": 0.04277510941028595, "learning_rate": 5.573110378130719e-05, "loss": 0.2074, "step": 12166 }, { "epoch": 2.451742897441064, "grad_norm": 0.09278752654790878, "learning_rate": 5.5717864895658045e-05, "loss": 0.1807, "step": 12168 }, { "epoch": 2.452145879508362, "grad_norm": 0.06282159686088562, "learning_rate": 5.5704625603820925e-05, "loss": 0.1981, "step": 12170 }, { "epoch": 2.45254886157566, "grad_norm": 0.04509962350130081, "learning_rate": 5.569138590673633e-05, "loss": 0.1393, "step": 12172 }, { "epoch": 2.452951843642958, "grad_norm": 0.052625782787799835, "learning_rate": 5.567814580534477e-05, "loss": 0.1651, "step": 12174 }, { "epoch": 2.4533548257102558, "grad_norm": 0.05318623036146164, "learning_rate": 5.566490530058681e-05, "loss": 0.2167, "step": 12176 }, { "epoch": 2.4537578077775537, "grad_norm": 0.048767492175102234, "learning_rate": 5.565166439340306e-05, "loss": 0.137, "step": 12178 }, { "epoch": 2.454160789844852, "grad_norm": 0.0497177429497242, "learning_rate": 5.5638423084734095e-05, "loss": 0.1915, "step": 12180 }, { "epoch": 2.45456377191215, "grad_norm": 0.05952637642621994, "learning_rate": 5.562518137552056e-05, "loss": 0.1969, "step": 12182 }, { "epoch": 2.454966753979448, "grad_norm": 0.06582552939653397, "learning_rate": 5.561193926670316e-05, "loss": 0.168, "step": 12184 }, { "epoch": 2.455369736046746, "grad_norm": 0.03022056818008423, "learning_rate": 5.5598696759222555e-05, "loss": 0.1544, "step": 12186 }, { "epoch": 2.455772718114044, "grad_norm": 0.09558003395795822, "learning_rate": 5.5585453854019495e-05, "loss": 0.1949, "step": 12188 }, { "epoch": 2.456175700181342, "grad_norm": 0.06677708774805069, "learning_rate": 5.557221055203472e-05, "loss": 0.2189, "step": 12190 }, { "epoch": 2.45657868224864, "grad_norm": 0.058833569288253784, "learning_rate": 5.555896685420902e-05, "loss": 0.1721, "step": 12192 }, { "epoch": 2.456981664315938, "grad_norm": 0.04797518998384476, "learning_rate": 5.554572276148321e-05, "loss": 0.2185, "step": 12194 }, { "epoch": 2.457384646383236, "grad_norm": 0.054031334817409515, "learning_rate": 5.553247827479812e-05, "loss": 0.2034, "step": 12196 }, { "epoch": 2.457787628450534, "grad_norm": 0.045484550297260284, "learning_rate": 5.5519233395094614e-05, "loss": 0.1817, "step": 12198 }, { "epoch": 2.458190610517832, "grad_norm": 0.2897370755672455, "learning_rate": 5.55059881233136e-05, "loss": 0.1539, "step": 12200 }, { "epoch": 2.45859359258513, "grad_norm": 0.05194070562720299, "learning_rate": 5.5492742460395996e-05, "loss": 0.1916, "step": 12202 }, { "epoch": 2.458996574652428, "grad_norm": 0.04825571924448013, "learning_rate": 5.547949640728275e-05, "loss": 0.1901, "step": 12204 }, { "epoch": 2.4593995567197258, "grad_norm": 0.04531543329358101, "learning_rate": 5.546624996491485e-05, "loss": 0.1805, "step": 12206 }, { "epoch": 2.459802538787024, "grad_norm": 0.0695631206035614, "learning_rate": 5.545300313423328e-05, "loss": 0.1507, "step": 12208 }, { "epoch": 2.460205520854322, "grad_norm": 0.15136954188346863, "learning_rate": 5.5439755916179094e-05, "loss": 0.2416, "step": 12210 }, { "epoch": 2.46060850292162, "grad_norm": 0.04831309616565704, "learning_rate": 5.5426508311693356e-05, "loss": 0.1837, "step": 12212 }, { "epoch": 2.461011484988918, "grad_norm": 0.0721033364534378, "learning_rate": 5.5413260321717144e-05, "loss": 0.2002, "step": 12214 }, { "epoch": 2.461414467056216, "grad_norm": 0.048263076692819595, "learning_rate": 5.5400011947191566e-05, "loss": 0.1699, "step": 12216 }, { "epoch": 2.461817449123514, "grad_norm": 0.06317691504955292, "learning_rate": 5.538676318905779e-05, "loss": 0.1929, "step": 12218 }, { "epoch": 2.4622204311908122, "grad_norm": 0.06637461483478546, "learning_rate": 5.537351404825696e-05, "loss": 0.1725, "step": 12220 }, { "epoch": 2.46262341325811, "grad_norm": 0.07199779897928238, "learning_rate": 5.536026452573028e-05, "loss": 0.2299, "step": 12222 }, { "epoch": 2.463026395325408, "grad_norm": 0.05035098269581795, "learning_rate": 5.534701462241899e-05, "loss": 0.1801, "step": 12224 }, { "epoch": 2.463429377392706, "grad_norm": 0.0727388858795166, "learning_rate": 5.533376433926434e-05, "loss": 0.2092, "step": 12226 }, { "epoch": 2.463832359460004, "grad_norm": 0.046067435294389725, "learning_rate": 5.532051367720759e-05, "loss": 0.1696, "step": 12228 }, { "epoch": 2.464235341527302, "grad_norm": 0.07674822956323624, "learning_rate": 5.530726263719006e-05, "loss": 0.1275, "step": 12230 }, { "epoch": 2.4646383235946, "grad_norm": 0.04457063227891922, "learning_rate": 5.529401122015307e-05, "loss": 0.1404, "step": 12232 }, { "epoch": 2.465041305661898, "grad_norm": 0.08931057155132294, "learning_rate": 5.5280759427038e-05, "loss": 0.2401, "step": 12234 }, { "epoch": 2.465444287729196, "grad_norm": 0.042824145406484604, "learning_rate": 5.5267507258786236e-05, "loss": 0.2042, "step": 12236 }, { "epoch": 2.465847269796494, "grad_norm": 0.058367300778627396, "learning_rate": 5.525425471633916e-05, "loss": 0.1814, "step": 12238 }, { "epoch": 2.466250251863792, "grad_norm": 0.07647201418876648, "learning_rate": 5.524100180063825e-05, "loss": 0.2099, "step": 12240 }, { "epoch": 2.46665323393109, "grad_norm": 0.03983471542596817, "learning_rate": 5.522774851262494e-05, "loss": 0.2135, "step": 12242 }, { "epoch": 2.467056215998388, "grad_norm": 0.05607502534985542, "learning_rate": 5.521449485324074e-05, "loss": 0.1735, "step": 12244 }, { "epoch": 2.467459198065686, "grad_norm": 0.042698707431554794, "learning_rate": 5.520124082342717e-05, "loss": 0.1612, "step": 12246 }, { "epoch": 2.4678621801329843, "grad_norm": 0.04548298195004463, "learning_rate": 5.518798642412577e-05, "loss": 0.1962, "step": 12248 }, { "epoch": 2.4682651622002822, "grad_norm": 0.05207012593746185, "learning_rate": 5.51747316562781e-05, "loss": 0.1973, "step": 12250 }, { "epoch": 2.46866814426758, "grad_norm": 0.06228821724653244, "learning_rate": 5.5161476520825785e-05, "loss": 0.2063, "step": 12252 }, { "epoch": 2.469071126334878, "grad_norm": 0.049454256892204285, "learning_rate": 5.514822101871042e-05, "loss": 0.2042, "step": 12254 }, { "epoch": 2.469474108402176, "grad_norm": 0.06387735903263092, "learning_rate": 5.5134965150873675e-05, "loss": 0.1925, "step": 12256 }, { "epoch": 2.469877090469474, "grad_norm": 0.0502970926463604, "learning_rate": 5.512170891825722e-05, "loss": 0.223, "step": 12258 }, { "epoch": 2.470280072536772, "grad_norm": 0.05233887583017349, "learning_rate": 5.510845232180275e-05, "loss": 0.1877, "step": 12260 }, { "epoch": 2.47068305460407, "grad_norm": 0.053190380334854126, "learning_rate": 5.509519536245199e-05, "loss": 0.21, "step": 12262 }, { "epoch": 2.4710860366713683, "grad_norm": 0.05719498172402382, "learning_rate": 5.508193804114671e-05, "loss": 0.1768, "step": 12264 }, { "epoch": 2.471489018738666, "grad_norm": 0.04697343707084656, "learning_rate": 5.506868035882867e-05, "loss": 0.1915, "step": 12266 }, { "epoch": 2.471892000805964, "grad_norm": 0.06328902393579483, "learning_rate": 5.5055422316439686e-05, "loss": 0.2007, "step": 12268 }, { "epoch": 2.472294982873262, "grad_norm": 0.05199269577860832, "learning_rate": 5.504216391492159e-05, "loss": 0.2097, "step": 12270 }, { "epoch": 2.47269796494056, "grad_norm": 0.06375780701637268, "learning_rate": 5.502890515521624e-05, "loss": 0.2188, "step": 12272 }, { "epoch": 2.473100947007858, "grad_norm": 0.047609761357307434, "learning_rate": 5.501564603826549e-05, "loss": 0.1402, "step": 12274 }, { "epoch": 2.4735039290751564, "grad_norm": 0.06725708395242691, "learning_rate": 5.500238656501129e-05, "loss": 0.2134, "step": 12276 }, { "epoch": 2.4739069111424543, "grad_norm": 0.04552186653017998, "learning_rate": 5.4989126736395526e-05, "loss": 0.1678, "step": 12278 }, { "epoch": 2.4743098932097523, "grad_norm": 0.050900984555482864, "learning_rate": 5.497586655336019e-05, "loss": 0.2105, "step": 12280 }, { "epoch": 2.47471287527705, "grad_norm": 0.05463655665516853, "learning_rate": 5.496260601684725e-05, "loss": 0.2081, "step": 12282 }, { "epoch": 2.475115857344348, "grad_norm": 0.04248126596212387, "learning_rate": 5.4949345127798714e-05, "loss": 0.1653, "step": 12284 }, { "epoch": 2.475518839411646, "grad_norm": 0.05193443223834038, "learning_rate": 5.493608388715661e-05, "loss": 0.1912, "step": 12286 }, { "epoch": 2.475921821478944, "grad_norm": 0.053680527955293655, "learning_rate": 5.492282229586302e-05, "loss": 0.1993, "step": 12288 }, { "epoch": 2.476324803546242, "grad_norm": 0.04288513585925102, "learning_rate": 5.490956035485999e-05, "loss": 0.1566, "step": 12290 }, { "epoch": 2.4767277856135403, "grad_norm": 0.07353232055902481, "learning_rate": 5.489629806508964e-05, "loss": 0.2277, "step": 12292 }, { "epoch": 2.4771307676808383, "grad_norm": 0.06713627278804779, "learning_rate": 5.4883035427494125e-05, "loss": 0.2156, "step": 12294 }, { "epoch": 2.4775337497481362, "grad_norm": 0.07119248807430267, "learning_rate": 5.486977244301556e-05, "loss": 0.1849, "step": 12296 }, { "epoch": 2.477936731815434, "grad_norm": 0.05009063705801964, "learning_rate": 5.485650911259617e-05, "loss": 0.228, "step": 12298 }, { "epoch": 2.478339713882732, "grad_norm": 0.03596751391887665, "learning_rate": 5.484324543717814e-05, "loss": 0.1456, "step": 12300 }, { "epoch": 2.47874269595003, "grad_norm": 0.07743962854146957, "learning_rate": 5.482998141770368e-05, "loss": 0.2071, "step": 12302 }, { "epoch": 2.4791456780173284, "grad_norm": 0.058847635984420776, "learning_rate": 5.4816717055115065e-05, "loss": 0.1557, "step": 12304 }, { "epoch": 2.4795486600846264, "grad_norm": 0.053091805428266525, "learning_rate": 5.480345235035459e-05, "loss": 0.1997, "step": 12306 }, { "epoch": 2.4799516421519243, "grad_norm": 0.04882095754146576, "learning_rate": 5.479018730436454e-05, "loss": 0.1748, "step": 12308 }, { "epoch": 2.4803546242192223, "grad_norm": 0.05197981372475624, "learning_rate": 5.477692191808723e-05, "loss": 0.1806, "step": 12310 }, { "epoch": 2.48075760628652, "grad_norm": 0.04549933597445488, "learning_rate": 5.476365619246504e-05, "loss": 0.1784, "step": 12312 }, { "epoch": 2.481160588353818, "grad_norm": 0.07017272710800171, "learning_rate": 5.475039012844033e-05, "loss": 0.1789, "step": 12314 }, { "epoch": 2.481563570421116, "grad_norm": 0.09665434062480927, "learning_rate": 5.4737123726955494e-05, "loss": 0.1972, "step": 12316 }, { "epoch": 2.481966552488414, "grad_norm": 0.053050488233566284, "learning_rate": 5.4723856988952985e-05, "loss": 0.1895, "step": 12318 }, { "epoch": 2.4823695345557124, "grad_norm": 0.05171143636107445, "learning_rate": 5.471058991537521e-05, "loss": 0.1766, "step": 12320 }, { "epoch": 2.4827725166230104, "grad_norm": 0.04598110914230347, "learning_rate": 5.469732250716466e-05, "loss": 0.17, "step": 12322 }, { "epoch": 2.4831754986903083, "grad_norm": 0.030317850410938263, "learning_rate": 5.468405476526385e-05, "loss": 0.1592, "step": 12324 }, { "epoch": 2.4835784807576062, "grad_norm": 0.053349483758211136, "learning_rate": 5.467078669061526e-05, "loss": 0.1976, "step": 12326 }, { "epoch": 2.483981462824904, "grad_norm": 0.059870265424251556, "learning_rate": 5.465751828416147e-05, "loss": 0.1714, "step": 12328 }, { "epoch": 2.484384444892202, "grad_norm": 0.04339151084423065, "learning_rate": 5.4644249546845015e-05, "loss": 0.231, "step": 12330 }, { "epoch": 2.4847874269595005, "grad_norm": 0.06685718894004822, "learning_rate": 5.4630980479608504e-05, "loss": 0.2283, "step": 12332 }, { "epoch": 2.4851904090267984, "grad_norm": 0.056755565106868744, "learning_rate": 5.461771108339456e-05, "loss": 0.1643, "step": 12334 }, { "epoch": 2.4855933910940964, "grad_norm": 0.05903014913201332, "learning_rate": 5.46044413591458e-05, "loss": 0.2265, "step": 12336 }, { "epoch": 2.4859963731613943, "grad_norm": 0.0677526444196701, "learning_rate": 5.459117130780487e-05, "loss": 0.2401, "step": 12338 }, { "epoch": 2.4863993552286923, "grad_norm": 0.04722040891647339, "learning_rate": 5.45779009303145e-05, "loss": 0.1772, "step": 12340 }, { "epoch": 2.48680233729599, "grad_norm": 0.058663852512836456, "learning_rate": 5.4564630227617355e-05, "loss": 0.2054, "step": 12342 }, { "epoch": 2.487205319363288, "grad_norm": 0.08635495603084564, "learning_rate": 5.455135920065617e-05, "loss": 0.1855, "step": 12344 }, { "epoch": 2.4876083014305865, "grad_norm": 0.060355834662914276, "learning_rate": 5.453808785037372e-05, "loss": 0.2347, "step": 12346 }, { "epoch": 2.4880112834978845, "grad_norm": 0.06020810455083847, "learning_rate": 5.452481617771276e-05, "loss": 0.228, "step": 12348 }, { "epoch": 2.4884142655651824, "grad_norm": 0.04620720073580742, "learning_rate": 5.451154418361609e-05, "loss": 0.1407, "step": 12350 }, { "epoch": 2.4888172476324804, "grad_norm": 0.07045585662126541, "learning_rate": 5.449827186902655e-05, "loss": 0.2038, "step": 12352 }, { "epoch": 2.4892202296997783, "grad_norm": 0.041793834418058395, "learning_rate": 5.448499923488697e-05, "loss": 0.1546, "step": 12354 }, { "epoch": 2.4896232117670762, "grad_norm": 0.05781014636158943, "learning_rate": 5.4471726282140203e-05, "loss": 0.2068, "step": 12356 }, { "epoch": 2.490026193834374, "grad_norm": 0.08505477011203766, "learning_rate": 5.445845301172917e-05, "loss": 0.2346, "step": 12358 }, { "epoch": 2.4904291759016726, "grad_norm": 0.06190848723053932, "learning_rate": 5.4445179424596747e-05, "loss": 0.1849, "step": 12360 }, { "epoch": 2.4908321579689705, "grad_norm": 0.03134537488222122, "learning_rate": 5.443190552168589e-05, "loss": 0.1761, "step": 12362 }, { "epoch": 2.4912351400362684, "grad_norm": 0.04573149234056473, "learning_rate": 5.441863130393957e-05, "loss": 0.1969, "step": 12364 }, { "epoch": 2.4916381221035664, "grad_norm": 0.05835878103971481, "learning_rate": 5.4405356772300733e-05, "loss": 0.2151, "step": 12366 }, { "epoch": 2.4920411041708643, "grad_norm": 0.06177271157503128, "learning_rate": 5.4392081927712394e-05, "loss": 0.2706, "step": 12368 }, { "epoch": 2.4924440862381623, "grad_norm": 0.051789190620183945, "learning_rate": 5.43788067711176e-05, "loss": 0.1595, "step": 12370 }, { "epoch": 2.49284706830546, "grad_norm": 0.050115808844566345, "learning_rate": 5.436553130345935e-05, "loss": 0.1806, "step": 12372 }, { "epoch": 2.4932500503727586, "grad_norm": 0.051849812269210815, "learning_rate": 5.435225552568075e-05, "loss": 0.1914, "step": 12374 }, { "epoch": 2.4936530324400565, "grad_norm": 0.06172528490424156, "learning_rate": 5.433897943872488e-05, "loss": 0.18, "step": 12376 }, { "epoch": 2.4940560145073545, "grad_norm": 0.04672456160187721, "learning_rate": 5.432570304353484e-05, "loss": 0.1496, "step": 12378 }, { "epoch": 2.4944589965746524, "grad_norm": 0.07356838136911392, "learning_rate": 5.431242634105378e-05, "loss": 0.194, "step": 12380 }, { "epoch": 2.4948619786419504, "grad_norm": 0.08135899156332016, "learning_rate": 5.429914933222485e-05, "loss": 0.1797, "step": 12382 }, { "epoch": 2.4952649607092483, "grad_norm": 0.04457440972328186, "learning_rate": 5.428587201799122e-05, "loss": 0.1633, "step": 12384 }, { "epoch": 2.4956679427765467, "grad_norm": 0.05523247644305229, "learning_rate": 5.4272594399296105e-05, "loss": 0.236, "step": 12386 }, { "epoch": 2.4960709248438446, "grad_norm": 0.03238510340452194, "learning_rate": 5.425931647708272e-05, "loss": 0.171, "step": 12388 }, { "epoch": 2.4964739069111426, "grad_norm": 0.13140137493610382, "learning_rate": 5.42460382522943e-05, "loss": 0.196, "step": 12390 }, { "epoch": 2.4968768889784405, "grad_norm": 0.06779821962118149, "learning_rate": 5.423275972587411e-05, "loss": 0.1828, "step": 12392 }, { "epoch": 2.4972798710457385, "grad_norm": 0.04135895520448685, "learning_rate": 5.421948089876544e-05, "loss": 0.1577, "step": 12394 }, { "epoch": 2.4976828531130364, "grad_norm": 0.06245134025812149, "learning_rate": 5.420620177191159e-05, "loss": 0.1924, "step": 12396 }, { "epoch": 2.4980858351803343, "grad_norm": 0.04454468935728073, "learning_rate": 5.4192922346255916e-05, "loss": 0.1985, "step": 12398 }, { "epoch": 2.4984888172476323, "grad_norm": 0.07690700143575668, "learning_rate": 5.417964262274171e-05, "loss": 0.1572, "step": 12400 }, { "epoch": 2.4988917993149307, "grad_norm": 0.04643810912966728, "learning_rate": 5.4166362602312396e-05, "loss": 0.1698, "step": 12402 }, { "epoch": 2.4992947813822286, "grad_norm": 0.05322658643126488, "learning_rate": 5.415308228591135e-05, "loss": 0.2312, "step": 12404 }, { "epoch": 2.4996977634495265, "grad_norm": 0.04215513542294502, "learning_rate": 5.413980167448197e-05, "loss": 0.1892, "step": 12406 }, { "epoch": 2.5001007455168245, "grad_norm": 0.06809663027524948, "learning_rate": 5.412652076896769e-05, "loss": 0.2176, "step": 12408 }, { "epoch": 2.5005037275841224, "grad_norm": 0.061162110418081284, "learning_rate": 5.4113239570312e-05, "loss": 0.162, "step": 12410 }, { "epoch": 2.5009067096514204, "grad_norm": 0.04683419689536095, "learning_rate": 5.409995807945834e-05, "loss": 0.1931, "step": 12412 }, { "epoch": 2.5013096917187188, "grad_norm": 0.054280806332826614, "learning_rate": 5.4086676297350204e-05, "loss": 0.2159, "step": 12414 }, { "epoch": 2.5017126737860167, "grad_norm": 0.05468326061964035, "learning_rate": 5.407339422493113e-05, "loss": 0.2176, "step": 12416 }, { "epoch": 2.5021156558533146, "grad_norm": 0.06952318549156189, "learning_rate": 5.4060111863144636e-05, "loss": 0.2025, "step": 12418 }, { "epoch": 2.5025186379206126, "grad_norm": 0.06900619715452194, "learning_rate": 5.404682921293429e-05, "loss": 0.1671, "step": 12420 }, { "epoch": 2.5029216199879105, "grad_norm": 0.037348657846450806, "learning_rate": 5.403354627524367e-05, "loss": 0.1616, "step": 12422 }, { "epoch": 2.5033246020552085, "grad_norm": 0.07734784483909607, "learning_rate": 5.4020263051016375e-05, "loss": 0.2028, "step": 12424 }, { "epoch": 2.5037275841225064, "grad_norm": 0.050108522176742554, "learning_rate": 5.4006979541196024e-05, "loss": 0.1732, "step": 12426 }, { "epoch": 2.5041305661898043, "grad_norm": 0.05922889709472656, "learning_rate": 5.399369574672626e-05, "loss": 0.1576, "step": 12428 }, { "epoch": 2.5045335482571023, "grad_norm": 0.06523427367210388, "learning_rate": 5.3980411668550724e-05, "loss": 0.1894, "step": 12430 }, { "epoch": 2.5049365303244007, "grad_norm": 0.05559534579515457, "learning_rate": 5.396712730761311e-05, "loss": 0.1832, "step": 12432 }, { "epoch": 2.5053395123916986, "grad_norm": 0.06921125203371048, "learning_rate": 5.395384266485713e-05, "loss": 0.1934, "step": 12434 }, { "epoch": 2.5057424944589965, "grad_norm": 0.04383534938097, "learning_rate": 5.394055774122648e-05, "loss": 0.1495, "step": 12436 }, { "epoch": 2.5061454765262945, "grad_norm": 0.044495001435279846, "learning_rate": 5.392727253766491e-05, "loss": 0.1741, "step": 12438 }, { "epoch": 2.5065484585935924, "grad_norm": 0.05062146857380867, "learning_rate": 5.391398705511619e-05, "loss": 0.1648, "step": 12440 }, { "epoch": 2.506951440660891, "grad_norm": 0.05182384327054024, "learning_rate": 5.390070129452407e-05, "loss": 0.1508, "step": 12442 }, { "epoch": 2.5073544227281888, "grad_norm": 0.06176676228642464, "learning_rate": 5.388741525683237e-05, "loss": 0.1624, "step": 12444 }, { "epoch": 2.5077574047954867, "grad_norm": 0.052249081432819366, "learning_rate": 5.387412894298494e-05, "loss": 0.1813, "step": 12446 }, { "epoch": 2.5081603868627846, "grad_norm": 0.05974605679512024, "learning_rate": 5.386084235392555e-05, "loss": 0.1921, "step": 12448 }, { "epoch": 2.5085633689300826, "grad_norm": 0.06526529043912888, "learning_rate": 5.38475554905981e-05, "loss": 0.2365, "step": 12450 }, { "epoch": 2.5089663509973805, "grad_norm": 0.058665867894887924, "learning_rate": 5.383426835394646e-05, "loss": 0.1953, "step": 12452 }, { "epoch": 2.5093693330646785, "grad_norm": 0.05564659833908081, "learning_rate": 5.3820980944914534e-05, "loss": 0.2344, "step": 12454 }, { "epoch": 2.5097723151319764, "grad_norm": 0.06119886040687561, "learning_rate": 5.380769326444624e-05, "loss": 0.2102, "step": 12456 }, { "epoch": 2.510175297199275, "grad_norm": 0.07365282624959946, "learning_rate": 5.37944053134855e-05, "loss": 0.1746, "step": 12458 }, { "epoch": 2.5105782792665727, "grad_norm": 0.043750863522291183, "learning_rate": 5.3781117092976264e-05, "loss": 0.1796, "step": 12460 }, { "epoch": 2.5109812613338707, "grad_norm": 0.05926395207643509, "learning_rate": 5.3767828603862535e-05, "loss": 0.2018, "step": 12462 }, { "epoch": 2.5113842434011686, "grad_norm": 0.07026753574609756, "learning_rate": 5.3754539847088284e-05, "loss": 0.1837, "step": 12464 }, { "epoch": 2.5117872254684666, "grad_norm": 0.03925260528922081, "learning_rate": 5.3741250823597514e-05, "loss": 0.1593, "step": 12466 }, { "epoch": 2.5121902075357645, "grad_norm": 0.06190716475248337, "learning_rate": 5.372796153433428e-05, "loss": 0.1479, "step": 12468 }, { "epoch": 2.512593189603063, "grad_norm": 0.051117755472660065, "learning_rate": 5.371467198024262e-05, "loss": 0.1664, "step": 12470 }, { "epoch": 2.512996171670361, "grad_norm": 0.057007379829883575, "learning_rate": 5.370138216226659e-05, "loss": 0.1731, "step": 12472 }, { "epoch": 2.5133991537376588, "grad_norm": 0.07143572717905045, "learning_rate": 5.368809208135031e-05, "loss": 0.1865, "step": 12474 }, { "epoch": 2.5138021358049567, "grad_norm": 0.09572847932577133, "learning_rate": 5.3674801738437854e-05, "loss": 0.1801, "step": 12476 }, { "epoch": 2.5142051178722546, "grad_norm": 0.05938468873500824, "learning_rate": 5.366151113447336e-05, "loss": 0.1873, "step": 12478 }, { "epoch": 2.5146080999395526, "grad_norm": 0.07035186886787415, "learning_rate": 5.3648220270400985e-05, "loss": 0.2036, "step": 12480 }, { "epoch": 2.5150110820068505, "grad_norm": 0.05732342600822449, "learning_rate": 5.3634929147164856e-05, "loss": 0.1857, "step": 12482 }, { "epoch": 2.5154140640741485, "grad_norm": 0.055962275713682175, "learning_rate": 5.362163776570919e-05, "loss": 0.1447, "step": 12484 }, { "epoch": 2.515817046141447, "grad_norm": 0.04367542266845703, "learning_rate": 5.360834612697816e-05, "loss": 0.1856, "step": 12486 }, { "epoch": 2.516220028208745, "grad_norm": 0.06652933359146118, "learning_rate": 5.3595054231916e-05, "loss": 0.2319, "step": 12488 }, { "epoch": 2.5166230102760427, "grad_norm": 0.05221908167004585, "learning_rate": 5.3581762081466936e-05, "loss": 0.1884, "step": 12490 }, { "epoch": 2.5170259923433407, "grad_norm": 0.04966789111495018, "learning_rate": 5.3568469676575206e-05, "loss": 0.2137, "step": 12492 }, { "epoch": 2.5174289744106386, "grad_norm": 0.056733645498752594, "learning_rate": 5.355517701818511e-05, "loss": 0.2197, "step": 12494 }, { "epoch": 2.517831956477937, "grad_norm": 0.054679885506629944, "learning_rate": 5.354188410724092e-05, "loss": 0.1723, "step": 12496 }, { "epoch": 2.518234938545235, "grad_norm": 0.04557587578892708, "learning_rate": 5.352859094468695e-05, "loss": 0.1544, "step": 12498 }, { "epoch": 2.518637920612533, "grad_norm": 0.055995721369981766, "learning_rate": 5.351529753146752e-05, "loss": 0.1585, "step": 12500 }, { "epoch": 2.519040902679831, "grad_norm": 0.06158788874745369, "learning_rate": 5.350200386852698e-05, "loss": 0.1699, "step": 12502 }, { "epoch": 2.5194438847471288, "grad_norm": 0.07631693035364151, "learning_rate": 5.348870995680969e-05, "loss": 0.2663, "step": 12504 }, { "epoch": 2.5198468668144267, "grad_norm": 0.05153597518801689, "learning_rate": 5.347541579726001e-05, "loss": 0.1706, "step": 12506 }, { "epoch": 2.5202498488817247, "grad_norm": 0.050337210297584534, "learning_rate": 5.346212139082236e-05, "loss": 0.1744, "step": 12508 }, { "epoch": 2.5206528309490226, "grad_norm": 0.051275044679641724, "learning_rate": 5.3448826738441135e-05, "loss": 0.1443, "step": 12510 }, { "epoch": 2.5210558130163205, "grad_norm": 0.0649779736995697, "learning_rate": 5.343553184106078e-05, "loss": 0.2232, "step": 12512 }, { "epoch": 2.521458795083619, "grad_norm": 0.0551101416349411, "learning_rate": 5.342223669962575e-05, "loss": 0.2095, "step": 12514 }, { "epoch": 2.521861777150917, "grad_norm": 0.06402526795864105, "learning_rate": 5.3408941315080476e-05, "loss": 0.1813, "step": 12516 }, { "epoch": 2.522264759218215, "grad_norm": 0.06047520786523819, "learning_rate": 5.3395645688369464e-05, "loss": 0.1938, "step": 12518 }, { "epoch": 2.5226677412855127, "grad_norm": 0.05238273739814758, "learning_rate": 5.338234982043723e-05, "loss": 0.2008, "step": 12520 }, { "epoch": 2.5230707233528107, "grad_norm": 0.05175113305449486, "learning_rate": 5.3369053712228265e-05, "loss": 0.1408, "step": 12522 }, { "epoch": 2.523473705420109, "grad_norm": 0.04698505997657776, "learning_rate": 5.335575736468711e-05, "loss": 0.1922, "step": 12524 }, { "epoch": 2.523876687487407, "grad_norm": 0.05037987604737282, "learning_rate": 5.334246077875833e-05, "loss": 0.1506, "step": 12526 }, { "epoch": 2.524279669554705, "grad_norm": 0.04577264562249184, "learning_rate": 5.332916395538646e-05, "loss": 0.1513, "step": 12528 }, { "epoch": 2.524682651622003, "grad_norm": 0.0762718990445137, "learning_rate": 5.331586689551612e-05, "loss": 0.1935, "step": 12530 }, { "epoch": 2.525085633689301, "grad_norm": 0.05379846692085266, "learning_rate": 5.33025696000919e-05, "loss": 0.2161, "step": 12532 }, { "epoch": 2.5254886157565988, "grad_norm": 0.05114980787038803, "learning_rate": 5.3289272070058415e-05, "loss": 0.1496, "step": 12534 }, { "epoch": 2.5258915978238967, "grad_norm": 0.04115746542811394, "learning_rate": 5.3275974306360296e-05, "loss": 0.1536, "step": 12536 }, { "epoch": 2.5262945798911947, "grad_norm": 0.055138975381851196, "learning_rate": 5.326267630994222e-05, "loss": 0.2359, "step": 12538 }, { "epoch": 2.5266975619584926, "grad_norm": 0.05190563201904297, "learning_rate": 5.3249378081748815e-05, "loss": 0.1924, "step": 12540 }, { "epoch": 2.527100544025791, "grad_norm": 0.055855296552181244, "learning_rate": 5.32360796227248e-05, "loss": 0.1893, "step": 12542 }, { "epoch": 2.527503526093089, "grad_norm": 0.0799691379070282, "learning_rate": 5.322278093381486e-05, "loss": 0.1675, "step": 12544 }, { "epoch": 2.527906508160387, "grad_norm": 0.05231550708413124, "learning_rate": 5.320948201596372e-05, "loss": 0.2105, "step": 12546 }, { "epoch": 2.528309490227685, "grad_norm": 0.06030051410198212, "learning_rate": 5.319618287011611e-05, "loss": 0.1889, "step": 12548 }, { "epoch": 2.5287124722949827, "grad_norm": 0.04509327560663223, "learning_rate": 5.3182883497216785e-05, "loss": 0.1286, "step": 12550 }, { "epoch": 2.529115454362281, "grad_norm": 0.04975868761539459, "learning_rate": 5.3169583898210495e-05, "loss": 0.1533, "step": 12552 }, { "epoch": 2.529518436429579, "grad_norm": 0.04566744342446327, "learning_rate": 5.315628407404203e-05, "loss": 0.1536, "step": 12554 }, { "epoch": 2.529921418496877, "grad_norm": 0.05292901396751404, "learning_rate": 5.314298402565621e-05, "loss": 0.2023, "step": 12556 }, { "epoch": 2.530324400564175, "grad_norm": 0.05594682693481445, "learning_rate": 5.312968375399782e-05, "loss": 0.1589, "step": 12558 }, { "epoch": 2.530727382631473, "grad_norm": 0.051560178399086, "learning_rate": 5.311638326001172e-05, "loss": 0.1737, "step": 12560 }, { "epoch": 2.531130364698771, "grad_norm": 0.050082527101039886, "learning_rate": 5.31030825446427e-05, "loss": 0.216, "step": 12562 }, { "epoch": 2.531533346766069, "grad_norm": 0.0678807944059372, "learning_rate": 5.3089781608835684e-05, "loss": 0.2065, "step": 12564 }, { "epoch": 2.5319363288333667, "grad_norm": 0.04848482087254524, "learning_rate": 5.307648045353553e-05, "loss": 0.1619, "step": 12566 }, { "epoch": 2.5323393109006647, "grad_norm": 0.06144823879003525, "learning_rate": 5.306317907968711e-05, "loss": 0.179, "step": 12568 }, { "epoch": 2.532742292967963, "grad_norm": 0.07615198940038681, "learning_rate": 5.3049877488235346e-05, "loss": 0.202, "step": 12570 }, { "epoch": 2.533145275035261, "grad_norm": 0.058797042816877365, "learning_rate": 5.303657568012518e-05, "loss": 0.1801, "step": 12572 }, { "epoch": 2.533548257102559, "grad_norm": 0.0644260123372078, "learning_rate": 5.302327365630151e-05, "loss": 0.1665, "step": 12574 }, { "epoch": 2.533951239169857, "grad_norm": 0.055937882512807846, "learning_rate": 5.300997141770933e-05, "loss": 0.1723, "step": 12576 }, { "epoch": 2.534354221237155, "grad_norm": 0.061812832951545715, "learning_rate": 5.299666896529359e-05, "loss": 0.2301, "step": 12578 }, { "epoch": 2.534757203304453, "grad_norm": 0.07181499153375626, "learning_rate": 5.298336629999928e-05, "loss": 0.1966, "step": 12580 }, { "epoch": 2.535160185371751, "grad_norm": 0.05782296136021614, "learning_rate": 5.29700634227714e-05, "loss": 0.1752, "step": 12582 }, { "epoch": 2.535563167439049, "grad_norm": 0.06424509733915329, "learning_rate": 5.2956760334554966e-05, "loss": 0.1884, "step": 12584 }, { "epoch": 2.535966149506347, "grad_norm": 0.0681728720664978, "learning_rate": 5.2943457036295e-05, "loss": 0.1922, "step": 12586 }, { "epoch": 2.536369131573645, "grad_norm": 0.06878721714019775, "learning_rate": 5.2930153528936556e-05, "loss": 0.2045, "step": 12588 }, { "epoch": 2.536772113640943, "grad_norm": 0.06963707506656647, "learning_rate": 5.2916849813424694e-05, "loss": 0.192, "step": 12590 }, { "epoch": 2.537175095708241, "grad_norm": 0.05961614102125168, "learning_rate": 5.2903545890704484e-05, "loss": 0.1639, "step": 12592 }, { "epoch": 2.537578077775539, "grad_norm": 0.06052708998322487, "learning_rate": 5.289024176172102e-05, "loss": 0.2408, "step": 12594 }, { "epoch": 2.5379810598428367, "grad_norm": 0.05931401625275612, "learning_rate": 5.28769374274194e-05, "loss": 0.2079, "step": 12596 }, { "epoch": 2.538384041910135, "grad_norm": 0.06881610304117203, "learning_rate": 5.2863632888744753e-05, "loss": 0.1564, "step": 12598 }, { "epoch": 2.538787023977433, "grad_norm": 0.09120440483093262, "learning_rate": 5.2850328146642194e-05, "loss": 0.2046, "step": 12600 }, { "epoch": 2.539190006044731, "grad_norm": 0.06185607239603996, "learning_rate": 5.283702320205689e-05, "loss": 0.1858, "step": 12602 }, { "epoch": 2.539592988112029, "grad_norm": 0.06449907273054123, "learning_rate": 5.282371805593399e-05, "loss": 0.1725, "step": 12604 }, { "epoch": 2.539995970179327, "grad_norm": 0.07941526174545288, "learning_rate": 5.281041270921867e-05, "loss": 0.1761, "step": 12606 }, { "epoch": 2.5403989522466253, "grad_norm": 0.05380752682685852, "learning_rate": 5.2797107162856154e-05, "loss": 0.2393, "step": 12608 }, { "epoch": 2.540801934313923, "grad_norm": 0.06477542966604233, "learning_rate": 5.278380141779159e-05, "loss": 0.2467, "step": 12610 }, { "epoch": 2.541204916381221, "grad_norm": 0.07349900901317596, "learning_rate": 5.277049547497023e-05, "loss": 0.2821, "step": 12612 }, { "epoch": 2.541607898448519, "grad_norm": 0.05324852094054222, "learning_rate": 5.275718933533731e-05, "loss": 0.2045, "step": 12614 }, { "epoch": 2.542010880515817, "grad_norm": 0.052172720432281494, "learning_rate": 5.274388299983807e-05, "loss": 0.1792, "step": 12616 }, { "epoch": 2.542413862583115, "grad_norm": 0.03972654789686203, "learning_rate": 5.273057646941776e-05, "loss": 0.183, "step": 12618 }, { "epoch": 2.542816844650413, "grad_norm": 0.056424275040626526, "learning_rate": 5.271726974502167e-05, "loss": 0.2112, "step": 12620 }, { "epoch": 2.543219826717711, "grad_norm": 0.0493265837430954, "learning_rate": 5.270396282759508e-05, "loss": 0.1641, "step": 12622 }, { "epoch": 2.543622808785009, "grad_norm": 0.04471006616950035, "learning_rate": 5.269065571808329e-05, "loss": 0.1254, "step": 12624 }, { "epoch": 2.544025790852307, "grad_norm": 0.05025114119052887, "learning_rate": 5.2677348417431636e-05, "loss": 0.1767, "step": 12626 }, { "epoch": 2.544428772919605, "grad_norm": 0.07107166945934296, "learning_rate": 5.266404092658542e-05, "loss": 0.1732, "step": 12628 }, { "epoch": 2.544831754986903, "grad_norm": 0.07669036090373993, "learning_rate": 5.2650733246490014e-05, "loss": 0.1624, "step": 12630 }, { "epoch": 2.545234737054201, "grad_norm": 0.05052323639392853, "learning_rate": 5.263742537809074e-05, "loss": 0.1813, "step": 12632 }, { "epoch": 2.545637719121499, "grad_norm": 0.04952344670891762, "learning_rate": 5.262411732233299e-05, "loss": 0.1793, "step": 12634 }, { "epoch": 2.5460407011887973, "grad_norm": 0.06734666973352432, "learning_rate": 5.261080908016215e-05, "loss": 0.2026, "step": 12636 }, { "epoch": 2.5464436832560953, "grad_norm": 0.056185994297266006, "learning_rate": 5.2597500652523594e-05, "loss": 0.1895, "step": 12638 }, { "epoch": 2.546846665323393, "grad_norm": 0.04555206000804901, "learning_rate": 5.258419204036275e-05, "loss": 0.1778, "step": 12640 }, { "epoch": 2.547249647390691, "grad_norm": 0.0584423765540123, "learning_rate": 5.257088324462505e-05, "loss": 0.1854, "step": 12642 }, { "epoch": 2.547652629457989, "grad_norm": 0.05547767132520676, "learning_rate": 5.255757426625589e-05, "loss": 0.2358, "step": 12644 }, { "epoch": 2.548055611525287, "grad_norm": 0.0526818111538887, "learning_rate": 5.254426510620076e-05, "loss": 0.1936, "step": 12646 }, { "epoch": 2.548458593592585, "grad_norm": 0.059681832790374756, "learning_rate": 5.253095576540511e-05, "loss": 0.204, "step": 12648 }, { "epoch": 2.548861575659883, "grad_norm": 0.11047457903623581, "learning_rate": 5.25176462448144e-05, "loss": 0.1929, "step": 12650 }, { "epoch": 2.5492645577271813, "grad_norm": 0.047334007918834686, "learning_rate": 5.250433654537413e-05, "loss": 0.1852, "step": 12652 }, { "epoch": 2.5496675397944792, "grad_norm": 0.061922837048769, "learning_rate": 5.249102666802981e-05, "loss": 0.1463, "step": 12654 }, { "epoch": 2.550070521861777, "grad_norm": 0.07106788456439972, "learning_rate": 5.247771661372692e-05, "loss": 0.1574, "step": 12656 }, { "epoch": 2.550473503929075, "grad_norm": 0.041897907853126526, "learning_rate": 5.2464406383411004e-05, "loss": 0.1597, "step": 12658 }, { "epoch": 2.550876485996373, "grad_norm": 0.036984678357839584, "learning_rate": 5.245109597802762e-05, "loss": 0.167, "step": 12660 }, { "epoch": 2.551279468063671, "grad_norm": 0.049853935837745667, "learning_rate": 5.243778539852228e-05, "loss": 0.2422, "step": 12662 }, { "epoch": 2.5516824501309694, "grad_norm": 0.054746098816394806, "learning_rate": 5.2424474645840574e-05, "loss": 0.1764, "step": 12664 }, { "epoch": 2.5520854321982673, "grad_norm": 0.06157633662223816, "learning_rate": 5.241116372092806e-05, "loss": 0.1968, "step": 12666 }, { "epoch": 2.5524884142655653, "grad_norm": 0.06506279110908508, "learning_rate": 5.2397852624730327e-05, "loss": 0.1789, "step": 12668 }, { "epoch": 2.552891396332863, "grad_norm": 0.03678274154663086, "learning_rate": 5.2384541358192986e-05, "loss": 0.134, "step": 12670 }, { "epoch": 2.553294378400161, "grad_norm": 0.06827437877655029, "learning_rate": 5.237122992226165e-05, "loss": 0.2136, "step": 12672 }, { "epoch": 2.553697360467459, "grad_norm": 0.058229926973581314, "learning_rate": 5.2357918317881915e-05, "loss": 0.2051, "step": 12674 }, { "epoch": 2.554100342534757, "grad_norm": 0.06786002963781357, "learning_rate": 5.2344606545999433e-05, "loss": 0.1732, "step": 12676 }, { "epoch": 2.554503324602055, "grad_norm": 0.06592409312725067, "learning_rate": 5.233129460755987e-05, "loss": 0.183, "step": 12678 }, { "epoch": 2.5549063066693534, "grad_norm": 0.04685007780790329, "learning_rate": 5.2317982503508856e-05, "loss": 0.1344, "step": 12680 }, { "epoch": 2.5553092887366513, "grad_norm": 0.09164592623710632, "learning_rate": 5.230467023479206e-05, "loss": 0.1841, "step": 12682 }, { "epoch": 2.5557122708039492, "grad_norm": 0.07550480961799622, "learning_rate": 5.22913578023552e-05, "loss": 0.2059, "step": 12684 }, { "epoch": 2.556115252871247, "grad_norm": 0.054588042199611664, "learning_rate": 5.227804520714392e-05, "loss": 0.1812, "step": 12686 }, { "epoch": 2.556518234938545, "grad_norm": 0.2929278314113617, "learning_rate": 5.226473245010397e-05, "loss": 0.2214, "step": 12688 }, { "epoch": 2.5569212170058435, "grad_norm": 0.04900304600596428, "learning_rate": 5.2251419532181054e-05, "loss": 0.1693, "step": 12690 }, { "epoch": 2.5573241990731415, "grad_norm": 0.06676102429628372, "learning_rate": 5.223810645432088e-05, "loss": 0.1692, "step": 12692 }, { "epoch": 2.5577271811404394, "grad_norm": 0.06755790114402771, "learning_rate": 5.2224793217469213e-05, "loss": 0.1997, "step": 12694 }, { "epoch": 2.5581301632077373, "grad_norm": 0.4008621871471405, "learning_rate": 5.221147982257178e-05, "loss": 0.2239, "step": 12696 }, { "epoch": 2.5585331452750353, "grad_norm": 0.05779675021767616, "learning_rate": 5.2198166270574366e-05, "loss": 0.2308, "step": 12698 }, { "epoch": 2.5589361273423332, "grad_norm": 0.060293830931186676, "learning_rate": 5.2184852562422746e-05, "loss": 0.2085, "step": 12700 }, { "epoch": 2.559339109409631, "grad_norm": 0.05748559907078743, "learning_rate": 5.2171538699062686e-05, "loss": 0.2141, "step": 12702 }, { "epoch": 2.559742091476929, "grad_norm": 0.04755993187427521, "learning_rate": 5.215822468143998e-05, "loss": 0.151, "step": 12704 }, { "epoch": 2.560145073544227, "grad_norm": 0.06931561976671219, "learning_rate": 5.214491051050045e-05, "loss": 0.2315, "step": 12706 }, { "epoch": 2.5605480556115254, "grad_norm": 0.07574167847633362, "learning_rate": 5.2131596187189914e-05, "loss": 0.2175, "step": 12708 }, { "epoch": 2.5609510376788234, "grad_norm": 0.057953450828790665, "learning_rate": 5.2118281712454184e-05, "loss": 0.2294, "step": 12710 }, { "epoch": 2.5613540197461213, "grad_norm": 0.05987294390797615, "learning_rate": 5.210496708723912e-05, "loss": 0.169, "step": 12712 }, { "epoch": 2.5617570018134193, "grad_norm": 0.05373973026871681, "learning_rate": 5.2091652312490556e-05, "loss": 0.1728, "step": 12714 }, { "epoch": 2.562159983880717, "grad_norm": 0.057644158601760864, "learning_rate": 5.207833738915435e-05, "loss": 0.2014, "step": 12716 }, { "epoch": 2.5625629659480156, "grad_norm": 0.06298521906137466, "learning_rate": 5.206502231817639e-05, "loss": 0.1506, "step": 12718 }, { "epoch": 2.5629659480153135, "grad_norm": 0.06016525253653526, "learning_rate": 5.2051707100502534e-05, "loss": 0.2087, "step": 12720 }, { "epoch": 2.5633689300826115, "grad_norm": 0.08092235773801804, "learning_rate": 5.2038391737078694e-05, "loss": 0.2032, "step": 12722 }, { "epoch": 2.5637719121499094, "grad_norm": 0.053698521107435226, "learning_rate": 5.202507622885078e-05, "loss": 0.178, "step": 12724 }, { "epoch": 2.5641748942172073, "grad_norm": 0.06155625730752945, "learning_rate": 5.201176057676467e-05, "loss": 0.189, "step": 12726 }, { "epoch": 2.5645778762845053, "grad_norm": 0.05709967389702797, "learning_rate": 5.199844478176631e-05, "loss": 0.2007, "step": 12728 }, { "epoch": 2.5649808583518032, "grad_norm": 0.05761351063847542, "learning_rate": 5.1985128844801633e-05, "loss": 0.1464, "step": 12730 }, { "epoch": 2.565383840419101, "grad_norm": 0.05454608052968979, "learning_rate": 5.197181276681657e-05, "loss": 0.2254, "step": 12732 }, { "epoch": 2.565786822486399, "grad_norm": 0.23275476694107056, "learning_rate": 5.195849654875709e-05, "loss": 0.2148, "step": 12734 }, { "epoch": 2.5661898045536975, "grad_norm": 0.09468139708042145, "learning_rate": 5.194518019156914e-05, "loss": 0.1706, "step": 12736 }, { "epoch": 2.5665927866209954, "grad_norm": 0.05750922113656998, "learning_rate": 5.19318636961987e-05, "loss": 0.1874, "step": 12738 }, { "epoch": 2.5669957686882934, "grad_norm": 0.05291645601391792, "learning_rate": 5.191854706359175e-05, "loss": 0.2202, "step": 12740 }, { "epoch": 2.5673987507555913, "grad_norm": 0.04654770344495773, "learning_rate": 5.190523029469431e-05, "loss": 0.2129, "step": 12742 }, { "epoch": 2.5678017328228893, "grad_norm": 0.0438658781349659, "learning_rate": 5.189191339045233e-05, "loss": 0.1831, "step": 12744 }, { "epoch": 2.5682047148901876, "grad_norm": 0.056785158812999725, "learning_rate": 5.1878596351811845e-05, "loss": 0.2295, "step": 12746 }, { "epoch": 2.5686076969574856, "grad_norm": 0.03449772298336029, "learning_rate": 5.1865279179718906e-05, "loss": 0.1447, "step": 12748 }, { "epoch": 2.5690106790247835, "grad_norm": 0.06026627495884895, "learning_rate": 5.1851961875119493e-05, "loss": 0.1828, "step": 12750 }, { "epoch": 2.5694136610920815, "grad_norm": 0.07659012079238892, "learning_rate": 5.183864443895967e-05, "loss": 0.2014, "step": 12752 }, { "epoch": 2.5698166431593794, "grad_norm": 0.05824153125286102, "learning_rate": 5.182532687218551e-05, "loss": 0.2268, "step": 12754 }, { "epoch": 2.5702196252266774, "grad_norm": 0.05987565591931343, "learning_rate": 5.181200917574303e-05, "loss": 0.207, "step": 12756 }, { "epoch": 2.5706226072939753, "grad_norm": 0.06632793694734573, "learning_rate": 5.179869135057831e-05, "loss": 0.151, "step": 12758 }, { "epoch": 2.5710255893612732, "grad_norm": 0.05087270215153694, "learning_rate": 5.178537339763745e-05, "loss": 0.2149, "step": 12760 }, { "epoch": 2.571428571428571, "grad_norm": 0.07185879349708557, "learning_rate": 5.177205531786651e-05, "loss": 0.2331, "step": 12762 }, { "epoch": 2.5718315534958696, "grad_norm": 0.060018185526132584, "learning_rate": 5.175873711221161e-05, "loss": 0.1866, "step": 12764 }, { "epoch": 2.5722345355631675, "grad_norm": 0.08213285356760025, "learning_rate": 5.174541878161881e-05, "loss": 0.2149, "step": 12766 }, { "epoch": 2.5726375176304654, "grad_norm": 0.04798312857747078, "learning_rate": 5.173210032703427e-05, "loss": 0.1771, "step": 12768 }, { "epoch": 2.5730404996977634, "grad_norm": 0.07627954334020615, "learning_rate": 5.171878174940409e-05, "loss": 0.2149, "step": 12770 }, { "epoch": 2.5734434817650613, "grad_norm": 0.07262519747018814, "learning_rate": 5.1705463049674397e-05, "loss": 0.2236, "step": 12772 }, { "epoch": 2.5738464638323597, "grad_norm": 0.05804283544421196, "learning_rate": 5.169214422879134e-05, "loss": 0.2591, "step": 12774 }, { "epoch": 2.5742494458996577, "grad_norm": 0.05936234071850777, "learning_rate": 5.167882528770107e-05, "loss": 0.1525, "step": 12776 }, { "epoch": 2.5746524279669556, "grad_norm": 0.07271191477775574, "learning_rate": 5.1665506227349726e-05, "loss": 0.192, "step": 12778 }, { "epoch": 2.5750554100342535, "grad_norm": 0.07205134630203247, "learning_rate": 5.165218704868349e-05, "loss": 0.182, "step": 12780 }, { "epoch": 2.5754583921015515, "grad_norm": 0.055812589824199677, "learning_rate": 5.1638867752648534e-05, "loss": 0.2042, "step": 12782 }, { "epoch": 2.5758613741688494, "grad_norm": 0.05865110456943512, "learning_rate": 5.1625548340191024e-05, "loss": 0.1843, "step": 12784 }, { "epoch": 2.5762643562361474, "grad_norm": 0.04999213665723801, "learning_rate": 5.161222881225716e-05, "loss": 0.1712, "step": 12786 }, { "epoch": 2.5766673383034453, "grad_norm": 0.05770622566342354, "learning_rate": 5.1598909169793144e-05, "loss": 0.2321, "step": 12788 }, { "epoch": 2.5770703203707432, "grad_norm": 0.04536011442542076, "learning_rate": 5.1585589413745176e-05, "loss": 0.1553, "step": 12790 }, { "epoch": 2.5774733024380416, "grad_norm": 0.07230181246995926, "learning_rate": 5.157226954505946e-05, "loss": 0.2216, "step": 12792 }, { "epoch": 2.5778762845053396, "grad_norm": 0.06430327147245407, "learning_rate": 5.1558949564682245e-05, "loss": 0.2242, "step": 12794 }, { "epoch": 2.5782792665726375, "grad_norm": 0.05737382546067238, "learning_rate": 5.1545629473559745e-05, "loss": 0.1762, "step": 12796 }, { "epoch": 2.5786822486399354, "grad_norm": 0.04715484380722046, "learning_rate": 5.1532309272638194e-05, "loss": 0.1554, "step": 12798 }, { "epoch": 2.5790852307072334, "grad_norm": 0.05664588510990143, "learning_rate": 5.151898896286385e-05, "loss": 0.1658, "step": 12800 }, { "epoch": 2.5794882127745318, "grad_norm": 0.05977484956383705, "learning_rate": 5.150566854518294e-05, "loss": 0.1833, "step": 12802 }, { "epoch": 2.5798911948418297, "grad_norm": 0.04098358750343323, "learning_rate": 5.149234802054176e-05, "loss": 0.2023, "step": 12804 }, { "epoch": 2.5802941769091277, "grad_norm": 0.04994696006178856, "learning_rate": 5.147902738988657e-05, "loss": 0.1946, "step": 12806 }, { "epoch": 2.5806971589764256, "grad_norm": 0.07139338552951813, "learning_rate": 5.146570665416363e-05, "loss": 0.2112, "step": 12808 }, { "epoch": 2.5811001410437235, "grad_norm": 0.058642320334911346, "learning_rate": 5.145238581431923e-05, "loss": 0.21, "step": 12810 }, { "epoch": 2.5815031231110215, "grad_norm": 0.06522581726312637, "learning_rate": 5.143906487129967e-05, "loss": 0.1602, "step": 12812 }, { "epoch": 2.5819061051783194, "grad_norm": 0.05621035769581795, "learning_rate": 5.1425743826051245e-05, "loss": 0.2493, "step": 12814 }, { "epoch": 2.5823090872456174, "grad_norm": 0.04596579074859619, "learning_rate": 5.1412422679520245e-05, "loss": 0.205, "step": 12816 }, { "epoch": 2.5827120693129153, "grad_norm": 0.04868212342262268, "learning_rate": 5.139910143265302e-05, "loss": 0.2341, "step": 12818 }, { "epoch": 2.5831150513802137, "grad_norm": 0.046460337936878204, "learning_rate": 5.1385780086395853e-05, "loss": 0.1865, "step": 12820 }, { "epoch": 2.5835180334475116, "grad_norm": 0.04652968794107437, "learning_rate": 5.137245864169507e-05, "loss": 0.1769, "step": 12822 }, { "epoch": 2.5839210155148096, "grad_norm": 0.05539759248495102, "learning_rate": 5.135913709949706e-05, "loss": 0.1864, "step": 12824 }, { "epoch": 2.5843239975821075, "grad_norm": 0.07780510932207108, "learning_rate": 5.134581546074809e-05, "loss": 0.1687, "step": 12826 }, { "epoch": 2.5847269796494055, "grad_norm": 0.052189409732818604, "learning_rate": 5.133249372639455e-05, "loss": 0.2018, "step": 12828 }, { "epoch": 2.585129961716704, "grad_norm": 0.0710270032286644, "learning_rate": 5.131917189738279e-05, "loss": 0.2054, "step": 12830 }, { "epoch": 2.585532943784002, "grad_norm": 0.04049243405461311, "learning_rate": 5.130584997465917e-05, "loss": 0.2103, "step": 12832 }, { "epoch": 2.5859359258512997, "grad_norm": 0.08410030603408813, "learning_rate": 5.129252795917006e-05, "loss": 0.1936, "step": 12834 }, { "epoch": 2.5863389079185977, "grad_norm": 0.07183802127838135, "learning_rate": 5.127920585186181e-05, "loss": 0.2514, "step": 12836 }, { "epoch": 2.5867418899858956, "grad_norm": 0.046077702194452286, "learning_rate": 5.1265883653680825e-05, "loss": 0.1637, "step": 12838 }, { "epoch": 2.5871448720531935, "grad_norm": 0.06531906127929688, "learning_rate": 5.1252561365573516e-05, "loss": 0.2507, "step": 12840 }, { "epoch": 2.5875478541204915, "grad_norm": 0.03278656303882599, "learning_rate": 5.123923898848623e-05, "loss": 0.1597, "step": 12842 }, { "epoch": 2.5879508361877894, "grad_norm": 0.06488583981990814, "learning_rate": 5.122591652336539e-05, "loss": 0.1736, "step": 12844 }, { "epoch": 2.588353818255088, "grad_norm": 0.04900403320789337, "learning_rate": 5.1212593971157405e-05, "loss": 0.244, "step": 12846 }, { "epoch": 2.5887568003223858, "grad_norm": 0.04153195396065712, "learning_rate": 5.119927133280869e-05, "loss": 0.1476, "step": 12848 }, { "epoch": 2.5891597823896837, "grad_norm": 0.057202406227588654, "learning_rate": 5.118594860926564e-05, "loss": 0.1796, "step": 12850 }, { "epoch": 2.5895627644569816, "grad_norm": 0.06250036507844925, "learning_rate": 5.117262580147472e-05, "loss": 0.1753, "step": 12852 }, { "epoch": 2.5899657465242796, "grad_norm": 0.057463813573122025, "learning_rate": 5.115930291038232e-05, "loss": 0.186, "step": 12854 }, { "epoch": 2.5903687285915775, "grad_norm": 0.049552544951438904, "learning_rate": 5.114597993693491e-05, "loss": 0.1732, "step": 12856 }, { "epoch": 2.590771710658876, "grad_norm": 0.06389881670475006, "learning_rate": 5.11326568820789e-05, "loss": 0.1787, "step": 12858 }, { "epoch": 2.591174692726174, "grad_norm": 0.0628291442990303, "learning_rate": 5.111933374676077e-05, "loss": 0.2285, "step": 12860 }, { "epoch": 2.591577674793472, "grad_norm": 0.0501752570271492, "learning_rate": 5.110601053192696e-05, "loss": 0.1721, "step": 12862 }, { "epoch": 2.5919806568607697, "grad_norm": 0.05748743563890457, "learning_rate": 5.1092687238523926e-05, "loss": 0.2718, "step": 12864 }, { "epoch": 2.5923836389280677, "grad_norm": 0.06448312848806381, "learning_rate": 5.1079363867498134e-05, "loss": 0.2096, "step": 12866 }, { "epoch": 2.5927866209953656, "grad_norm": 0.04302519932389259, "learning_rate": 5.1066040419796066e-05, "loss": 0.2141, "step": 12868 }, { "epoch": 2.5931896030626636, "grad_norm": 0.03750751540064812, "learning_rate": 5.10527168963642e-05, "loss": 0.1829, "step": 12870 }, { "epoch": 2.5935925851299615, "grad_norm": 0.04912559688091278, "learning_rate": 5.103939329814898e-05, "loss": 0.1977, "step": 12872 }, { "epoch": 2.59399556719726, "grad_norm": 0.05096503719687462, "learning_rate": 5.1026069626096964e-05, "loss": 0.1397, "step": 12874 }, { "epoch": 2.594398549264558, "grad_norm": 0.059429194778203964, "learning_rate": 5.101274588115457e-05, "loss": 0.1858, "step": 12876 }, { "epoch": 2.5948015313318558, "grad_norm": 0.05182720348238945, "learning_rate": 5.099942206426833e-05, "loss": 0.2315, "step": 12878 }, { "epoch": 2.5952045133991537, "grad_norm": 0.06778381764888763, "learning_rate": 5.098609817638477e-05, "loss": 0.1854, "step": 12880 }, { "epoch": 2.5956074954664516, "grad_norm": 0.04259805753827095, "learning_rate": 5.097277421845035e-05, "loss": 0.1813, "step": 12882 }, { "epoch": 2.5960104775337496, "grad_norm": 0.06854765862226486, "learning_rate": 5.0959450191411606e-05, "loss": 0.2141, "step": 12884 }, { "epoch": 2.596413459601048, "grad_norm": 0.05548940226435661, "learning_rate": 5.094612609621506e-05, "loss": 0.1481, "step": 12886 }, { "epoch": 2.596816441668346, "grad_norm": 0.06769641488790512, "learning_rate": 5.093280193380723e-05, "loss": 0.1951, "step": 12888 }, { "epoch": 2.597219423735644, "grad_norm": 0.05677637830376625, "learning_rate": 5.0919477705134644e-05, "loss": 0.1935, "step": 12890 }, { "epoch": 2.597622405802942, "grad_norm": 0.0598987378180027, "learning_rate": 5.090615341114383e-05, "loss": 0.2259, "step": 12892 }, { "epoch": 2.5980253878702397, "grad_norm": 0.05017285421490669, "learning_rate": 5.0892829052781334e-05, "loss": 0.2007, "step": 12894 }, { "epoch": 2.5984283699375377, "grad_norm": 0.04577360674738884, "learning_rate": 5.087950463099367e-05, "loss": 0.2345, "step": 12896 }, { "epoch": 2.5988313520048356, "grad_norm": 0.04841199517250061, "learning_rate": 5.086618014672743e-05, "loss": 0.2259, "step": 12898 }, { "epoch": 2.5992343340721336, "grad_norm": 0.061152152717113495, "learning_rate": 5.0852855600929116e-05, "loss": 0.2082, "step": 12900 }, { "epoch": 2.599637316139432, "grad_norm": 0.052234455943107605, "learning_rate": 5.0839530994545316e-05, "loss": 0.1641, "step": 12902 }, { "epoch": 2.60004029820673, "grad_norm": 0.07535497099161148, "learning_rate": 5.082620632852258e-05, "loss": 0.2003, "step": 12904 }, { "epoch": 2.600443280274028, "grad_norm": 0.05371668562293053, "learning_rate": 5.081288160380745e-05, "loss": 0.1998, "step": 12906 }, { "epoch": 2.6008462623413258, "grad_norm": 0.04646865651011467, "learning_rate": 5.079955682134652e-05, "loss": 0.1759, "step": 12908 }, { "epoch": 2.6012492444086237, "grad_norm": 0.05468188226222992, "learning_rate": 5.0786231982086364e-05, "loss": 0.147, "step": 12910 }, { "epoch": 2.601652226475922, "grad_norm": 0.06912878155708313, "learning_rate": 5.077290708697353e-05, "loss": 0.197, "step": 12912 }, { "epoch": 2.60205520854322, "grad_norm": 0.05467282235622406, "learning_rate": 5.075958213695461e-05, "loss": 0.1566, "step": 12914 }, { "epoch": 2.602458190610518, "grad_norm": 0.042261648923158646, "learning_rate": 5.0746257132976205e-05, "loss": 0.1476, "step": 12916 }, { "epoch": 2.602861172677816, "grad_norm": 0.05570102855563164, "learning_rate": 5.073293207598487e-05, "loss": 0.1765, "step": 12918 }, { "epoch": 2.603264154745114, "grad_norm": 0.05489011108875275, "learning_rate": 5.0719606966927226e-05, "loss": 0.171, "step": 12920 }, { "epoch": 2.603667136812412, "grad_norm": 0.052448570728302, "learning_rate": 5.070628180674986e-05, "loss": 0.1551, "step": 12922 }, { "epoch": 2.6040701188797097, "grad_norm": 0.24931637942790985, "learning_rate": 5.0692956596399344e-05, "loss": 0.2118, "step": 12924 }, { "epoch": 2.6044731009470077, "grad_norm": 0.052810054272413254, "learning_rate": 5.067963133682232e-05, "loss": 0.2092, "step": 12926 }, { "epoch": 2.6048760830143056, "grad_norm": 0.051145680248737335, "learning_rate": 5.066630602896536e-05, "loss": 0.2083, "step": 12928 }, { "epoch": 2.605279065081604, "grad_norm": 0.08187511563301086, "learning_rate": 5.0652980673775085e-05, "loss": 0.1843, "step": 12930 }, { "epoch": 2.605682047148902, "grad_norm": 0.08010423928499222, "learning_rate": 5.0639655272198116e-05, "loss": 0.1689, "step": 12932 }, { "epoch": 2.6060850292162, "grad_norm": 0.05526207014918327, "learning_rate": 5.062632982518105e-05, "loss": 0.17, "step": 12934 }, { "epoch": 2.606488011283498, "grad_norm": 0.05141100659966469, "learning_rate": 5.061300433367051e-05, "loss": 0.2417, "step": 12936 }, { "epoch": 2.6068909933507958, "grad_norm": 0.05525074899196625, "learning_rate": 5.059967879861314e-05, "loss": 0.2469, "step": 12938 }, { "epoch": 2.607293975418094, "grad_norm": 0.051896847784519196, "learning_rate": 5.058635322095553e-05, "loss": 0.1749, "step": 12940 }, { "epoch": 2.607696957485392, "grad_norm": 0.04624801501631737, "learning_rate": 5.057302760164433e-05, "loss": 0.2052, "step": 12942 }, { "epoch": 2.60809993955269, "grad_norm": 0.08334264159202576, "learning_rate": 5.055970194162618e-05, "loss": 0.244, "step": 12944 }, { "epoch": 2.608502921619988, "grad_norm": 0.0596526637673378, "learning_rate": 5.054637624184768e-05, "loss": 0.1921, "step": 12946 }, { "epoch": 2.608905903687286, "grad_norm": 0.07964113354682922, "learning_rate": 5.053305050325549e-05, "loss": 0.2158, "step": 12948 }, { "epoch": 2.609308885754584, "grad_norm": 0.06931203603744507, "learning_rate": 5.051972472679626e-05, "loss": 0.1674, "step": 12950 }, { "epoch": 2.609711867821882, "grad_norm": 0.06616196036338806, "learning_rate": 5.0506398913416596e-05, "loss": 0.2039, "step": 12952 }, { "epoch": 2.6101148498891797, "grad_norm": 0.04973366856575012, "learning_rate": 5.049307306406317e-05, "loss": 0.2103, "step": 12954 }, { "epoch": 2.6105178319564777, "grad_norm": 0.07316756248474121, "learning_rate": 5.047974717968262e-05, "loss": 0.2184, "step": 12956 }, { "epoch": 2.610920814023776, "grad_norm": 0.06390600651502609, "learning_rate": 5.0466421261221606e-05, "loss": 0.2159, "step": 12958 }, { "epoch": 2.611323796091074, "grad_norm": 0.07867567986249924, "learning_rate": 5.045309530962676e-05, "loss": 0.2068, "step": 12960 }, { "epoch": 2.611726778158372, "grad_norm": 0.06732230633497238, "learning_rate": 5.0439769325844765e-05, "loss": 0.1552, "step": 12962 }, { "epoch": 2.61212976022567, "grad_norm": 0.05458653345704079, "learning_rate": 5.042644331082225e-05, "loss": 0.2168, "step": 12964 }, { "epoch": 2.612532742292968, "grad_norm": 0.05227232724428177, "learning_rate": 5.041311726550587e-05, "loss": 0.1737, "step": 12966 }, { "epoch": 2.612935724360266, "grad_norm": 0.041917361319065094, "learning_rate": 5.0399791190842324e-05, "loss": 0.1745, "step": 12968 }, { "epoch": 2.613338706427564, "grad_norm": 0.048856884241104126, "learning_rate": 5.0386465087778235e-05, "loss": 0.1913, "step": 12970 }, { "epoch": 2.613741688494862, "grad_norm": 0.060772497206926346, "learning_rate": 5.037313895726029e-05, "loss": 0.1481, "step": 12972 }, { "epoch": 2.61414467056216, "grad_norm": 0.06059327349066734, "learning_rate": 5.035981280023516e-05, "loss": 0.1709, "step": 12974 }, { "epoch": 2.614547652629458, "grad_norm": 0.08554743230342865, "learning_rate": 5.034648661764949e-05, "loss": 0.212, "step": 12976 }, { "epoch": 2.614950634696756, "grad_norm": 0.05296426638960838, "learning_rate": 5.0333160410449966e-05, "loss": 0.1944, "step": 12978 }, { "epoch": 2.615353616764054, "grad_norm": 0.05745255574584007, "learning_rate": 5.031983417958327e-05, "loss": 0.16, "step": 12980 }, { "epoch": 2.615756598831352, "grad_norm": 0.05538434907793999, "learning_rate": 5.030650792599605e-05, "loss": 0.2114, "step": 12982 }, { "epoch": 2.6161595808986498, "grad_norm": 0.04889480769634247, "learning_rate": 5.0293181650635e-05, "loss": 0.2085, "step": 12984 }, { "epoch": 2.616562562965948, "grad_norm": 0.04710153490304947, "learning_rate": 5.0279855354446815e-05, "loss": 0.2168, "step": 12986 }, { "epoch": 2.616965545033246, "grad_norm": 0.061837129294872284, "learning_rate": 5.026652903837813e-05, "loss": 0.1725, "step": 12988 }, { "epoch": 2.617368527100544, "grad_norm": 0.06950433552265167, "learning_rate": 5.025320270337566e-05, "loss": 0.2282, "step": 12990 }, { "epoch": 2.617771509167842, "grad_norm": 0.07739868760108948, "learning_rate": 5.0239876350386076e-05, "loss": 0.229, "step": 12992 }, { "epoch": 2.61817449123514, "grad_norm": 0.04732915386557579, "learning_rate": 5.022654998035604e-05, "loss": 0.144, "step": 12994 }, { "epoch": 2.6185774733024383, "grad_norm": 0.05778880417346954, "learning_rate": 5.021322359423228e-05, "loss": 0.2039, "step": 12996 }, { "epoch": 2.6189804553697362, "grad_norm": 0.05006580054759979, "learning_rate": 5.019989719296144e-05, "loss": 0.1509, "step": 12998 }, { "epoch": 2.619383437437034, "grad_norm": 0.04661744460463524, "learning_rate": 5.018657077749024e-05, "loss": 0.1915, "step": 13000 }, { "epoch": 2.619786419504332, "grad_norm": 0.07048065215349197, "learning_rate": 5.0173244348765345e-05, "loss": 0.1915, "step": 13002 }, { "epoch": 2.62018940157163, "grad_norm": 0.05740850046277046, "learning_rate": 5.0159917907733436e-05, "loss": 0.2505, "step": 13004 }, { "epoch": 2.620592383638928, "grad_norm": 0.06299738585948944, "learning_rate": 5.0146591455341217e-05, "loss": 0.2106, "step": 13006 }, { "epoch": 2.620995365706226, "grad_norm": 0.04687739536166191, "learning_rate": 5.013326499253539e-05, "loss": 0.1793, "step": 13008 }, { "epoch": 2.621398347773524, "grad_norm": 0.05674981698393822, "learning_rate": 5.0119938520262624e-05, "loss": 0.1942, "step": 13010 }, { "epoch": 2.621801329840822, "grad_norm": 0.04868143051862717, "learning_rate": 5.010661203946961e-05, "loss": 0.1878, "step": 13012 }, { "epoch": 2.62220431190812, "grad_norm": 0.037791360169649124, "learning_rate": 5.0093285551103064e-05, "loss": 0.1803, "step": 13014 }, { "epoch": 2.622607293975418, "grad_norm": 0.05020664632320404, "learning_rate": 5.007995905610964e-05, "loss": 0.2101, "step": 13016 }, { "epoch": 2.623010276042716, "grad_norm": 0.044175636023283005, "learning_rate": 5.006663255543607e-05, "loss": 0.1813, "step": 13018 }, { "epoch": 2.623413258110014, "grad_norm": 0.05268344283103943, "learning_rate": 5.0053306050029026e-05, "loss": 0.244, "step": 13020 }, { "epoch": 2.623816240177312, "grad_norm": 0.05456307902932167, "learning_rate": 5.003997954083519e-05, "loss": 0.142, "step": 13022 }, { "epoch": 2.6242192222446104, "grad_norm": 0.05344918742775917, "learning_rate": 5.002665302880129e-05, "loss": 0.1684, "step": 13024 }, { "epoch": 2.6246222043119083, "grad_norm": 0.07235932350158691, "learning_rate": 5.0013326514874e-05, "loss": 0.2067, "step": 13026 }, { "epoch": 2.6250251863792062, "grad_norm": 0.051819127053022385, "learning_rate": 5e-05, "loss": 0.1963, "step": 13028 }, { "epoch": 2.625428168446504, "grad_norm": 0.057188838720321655, "learning_rate": 4.998667348512601e-05, "loss": 0.1831, "step": 13030 }, { "epoch": 2.625831150513802, "grad_norm": 0.05618830397725105, "learning_rate": 4.9973346971198725e-05, "loss": 0.2096, "step": 13032 }, { "epoch": 2.6262341325811, "grad_norm": 0.060924481600522995, "learning_rate": 4.99600204591648e-05, "loss": 0.2214, "step": 13034 }, { "epoch": 2.626637114648398, "grad_norm": 0.05707908049225807, "learning_rate": 4.994669394997099e-05, "loss": 0.2045, "step": 13036 }, { "epoch": 2.627040096715696, "grad_norm": 0.057565122842788696, "learning_rate": 4.9933367444563936e-05, "loss": 0.2445, "step": 13038 }, { "epoch": 2.627443078782994, "grad_norm": 0.046373993158340454, "learning_rate": 4.9920040943890364e-05, "loss": 0.1656, "step": 13040 }, { "epoch": 2.6278460608502923, "grad_norm": 0.04796910285949707, "learning_rate": 4.9906714448896955e-05, "loss": 0.1453, "step": 13042 }, { "epoch": 2.62824904291759, "grad_norm": 0.048241354525089264, "learning_rate": 4.9893387960530406e-05, "loss": 0.1605, "step": 13044 }, { "epoch": 2.628652024984888, "grad_norm": 0.06200138479471207, "learning_rate": 4.9880061479737374e-05, "loss": 0.1776, "step": 13046 }, { "epoch": 2.629055007052186, "grad_norm": 0.058670494705438614, "learning_rate": 4.9866735007464614e-05, "loss": 0.2085, "step": 13048 }, { "epoch": 2.629457989119484, "grad_norm": 0.051949888467788696, "learning_rate": 4.985340854465878e-05, "loss": 0.2077, "step": 13050 }, { "epoch": 2.6298609711867824, "grad_norm": 0.054738037288188934, "learning_rate": 4.984008209226657e-05, "loss": 0.1633, "step": 13052 }, { "epoch": 2.6302639532540804, "grad_norm": 0.058610428124666214, "learning_rate": 4.982675565123467e-05, "loss": 0.2085, "step": 13054 }, { "epoch": 2.6306669353213783, "grad_norm": 0.0664537250995636, "learning_rate": 4.981342922250978e-05, "loss": 0.1794, "step": 13056 }, { "epoch": 2.6310699173886762, "grad_norm": 0.05168154463171959, "learning_rate": 4.980010280703855e-05, "loss": 0.1925, "step": 13058 }, { "epoch": 2.631472899455974, "grad_norm": 0.07177409529685974, "learning_rate": 4.978677640576773e-05, "loss": 0.2115, "step": 13060 }, { "epoch": 2.631875881523272, "grad_norm": 0.05283867195248604, "learning_rate": 4.977345001964395e-05, "loss": 0.1536, "step": 13062 }, { "epoch": 2.63227886359057, "grad_norm": 0.08195855468511581, "learning_rate": 4.9760123649613936e-05, "loss": 0.208, "step": 13064 }, { "epoch": 2.632681845657868, "grad_norm": 0.08787176758050919, "learning_rate": 4.9746797296624346e-05, "loss": 0.1907, "step": 13066 }, { "epoch": 2.6330848277251664, "grad_norm": 0.06067819148302078, "learning_rate": 4.973347096162188e-05, "loss": 0.2148, "step": 13068 }, { "epoch": 2.6334878097924643, "grad_norm": 0.04483300447463989, "learning_rate": 4.972014464555319e-05, "loss": 0.1782, "step": 13070 }, { "epoch": 2.6338907918597623, "grad_norm": 0.057843420654535294, "learning_rate": 4.9706818349365006e-05, "loss": 0.2338, "step": 13072 }, { "epoch": 2.63429377392706, "grad_norm": 0.05431623384356499, "learning_rate": 4.969349207400395e-05, "loss": 0.1761, "step": 13074 }, { "epoch": 2.634696755994358, "grad_norm": 0.045999687165021896, "learning_rate": 4.968016582041674e-05, "loss": 0.1846, "step": 13076 }, { "epoch": 2.635099738061656, "grad_norm": 0.05476114898920059, "learning_rate": 4.966683958955003e-05, "loss": 0.2219, "step": 13078 }, { "epoch": 2.6355027201289545, "grad_norm": 0.06299296766519547, "learning_rate": 4.965351338235053e-05, "loss": 0.1971, "step": 13080 }, { "epoch": 2.6359057021962524, "grad_norm": 0.06087028235197067, "learning_rate": 4.9640187199764844e-05, "loss": 0.1991, "step": 13082 }, { "epoch": 2.6363086842635504, "grad_norm": 0.06715063750743866, "learning_rate": 4.962686104273972e-05, "loss": 0.2434, "step": 13084 }, { "epoch": 2.6367116663308483, "grad_norm": 0.05608596280217171, "learning_rate": 4.9613534912221756e-05, "loss": 0.1879, "step": 13086 }, { "epoch": 2.6371146483981462, "grad_norm": 0.06677883863449097, "learning_rate": 4.960020880915769e-05, "loss": 0.1768, "step": 13088 }, { "epoch": 2.637517630465444, "grad_norm": 0.04541515186429024, "learning_rate": 4.9586882734494126e-05, "loss": 0.1903, "step": 13090 }, { "epoch": 2.637920612532742, "grad_norm": 0.06370842456817627, "learning_rate": 4.957355668917777e-05, "loss": 0.1449, "step": 13092 }, { "epoch": 2.63832359460004, "grad_norm": 0.04287172481417656, "learning_rate": 4.956023067415525e-05, "loss": 0.1806, "step": 13094 }, { "epoch": 2.6387265766673385, "grad_norm": 0.04955010116100311, "learning_rate": 4.954690469037325e-05, "loss": 0.2014, "step": 13096 }, { "epoch": 2.6391295587346364, "grad_norm": 0.047106534242630005, "learning_rate": 4.95335787387784e-05, "loss": 0.1747, "step": 13098 }, { "epoch": 2.6395325408019343, "grad_norm": 0.06430371850728989, "learning_rate": 4.952025282031739e-05, "loss": 0.1857, "step": 13100 }, { "epoch": 2.6399355228692323, "grad_norm": 0.0573105588555336, "learning_rate": 4.950692693593683e-05, "loss": 0.1549, "step": 13102 }, { "epoch": 2.64033850493653, "grad_norm": 0.048013072460889816, "learning_rate": 4.9493601086583416e-05, "loss": 0.1909, "step": 13104 }, { "epoch": 2.6407414870038286, "grad_norm": 0.05720860883593559, "learning_rate": 4.9480275273203755e-05, "loss": 0.2411, "step": 13106 }, { "epoch": 2.6411444690711265, "grad_norm": 0.04672304913401604, "learning_rate": 4.946694949674452e-05, "loss": 0.2303, "step": 13108 }, { "epoch": 2.6415474511384245, "grad_norm": 0.05690852925181389, "learning_rate": 4.9453623758152316e-05, "loss": 0.1842, "step": 13110 }, { "epoch": 2.6419504332057224, "grad_norm": 0.056028977036476135, "learning_rate": 4.9440298058373834e-05, "loss": 0.1571, "step": 13112 }, { "epoch": 2.6423534152730204, "grad_norm": 0.05338013917207718, "learning_rate": 4.942697239835567e-05, "loss": 0.1442, "step": 13114 }, { "epoch": 2.6427563973403183, "grad_norm": 0.06082042679190636, "learning_rate": 4.9413646779044475e-05, "loss": 0.1667, "step": 13116 }, { "epoch": 2.6431593794076162, "grad_norm": 0.06253007054328918, "learning_rate": 4.9400321201386873e-05, "loss": 0.2382, "step": 13118 }, { "epoch": 2.643562361474914, "grad_norm": 0.04853769391775131, "learning_rate": 4.93869956663295e-05, "loss": 0.1721, "step": 13120 }, { "epoch": 2.643965343542212, "grad_norm": 0.09063606709241867, "learning_rate": 4.9373670174818956e-05, "loss": 0.2299, "step": 13122 }, { "epoch": 2.6443683256095105, "grad_norm": 0.07562512904405594, "learning_rate": 4.93603447278019e-05, "loss": 0.2321, "step": 13124 }, { "epoch": 2.6447713076768085, "grad_norm": 0.05161966755986214, "learning_rate": 4.934701932622492e-05, "loss": 0.1779, "step": 13126 }, { "epoch": 2.6451742897441064, "grad_norm": 0.05404038354754448, "learning_rate": 4.933369397103465e-05, "loss": 0.2406, "step": 13128 }, { "epoch": 2.6455772718114043, "grad_norm": 0.0837070494890213, "learning_rate": 4.932036866317769e-05, "loss": 0.2105, "step": 13130 }, { "epoch": 2.6459802538787023, "grad_norm": 0.040656138211488724, "learning_rate": 4.930704340360066e-05, "loss": 0.1896, "step": 13132 }, { "epoch": 2.6463832359460007, "grad_norm": 0.05056383088231087, "learning_rate": 4.929371819325014e-05, "loss": 0.1915, "step": 13134 }, { "epoch": 2.6467862180132986, "grad_norm": 0.07564432919025421, "learning_rate": 4.9280393033072785e-05, "loss": 0.1927, "step": 13136 }, { "epoch": 2.6471892000805965, "grad_norm": 0.0784987360239029, "learning_rate": 4.926706792401512e-05, "loss": 0.2176, "step": 13138 }, { "epoch": 2.6475921821478945, "grad_norm": 0.05027944594621658, "learning_rate": 4.9253742867023806e-05, "loss": 0.2728, "step": 13140 }, { "epoch": 2.6479951642151924, "grad_norm": 0.045518044382333755, "learning_rate": 4.9240417863045384e-05, "loss": 0.1721, "step": 13142 }, { "epoch": 2.6483981462824904, "grad_norm": 0.049447476863861084, "learning_rate": 4.922709291302648e-05, "loss": 0.1765, "step": 13144 }, { "epoch": 2.6488011283497883, "grad_norm": 0.06533786654472351, "learning_rate": 4.9213768017913634e-05, "loss": 0.1861, "step": 13146 }, { "epoch": 2.6492041104170863, "grad_norm": 0.06059259548783302, "learning_rate": 4.920044317865349e-05, "loss": 0.242, "step": 13148 }, { "epoch": 2.649607092484384, "grad_norm": 0.05759195238351822, "learning_rate": 4.918711839619255e-05, "loss": 0.1757, "step": 13150 }, { "epoch": 2.6500100745516826, "grad_norm": 0.05047111213207245, "learning_rate": 4.9173793671477435e-05, "loss": 0.1754, "step": 13152 }, { "epoch": 2.6504130566189805, "grad_norm": 0.06632532179355621, "learning_rate": 4.916046900545469e-05, "loss": 0.2063, "step": 13154 }, { "epoch": 2.6508160386862785, "grad_norm": 0.052938882261514664, "learning_rate": 4.9147144399070896e-05, "loss": 0.2113, "step": 13156 }, { "epoch": 2.6512190207535764, "grad_norm": 0.04302411153912544, "learning_rate": 4.9133819853272584e-05, "loss": 0.1403, "step": 13158 }, { "epoch": 2.6516220028208743, "grad_norm": 0.05520898476243019, "learning_rate": 4.912049536900634e-05, "loss": 0.1963, "step": 13160 }, { "epoch": 2.6520249848881727, "grad_norm": 0.0664575919508934, "learning_rate": 4.910717094721867e-05, "loss": 0.215, "step": 13162 }, { "epoch": 2.6524279669554707, "grad_norm": 0.06888501346111298, "learning_rate": 4.909384658885617e-05, "loss": 0.2019, "step": 13164 }, { "epoch": 2.6528309490227686, "grad_norm": 0.04566109552979469, "learning_rate": 4.9080522294865354e-05, "loss": 0.1602, "step": 13166 }, { "epoch": 2.6532339310900666, "grad_norm": 0.05641119182109833, "learning_rate": 4.906719806619278e-05, "loss": 0.2274, "step": 13168 }, { "epoch": 2.6536369131573645, "grad_norm": 0.0688968226313591, "learning_rate": 4.905387390378494e-05, "loss": 0.1854, "step": 13170 }, { "epoch": 2.6540398952246624, "grad_norm": 0.05073093995451927, "learning_rate": 4.9040549808588405e-05, "loss": 0.2027, "step": 13172 }, { "epoch": 2.6544428772919604, "grad_norm": 0.0423697866499424, "learning_rate": 4.902722578154965e-05, "loss": 0.2197, "step": 13174 }, { "epoch": 2.6548458593592583, "grad_norm": 0.057178203016519547, "learning_rate": 4.901390182361524e-05, "loss": 0.1881, "step": 13176 }, { "epoch": 2.6552488414265563, "grad_norm": 0.04987271502614021, "learning_rate": 4.900057793573166e-05, "loss": 0.2196, "step": 13178 }, { "epoch": 2.6556518234938546, "grad_norm": 0.056287556886672974, "learning_rate": 4.8987254118845436e-05, "loss": 0.1917, "step": 13180 }, { "epoch": 2.6560548055611526, "grad_norm": 0.05609240382909775, "learning_rate": 4.8973930373903054e-05, "loss": 0.1993, "step": 13182 }, { "epoch": 2.6564577876284505, "grad_norm": 0.0506737045943737, "learning_rate": 4.896060670185102e-05, "loss": 0.2618, "step": 13184 }, { "epoch": 2.6568607696957485, "grad_norm": 0.04480299726128578, "learning_rate": 4.894728310363581e-05, "loss": 0.1861, "step": 13186 }, { "epoch": 2.6572637517630464, "grad_norm": 0.05772459879517555, "learning_rate": 4.893395958020394e-05, "loss": 0.194, "step": 13188 }, { "epoch": 2.657666733830345, "grad_norm": 0.05927930027246475, "learning_rate": 4.8920636132501864e-05, "loss": 0.1799, "step": 13190 }, { "epoch": 2.6580697158976427, "grad_norm": 0.06080922484397888, "learning_rate": 4.8907312761476085e-05, "loss": 0.1891, "step": 13192 }, { "epoch": 2.6584726979649407, "grad_norm": 0.036109864711761475, "learning_rate": 4.889398946807305e-05, "loss": 0.1901, "step": 13194 }, { "epoch": 2.6588756800322386, "grad_norm": 0.06764542311429977, "learning_rate": 4.8880666253239244e-05, "loss": 0.2052, "step": 13196 }, { "epoch": 2.6592786620995366, "grad_norm": 0.05238844081759453, "learning_rate": 4.886734311792109e-05, "loss": 0.2018, "step": 13198 }, { "epoch": 2.6596816441668345, "grad_norm": 0.0657360702753067, "learning_rate": 4.8854020063065104e-05, "loss": 0.1742, "step": 13200 }, { "epoch": 2.6600846262341324, "grad_norm": 0.05971973389387131, "learning_rate": 4.8840697089617685e-05, "loss": 0.2342, "step": 13202 }, { "epoch": 2.6604876083014304, "grad_norm": 0.052394554018974304, "learning_rate": 4.8827374198525293e-05, "loss": 0.241, "step": 13204 }, { "epoch": 2.6608905903687283, "grad_norm": 0.060340166091918945, "learning_rate": 4.8814051390734364e-05, "loss": 0.222, "step": 13206 }, { "epoch": 2.6612935724360267, "grad_norm": 0.042948994785547256, "learning_rate": 4.8800728667191324e-05, "loss": 0.161, "step": 13208 }, { "epoch": 2.6616965545033247, "grad_norm": 0.04476882889866829, "learning_rate": 4.878740602884259e-05, "loss": 0.2258, "step": 13210 }, { "epoch": 2.6620995365706226, "grad_norm": 0.05171223357319832, "learning_rate": 4.8774083476634626e-05, "loss": 0.1949, "step": 13212 }, { "epoch": 2.6625025186379205, "grad_norm": 0.07086855173110962, "learning_rate": 4.8760761011513776e-05, "loss": 0.2226, "step": 13214 }, { "epoch": 2.6629055007052185, "grad_norm": 0.05836058035492897, "learning_rate": 4.87474386344265e-05, "loss": 0.1843, "step": 13216 }, { "epoch": 2.663308482772517, "grad_norm": 0.042663928121328354, "learning_rate": 4.873411634631917e-05, "loss": 0.1727, "step": 13218 }, { "epoch": 2.663711464839815, "grad_norm": 0.04874979704618454, "learning_rate": 4.87207941481382e-05, "loss": 0.1892, "step": 13220 }, { "epoch": 2.6641144469071127, "grad_norm": 0.04593397676944733, "learning_rate": 4.8707472040829954e-05, "loss": 0.2116, "step": 13222 }, { "epoch": 2.6645174289744107, "grad_norm": 0.04077121242880821, "learning_rate": 4.8694150025340856e-05, "loss": 0.1798, "step": 13224 }, { "epoch": 2.6649204110417086, "grad_norm": 0.050686176866292953, "learning_rate": 4.8680828102617215e-05, "loss": 0.1842, "step": 13226 }, { "epoch": 2.6653233931090066, "grad_norm": 0.06952465325593948, "learning_rate": 4.866750627360546e-05, "loss": 0.1789, "step": 13228 }, { "epoch": 2.6657263751763045, "grad_norm": 0.07306458055973053, "learning_rate": 4.865418453925192e-05, "loss": 0.1833, "step": 13230 }, { "epoch": 2.6661293572436024, "grad_norm": 0.06378468871116638, "learning_rate": 4.864086290050297e-05, "loss": 0.1782, "step": 13232 }, { "epoch": 2.6665323393109004, "grad_norm": 0.06567275524139404, "learning_rate": 4.862754135830493e-05, "loss": 0.1871, "step": 13234 }, { "epoch": 2.6669353213781988, "grad_norm": 0.055104974657297134, "learning_rate": 4.861421991360418e-05, "loss": 0.1788, "step": 13236 }, { "epoch": 2.6673383034454967, "grad_norm": 0.053641512989997864, "learning_rate": 4.860089856734699e-05, "loss": 0.187, "step": 13238 }, { "epoch": 2.6677412855127947, "grad_norm": 0.045359183102846146, "learning_rate": 4.858757732047976e-05, "loss": 0.1966, "step": 13240 }, { "epoch": 2.6681442675800926, "grad_norm": 0.04768161475658417, "learning_rate": 4.8574256173948766e-05, "loss": 0.2288, "step": 13242 }, { "epoch": 2.6685472496473905, "grad_norm": 0.07475445419549942, "learning_rate": 4.856093512870035e-05, "loss": 0.2402, "step": 13244 }, { "epoch": 2.668950231714689, "grad_norm": 0.0542769655585289, "learning_rate": 4.854761418568078e-05, "loss": 0.2363, "step": 13246 }, { "epoch": 2.669353213781987, "grad_norm": 0.03904513269662857, "learning_rate": 4.85342933458364e-05, "loss": 0.1482, "step": 13248 }, { "epoch": 2.669756195849285, "grad_norm": 0.061630938202142715, "learning_rate": 4.852097261011344e-05, "loss": 0.2125, "step": 13250 }, { "epoch": 2.6701591779165827, "grad_norm": 0.05297626927495003, "learning_rate": 4.850765197945825e-05, "loss": 0.223, "step": 13252 }, { "epoch": 2.6705621599838807, "grad_norm": 0.042190030217170715, "learning_rate": 4.8494331454817064e-05, "loss": 0.1734, "step": 13254 }, { "epoch": 2.6709651420511786, "grad_norm": 0.041881803423166275, "learning_rate": 4.8481011037136176e-05, "loss": 0.1577, "step": 13256 }, { "epoch": 2.6713681241184766, "grad_norm": 0.047158777713775635, "learning_rate": 4.8467690727361825e-05, "loss": 0.1604, "step": 13258 }, { "epoch": 2.6717711061857745, "grad_norm": 0.06598570197820663, "learning_rate": 4.845437052644029e-05, "loss": 0.2118, "step": 13260 }, { "epoch": 2.672174088253073, "grad_norm": 0.05773276835680008, "learning_rate": 4.8441050435317766e-05, "loss": 0.2249, "step": 13262 }, { "epoch": 2.672577070320371, "grad_norm": 0.0460265651345253, "learning_rate": 4.842773045494055e-05, "loss": 0.1632, "step": 13264 }, { "epoch": 2.672980052387669, "grad_norm": 0.053771812468767166, "learning_rate": 4.841441058625484e-05, "loss": 0.2166, "step": 13266 }, { "epoch": 2.6733830344549667, "grad_norm": 0.04382085055112839, "learning_rate": 4.840109083020688e-05, "loss": 0.2236, "step": 13268 }, { "epoch": 2.6737860165222647, "grad_norm": 0.050635140389204025, "learning_rate": 4.838777118774286e-05, "loss": 0.1704, "step": 13270 }, { "epoch": 2.6741889985895626, "grad_norm": 0.051803749054670334, "learning_rate": 4.837445165980901e-05, "loss": 0.2351, "step": 13272 }, { "epoch": 2.674591980656861, "grad_norm": 0.07338876277208328, "learning_rate": 4.8361132247351484e-05, "loss": 0.2134, "step": 13274 }, { "epoch": 2.674994962724159, "grad_norm": 0.0721621885895729, "learning_rate": 4.834781295131654e-05, "loss": 0.1942, "step": 13276 }, { "epoch": 2.675397944791457, "grad_norm": 0.06050800532102585, "learning_rate": 4.833449377265028e-05, "loss": 0.1653, "step": 13278 }, { "epoch": 2.675800926858755, "grad_norm": 0.062214065343141556, "learning_rate": 4.832117471229895e-05, "loss": 0.2049, "step": 13280 }, { "epoch": 2.6762039089260528, "grad_norm": 0.045399442315101624, "learning_rate": 4.8307855771208674e-05, "loss": 0.1932, "step": 13282 }, { "epoch": 2.6766068909933507, "grad_norm": 0.04470831900835037, "learning_rate": 4.829453695032562e-05, "loss": 0.1875, "step": 13284 }, { "epoch": 2.6770098730606486, "grad_norm": 0.050232332199811935, "learning_rate": 4.8281218250595914e-05, "loss": 0.1804, "step": 13286 }, { "epoch": 2.6774128551279466, "grad_norm": 0.05278032645583153, "learning_rate": 4.8267899672965755e-05, "loss": 0.1992, "step": 13288 }, { "epoch": 2.677815837195245, "grad_norm": 0.054559800773859024, "learning_rate": 4.825458121838119e-05, "loss": 0.2348, "step": 13290 }, { "epoch": 2.678218819262543, "grad_norm": 0.07798026502132416, "learning_rate": 4.8241262887788416e-05, "loss": 0.1449, "step": 13292 }, { "epoch": 2.678621801329841, "grad_norm": 0.046620819717645645, "learning_rate": 4.8227944682133495e-05, "loss": 0.1847, "step": 13294 }, { "epoch": 2.679024783397139, "grad_norm": 0.07669510692358017, "learning_rate": 4.821462660236257e-05, "loss": 0.2126, "step": 13296 }, { "epoch": 2.6794277654644367, "grad_norm": 0.05963238328695297, "learning_rate": 4.8201308649421696e-05, "loss": 0.2398, "step": 13298 }, { "epoch": 2.679830747531735, "grad_norm": 0.05043806880712509, "learning_rate": 4.8187990824256996e-05, "loss": 0.2067, "step": 13300 }, { "epoch": 2.680233729599033, "grad_norm": 0.06212414801120758, "learning_rate": 4.8174673127814505e-05, "loss": 0.1356, "step": 13302 }, { "epoch": 2.680636711666331, "grad_norm": 0.07311484962701797, "learning_rate": 4.8161355561040336e-05, "loss": 0.2394, "step": 13304 }, { "epoch": 2.681039693733629, "grad_norm": 0.07517794519662857, "learning_rate": 4.814803812488052e-05, "loss": 0.1514, "step": 13306 }, { "epoch": 2.681442675800927, "grad_norm": 0.056023143231868744, "learning_rate": 4.813472082028112e-05, "loss": 0.177, "step": 13308 }, { "epoch": 2.681845657868225, "grad_norm": 0.05077926069498062, "learning_rate": 4.812140364818816e-05, "loss": 0.1821, "step": 13310 }, { "epoch": 2.6822486399355228, "grad_norm": 0.08374262601137161, "learning_rate": 4.81080866095477e-05, "loss": 0.1925, "step": 13312 }, { "epoch": 2.6826516220028207, "grad_norm": 0.059895146638154984, "learning_rate": 4.809476970530571e-05, "loss": 0.1605, "step": 13314 }, { "epoch": 2.6830546040701186, "grad_norm": 0.06866048276424408, "learning_rate": 4.808145293640826e-05, "loss": 0.2282, "step": 13316 }, { "epoch": 2.683457586137417, "grad_norm": 0.0920555368065834, "learning_rate": 4.806813630380131e-05, "loss": 0.2012, "step": 13318 }, { "epoch": 2.683860568204715, "grad_norm": 0.059407614171504974, "learning_rate": 4.8054819808430876e-05, "loss": 0.1991, "step": 13320 }, { "epoch": 2.684263550272013, "grad_norm": 0.06879781931638718, "learning_rate": 4.804150345124293e-05, "loss": 0.2295, "step": 13322 }, { "epoch": 2.684666532339311, "grad_norm": 0.05245072394609451, "learning_rate": 4.8028187233183454e-05, "loss": 0.2259, "step": 13324 }, { "epoch": 2.685069514406609, "grad_norm": 0.06847406923770905, "learning_rate": 4.8014871155198385e-05, "loss": 0.1739, "step": 13326 }, { "epoch": 2.685472496473907, "grad_norm": 0.06828334182500839, "learning_rate": 4.8001555218233704e-05, "loss": 0.1833, "step": 13328 }, { "epoch": 2.685875478541205, "grad_norm": 0.06589218974113464, "learning_rate": 4.798823942323534e-05, "loss": 0.2163, "step": 13330 }, { "epoch": 2.686278460608503, "grad_norm": 0.05553770810365677, "learning_rate": 4.797492377114925e-05, "loss": 0.2191, "step": 13332 }, { "epoch": 2.686681442675801, "grad_norm": 0.04765209183096886, "learning_rate": 4.796160826292132e-05, "loss": 0.1729, "step": 13334 }, { "epoch": 2.687084424743099, "grad_norm": 0.04598912596702576, "learning_rate": 4.7948292899497485e-05, "loss": 0.2046, "step": 13336 }, { "epoch": 2.687487406810397, "grad_norm": 0.058516427874565125, "learning_rate": 4.793497768182362e-05, "loss": 0.1781, "step": 13338 }, { "epoch": 2.687890388877695, "grad_norm": 0.0651201382279396, "learning_rate": 4.792166261084567e-05, "loss": 0.1859, "step": 13340 }, { "epoch": 2.6882933709449928, "grad_norm": 0.06374579668045044, "learning_rate": 4.7908347687509456e-05, "loss": 0.1804, "step": 13342 }, { "epoch": 2.6886963530122907, "grad_norm": 0.08643309772014618, "learning_rate": 4.7895032912760904e-05, "loss": 0.1678, "step": 13344 }, { "epoch": 2.689099335079589, "grad_norm": 0.08268879354000092, "learning_rate": 4.788171828754583e-05, "loss": 0.1917, "step": 13346 }, { "epoch": 2.689502317146887, "grad_norm": 0.07940013706684113, "learning_rate": 4.786840381281011e-05, "loss": 0.2438, "step": 13348 }, { "epoch": 2.689905299214185, "grad_norm": 0.06752052903175354, "learning_rate": 4.785508948949955e-05, "loss": 0.169, "step": 13350 }, { "epoch": 2.690308281281483, "grad_norm": 0.0565398707985878, "learning_rate": 4.784177531856004e-05, "loss": 0.1829, "step": 13352 }, { "epoch": 2.690711263348781, "grad_norm": 0.04072846844792366, "learning_rate": 4.782846130093733e-05, "loss": 0.1687, "step": 13354 }, { "epoch": 2.6911142454160792, "grad_norm": 0.05021306127309799, "learning_rate": 4.781514743757727e-05, "loss": 0.2042, "step": 13356 }, { "epoch": 2.691517227483377, "grad_norm": 0.057188209146261215, "learning_rate": 4.7801833729425645e-05, "loss": 0.1799, "step": 13358 }, { "epoch": 2.691920209550675, "grad_norm": 0.06615625321865082, "learning_rate": 4.7788520177428235e-05, "loss": 0.1755, "step": 13360 }, { "epoch": 2.692323191617973, "grad_norm": 0.040171921253204346, "learning_rate": 4.77752067825308e-05, "loss": 0.1796, "step": 13362 }, { "epoch": 2.692726173685271, "grad_norm": 0.07169929146766663, "learning_rate": 4.7761893545679145e-05, "loss": 0.2366, "step": 13364 }, { "epoch": 2.693129155752569, "grad_norm": 0.0498359277844429, "learning_rate": 4.774858046781896e-05, "loss": 0.1666, "step": 13366 }, { "epoch": 2.693532137819867, "grad_norm": 0.05385211855173111, "learning_rate": 4.773526754989604e-05, "loss": 0.1815, "step": 13368 }, { "epoch": 2.693935119887165, "grad_norm": 0.04354149475693703, "learning_rate": 4.7721954792856085e-05, "loss": 0.2145, "step": 13370 }, { "epoch": 2.6943381019544628, "grad_norm": 0.05808882415294647, "learning_rate": 4.7708642197644826e-05, "loss": 0.2205, "step": 13372 }, { "epoch": 2.694741084021761, "grad_norm": 0.04748508334159851, "learning_rate": 4.769532976520795e-05, "loss": 0.1858, "step": 13374 }, { "epoch": 2.695144066089059, "grad_norm": 0.06552096456289291, "learning_rate": 4.768201749649117e-05, "loss": 0.1915, "step": 13376 }, { "epoch": 2.695547048156357, "grad_norm": 0.08570721745491028, "learning_rate": 4.766870539244014e-05, "loss": 0.244, "step": 13378 }, { "epoch": 2.695950030223655, "grad_norm": 0.05731480196118355, "learning_rate": 4.765539345400057e-05, "loss": 0.1671, "step": 13380 }, { "epoch": 2.696353012290953, "grad_norm": 0.05735393241047859, "learning_rate": 4.7642081682118096e-05, "loss": 0.2473, "step": 13382 }, { "epoch": 2.6967559943582513, "grad_norm": 0.07119689881801605, "learning_rate": 4.762877007773838e-05, "loss": 0.1961, "step": 13384 }, { "epoch": 2.6971589764255492, "grad_norm": 0.03842030093073845, "learning_rate": 4.7615458641807025e-05, "loss": 0.1277, "step": 13386 }, { "epoch": 2.697561958492847, "grad_norm": 0.04085619002580643, "learning_rate": 4.760214737526969e-05, "loss": 0.1637, "step": 13388 }, { "epoch": 2.697964940560145, "grad_norm": 0.05109267681837082, "learning_rate": 4.7588836279071944e-05, "loss": 0.1744, "step": 13390 }, { "epoch": 2.698367922627443, "grad_norm": 0.07984402775764465, "learning_rate": 4.7575525354159445e-05, "loss": 0.2293, "step": 13392 }, { "epoch": 2.698770904694741, "grad_norm": 0.07411880046129227, "learning_rate": 4.7562214601477725e-05, "loss": 0.2167, "step": 13394 }, { "epoch": 2.699173886762039, "grad_norm": 0.04365954548120499, "learning_rate": 4.75489040219724e-05, "loss": 0.194, "step": 13396 }, { "epoch": 2.699576868829337, "grad_norm": 0.05702051520347595, "learning_rate": 4.7535593616589e-05, "loss": 0.1744, "step": 13398 }, { "epoch": 2.699979850896635, "grad_norm": 0.05207115411758423, "learning_rate": 4.75222833862731e-05, "loss": 0.1674, "step": 13400 }, { "epoch": 2.700382832963933, "grad_norm": 0.04992508888244629, "learning_rate": 4.750897333197021e-05, "loss": 0.226, "step": 13402 }, { "epoch": 2.700785815031231, "grad_norm": 0.0452972836792469, "learning_rate": 4.7495663454625885e-05, "loss": 0.2254, "step": 13404 }, { "epoch": 2.701188797098529, "grad_norm": 0.0995267704129219, "learning_rate": 4.748235375518561e-05, "loss": 0.179, "step": 13406 }, { "epoch": 2.701591779165827, "grad_norm": 0.07648054510354996, "learning_rate": 4.746904423459491e-05, "loss": 0.1982, "step": 13408 }, { "epoch": 2.701994761233125, "grad_norm": 0.05254209414124489, "learning_rate": 4.7455734893799256e-05, "loss": 0.136, "step": 13410 }, { "epoch": 2.7023977433004234, "grad_norm": 0.0762658417224884, "learning_rate": 4.744242573374413e-05, "loss": 0.1972, "step": 13412 }, { "epoch": 2.7028007253677213, "grad_norm": 0.06384989619255066, "learning_rate": 4.742911675537497e-05, "loss": 0.2099, "step": 13414 }, { "epoch": 2.7032037074350193, "grad_norm": 0.04679650813341141, "learning_rate": 4.741580795963726e-05, "loss": 0.153, "step": 13416 }, { "epoch": 2.703606689502317, "grad_norm": 0.05645020306110382, "learning_rate": 4.740249934747642e-05, "loss": 0.217, "step": 13418 }, { "epoch": 2.704009671569615, "grad_norm": 0.07073520869016647, "learning_rate": 4.7389190919837865e-05, "loss": 0.2049, "step": 13420 }, { "epoch": 2.704412653636913, "grad_norm": 0.04735686630010605, "learning_rate": 4.737588267766703e-05, "loss": 0.1856, "step": 13422 }, { "epoch": 2.704815635704211, "grad_norm": 0.05677908658981323, "learning_rate": 4.7362574621909264e-05, "loss": 0.2155, "step": 13424 }, { "epoch": 2.705218617771509, "grad_norm": 0.05154525861144066, "learning_rate": 4.734926675351e-05, "loss": 0.1814, "step": 13426 }, { "epoch": 2.705621599838807, "grad_norm": 0.05443556606769562, "learning_rate": 4.733595907341458e-05, "loss": 0.2147, "step": 13428 }, { "epoch": 2.7060245819061053, "grad_norm": 0.05580776929855347, "learning_rate": 4.732265158256837e-05, "loss": 0.1586, "step": 13430 }, { "epoch": 2.7064275639734032, "grad_norm": 0.05419588088989258, "learning_rate": 4.730934428191671e-05, "loss": 0.1915, "step": 13432 }, { "epoch": 2.706830546040701, "grad_norm": 0.04755214601755142, "learning_rate": 4.7296037172404934e-05, "loss": 0.1544, "step": 13434 }, { "epoch": 2.707233528107999, "grad_norm": 0.06594018638134003, "learning_rate": 4.728273025497833e-05, "loss": 0.18, "step": 13436 }, { "epoch": 2.707636510175297, "grad_norm": 0.05939716100692749, "learning_rate": 4.726942353058226e-05, "loss": 0.2284, "step": 13438 }, { "epoch": 2.7080394922425954, "grad_norm": 0.05221749469637871, "learning_rate": 4.7256117000161935e-05, "loss": 0.2107, "step": 13440 }, { "epoch": 2.7084424743098934, "grad_norm": 0.06709478050470352, "learning_rate": 4.72428106646627e-05, "loss": 0.1697, "step": 13442 }, { "epoch": 2.7088454563771913, "grad_norm": 0.09320972859859467, "learning_rate": 4.722950452502977e-05, "loss": 0.2327, "step": 13444 }, { "epoch": 2.7092484384444893, "grad_norm": 0.04267571121454239, "learning_rate": 4.721619858220842e-05, "loss": 0.1675, "step": 13446 }, { "epoch": 2.709651420511787, "grad_norm": 0.06907591968774796, "learning_rate": 4.720289283714385e-05, "loss": 0.174, "step": 13448 }, { "epoch": 2.710054402579085, "grad_norm": 0.06685389578342438, "learning_rate": 4.718958729078133e-05, "loss": 0.2013, "step": 13450 }, { "epoch": 2.710457384646383, "grad_norm": 0.053654421120882034, "learning_rate": 4.717628194406601e-05, "loss": 0.2289, "step": 13452 }, { "epoch": 2.710860366713681, "grad_norm": 0.0344913974404335, "learning_rate": 4.716297679794312e-05, "loss": 0.1451, "step": 13454 }, { "epoch": 2.7112633487809794, "grad_norm": 0.051283374428749084, "learning_rate": 4.7149671853357804e-05, "loss": 0.173, "step": 13456 }, { "epoch": 2.7116663308482774, "grad_norm": 0.06237954646348953, "learning_rate": 4.7136367111255265e-05, "loss": 0.2347, "step": 13458 }, { "epoch": 2.7120693129155753, "grad_norm": 0.08349774777889252, "learning_rate": 4.7123062572580603e-05, "loss": 0.1609, "step": 13460 }, { "epoch": 2.7124722949828732, "grad_norm": 0.05388723313808441, "learning_rate": 4.7109758238278993e-05, "loss": 0.1892, "step": 13462 }, { "epoch": 2.712875277050171, "grad_norm": 0.05169912055134773, "learning_rate": 4.709645410929552e-05, "loss": 0.1759, "step": 13464 }, { "epoch": 2.713278259117469, "grad_norm": 0.05110298469662666, "learning_rate": 4.708315018657532e-05, "loss": 0.1833, "step": 13466 }, { "epoch": 2.7136812411847675, "grad_norm": 0.07065416872501373, "learning_rate": 4.706984647106345e-05, "loss": 0.2534, "step": 13468 }, { "epoch": 2.7140842232520654, "grad_norm": 0.05724635720252991, "learning_rate": 4.7056542963705014e-05, "loss": 0.203, "step": 13470 }, { "epoch": 2.7144872053193634, "grad_norm": 0.0664157047867775, "learning_rate": 4.704323966544505e-05, "loss": 0.1914, "step": 13472 }, { "epoch": 2.7148901873866613, "grad_norm": 0.0474373884499073, "learning_rate": 4.702993657722862e-05, "loss": 0.1917, "step": 13474 }, { "epoch": 2.7152931694539593, "grad_norm": 0.0590548999607563, "learning_rate": 4.701663370000072e-05, "loss": 0.2429, "step": 13476 }, { "epoch": 2.715696151521257, "grad_norm": 0.06265545636415482, "learning_rate": 4.700333103470642e-05, "loss": 0.1909, "step": 13478 }, { "epoch": 2.716099133588555, "grad_norm": 0.06928554177284241, "learning_rate": 4.699002858229067e-05, "loss": 0.1811, "step": 13480 }, { "epoch": 2.716502115655853, "grad_norm": 0.06219932809472084, "learning_rate": 4.6976726343698504e-05, "loss": 0.1795, "step": 13482 }, { "epoch": 2.7169050977231515, "grad_norm": 0.0587947741150856, "learning_rate": 4.696342431987484e-05, "loss": 0.1673, "step": 13484 }, { "epoch": 2.7173080797904494, "grad_norm": 0.07239782065153122, "learning_rate": 4.6950122511764665e-05, "loss": 0.2172, "step": 13486 }, { "epoch": 2.7177110618577474, "grad_norm": 0.06954152882099152, "learning_rate": 4.6936820920312894e-05, "loss": 0.1926, "step": 13488 }, { "epoch": 2.7181140439250453, "grad_norm": 0.07562306523323059, "learning_rate": 4.692351954646448e-05, "loss": 0.1607, "step": 13490 }, { "epoch": 2.7185170259923432, "grad_norm": 0.062392767518758774, "learning_rate": 4.691021839116432e-05, "loss": 0.2182, "step": 13492 }, { "epoch": 2.718920008059641, "grad_norm": 0.057378824800252914, "learning_rate": 4.6896917455357304e-05, "loss": 0.1561, "step": 13494 }, { "epoch": 2.7193229901269396, "grad_norm": 0.05048033967614174, "learning_rate": 4.68836167399883e-05, "loss": 0.2058, "step": 13496 }, { "epoch": 2.7197259721942375, "grad_norm": 0.06855437904596329, "learning_rate": 4.6870316246002195e-05, "loss": 0.1935, "step": 13498 }, { "epoch": 2.7201289542615354, "grad_norm": 0.039874229580163956, "learning_rate": 4.6857015974343785e-05, "loss": 0.1621, "step": 13500 }, { "epoch": 2.7205319363288334, "grad_norm": 0.054651062935590744, "learning_rate": 4.684371592595798e-05, "loss": 0.181, "step": 13502 }, { "epoch": 2.7209349183961313, "grad_norm": 0.05894327163696289, "learning_rate": 4.683041610178951e-05, "loss": 0.2302, "step": 13504 }, { "epoch": 2.7213379004634293, "grad_norm": 0.09319541603326797, "learning_rate": 4.681711650278323e-05, "loss": 0.1972, "step": 13506 }, { "epoch": 2.721740882530727, "grad_norm": 0.07073593139648438, "learning_rate": 4.68038171298839e-05, "loss": 0.2044, "step": 13508 }, { "epoch": 2.722143864598025, "grad_norm": 0.061639729887247086, "learning_rate": 4.679051798403629e-05, "loss": 0.1904, "step": 13510 }, { "epoch": 2.7225468466653235, "grad_norm": 0.057687997817993164, "learning_rate": 4.677721906618514e-05, "loss": 0.1751, "step": 13512 }, { "epoch": 2.7229498287326215, "grad_norm": 0.06481876969337463, "learning_rate": 4.676392037727522e-05, "loss": 0.2169, "step": 13514 }, { "epoch": 2.7233528107999194, "grad_norm": 0.05692235007882118, "learning_rate": 4.675062191825118e-05, "loss": 0.21, "step": 13516 }, { "epoch": 2.7237557928672174, "grad_norm": 0.06150791794061661, "learning_rate": 4.673732369005779e-05, "loss": 0.1686, "step": 13518 }, { "epoch": 2.7241587749345153, "grad_norm": 0.05073976516723633, "learning_rate": 4.672402569363971e-05, "loss": 0.1632, "step": 13520 }, { "epoch": 2.7245617570018137, "grad_norm": 0.0635070875287056, "learning_rate": 4.67107279299416e-05, "loss": 0.2368, "step": 13522 }, { "epoch": 2.7249647390691116, "grad_norm": 0.07709597796201706, "learning_rate": 4.66974303999081e-05, "loss": 0.1828, "step": 13524 }, { "epoch": 2.7253677211364096, "grad_norm": 0.06766793131828308, "learning_rate": 4.66841331044839e-05, "loss": 0.18, "step": 13526 }, { "epoch": 2.7257707032037075, "grad_norm": 0.04549749195575714, "learning_rate": 4.6670836044613536e-05, "loss": 0.1761, "step": 13528 }, { "epoch": 2.7261736852710055, "grad_norm": 0.07432392239570618, "learning_rate": 4.6657539221241684e-05, "loss": 0.2038, "step": 13530 }, { "epoch": 2.7265766673383034, "grad_norm": 0.05852292478084564, "learning_rate": 4.664424263531289e-05, "loss": 0.2055, "step": 13532 }, { "epoch": 2.7269796494056013, "grad_norm": 0.07217149436473846, "learning_rate": 4.6630946287771746e-05, "loss": 0.192, "step": 13534 }, { "epoch": 2.7273826314728993, "grad_norm": 0.054228655993938446, "learning_rate": 4.6617650179562774e-05, "loss": 0.1559, "step": 13536 }, { "epoch": 2.727785613540197, "grad_norm": 0.04575591906905174, "learning_rate": 4.660435431163054e-05, "loss": 0.1644, "step": 13538 }, { "epoch": 2.7281885956074956, "grad_norm": 0.05871587619185448, "learning_rate": 4.659105868491952e-05, "loss": 0.2315, "step": 13540 }, { "epoch": 2.7285915776747935, "grad_norm": 0.061794646084308624, "learning_rate": 4.657776330037427e-05, "loss": 0.1802, "step": 13542 }, { "epoch": 2.7289945597420915, "grad_norm": 0.053640998899936676, "learning_rate": 4.656446815893922e-05, "loss": 0.206, "step": 13544 }, { "epoch": 2.7293975418093894, "grad_norm": 0.1432095468044281, "learning_rate": 4.655117326155887e-05, "loss": 0.2125, "step": 13546 }, { "epoch": 2.7298005238766874, "grad_norm": 0.04829771816730499, "learning_rate": 4.6537878609177646e-05, "loss": 0.1351, "step": 13548 }, { "epoch": 2.7302035059439858, "grad_norm": 0.05013980343937874, "learning_rate": 4.652458420274e-05, "loss": 0.2334, "step": 13550 }, { "epoch": 2.7306064880112837, "grad_norm": 0.06168575957417488, "learning_rate": 4.6511290043190314e-05, "loss": 0.1966, "step": 13552 }, { "epoch": 2.7310094700785816, "grad_norm": 0.06013885512948036, "learning_rate": 4.649799613147303e-05, "loss": 0.1846, "step": 13554 }, { "epoch": 2.7314124521458796, "grad_norm": 0.0718698501586914, "learning_rate": 4.648470246853248e-05, "loss": 0.2349, "step": 13556 }, { "epoch": 2.7318154342131775, "grad_norm": 0.04924318194389343, "learning_rate": 4.6471409055313056e-05, "loss": 0.1665, "step": 13558 }, { "epoch": 2.7322184162804755, "grad_norm": 0.07484610378742218, "learning_rate": 4.645811589275909e-05, "loss": 0.1914, "step": 13560 }, { "epoch": 2.7326213983477734, "grad_norm": 0.055900052189826965, "learning_rate": 4.64448229818149e-05, "loss": 0.2099, "step": 13562 }, { "epoch": 2.7330243804150713, "grad_norm": 0.05515626445412636, "learning_rate": 4.643153032342479e-05, "loss": 0.1421, "step": 13564 }, { "epoch": 2.7334273624823693, "grad_norm": 0.06733749061822891, "learning_rate": 4.641823791853308e-05, "loss": 0.2094, "step": 13566 }, { "epoch": 2.7338303445496677, "grad_norm": 0.06128699705004692, "learning_rate": 4.6404945768084005e-05, "loss": 0.2177, "step": 13568 }, { "epoch": 2.7342333266169656, "grad_norm": 0.05260147899389267, "learning_rate": 4.639165387302185e-05, "loss": 0.1178, "step": 13570 }, { "epoch": 2.7346363086842636, "grad_norm": 0.05988286808133125, "learning_rate": 4.6378362234290817e-05, "loss": 0.2061, "step": 13572 }, { "epoch": 2.7350392907515615, "grad_norm": 0.06819775700569153, "learning_rate": 4.636507085283515e-05, "loss": 0.2071, "step": 13574 }, { "epoch": 2.7354422728188594, "grad_norm": 0.04776353761553764, "learning_rate": 4.635177972959902e-05, "loss": 0.1735, "step": 13576 }, { "epoch": 2.735845254886158, "grad_norm": 0.05012471228837967, "learning_rate": 4.6338488865526655e-05, "loss": 0.1976, "step": 13578 }, { "epoch": 2.7362482369534558, "grad_norm": 0.07344962656497955, "learning_rate": 4.6325198261562144e-05, "loss": 0.1867, "step": 13580 }, { "epoch": 2.7366512190207537, "grad_norm": 0.05135316029191017, "learning_rate": 4.63119079186497e-05, "loss": 0.2513, "step": 13582 }, { "epoch": 2.7370542010880516, "grad_norm": 0.05327500030398369, "learning_rate": 4.629861783773341e-05, "loss": 0.1704, "step": 13584 }, { "epoch": 2.7374571831553496, "grad_norm": 0.05641722306609154, "learning_rate": 4.6285328019757395e-05, "loss": 0.1633, "step": 13586 }, { "epoch": 2.7378601652226475, "grad_norm": 0.08201052248477936, "learning_rate": 4.627203846566572e-05, "loss": 0.1811, "step": 13588 }, { "epoch": 2.7382631472899455, "grad_norm": 0.04866538196802139, "learning_rate": 4.62587491764025e-05, "loss": 0.1921, "step": 13590 }, { "epoch": 2.7386661293572434, "grad_norm": 0.0483061820268631, "learning_rate": 4.624546015291172e-05, "loss": 0.2044, "step": 13592 }, { "epoch": 2.7390691114245413, "grad_norm": 0.06678812950849533, "learning_rate": 4.623217139613748e-05, "loss": 0.2455, "step": 13594 }, { "epoch": 2.7394720934918397, "grad_norm": 0.053607940673828125, "learning_rate": 4.6218882907023734e-05, "loss": 0.1809, "step": 13596 }, { "epoch": 2.7398750755591377, "grad_norm": 0.049875251948833466, "learning_rate": 4.620559468651451e-05, "loss": 0.2261, "step": 13598 }, { "epoch": 2.7402780576264356, "grad_norm": 0.061627864837646484, "learning_rate": 4.619230673555377e-05, "loss": 0.1984, "step": 13600 }, { "epoch": 2.7406810396937336, "grad_norm": 0.05893751606345177, "learning_rate": 4.617901905508548e-05, "loss": 0.2013, "step": 13602 }, { "epoch": 2.7410840217610315, "grad_norm": 0.05172676220536232, "learning_rate": 4.616573164605354e-05, "loss": 0.18, "step": 13604 }, { "epoch": 2.74148700382833, "grad_norm": 0.057852406054735184, "learning_rate": 4.615244450940191e-05, "loss": 0.218, "step": 13606 }, { "epoch": 2.741889985895628, "grad_norm": 0.059165455400943756, "learning_rate": 4.613915764607446e-05, "loss": 0.2052, "step": 13608 }, { "epoch": 2.7422929679629258, "grad_norm": 0.046519551426172256, "learning_rate": 4.612587105701509e-05, "loss": 0.2126, "step": 13610 }, { "epoch": 2.7426959500302237, "grad_norm": 0.05582638457417488, "learning_rate": 4.611258474316764e-05, "loss": 0.1854, "step": 13612 }, { "epoch": 2.7430989320975216, "grad_norm": 0.05454739183187485, "learning_rate": 4.609929870547595e-05, "loss": 0.19, "step": 13614 }, { "epoch": 2.7435019141648196, "grad_norm": 0.03613164275884628, "learning_rate": 4.6086012944883825e-05, "loss": 0.17, "step": 13616 }, { "epoch": 2.7439048962321175, "grad_norm": 0.05004088953137398, "learning_rate": 4.60727274623351e-05, "loss": 0.1799, "step": 13618 }, { "epoch": 2.7443078782994155, "grad_norm": 0.06082676351070404, "learning_rate": 4.6059442258773536e-05, "loss": 0.2265, "step": 13620 }, { "epoch": 2.7447108603667134, "grad_norm": 0.06850215792655945, "learning_rate": 4.604615733514289e-05, "loss": 0.2378, "step": 13622 }, { "epoch": 2.745113842434012, "grad_norm": 0.047571759670972824, "learning_rate": 4.60328726923869e-05, "loss": 0.1445, "step": 13624 }, { "epoch": 2.7455168245013097, "grad_norm": 0.039432961493730545, "learning_rate": 4.60195883314493e-05, "loss": 0.1593, "step": 13626 }, { "epoch": 2.7459198065686077, "grad_norm": 0.04747028276324272, "learning_rate": 4.600630425327375e-05, "loss": 0.1961, "step": 13628 }, { "epoch": 2.7463227886359056, "grad_norm": 0.051651448011398315, "learning_rate": 4.599302045880399e-05, "loss": 0.1608, "step": 13630 }, { "epoch": 2.7467257707032036, "grad_norm": 0.050929635763168335, "learning_rate": 4.597973694898363e-05, "loss": 0.1845, "step": 13632 }, { "epoch": 2.747128752770502, "grad_norm": 0.06035885959863663, "learning_rate": 4.596645372475634e-05, "loss": 0.2094, "step": 13634 }, { "epoch": 2.7475317348378, "grad_norm": 0.04276600480079651, "learning_rate": 4.595317078706572e-05, "loss": 0.1647, "step": 13636 }, { "epoch": 2.747934716905098, "grad_norm": 0.07479345053434372, "learning_rate": 4.593988813685539e-05, "loss": 0.2003, "step": 13638 }, { "epoch": 2.7483376989723958, "grad_norm": 0.050298575311899185, "learning_rate": 4.592660577506888e-05, "loss": 0.1719, "step": 13640 }, { "epoch": 2.7487406810396937, "grad_norm": 0.05687572807073593, "learning_rate": 4.591332370264982e-05, "loss": 0.2393, "step": 13642 }, { "epoch": 2.7491436631069917, "grad_norm": 0.07088617235422134, "learning_rate": 4.590004192054168e-05, "loss": 0.189, "step": 13644 }, { "epoch": 2.7495466451742896, "grad_norm": 0.03902342915534973, "learning_rate": 4.5886760429688016e-05, "loss": 0.1422, "step": 13646 }, { "epoch": 2.7499496272415875, "grad_norm": 0.050030291080474854, "learning_rate": 4.587347923103231e-05, "loss": 0.1976, "step": 13648 }, { "epoch": 2.750352609308886, "grad_norm": 0.05007876083254814, "learning_rate": 4.5860198325518055e-05, "loss": 0.179, "step": 13650 }, { "epoch": 2.750755591376184, "grad_norm": 0.050599128007888794, "learning_rate": 4.584691771408866e-05, "loss": 0.1626, "step": 13652 }, { "epoch": 2.751158573443482, "grad_norm": 0.051085565239191055, "learning_rate": 4.583363739768763e-05, "loss": 0.1839, "step": 13654 }, { "epoch": 2.7515615555107797, "grad_norm": 0.07226687669754028, "learning_rate": 4.582035737725829e-05, "loss": 0.2334, "step": 13656 }, { "epoch": 2.7519645375780777, "grad_norm": 0.06962387263774872, "learning_rate": 4.5807077653744116e-05, "loss": 0.2079, "step": 13658 }, { "epoch": 2.7523675196453756, "grad_norm": 0.08981406688690186, "learning_rate": 4.579379822808841e-05, "loss": 0.1935, "step": 13660 }, { "epoch": 2.752770501712674, "grad_norm": 0.05534931644797325, "learning_rate": 4.578051910123458e-05, "loss": 0.1722, "step": 13662 }, { "epoch": 2.753173483779972, "grad_norm": 0.07691693305969238, "learning_rate": 4.5767240274125904e-05, "loss": 0.165, "step": 13664 }, { "epoch": 2.75357646584727, "grad_norm": 0.05576300621032715, "learning_rate": 4.5753961747705726e-05, "loss": 0.2217, "step": 13666 }, { "epoch": 2.753979447914568, "grad_norm": 0.05548671633005142, "learning_rate": 4.574068352291729e-05, "loss": 0.2706, "step": 13668 }, { "epoch": 2.7543824299818658, "grad_norm": 0.04379713162779808, "learning_rate": 4.572740560070391e-05, "loss": 0.157, "step": 13670 }, { "epoch": 2.7547854120491637, "grad_norm": 0.052931223064661026, "learning_rate": 4.571412798200878e-05, "loss": 0.2155, "step": 13672 }, { "epoch": 2.7551883941164617, "grad_norm": 0.05508912354707718, "learning_rate": 4.5700850667775166e-05, "loss": 0.2049, "step": 13674 }, { "epoch": 2.7555913761837596, "grad_norm": 0.048319194465875626, "learning_rate": 4.568757365894623e-05, "loss": 0.1545, "step": 13676 }, { "epoch": 2.755994358251058, "grad_norm": 0.05623358115553856, "learning_rate": 4.567429695646518e-05, "loss": 0.1878, "step": 13678 }, { "epoch": 2.756397340318356, "grad_norm": 0.04395512863993645, "learning_rate": 4.566102056127513e-05, "loss": 0.1332, "step": 13680 }, { "epoch": 2.756800322385654, "grad_norm": 0.0522671602666378, "learning_rate": 4.564774447431927e-05, "loss": 0.2048, "step": 13682 }, { "epoch": 2.757203304452952, "grad_norm": 0.05952902510762215, "learning_rate": 4.563446869654066e-05, "loss": 0.1795, "step": 13684 }, { "epoch": 2.7576062865202497, "grad_norm": 0.04310224950313568, "learning_rate": 4.562119322888243e-05, "loss": 0.1426, "step": 13686 }, { "epoch": 2.7580092685875477, "grad_norm": 0.06073429435491562, "learning_rate": 4.560791807228761e-05, "loss": 0.1593, "step": 13688 }, { "epoch": 2.758412250654846, "grad_norm": 0.07317265123128891, "learning_rate": 4.559464322769929e-05, "loss": 0.1992, "step": 13690 }, { "epoch": 2.758815232722144, "grad_norm": 0.07108186930418015, "learning_rate": 4.558136869606045e-05, "loss": 0.1988, "step": 13692 }, { "epoch": 2.759218214789442, "grad_norm": 0.054733678698539734, "learning_rate": 4.556809447831412e-05, "loss": 0.1376, "step": 13694 }, { "epoch": 2.75962119685674, "grad_norm": 0.06446659564971924, "learning_rate": 4.5554820575403265e-05, "loss": 0.1946, "step": 13696 }, { "epoch": 2.760024178924038, "grad_norm": 0.05727505311369896, "learning_rate": 4.5541546988270856e-05, "loss": 0.1917, "step": 13698 }, { "epoch": 2.760427160991336, "grad_norm": 0.05168221890926361, "learning_rate": 4.552827371785981e-05, "loss": 0.2005, "step": 13700 }, { "epoch": 2.7608301430586337, "grad_norm": 0.06143752112984657, "learning_rate": 4.551500076511306e-05, "loss": 0.2234, "step": 13702 }, { "epoch": 2.7612331251259317, "grad_norm": 0.05393059179186821, "learning_rate": 4.550172813097346e-05, "loss": 0.1632, "step": 13704 }, { "epoch": 2.76163610719323, "grad_norm": 0.05636855214834213, "learning_rate": 4.548845581638392e-05, "loss": 0.1491, "step": 13706 }, { "epoch": 2.762039089260528, "grad_norm": 0.04566177725791931, "learning_rate": 4.547518382228725e-05, "loss": 0.1682, "step": 13708 }, { "epoch": 2.762442071327826, "grad_norm": 0.05374148488044739, "learning_rate": 4.54619121496263e-05, "loss": 0.2025, "step": 13710 }, { "epoch": 2.762845053395124, "grad_norm": 0.05850568413734436, "learning_rate": 4.544864079934385e-05, "loss": 0.2037, "step": 13712 }, { "epoch": 2.763248035462422, "grad_norm": 0.04666848108172417, "learning_rate": 4.543536977238268e-05, "loss": 0.1904, "step": 13714 }, { "epoch": 2.76365101752972, "grad_norm": 0.0654207244515419, "learning_rate": 4.542209906968551e-05, "loss": 0.197, "step": 13716 }, { "epoch": 2.764053999597018, "grad_norm": 0.050538092851638794, "learning_rate": 4.540882869219515e-05, "loss": 0.1964, "step": 13718 }, { "epoch": 2.764456981664316, "grad_norm": 0.10446475446224213, "learning_rate": 4.539555864085422e-05, "loss": 0.2004, "step": 13720 }, { "epoch": 2.764859963731614, "grad_norm": 0.046087007969617844, "learning_rate": 4.538228891660546e-05, "loss": 0.1674, "step": 13722 }, { "epoch": 2.765262945798912, "grad_norm": 0.05671662464737892, "learning_rate": 4.53690195203915e-05, "loss": 0.2174, "step": 13724 }, { "epoch": 2.76566592786621, "grad_norm": 0.0580022819340229, "learning_rate": 4.5355750453155e-05, "loss": 0.1821, "step": 13726 }, { "epoch": 2.766068909933508, "grad_norm": 0.05730949342250824, "learning_rate": 4.534248171583854e-05, "loss": 0.2026, "step": 13728 }, { "epoch": 2.766471892000806, "grad_norm": 0.04864330217242241, "learning_rate": 4.532921330938476e-05, "loss": 0.2253, "step": 13730 }, { "epoch": 2.7668748740681037, "grad_norm": 0.06228374317288399, "learning_rate": 4.531594523473616e-05, "loss": 0.1941, "step": 13732 }, { "epoch": 2.767277856135402, "grad_norm": 0.08437447249889374, "learning_rate": 4.530267749283535e-05, "loss": 0.2316, "step": 13734 }, { "epoch": 2.7676808382027, "grad_norm": 0.06000857055187225, "learning_rate": 4.52894100846248e-05, "loss": 0.1896, "step": 13736 }, { "epoch": 2.768083820269998, "grad_norm": 0.062360458076000214, "learning_rate": 4.527614301104704e-05, "loss": 0.1581, "step": 13738 }, { "epoch": 2.768486802337296, "grad_norm": 0.04638027027249336, "learning_rate": 4.526287627304451e-05, "loss": 0.1657, "step": 13740 }, { "epoch": 2.768889784404594, "grad_norm": 0.06518687307834625, "learning_rate": 4.5249609871559693e-05, "loss": 0.205, "step": 13742 }, { "epoch": 2.7692927664718923, "grad_norm": 0.04653553292155266, "learning_rate": 4.5236343807534964e-05, "loss": 0.1813, "step": 13744 }, { "epoch": 2.76969574853919, "grad_norm": 0.04413139447569847, "learning_rate": 4.522307808191278e-05, "loss": 0.1976, "step": 13746 }, { "epoch": 2.770098730606488, "grad_norm": 0.07281026244163513, "learning_rate": 4.520981269563548e-05, "loss": 0.2496, "step": 13748 }, { "epoch": 2.770501712673786, "grad_norm": 0.03691798821091652, "learning_rate": 4.5196547649645426e-05, "loss": 0.1752, "step": 13750 }, { "epoch": 2.770904694741084, "grad_norm": 0.055397164076566696, "learning_rate": 4.518328294488494e-05, "loss": 0.1883, "step": 13752 }, { "epoch": 2.771307676808382, "grad_norm": 0.08555757254362106, "learning_rate": 4.517001858229634e-05, "loss": 0.1912, "step": 13754 }, { "epoch": 2.77171065887568, "grad_norm": 0.04360605776309967, "learning_rate": 4.515675456282188e-05, "loss": 0.1671, "step": 13756 }, { "epoch": 2.772113640942978, "grad_norm": 0.06162875518202782, "learning_rate": 4.5143490887403844e-05, "loss": 0.189, "step": 13758 }, { "epoch": 2.772516623010276, "grad_norm": 0.08789025247097015, "learning_rate": 4.513022755698444e-05, "loss": 0.1658, "step": 13760 }, { "epoch": 2.772919605077574, "grad_norm": 0.05798583850264549, "learning_rate": 4.51169645725059e-05, "loss": 0.209, "step": 13762 }, { "epoch": 2.773322587144872, "grad_norm": 0.07589727640151978, "learning_rate": 4.510370193491037e-05, "loss": 0.1956, "step": 13764 }, { "epoch": 2.77372556921217, "grad_norm": 0.05734777823090553, "learning_rate": 4.509043964514003e-05, "loss": 0.1661, "step": 13766 }, { "epoch": 2.774128551279468, "grad_norm": 0.07661935687065125, "learning_rate": 4.507717770413699e-05, "loss": 0.1956, "step": 13768 }, { "epoch": 2.774531533346766, "grad_norm": 0.07800666242837906, "learning_rate": 4.5063916112843394e-05, "loss": 0.2079, "step": 13770 }, { "epoch": 2.7749345154140643, "grad_norm": 0.05529240146279335, "learning_rate": 4.50506548722013e-05, "loss": 0.1785, "step": 13772 }, { "epoch": 2.7753374974813623, "grad_norm": 0.052781157195568085, "learning_rate": 4.503739398315277e-05, "loss": 0.1843, "step": 13774 }, { "epoch": 2.77574047954866, "grad_norm": 0.04509377107024193, "learning_rate": 4.502413344663983e-05, "loss": 0.1793, "step": 13776 }, { "epoch": 2.776143461615958, "grad_norm": 0.04552818834781647, "learning_rate": 4.501087326360449e-05, "loss": 0.1808, "step": 13778 }, { "epoch": 2.776546443683256, "grad_norm": 0.07951189577579498, "learning_rate": 4.499761343498873e-05, "loss": 0.2475, "step": 13780 }, { "epoch": 2.776949425750554, "grad_norm": 0.06761814653873444, "learning_rate": 4.498435396173453e-05, "loss": 0.2362, "step": 13782 }, { "epoch": 2.777352407817852, "grad_norm": 0.0560927577316761, "learning_rate": 4.497109484478378e-05, "loss": 0.1828, "step": 13784 }, { "epoch": 2.77775538988515, "grad_norm": 0.05709867179393768, "learning_rate": 4.4957836085078426e-05, "loss": 0.1846, "step": 13786 }, { "epoch": 2.778158371952448, "grad_norm": 0.07907452434301376, "learning_rate": 4.4944577683560325e-05, "loss": 0.1647, "step": 13788 }, { "epoch": 2.7785613540197462, "grad_norm": 0.05839816480875015, "learning_rate": 4.493131964117135e-05, "loss": 0.149, "step": 13790 }, { "epoch": 2.778964336087044, "grad_norm": 0.06229160353541374, "learning_rate": 4.49180619588533e-05, "loss": 0.2122, "step": 13792 }, { "epoch": 2.779367318154342, "grad_norm": 0.04715651273727417, "learning_rate": 4.490480463754804e-05, "loss": 0.1707, "step": 13794 }, { "epoch": 2.77977030022164, "grad_norm": 0.06249184161424637, "learning_rate": 4.489154767819727e-05, "loss": 0.2341, "step": 13796 }, { "epoch": 2.780173282288938, "grad_norm": 0.05317969247698784, "learning_rate": 4.48782910817428e-05, "loss": 0.1957, "step": 13798 }, { "epoch": 2.7805762643562364, "grad_norm": 0.04029626026749611, "learning_rate": 4.4865034849126336e-05, "loss": 0.165, "step": 13800 }, { "epoch": 2.7809792464235343, "grad_norm": 0.04745176434516907, "learning_rate": 4.485177898128957e-05, "loss": 0.1561, "step": 13802 }, { "epoch": 2.7813822284908323, "grad_norm": 0.04458378627896309, "learning_rate": 4.483852347917423e-05, "loss": 0.1586, "step": 13804 }, { "epoch": 2.78178521055813, "grad_norm": 0.06560066342353821, "learning_rate": 4.4825268343721896e-05, "loss": 0.2025, "step": 13806 }, { "epoch": 2.782188192625428, "grad_norm": 0.055546555668115616, "learning_rate": 4.481201357587424e-05, "loss": 0.1803, "step": 13808 }, { "epoch": 2.782591174692726, "grad_norm": 0.05878676101565361, "learning_rate": 4.479875917657284e-05, "loss": 0.2401, "step": 13810 }, { "epoch": 2.782994156760024, "grad_norm": 0.053903158754110336, "learning_rate": 4.478550514675927e-05, "loss": 0.1834, "step": 13812 }, { "epoch": 2.783397138827322, "grad_norm": 0.05280616879463196, "learning_rate": 4.477225148737506e-05, "loss": 0.1777, "step": 13814 }, { "epoch": 2.78380012089462, "grad_norm": 0.05329929664731026, "learning_rate": 4.4758998199361765e-05, "loss": 0.1804, "step": 13816 }, { "epoch": 2.7842031029619183, "grad_norm": 0.06433535367250443, "learning_rate": 4.4745745283660835e-05, "loss": 0.2252, "step": 13818 }, { "epoch": 2.7846060850292162, "grad_norm": 0.030351370573043823, "learning_rate": 4.4732492741213776e-05, "loss": 0.152, "step": 13820 }, { "epoch": 2.785009067096514, "grad_norm": 0.055486053228378296, "learning_rate": 4.471924057296199e-05, "loss": 0.1758, "step": 13822 }, { "epoch": 2.785412049163812, "grad_norm": 0.05568544939160347, "learning_rate": 4.470598877984693e-05, "loss": 0.1725, "step": 13824 }, { "epoch": 2.78581503123111, "grad_norm": 0.06160859763622284, "learning_rate": 4.469273736280994e-05, "loss": 0.2058, "step": 13826 }, { "epoch": 2.7862180132984085, "grad_norm": 0.0535859577357769, "learning_rate": 4.467948632279243e-05, "loss": 0.203, "step": 13828 }, { "epoch": 2.7866209953657064, "grad_norm": 0.06398488581180573, "learning_rate": 4.4666235660735665e-05, "loss": 0.2098, "step": 13830 }, { "epoch": 2.7870239774330043, "grad_norm": 0.06315203756093979, "learning_rate": 4.4652985377581016e-05, "loss": 0.188, "step": 13832 }, { "epoch": 2.7874269595003023, "grad_norm": 0.045979224145412445, "learning_rate": 4.463973547426972e-05, "loss": 0.1747, "step": 13834 }, { "epoch": 2.7878299415676002, "grad_norm": 0.04362760856747627, "learning_rate": 4.4626485951743055e-05, "loss": 0.1939, "step": 13836 }, { "epoch": 2.788232923634898, "grad_norm": 0.04942842200398445, "learning_rate": 4.461323681094223e-05, "loss": 0.2127, "step": 13838 }, { "epoch": 2.788635905702196, "grad_norm": 0.06796036660671234, "learning_rate": 4.459998805280845e-05, "loss": 0.1717, "step": 13840 }, { "epoch": 2.789038887769494, "grad_norm": 0.05861689895391464, "learning_rate": 4.458673967828286e-05, "loss": 0.1831, "step": 13842 }, { "epoch": 2.789441869836792, "grad_norm": 0.06883051246404648, "learning_rate": 4.457349168830665e-05, "loss": 0.1663, "step": 13844 }, { "epoch": 2.7898448519040904, "grad_norm": 0.05223101004958153, "learning_rate": 4.4560244083820904e-05, "loss": 0.1898, "step": 13846 }, { "epoch": 2.7902478339713883, "grad_norm": 0.0574951246380806, "learning_rate": 4.454699686576673e-05, "loss": 0.2478, "step": 13848 }, { "epoch": 2.7906508160386863, "grad_norm": 0.05982831120491028, "learning_rate": 4.453375003508516e-05, "loss": 0.1523, "step": 13850 }, { "epoch": 2.791053798105984, "grad_norm": 0.059536661952733994, "learning_rate": 4.452050359271726e-05, "loss": 0.1827, "step": 13852 }, { "epoch": 2.791456780173282, "grad_norm": 0.06197162717580795, "learning_rate": 4.4507257539604e-05, "loss": 0.1867, "step": 13854 }, { "epoch": 2.7918597622405805, "grad_norm": 0.057462189346551895, "learning_rate": 4.4494011876686407e-05, "loss": 0.1587, "step": 13856 }, { "epoch": 2.7922627443078785, "grad_norm": 0.044567789882421494, "learning_rate": 4.448076660490539e-05, "loss": 0.2258, "step": 13858 }, { "epoch": 2.7926657263751764, "grad_norm": 0.06285153329372406, "learning_rate": 4.446752172520189e-05, "loss": 0.1995, "step": 13860 }, { "epoch": 2.7930687084424743, "grad_norm": 0.059338536113500595, "learning_rate": 4.44542772385168e-05, "loss": 0.2036, "step": 13862 }, { "epoch": 2.7934716905097723, "grad_norm": 0.041714541614055634, "learning_rate": 4.4441033145790986e-05, "loss": 0.1538, "step": 13864 }, { "epoch": 2.7938746725770702, "grad_norm": 0.04737304151058197, "learning_rate": 4.442778944796527e-05, "loss": 0.174, "step": 13866 }, { "epoch": 2.794277654644368, "grad_norm": 0.07135643810033798, "learning_rate": 4.441454614598051e-05, "loss": 0.2362, "step": 13868 }, { "epoch": 2.794680636711666, "grad_norm": 0.05102970823645592, "learning_rate": 4.440130324077744e-05, "loss": 0.1533, "step": 13870 }, { "epoch": 2.7950836187789645, "grad_norm": 0.06961221992969513, "learning_rate": 4.4388060733296846e-05, "loss": 0.2393, "step": 13872 }, { "epoch": 2.7954866008462624, "grad_norm": 0.07168768346309662, "learning_rate": 4.437481862447943e-05, "loss": 0.2357, "step": 13874 }, { "epoch": 2.7958895829135604, "grad_norm": 0.05309915915131569, "learning_rate": 4.436157691526592e-05, "loss": 0.1884, "step": 13876 }, { "epoch": 2.7962925649808583, "grad_norm": 0.07483027130365372, "learning_rate": 4.434833560659694e-05, "loss": 0.219, "step": 13878 }, { "epoch": 2.7966955470481563, "grad_norm": 0.06671711802482605, "learning_rate": 4.4335094699413196e-05, "loss": 0.2298, "step": 13880 }, { "epoch": 2.797098529115454, "grad_norm": 0.050368938595056534, "learning_rate": 4.432185419465523e-05, "loss": 0.1488, "step": 13882 }, { "epoch": 2.7975015111827526, "grad_norm": 0.0435597226023674, "learning_rate": 4.4308614093263684e-05, "loss": 0.1721, "step": 13884 }, { "epoch": 2.7979044932500505, "grad_norm": 0.06164487451314926, "learning_rate": 4.429537439617908e-05, "loss": 0.1786, "step": 13886 }, { "epoch": 2.7983074753173485, "grad_norm": 0.07364491373300552, "learning_rate": 4.428213510434197e-05, "loss": 0.2253, "step": 13888 }, { "epoch": 2.7987104573846464, "grad_norm": 0.04716651514172554, "learning_rate": 4.426889621869281e-05, "loss": 0.2165, "step": 13890 }, { "epoch": 2.7991134394519444, "grad_norm": 0.046748001128435135, "learning_rate": 4.425565774017213e-05, "loss": 0.1213, "step": 13892 }, { "epoch": 2.7995164215192423, "grad_norm": 0.055533722043037415, "learning_rate": 4.424241966972031e-05, "loss": 0.177, "step": 13894 }, { "epoch": 2.7999194035865402, "grad_norm": 0.09642328321933746, "learning_rate": 4.42291820082778e-05, "loss": 0.2075, "step": 13896 }, { "epoch": 2.800322385653838, "grad_norm": 0.04621008038520813, "learning_rate": 4.421594475678497e-05, "loss": 0.1543, "step": 13898 }, { "epoch": 2.8007253677211366, "grad_norm": 0.0577290914952755, "learning_rate": 4.4202707916182185e-05, "loss": 0.1797, "step": 13900 }, { "epoch": 2.8011283497884345, "grad_norm": 0.056990884244441986, "learning_rate": 4.418947148740974e-05, "loss": 0.1922, "step": 13902 }, { "epoch": 2.8015313318557324, "grad_norm": 0.0492350235581398, "learning_rate": 4.417623547140797e-05, "loss": 0.1559, "step": 13904 }, { "epoch": 2.8019343139230304, "grad_norm": 0.05712248757481575, "learning_rate": 4.416299986911709e-05, "loss": 0.1663, "step": 13906 }, { "epoch": 2.8023372959903283, "grad_norm": 0.062008894979953766, "learning_rate": 4.414976468147739e-05, "loss": 0.1751, "step": 13908 }, { "epoch": 2.8027402780576267, "grad_norm": 0.05482635647058487, "learning_rate": 4.413652990942904e-05, "loss": 0.1928, "step": 13910 }, { "epoch": 2.8031432601249247, "grad_norm": 0.05846606567502022, "learning_rate": 4.4123295553912233e-05, "loss": 0.1886, "step": 13912 }, { "epoch": 2.8035462421922226, "grad_norm": 0.039809294044971466, "learning_rate": 4.41100616158671e-05, "loss": 0.1723, "step": 13914 }, { "epoch": 2.8039492242595205, "grad_norm": 0.0680151879787445, "learning_rate": 4.409682809623379e-05, "loss": 0.2079, "step": 13916 }, { "epoch": 2.8043522063268185, "grad_norm": 0.05849350243806839, "learning_rate": 4.408359499595234e-05, "loss": 0.2057, "step": 13918 }, { "epoch": 2.8047551883941164, "grad_norm": 0.07199438661336899, "learning_rate": 4.4070362315962866e-05, "loss": 0.1936, "step": 13920 }, { "epoch": 2.8051581704614144, "grad_norm": 0.0518622063100338, "learning_rate": 4.405713005720536e-05, "loss": 0.1634, "step": 13922 }, { "epoch": 2.8055611525287123, "grad_norm": 0.09157627820968628, "learning_rate": 4.4043898220619836e-05, "loss": 0.1812, "step": 13924 }, { "epoch": 2.8059641345960102, "grad_norm": 0.06180949881672859, "learning_rate": 4.403066680714625e-05, "loss": 0.2044, "step": 13926 }, { "epoch": 2.8063671166633086, "grad_norm": 0.062134623527526855, "learning_rate": 4.401743581772456e-05, "loss": 0.2088, "step": 13928 }, { "epoch": 2.8067700987306066, "grad_norm": 0.06650009751319885, "learning_rate": 4.400420525329464e-05, "loss": 0.1917, "step": 13930 }, { "epoch": 2.8071730807979045, "grad_norm": 0.07102461159229279, "learning_rate": 4.399097511479641e-05, "loss": 0.2143, "step": 13932 }, { "epoch": 2.8075760628652024, "grad_norm": 0.058678820729255676, "learning_rate": 4.3977745403169686e-05, "loss": 0.2062, "step": 13934 }, { "epoch": 2.8079790449325004, "grad_norm": 0.04873026907444, "learning_rate": 4.396451611935431e-05, "loss": 0.1855, "step": 13936 }, { "epoch": 2.8083820269997988, "grad_norm": 0.051642045378685, "learning_rate": 4.395128726429004e-05, "loss": 0.151, "step": 13938 }, { "epoch": 2.8087850090670967, "grad_norm": 0.06375691294670105, "learning_rate": 4.393805883891667e-05, "loss": 0.222, "step": 13940 }, { "epoch": 2.8091879911343947, "grad_norm": 0.08424566686153412, "learning_rate": 4.392483084417388e-05, "loss": 0.1753, "step": 13942 }, { "epoch": 2.8095909732016926, "grad_norm": 0.060530856251716614, "learning_rate": 4.391160328100142e-05, "loss": 0.1772, "step": 13944 }, { "epoch": 2.8099939552689905, "grad_norm": 0.06497149169445038, "learning_rate": 4.3898376150338896e-05, "loss": 0.1854, "step": 13946 }, { "epoch": 2.8103969373362885, "grad_norm": 0.12235381454229355, "learning_rate": 4.388514945312599e-05, "loss": 0.1871, "step": 13948 }, { "epoch": 2.8107999194035864, "grad_norm": 0.05921998620033264, "learning_rate": 4.387192319030229e-05, "loss": 0.1796, "step": 13950 }, { "epoch": 2.8112029014708844, "grad_norm": 0.04586871340870857, "learning_rate": 4.3858697362807365e-05, "loss": 0.152, "step": 13952 }, { "epoch": 2.8116058835381823, "grad_norm": 0.06341541558504105, "learning_rate": 4.384547197158074e-05, "loss": 0.1648, "step": 13954 }, { "epoch": 2.8120088656054807, "grad_norm": 0.08245648443698883, "learning_rate": 4.383224701756197e-05, "loss": 0.2385, "step": 13956 }, { "epoch": 2.8124118476727786, "grad_norm": 0.04495750740170479, "learning_rate": 4.381902250169048e-05, "loss": 0.218, "step": 13958 }, { "epoch": 2.8128148297400766, "grad_norm": 0.03344697132706642, "learning_rate": 4.380579842490577e-05, "loss": 0.1628, "step": 13960 }, { "epoch": 2.8132178118073745, "grad_norm": 0.05339306592941284, "learning_rate": 4.3792574788147224e-05, "loss": 0.1534, "step": 13962 }, { "epoch": 2.8136207938746725, "grad_norm": 0.06429272145032883, "learning_rate": 4.3779351592354246e-05, "loss": 0.2016, "step": 13964 }, { "epoch": 2.814023775941971, "grad_norm": 0.06380239874124527, "learning_rate": 4.376612883846617e-05, "loss": 0.2009, "step": 13966 }, { "epoch": 2.814426758009269, "grad_norm": 0.05741169676184654, "learning_rate": 4.3752906527422346e-05, "loss": 0.2213, "step": 13968 }, { "epoch": 2.8148297400765667, "grad_norm": 0.052648428827524185, "learning_rate": 4.373968466016202e-05, "loss": 0.1926, "step": 13970 }, { "epoch": 2.8152327221438647, "grad_norm": 0.053499605506658554, "learning_rate": 4.3726463237624517e-05, "loss": 0.2153, "step": 13972 }, { "epoch": 2.8156357042111626, "grad_norm": 0.06318365782499313, "learning_rate": 4.371324226074902e-05, "loss": 0.166, "step": 13974 }, { "epoch": 2.8160386862784605, "grad_norm": 0.07373753190040588, "learning_rate": 4.3700021730474745e-05, "loss": 0.1996, "step": 13976 }, { "epoch": 2.8164416683457585, "grad_norm": 0.05533366650342941, "learning_rate": 4.3686801647740846e-05, "loss": 0.154, "step": 13978 }, { "epoch": 2.8168446504130564, "grad_norm": 0.06208839640021324, "learning_rate": 4.367358201348647e-05, "loss": 0.1526, "step": 13980 }, { "epoch": 2.8172476324803544, "grad_norm": 0.056060247123241425, "learning_rate": 4.366036282865068e-05, "loss": 0.1649, "step": 13982 }, { "epoch": 2.8176506145476528, "grad_norm": 0.07038706541061401, "learning_rate": 4.364714409417261e-05, "loss": 0.2252, "step": 13984 }, { "epoch": 2.8180535966149507, "grad_norm": 0.055569134652614594, "learning_rate": 4.363392581099125e-05, "loss": 0.1669, "step": 13986 }, { "epoch": 2.8184565786822486, "grad_norm": 0.06313623487949371, "learning_rate": 4.362070798004563e-05, "loss": 0.1651, "step": 13988 }, { "epoch": 2.8188595607495466, "grad_norm": 0.08299045264720917, "learning_rate": 4.360749060227469e-05, "loss": 0.1541, "step": 13990 }, { "epoch": 2.8192625428168445, "grad_norm": 0.06302808970212936, "learning_rate": 4.359427367861742e-05, "loss": 0.1875, "step": 13992 }, { "epoch": 2.819665524884143, "grad_norm": 0.0563676692545414, "learning_rate": 4.3581057210012676e-05, "loss": 0.2123, "step": 13994 }, { "epoch": 2.820068506951441, "grad_norm": 0.06105897203087807, "learning_rate": 4.356784119739939e-05, "loss": 0.2102, "step": 13996 }, { "epoch": 2.820471489018739, "grad_norm": 0.06385830044746399, "learning_rate": 4.3554625641716355e-05, "loss": 0.1786, "step": 13998 }, { "epoch": 2.8208744710860367, "grad_norm": 0.04783019796013832, "learning_rate": 4.354141054390243e-05, "loss": 0.1909, "step": 14000 }, { "epoch": 2.8212774531533347, "grad_norm": 0.06092913821339607, "learning_rate": 4.352819590489635e-05, "loss": 0.16, "step": 14002 }, { "epoch": 2.8216804352206326, "grad_norm": 0.06487409770488739, "learning_rate": 4.35149817256369e-05, "loss": 0.2304, "step": 14004 }, { "epoch": 2.8220834172879306, "grad_norm": 0.060342274606227875, "learning_rate": 4.3501768007062754e-05, "loss": 0.202, "step": 14006 }, { "epoch": 2.8224863993552285, "grad_norm": 0.05749595910310745, "learning_rate": 4.348855475011264e-05, "loss": 0.1563, "step": 14008 }, { "epoch": 2.8228893814225264, "grad_norm": 0.06227300316095352, "learning_rate": 4.347534195572517e-05, "loss": 0.1511, "step": 14010 }, { "epoch": 2.823292363489825, "grad_norm": 0.07745243608951569, "learning_rate": 4.3462129624838984e-05, "loss": 0.183, "step": 14012 }, { "epoch": 2.8236953455571228, "grad_norm": 0.06026535481214523, "learning_rate": 4.344891775839264e-05, "loss": 0.1633, "step": 14014 }, { "epoch": 2.8240983276244207, "grad_norm": 0.08717188239097595, "learning_rate": 4.3435706357324716e-05, "loss": 0.1903, "step": 14016 }, { "epoch": 2.8245013096917186, "grad_norm": 0.0764845758676529, "learning_rate": 4.342249542257369e-05, "loss": 0.2251, "step": 14018 }, { "epoch": 2.8249042917590166, "grad_norm": 0.07623816281557083, "learning_rate": 4.340928495507811e-05, "loss": 0.184, "step": 14020 }, { "epoch": 2.825307273826315, "grad_norm": 0.05312751606106758, "learning_rate": 4.339607495577634e-05, "loss": 0.21, "step": 14022 }, { "epoch": 2.825710255893613, "grad_norm": 0.05700031667947769, "learning_rate": 4.3382865425606875e-05, "loss": 0.215, "step": 14024 }, { "epoch": 2.826113237960911, "grad_norm": 0.07716810703277588, "learning_rate": 4.336965636550806e-05, "loss": 0.1729, "step": 14026 }, { "epoch": 2.826516220028209, "grad_norm": 0.10202040523290634, "learning_rate": 4.335644777641826e-05, "loss": 0.1678, "step": 14028 }, { "epoch": 2.8269192020955067, "grad_norm": 0.0468902550637722, "learning_rate": 4.3343239659275764e-05, "loss": 0.1965, "step": 14030 }, { "epoch": 2.8273221841628047, "grad_norm": 0.06173230707645416, "learning_rate": 4.3330032015018905e-05, "loss": 0.1859, "step": 14032 }, { "epoch": 2.8277251662301026, "grad_norm": 0.06326699256896973, "learning_rate": 4.331682484458588e-05, "loss": 0.176, "step": 14034 }, { "epoch": 2.8281281482974006, "grad_norm": 0.058440111577510834, "learning_rate": 4.3303618148914944e-05, "loss": 0.1853, "step": 14036 }, { "epoch": 2.8285311303646985, "grad_norm": 0.05458078905940056, "learning_rate": 4.329041192894426e-05, "loss": 0.1649, "step": 14038 }, { "epoch": 2.828934112431997, "grad_norm": 0.046770691871643066, "learning_rate": 4.3277206185611986e-05, "loss": 0.1496, "step": 14040 }, { "epoch": 2.829337094499295, "grad_norm": 0.057294707745313644, "learning_rate": 4.326400091985623e-05, "loss": 0.1922, "step": 14042 }, { "epoch": 2.8297400765665928, "grad_norm": 0.06126628443598747, "learning_rate": 4.325079613261508e-05, "loss": 0.1622, "step": 14044 }, { "epoch": 2.8301430586338907, "grad_norm": 0.047013264149427414, "learning_rate": 4.3237591824826565e-05, "loss": 0.1449, "step": 14046 }, { "epoch": 2.8305460407011886, "grad_norm": 0.086411252617836, "learning_rate": 4.3224387997428726e-05, "loss": 0.1789, "step": 14048 }, { "epoch": 2.830949022768487, "grad_norm": 0.049013204872608185, "learning_rate": 4.321118465135952e-05, "loss": 0.1821, "step": 14050 }, { "epoch": 2.831352004835785, "grad_norm": 0.048979319632053375, "learning_rate": 4.319798178755691e-05, "loss": 0.1656, "step": 14052 }, { "epoch": 2.831754986903083, "grad_norm": 0.05120906978845596, "learning_rate": 4.3184779406958785e-05, "loss": 0.1742, "step": 14054 }, { "epoch": 2.832157968970381, "grad_norm": 0.05259276553988457, "learning_rate": 4.3171577510503046e-05, "loss": 0.1767, "step": 14056 }, { "epoch": 2.832560951037679, "grad_norm": 0.06700893491506577, "learning_rate": 4.31583760991275e-05, "loss": 0.2277, "step": 14058 }, { "epoch": 2.8329639331049767, "grad_norm": 0.05070505291223526, "learning_rate": 4.314517517376999e-05, "loss": 0.1628, "step": 14060 }, { "epoch": 2.8333669151722747, "grad_norm": 0.03246789425611496, "learning_rate": 4.313197473536826e-05, "loss": 0.185, "step": 14062 }, { "epoch": 2.8337698972395726, "grad_norm": 0.08361760526895523, "learning_rate": 4.311877478486007e-05, "loss": 0.1882, "step": 14064 }, { "epoch": 2.834172879306871, "grad_norm": 0.04620158672332764, "learning_rate": 4.310557532318311e-05, "loss": 0.1459, "step": 14066 }, { "epoch": 2.834575861374169, "grad_norm": 0.05384652316570282, "learning_rate": 4.309237635127507e-05, "loss": 0.2281, "step": 14068 }, { "epoch": 2.834978843441467, "grad_norm": 0.06559676676988602, "learning_rate": 4.307917787007353e-05, "loss": 0.1756, "step": 14070 }, { "epoch": 2.835381825508765, "grad_norm": 0.05740681663155556, "learning_rate": 4.306597988051615e-05, "loss": 0.2154, "step": 14072 }, { "epoch": 2.8357848075760628, "grad_norm": 0.05628305301070213, "learning_rate": 4.305278238354047e-05, "loss": 0.2091, "step": 14074 }, { "epoch": 2.8361877896433607, "grad_norm": 0.032225411385297775, "learning_rate": 4.3039585380084025e-05, "loss": 0.1544, "step": 14076 }, { "epoch": 2.836590771710659, "grad_norm": 0.056562524288892746, "learning_rate": 4.302638887108429e-05, "loss": 0.1389, "step": 14078 }, { "epoch": 2.836993753777957, "grad_norm": 0.07340873777866364, "learning_rate": 4.301319285747875e-05, "loss": 0.2385, "step": 14080 }, { "epoch": 2.837396735845255, "grad_norm": 0.058820270001888275, "learning_rate": 4.299999734020479e-05, "loss": 0.1836, "step": 14082 }, { "epoch": 2.837799717912553, "grad_norm": 0.08579188585281372, "learning_rate": 4.2986802320199866e-05, "loss": 0.1922, "step": 14084 }, { "epoch": 2.838202699979851, "grad_norm": 0.05140161141753197, "learning_rate": 4.297360779840125e-05, "loss": 0.1919, "step": 14086 }, { "epoch": 2.838605682047149, "grad_norm": 0.05750875547528267, "learning_rate": 4.296041377574632e-05, "loss": 0.1504, "step": 14088 }, { "epoch": 2.8390086641144467, "grad_norm": 0.06645703315734863, "learning_rate": 4.294722025317233e-05, "loss": 0.1537, "step": 14090 }, { "epoch": 2.8394116461817447, "grad_norm": 0.05591237172484398, "learning_rate": 4.2934027231616545e-05, "loss": 0.2056, "step": 14092 }, { "epoch": 2.839814628249043, "grad_norm": 0.05866971239447594, "learning_rate": 4.2920834712016136e-05, "loss": 0.1648, "step": 14094 }, { "epoch": 2.840217610316341, "grad_norm": 0.05585728958249092, "learning_rate": 4.290764269530835e-05, "loss": 0.2098, "step": 14096 }, { "epoch": 2.840620592383639, "grad_norm": 0.05240754410624504, "learning_rate": 4.289445118243024e-05, "loss": 0.1762, "step": 14098 }, { "epoch": 2.841023574450937, "grad_norm": 0.06093365326523781, "learning_rate": 4.2881260174318984e-05, "loss": 0.1604, "step": 14100 }, { "epoch": 2.841426556518235, "grad_norm": 0.053919680416584015, "learning_rate": 4.286806967191161e-05, "loss": 0.2471, "step": 14102 }, { "epoch": 2.8418295385855328, "grad_norm": 0.054130956530570984, "learning_rate": 4.2854879676145166e-05, "loss": 0.1734, "step": 14104 }, { "epoch": 2.842232520652831, "grad_norm": 0.055720653384923935, "learning_rate": 4.284169018795664e-05, "loss": 0.1855, "step": 14106 }, { "epoch": 2.842635502720129, "grad_norm": 0.06489083170890808, "learning_rate": 4.2828501208283e-05, "loss": 0.1979, "step": 14108 }, { "epoch": 2.843038484787427, "grad_norm": 0.043121401220560074, "learning_rate": 4.2815312738061145e-05, "loss": 0.1592, "step": 14110 }, { "epoch": 2.843441466854725, "grad_norm": 0.06269125640392303, "learning_rate": 4.2802124778228e-05, "loss": 0.2043, "step": 14112 }, { "epoch": 2.843844448922023, "grad_norm": 0.04860043525695801, "learning_rate": 4.27889373297204e-05, "loss": 0.1608, "step": 14114 }, { "epoch": 2.844247430989321, "grad_norm": 0.05572217330336571, "learning_rate": 4.2775750393475164e-05, "loss": 0.2031, "step": 14116 }, { "epoch": 2.844650413056619, "grad_norm": 0.0398903451859951, "learning_rate": 4.2762563970429054e-05, "loss": 0.1928, "step": 14118 }, { "epoch": 2.8450533951239168, "grad_norm": 0.06042037159204483, "learning_rate": 4.274937806151884e-05, "loss": 0.1717, "step": 14120 }, { "epoch": 2.845456377191215, "grad_norm": 0.042064543813467026, "learning_rate": 4.2736192667681185e-05, "loss": 0.143, "step": 14122 }, { "epoch": 2.845859359258513, "grad_norm": 0.03717980161309242, "learning_rate": 4.272300778985281e-05, "loss": 0.1289, "step": 14124 }, { "epoch": 2.846262341325811, "grad_norm": 0.08025442063808441, "learning_rate": 4.270982342897032e-05, "loss": 0.1962, "step": 14126 }, { "epoch": 2.846665323393109, "grad_norm": 0.061862844973802567, "learning_rate": 4.269663958597032e-05, "loss": 0.229, "step": 14128 }, { "epoch": 2.847068305460407, "grad_norm": 0.057234328240156174, "learning_rate": 4.268345626178935e-05, "loss": 0.2134, "step": 14130 }, { "epoch": 2.8474712875277053, "grad_norm": 0.0573008768260479, "learning_rate": 4.267027345736396e-05, "loss": 0.1752, "step": 14132 }, { "epoch": 2.8478742695950032, "grad_norm": 0.05297653749585152, "learning_rate": 4.26570911736306e-05, "loss": 0.1613, "step": 14134 }, { "epoch": 2.848277251662301, "grad_norm": 0.06393546611070633, "learning_rate": 4.2643909411525765e-05, "loss": 0.1694, "step": 14136 }, { "epoch": 2.848680233729599, "grad_norm": 0.06238080561161041, "learning_rate": 4.263072817198582e-05, "loss": 0.1931, "step": 14138 }, { "epoch": 2.849083215796897, "grad_norm": 0.051683563739061356, "learning_rate": 4.261754745594718e-05, "loss": 0.1634, "step": 14140 }, { "epoch": 2.849486197864195, "grad_norm": 0.054612595587968826, "learning_rate": 4.2604367264346147e-05, "loss": 0.1973, "step": 14142 }, { "epoch": 2.849889179931493, "grad_norm": 0.06870435923337936, "learning_rate": 4.259118759811905e-05, "loss": 0.2024, "step": 14144 }, { "epoch": 2.850292161998791, "grad_norm": 0.052692923694849014, "learning_rate": 4.2578008458202113e-05, "loss": 0.1674, "step": 14146 }, { "epoch": 2.850695144066089, "grad_norm": 0.05360223725438118, "learning_rate": 4.256482984553162e-05, "loss": 0.2175, "step": 14148 }, { "epoch": 2.851098126133387, "grad_norm": 0.05867795646190643, "learning_rate": 4.2551651761043694e-05, "loss": 0.1892, "step": 14150 }, { "epoch": 2.851501108200685, "grad_norm": 0.06195264682173729, "learning_rate": 4.253847420567453e-05, "loss": 0.2264, "step": 14152 }, { "epoch": 2.851904090267983, "grad_norm": 0.07267355918884277, "learning_rate": 4.252529718036022e-05, "loss": 0.2061, "step": 14154 }, { "epoch": 2.852307072335281, "grad_norm": 0.058362722396850586, "learning_rate": 4.251212068603685e-05, "loss": 0.1994, "step": 14156 }, { "epoch": 2.852710054402579, "grad_norm": 0.048904869705438614, "learning_rate": 4.2498944723640434e-05, "loss": 0.1776, "step": 14158 }, { "epoch": 2.8531130364698774, "grad_norm": 0.06460296362638474, "learning_rate": 4.248576929410702e-05, "loss": 0.2103, "step": 14160 }, { "epoch": 2.8535160185371753, "grad_norm": 0.05771844834089279, "learning_rate": 4.2472594398372505e-05, "loss": 0.2026, "step": 14162 }, { "epoch": 2.8539190006044732, "grad_norm": 0.051594674587249756, "learning_rate": 4.245942003737287e-05, "loss": 0.1841, "step": 14164 }, { "epoch": 2.854321982671771, "grad_norm": 0.06020747125148773, "learning_rate": 4.2446246212043964e-05, "loss": 0.1857, "step": 14166 }, { "epoch": 2.854724964739069, "grad_norm": 0.06190628930926323, "learning_rate": 4.2433072923321656e-05, "loss": 0.1678, "step": 14168 }, { "epoch": 2.855127946806367, "grad_norm": 0.07561472803354263, "learning_rate": 4.2419900172141723e-05, "loss": 0.2247, "step": 14170 }, { "epoch": 2.855530928873665, "grad_norm": 0.062340203672647476, "learning_rate": 4.2406727959439995e-05, "loss": 0.2042, "step": 14172 }, { "epoch": 2.855933910940963, "grad_norm": 0.05466499179601669, "learning_rate": 4.239355628615214e-05, "loss": 0.1949, "step": 14174 }, { "epoch": 2.856336893008261, "grad_norm": 0.06361190229654312, "learning_rate": 4.23803851532139e-05, "loss": 0.1932, "step": 14176 }, { "epoch": 2.8567398750755593, "grad_norm": 0.057557422667741776, "learning_rate": 4.23672145615609e-05, "loss": 0.2181, "step": 14178 }, { "epoch": 2.857142857142857, "grad_norm": 0.05499812588095665, "learning_rate": 4.235404451212878e-05, "loss": 0.1839, "step": 14180 }, { "epoch": 2.857545839210155, "grad_norm": 0.06741025298833847, "learning_rate": 4.23408750058531e-05, "loss": 0.1744, "step": 14182 }, { "epoch": 2.857948821277453, "grad_norm": 0.06582369655370712, "learning_rate": 4.232770604366942e-05, "loss": 0.2026, "step": 14184 }, { "epoch": 2.858351803344751, "grad_norm": 0.04338719695806503, "learning_rate": 4.2314537626513216e-05, "loss": 0.156, "step": 14186 }, { "epoch": 2.8587547854120494, "grad_norm": 0.05401334539055824, "learning_rate": 4.230136975531998e-05, "loss": 0.1697, "step": 14188 }, { "epoch": 2.8591577674793474, "grad_norm": 0.05770457908511162, "learning_rate": 4.228820243102513e-05, "loss": 0.1832, "step": 14190 }, { "epoch": 2.8595607495466453, "grad_norm": 0.058899931609630585, "learning_rate": 4.227503565456403e-05, "loss": 0.1979, "step": 14192 }, { "epoch": 2.8599637316139432, "grad_norm": 0.06733313202857971, "learning_rate": 4.226186942687207e-05, "loss": 0.211, "step": 14194 }, { "epoch": 2.860366713681241, "grad_norm": 0.05753978341817856, "learning_rate": 4.22487037488845e-05, "loss": 0.1866, "step": 14196 }, { "epoch": 2.860769695748539, "grad_norm": 0.0664329007267952, "learning_rate": 4.223553862153664e-05, "loss": 0.2111, "step": 14198 }, { "epoch": 2.861172677815837, "grad_norm": 0.04582129046320915, "learning_rate": 4.2222374045763686e-05, "loss": 0.1512, "step": 14200 }, { "epoch": 2.861575659883135, "grad_norm": 0.05810026824474335, "learning_rate": 4.220921002250086e-05, "loss": 0.1722, "step": 14202 }, { "epoch": 2.861978641950433, "grad_norm": 0.056344956159591675, "learning_rate": 4.219604655268328e-05, "loss": 0.1742, "step": 14204 }, { "epoch": 2.8623816240177313, "grad_norm": 0.05658912658691406, "learning_rate": 4.2182883637246074e-05, "loss": 0.2023, "step": 14206 }, { "epoch": 2.8627846060850293, "grad_norm": 0.04890948161482811, "learning_rate": 4.21697212771243e-05, "loss": 0.1886, "step": 14208 }, { "epoch": 2.863187588152327, "grad_norm": 0.06561271846294403, "learning_rate": 4.2156559473253025e-05, "loss": 0.1962, "step": 14210 }, { "epoch": 2.863590570219625, "grad_norm": 0.0460575707256794, "learning_rate": 4.214339822656721e-05, "loss": 0.1798, "step": 14212 }, { "epoch": 2.863993552286923, "grad_norm": 0.05941230431199074, "learning_rate": 4.213023753800183e-05, "loss": 0.2593, "step": 14214 }, { "epoch": 2.8643965343542215, "grad_norm": 0.059036318212747574, "learning_rate": 4.211707740849178e-05, "loss": 0.2102, "step": 14216 }, { "epoch": 2.8647995164215194, "grad_norm": 0.06053904816508293, "learning_rate": 4.210391783897196e-05, "loss": 0.1604, "step": 14218 }, { "epoch": 2.8652024984888174, "grad_norm": 0.05623185262084007, "learning_rate": 4.209075883037716e-05, "loss": 0.235, "step": 14220 }, { "epoch": 2.8656054805561153, "grad_norm": 0.048137541860342026, "learning_rate": 4.207760038364223e-05, "loss": 0.1749, "step": 14222 }, { "epoch": 2.8660084626234132, "grad_norm": 0.0612863153219223, "learning_rate": 4.206444249970188e-05, "loss": 0.1812, "step": 14224 }, { "epoch": 2.866411444690711, "grad_norm": 0.05674326792359352, "learning_rate": 4.205128517949086e-05, "loss": 0.1889, "step": 14226 }, { "epoch": 2.866814426758009, "grad_norm": 0.05260700732469559, "learning_rate": 4.2038128423943815e-05, "loss": 0.1802, "step": 14228 }, { "epoch": 2.867217408825307, "grad_norm": 0.056204576045274734, "learning_rate": 4.202497223399541e-05, "loss": 0.1513, "step": 14230 }, { "epoch": 2.867620390892605, "grad_norm": 0.052276674658060074, "learning_rate": 4.20118166105802e-05, "loss": 0.1623, "step": 14232 }, { "epoch": 2.8680233729599034, "grad_norm": 0.06562013179063797, "learning_rate": 4.199866155463278e-05, "loss": 0.1922, "step": 14234 }, { "epoch": 2.8684263550272013, "grad_norm": 0.052156250923871994, "learning_rate": 4.198550706708764e-05, "loss": 0.1618, "step": 14236 }, { "epoch": 2.8688293370944993, "grad_norm": 0.06406504660844803, "learning_rate": 4.197235314887927e-05, "loss": 0.2082, "step": 14238 }, { "epoch": 2.869232319161797, "grad_norm": 0.052224770188331604, "learning_rate": 4.1959199800942083e-05, "loss": 0.1636, "step": 14240 }, { "epoch": 2.869635301229095, "grad_norm": 0.051645223051309586, "learning_rate": 4.1946047024210495e-05, "loss": 0.1757, "step": 14242 }, { "epoch": 2.8700382832963935, "grad_norm": 0.06716208904981613, "learning_rate": 4.1932894819618824e-05, "loss": 0.1585, "step": 14244 }, { "epoch": 2.8704412653636915, "grad_norm": 0.06000441685318947, "learning_rate": 4.1919743188101435e-05, "loss": 0.1799, "step": 14246 }, { "epoch": 2.8708442474309894, "grad_norm": 0.05317326635122299, "learning_rate": 4.190659213059254e-05, "loss": 0.1669, "step": 14248 }, { "epoch": 2.8712472294982874, "grad_norm": 0.04350145906209946, "learning_rate": 4.189344164802641e-05, "loss": 0.2012, "step": 14250 }, { "epoch": 2.8716502115655853, "grad_norm": 0.050959598273038864, "learning_rate": 4.1880291741337216e-05, "loss": 0.2116, "step": 14252 }, { "epoch": 2.8720531936328833, "grad_norm": 0.05060587078332901, "learning_rate": 4.1867142411459115e-05, "loss": 0.1911, "step": 14254 }, { "epoch": 2.872456175700181, "grad_norm": 0.05303170904517174, "learning_rate": 4.1853993659326194e-05, "loss": 0.1733, "step": 14256 }, { "epoch": 2.872859157767479, "grad_norm": 0.06755878776311874, "learning_rate": 4.184084548587257e-05, "loss": 0.1934, "step": 14258 }, { "epoch": 2.8732621398347775, "grad_norm": 0.059417106211185455, "learning_rate": 4.18276978920322e-05, "loss": 0.2123, "step": 14260 }, { "epoch": 2.8736651219020755, "grad_norm": 0.05868794769048691, "learning_rate": 4.181455087873912e-05, "loss": 0.2412, "step": 14262 }, { "epoch": 2.8740681039693734, "grad_norm": 0.05174780637025833, "learning_rate": 4.180140444692725e-05, "loss": 0.1739, "step": 14264 }, { "epoch": 2.8744710860366713, "grad_norm": 0.0744800716638565, "learning_rate": 4.178825859753051e-05, "loss": 0.2051, "step": 14266 }, { "epoch": 2.8748740681039693, "grad_norm": 0.0438925176858902, "learning_rate": 4.177511333148273e-05, "loss": 0.1976, "step": 14268 }, { "epoch": 2.8752770501712672, "grad_norm": 0.053939249366521835, "learning_rate": 4.176196864971775e-05, "loss": 0.196, "step": 14270 }, { "epoch": 2.8756800322385656, "grad_norm": 0.07452262192964554, "learning_rate": 4.174882455316933e-05, "loss": 0.1775, "step": 14272 }, { "epoch": 2.8760830143058635, "grad_norm": 0.06098518148064613, "learning_rate": 4.1735681042771236e-05, "loss": 0.1944, "step": 14274 }, { "epoch": 2.8764859963731615, "grad_norm": 0.046656448394060135, "learning_rate": 4.1722538119457134e-05, "loss": 0.1812, "step": 14276 }, { "epoch": 2.8768889784404594, "grad_norm": 0.07880371063947678, "learning_rate": 4.17093957841607e-05, "loss": 0.1714, "step": 14278 }, { "epoch": 2.8772919605077574, "grad_norm": 0.050928760319948196, "learning_rate": 4.169625403781551e-05, "loss": 0.1708, "step": 14280 }, { "epoch": 2.8776949425750553, "grad_norm": 0.06175197288393974, "learning_rate": 4.1683112881355177e-05, "loss": 0.2171, "step": 14282 }, { "epoch": 2.8780979246423533, "grad_norm": 0.11151041835546494, "learning_rate": 4.166997231571317e-05, "loss": 0.2139, "step": 14284 }, { "epoch": 2.878500906709651, "grad_norm": 0.06008008494973183, "learning_rate": 4.165683234182304e-05, "loss": 0.1806, "step": 14286 }, { "epoch": 2.8789038887769496, "grad_norm": 0.044566765427589417, "learning_rate": 4.164369296061818e-05, "loss": 0.1506, "step": 14288 }, { "epoch": 2.8793068708442475, "grad_norm": 0.08086936175823212, "learning_rate": 4.163055417303202e-05, "loss": 0.1846, "step": 14290 }, { "epoch": 2.8797098529115455, "grad_norm": 0.09690766781568527, "learning_rate": 4.1617415979997896e-05, "loss": 0.2048, "step": 14292 }, { "epoch": 2.8801128349788434, "grad_norm": 0.05995067209005356, "learning_rate": 4.160427838244915e-05, "loss": 0.1783, "step": 14294 }, { "epoch": 2.8805158170461413, "grad_norm": 0.058783888816833496, "learning_rate": 4.159114138131901e-05, "loss": 0.1628, "step": 14296 }, { "epoch": 2.8809187991134393, "grad_norm": 0.04987872391939163, "learning_rate": 4.1578004977540767e-05, "loss": 0.1664, "step": 14298 }, { "epoch": 2.8813217811807377, "grad_norm": 0.05188801884651184, "learning_rate": 4.156486917204757e-05, "loss": 0.1897, "step": 14300 }, { "epoch": 2.8817247632480356, "grad_norm": 0.050275251269340515, "learning_rate": 4.155173396577259e-05, "loss": 0.1994, "step": 14302 }, { "epoch": 2.8821277453153336, "grad_norm": 0.05531008914113045, "learning_rate": 4.15385993596489e-05, "loss": 0.1803, "step": 14304 }, { "epoch": 2.8825307273826315, "grad_norm": 0.0625590980052948, "learning_rate": 4.1525465354609596e-05, "loss": 0.2059, "step": 14306 }, { "epoch": 2.8829337094499294, "grad_norm": 0.0541142039000988, "learning_rate": 4.1512331951587656e-05, "loss": 0.1838, "step": 14308 }, { "epoch": 2.8833366915172274, "grad_norm": 0.0530061237514019, "learning_rate": 4.149919915151611e-05, "loss": 0.2092, "step": 14310 }, { "epoch": 2.8837396735845253, "grad_norm": 0.0603446289896965, "learning_rate": 4.1486066955327834e-05, "loss": 0.1828, "step": 14312 }, { "epoch": 2.8841426556518233, "grad_norm": 0.08570460975170135, "learning_rate": 4.147293536395577e-05, "loss": 0.2143, "step": 14314 }, { "epoch": 2.8845456377191216, "grad_norm": 0.04139425978064537, "learning_rate": 4.1459804378332724e-05, "loss": 0.1668, "step": 14316 }, { "epoch": 2.8849486197864196, "grad_norm": 0.08652174472808838, "learning_rate": 4.1446673999391525e-05, "loss": 0.2002, "step": 14318 }, { "epoch": 2.8853516018537175, "grad_norm": 0.06046774983406067, "learning_rate": 4.143354422806491e-05, "loss": 0.1494, "step": 14320 }, { "epoch": 2.8857545839210155, "grad_norm": 0.06893909722566605, "learning_rate": 4.142041506528564e-05, "loss": 0.2392, "step": 14322 }, { "epoch": 2.8861575659883134, "grad_norm": 0.05902134254574776, "learning_rate": 4.1407286511986335e-05, "loss": 0.2174, "step": 14324 }, { "epoch": 2.886560548055612, "grad_norm": 0.07994405180215836, "learning_rate": 4.139415856909968e-05, "loss": 0.2721, "step": 14326 }, { "epoch": 2.8869635301229097, "grad_norm": 0.04702121019363403, "learning_rate": 4.1381031237558224e-05, "loss": 0.2053, "step": 14328 }, { "epoch": 2.8873665121902077, "grad_norm": 0.07409130036830902, "learning_rate": 4.136790451829453e-05, "loss": 0.2046, "step": 14330 }, { "epoch": 2.8877694942575056, "grad_norm": 0.05629954859614372, "learning_rate": 4.1354778412241075e-05, "loss": 0.211, "step": 14332 }, { "epoch": 2.8881724763248036, "grad_norm": 0.04987116530537605, "learning_rate": 4.134165292033037e-05, "loss": 0.1781, "step": 14334 }, { "epoch": 2.8885754583921015, "grad_norm": 0.06818079948425293, "learning_rate": 4.1328528043494755e-05, "loss": 0.2116, "step": 14336 }, { "epoch": 2.8889784404593994, "grad_norm": 0.07187522947788239, "learning_rate": 4.131540378266667e-05, "loss": 0.2035, "step": 14338 }, { "epoch": 2.8893814225266974, "grad_norm": 0.056888628751039505, "learning_rate": 4.13022801387784e-05, "loss": 0.2228, "step": 14340 }, { "epoch": 2.8897844045939953, "grad_norm": 0.06543140858411789, "learning_rate": 4.1289157112762244e-05, "loss": 0.223, "step": 14342 }, { "epoch": 2.8901873866612937, "grad_norm": 0.05908466503024101, "learning_rate": 4.1276034705550434e-05, "loss": 0.1893, "step": 14344 }, { "epoch": 2.8905903687285917, "grad_norm": 0.05234837904572487, "learning_rate": 4.1262912918075166e-05, "loss": 0.2027, "step": 14346 }, { "epoch": 2.8909933507958896, "grad_norm": 0.05392058193683624, "learning_rate": 4.124979175126858e-05, "loss": 0.1785, "step": 14348 }, { "epoch": 2.8913963328631875, "grad_norm": 0.0729118064045906, "learning_rate": 4.123667120606281e-05, "loss": 0.1821, "step": 14350 }, { "epoch": 2.8917993149304855, "grad_norm": 0.04530632868409157, "learning_rate": 4.122355128338989e-05, "loss": 0.1548, "step": 14352 }, { "epoch": 2.892202296997784, "grad_norm": 0.06139715015888214, "learning_rate": 4.121043198418187e-05, "loss": 0.1948, "step": 14354 }, { "epoch": 2.892605279065082, "grad_norm": 0.04849937930703163, "learning_rate": 4.119731330937069e-05, "loss": 0.1738, "step": 14356 }, { "epoch": 2.8930082611323797, "grad_norm": 0.0831499695777893, "learning_rate": 4.11841952598883e-05, "loss": 0.2124, "step": 14358 }, { "epoch": 2.8934112431996777, "grad_norm": 0.144696444272995, "learning_rate": 4.117107783666656e-05, "loss": 0.2026, "step": 14360 }, { "epoch": 2.8938142252669756, "grad_norm": 0.051758021116256714, "learning_rate": 4.115796104063736e-05, "loss": 0.1871, "step": 14362 }, { "epoch": 2.8942172073342736, "grad_norm": 0.08509272336959839, "learning_rate": 4.1144844872732455e-05, "loss": 0.253, "step": 14364 }, { "epoch": 2.8946201894015715, "grad_norm": 0.045031026005744934, "learning_rate": 4.1131729333883614e-05, "loss": 0.1607, "step": 14366 }, { "epoch": 2.8950231714688694, "grad_norm": 0.05014210566878319, "learning_rate": 4.111861442502253e-05, "loss": 0.1378, "step": 14368 }, { "epoch": 2.8954261535361674, "grad_norm": 0.06437412649393082, "learning_rate": 4.1105500147080876e-05, "loss": 0.2153, "step": 14370 }, { "epoch": 2.8958291356034658, "grad_norm": 0.04823167249560356, "learning_rate": 4.1092386500990256e-05, "loss": 0.1839, "step": 14372 }, { "epoch": 2.8962321176707637, "grad_norm": 0.06474374979734421, "learning_rate": 4.107927348768227e-05, "loss": 0.209, "step": 14374 }, { "epoch": 2.8966350997380617, "grad_norm": 0.047769974917173386, "learning_rate": 4.106616110808843e-05, "loss": 0.1931, "step": 14376 }, { "epoch": 2.8970380818053596, "grad_norm": 0.04988821968436241, "learning_rate": 4.105304936314021e-05, "loss": 0.1862, "step": 14378 }, { "epoch": 2.8974410638726575, "grad_norm": 0.06389687210321426, "learning_rate": 4.103993825376905e-05, "loss": 0.1765, "step": 14380 }, { "epoch": 2.897844045939956, "grad_norm": 0.06107024848461151, "learning_rate": 4.1026827780906365e-05, "loss": 0.1516, "step": 14382 }, { "epoch": 2.898247028007254, "grad_norm": 0.05711763724684715, "learning_rate": 4.1013717945483454e-05, "loss": 0.1581, "step": 14384 }, { "epoch": 2.898650010074552, "grad_norm": 0.06959626078605652, "learning_rate": 4.100060874843168e-05, "loss": 0.1497, "step": 14386 }, { "epoch": 2.8990529921418497, "grad_norm": 0.051988981664180756, "learning_rate": 4.098750019068225e-05, "loss": 0.2156, "step": 14388 }, { "epoch": 2.8994559742091477, "grad_norm": 0.05507201701402664, "learning_rate": 4.09743922731664e-05, "loss": 0.1684, "step": 14390 }, { "epoch": 2.8998589562764456, "grad_norm": 0.05257626995444298, "learning_rate": 4.096128499681529e-05, "loss": 0.1745, "step": 14392 }, { "epoch": 2.9002619383437436, "grad_norm": 0.05004867911338806, "learning_rate": 4.0948178362560034e-05, "loss": 0.1921, "step": 14394 }, { "epoch": 2.9006649204110415, "grad_norm": 0.06818536669015884, "learning_rate": 4.093507237133169e-05, "loss": 0.2584, "step": 14396 }, { "epoch": 2.9010679024783395, "grad_norm": 0.06977367401123047, "learning_rate": 4.0921967024061355e-05, "loss": 0.2516, "step": 14398 }, { "epoch": 2.901470884545638, "grad_norm": 0.05165844410657883, "learning_rate": 4.0908862321679925e-05, "loss": 0.1735, "step": 14400 }, { "epoch": 2.901873866612936, "grad_norm": 0.0612303651869297, "learning_rate": 4.089575826511839e-05, "loss": 0.2446, "step": 14402 }, { "epoch": 2.9022768486802337, "grad_norm": 0.06506997346878052, "learning_rate": 4.088265485530763e-05, "loss": 0.1762, "step": 14404 }, { "epoch": 2.9026798307475317, "grad_norm": 0.08582379668951035, "learning_rate": 4.086955209317849e-05, "loss": 0.1823, "step": 14406 }, { "epoch": 2.9030828128148296, "grad_norm": 0.09465978294610977, "learning_rate": 4.085644997966176e-05, "loss": 0.2538, "step": 14408 }, { "epoch": 2.903485794882128, "grad_norm": 0.06913874298334122, "learning_rate": 4.0843348515688214e-05, "loss": 0.2471, "step": 14410 }, { "epoch": 2.903888776949426, "grad_norm": 0.06045586243271828, "learning_rate": 4.083024770218852e-05, "loss": 0.2089, "step": 14412 }, { "epoch": 2.904291759016724, "grad_norm": 0.07784318923950195, "learning_rate": 4.081714754009339e-05, "loss": 0.1892, "step": 14414 }, { "epoch": 2.904694741084022, "grad_norm": 0.06447883695363998, "learning_rate": 4.08040480303334e-05, "loss": 0.1931, "step": 14416 }, { "epoch": 2.9050977231513198, "grad_norm": 0.0589727982878685, "learning_rate": 4.079094917383914e-05, "loss": 0.2088, "step": 14418 }, { "epoch": 2.9055007052186177, "grad_norm": 0.06887009739875793, "learning_rate": 4.077785097154111e-05, "loss": 0.1967, "step": 14420 }, { "epoch": 2.9059036872859156, "grad_norm": 0.06360602378845215, "learning_rate": 4.076475342436981e-05, "loss": 0.2037, "step": 14422 }, { "epoch": 2.9063066693532136, "grad_norm": 0.05032625421881676, "learning_rate": 4.075165653325564e-05, "loss": 0.1425, "step": 14424 }, { "epoch": 2.9067096514205115, "grad_norm": 0.059502359479665756, "learning_rate": 4.073856029912902e-05, "loss": 0.2147, "step": 14426 }, { "epoch": 2.90711263348781, "grad_norm": 0.054628755897283554, "learning_rate": 4.072546472292025e-05, "loss": 0.1848, "step": 14428 }, { "epoch": 2.907515615555108, "grad_norm": 0.051899537444114685, "learning_rate": 4.071236980555965e-05, "loss": 0.1782, "step": 14430 }, { "epoch": 2.907918597622406, "grad_norm": 0.038006119430065155, "learning_rate": 4.0699275547977425e-05, "loss": 0.1577, "step": 14432 }, { "epoch": 2.9083215796897037, "grad_norm": 0.05724431946873665, "learning_rate": 4.068618195110381e-05, "loss": 0.1784, "step": 14434 }, { "epoch": 2.9087245617570017, "grad_norm": 0.05458924174308777, "learning_rate": 4.067308901586892e-05, "loss": 0.1425, "step": 14436 }, { "epoch": 2.9091275438243, "grad_norm": 0.04887452721595764, "learning_rate": 4.065999674320288e-05, "loss": 0.1658, "step": 14438 }, { "epoch": 2.909530525891598, "grad_norm": 0.05824177339673042, "learning_rate": 4.0646905134035726e-05, "loss": 0.1837, "step": 14440 }, { "epoch": 2.909933507958896, "grad_norm": 0.045779384672641754, "learning_rate": 4.063381418929748e-05, "loss": 0.1702, "step": 14442 }, { "epoch": 2.910336490026194, "grad_norm": 0.06375889480113983, "learning_rate": 4.062072390991809e-05, "loss": 0.2236, "step": 14444 }, { "epoch": 2.910739472093492, "grad_norm": 0.06309327483177185, "learning_rate": 4.060763429682748e-05, "loss": 0.2208, "step": 14446 }, { "epoch": 2.9111424541607898, "grad_norm": 0.0643438994884491, "learning_rate": 4.059454535095549e-05, "loss": 0.1983, "step": 14448 }, { "epoch": 2.9115454362280877, "grad_norm": 0.0457322783768177, "learning_rate": 4.058145707323199e-05, "loss": 0.1695, "step": 14450 }, { "epoch": 2.9119484182953856, "grad_norm": 0.07760033011436462, "learning_rate": 4.056836946458668e-05, "loss": 0.262, "step": 14452 }, { "epoch": 2.9123514003626836, "grad_norm": 0.06333133578300476, "learning_rate": 4.0555282525949346e-05, "loss": 0.1888, "step": 14454 }, { "epoch": 2.912754382429982, "grad_norm": 0.0646766647696495, "learning_rate": 4.054219625824963e-05, "loss": 0.2099, "step": 14456 }, { "epoch": 2.91315736449728, "grad_norm": 0.06069457530975342, "learning_rate": 4.052911066241717e-05, "loss": 0.2131, "step": 14458 }, { "epoch": 2.913560346564578, "grad_norm": 0.07926452159881592, "learning_rate": 4.051602573938152e-05, "loss": 0.1968, "step": 14460 }, { "epoch": 2.913963328631876, "grad_norm": 0.060611922293901443, "learning_rate": 4.0502941490072264e-05, "loss": 0.199, "step": 14462 }, { "epoch": 2.9143663106991737, "grad_norm": 0.0660584568977356, "learning_rate": 4.0489857915418826e-05, "loss": 0.2008, "step": 14464 }, { "epoch": 2.914769292766472, "grad_norm": 0.07786539942026138, "learning_rate": 4.04767750163507e-05, "loss": 0.2281, "step": 14466 }, { "epoch": 2.91517227483377, "grad_norm": 0.057906147092580795, "learning_rate": 4.046369279379723e-05, "loss": 0.1753, "step": 14468 }, { "epoch": 2.915575256901068, "grad_norm": 0.04037892818450928, "learning_rate": 4.045061124868779e-05, "loss": 0.1498, "step": 14470 }, { "epoch": 2.915978238968366, "grad_norm": 0.048774946480989456, "learning_rate": 4.043753038195164e-05, "loss": 0.1925, "step": 14472 }, { "epoch": 2.916381221035664, "grad_norm": 0.04842628538608551, "learning_rate": 4.042445019451805e-05, "loss": 0.2065, "step": 14474 }, { "epoch": 2.916784203102962, "grad_norm": 0.058236975222826004, "learning_rate": 4.041137068731617e-05, "loss": 0.2216, "step": 14476 }, { "epoch": 2.9171871851702598, "grad_norm": 0.06977251917123795, "learning_rate": 4.039829186127522e-05, "loss": 0.2252, "step": 14478 }, { "epoch": 2.9175901672375577, "grad_norm": 0.0626988634467125, "learning_rate": 4.038521371732425e-05, "loss": 0.2011, "step": 14480 }, { "epoch": 2.917993149304856, "grad_norm": 0.07548090070486069, "learning_rate": 4.0372136256392324e-05, "loss": 0.1634, "step": 14482 }, { "epoch": 2.918396131372154, "grad_norm": 0.05160561203956604, "learning_rate": 4.0359059479408436e-05, "loss": 0.203, "step": 14484 }, { "epoch": 2.918799113439452, "grad_norm": 0.06201513111591339, "learning_rate": 4.034598338730155e-05, "loss": 0.2376, "step": 14486 }, { "epoch": 2.91920209550675, "grad_norm": 0.06060298904776573, "learning_rate": 4.0332907981000546e-05, "loss": 0.1541, "step": 14488 }, { "epoch": 2.919605077574048, "grad_norm": 0.060880377888679504, "learning_rate": 4.031983326143432e-05, "loss": 0.2237, "step": 14490 }, { "epoch": 2.920008059641346, "grad_norm": 0.045542385429143906, "learning_rate": 4.0306759229531644e-05, "loss": 0.1644, "step": 14492 }, { "epoch": 2.920411041708644, "grad_norm": 0.06396988779306412, "learning_rate": 4.029368588622131e-05, "loss": 0.2005, "step": 14494 }, { "epoch": 2.920814023775942, "grad_norm": 0.06875386089086533, "learning_rate": 4.0280613232431984e-05, "loss": 0.1812, "step": 14496 }, { "epoch": 2.92121700584324, "grad_norm": 0.06327049434185028, "learning_rate": 4.026754126909237e-05, "loss": 0.1504, "step": 14498 }, { "epoch": 2.921619987910538, "grad_norm": 0.04754214361310005, "learning_rate": 4.0254469997131035e-05, "loss": 0.1549, "step": 14500 }, { "epoch": 2.922022969977836, "grad_norm": 0.05176509916782379, "learning_rate": 4.024139941747658e-05, "loss": 0.1864, "step": 14502 }, { "epoch": 2.922425952045134, "grad_norm": 0.06160098686814308, "learning_rate": 4.0228329531057506e-05, "loss": 0.1441, "step": 14504 }, { "epoch": 2.922828934112432, "grad_norm": 0.050542235374450684, "learning_rate": 4.021526033880228e-05, "loss": 0.1799, "step": 14506 }, { "epoch": 2.9232319161797298, "grad_norm": 0.04303761571645737, "learning_rate": 4.02021918416393e-05, "loss": 0.1321, "step": 14508 }, { "epoch": 2.923634898247028, "grad_norm": 0.05790780112147331, "learning_rate": 4.0189124040496954e-05, "loss": 0.189, "step": 14510 }, { "epoch": 2.924037880314326, "grad_norm": 0.07991109043359756, "learning_rate": 4.017605693630353e-05, "loss": 0.2345, "step": 14512 }, { "epoch": 2.924440862381624, "grad_norm": 0.06370232254266739, "learning_rate": 4.016299052998732e-05, "loss": 0.1756, "step": 14514 }, { "epoch": 2.924843844448922, "grad_norm": 0.07485152781009674, "learning_rate": 4.0149924822476526e-05, "loss": 0.201, "step": 14516 }, { "epoch": 2.92524682651622, "grad_norm": 0.05705023184418678, "learning_rate": 4.013685981469933e-05, "loss": 0.1849, "step": 14518 }, { "epoch": 2.9256498085835183, "grad_norm": 0.08263441175222397, "learning_rate": 4.0123795507583826e-05, "loss": 0.2508, "step": 14520 }, { "epoch": 2.9260527906508162, "grad_norm": 0.05221550539135933, "learning_rate": 4.0110731902058105e-05, "loss": 0.1852, "step": 14522 }, { "epoch": 2.926455772718114, "grad_norm": 0.058928411453962326, "learning_rate": 4.009766899905016e-05, "loss": 0.1796, "step": 14524 }, { "epoch": 2.926858754785412, "grad_norm": 0.06618046760559082, "learning_rate": 4.0084606799488e-05, "loss": 0.2088, "step": 14526 }, { "epoch": 2.92726173685271, "grad_norm": 0.05029602348804474, "learning_rate": 4.007154530429949e-05, "loss": 0.1485, "step": 14528 }, { "epoch": 2.927664718920008, "grad_norm": 0.05263345316052437, "learning_rate": 4.0058484514412534e-05, "loss": 0.1941, "step": 14530 }, { "epoch": 2.928067700987306, "grad_norm": 0.049153879284858704, "learning_rate": 4.004542443075493e-05, "loss": 0.1857, "step": 14532 }, { "epoch": 2.928470683054604, "grad_norm": 0.05516686663031578, "learning_rate": 4.003236505425447e-05, "loss": 0.1917, "step": 14534 }, { "epoch": 2.928873665121902, "grad_norm": 0.0750420019030571, "learning_rate": 4.001930638583883e-05, "loss": 0.1709, "step": 14536 }, { "epoch": 2.9292766471892002, "grad_norm": 0.050966911017894745, "learning_rate": 4.000624842643574e-05, "loss": 0.2332, "step": 14538 }, { "epoch": 2.929679629256498, "grad_norm": 0.05441434308886528, "learning_rate": 3.9993191176972746e-05, "loss": 0.2048, "step": 14540 }, { "epoch": 2.930082611323796, "grad_norm": 0.06468924880027771, "learning_rate": 3.998013463837747e-05, "loss": 0.1602, "step": 14542 }, { "epoch": 2.930485593391094, "grad_norm": 0.047111447900533676, "learning_rate": 3.996707881157739e-05, "loss": 0.185, "step": 14544 }, { "epoch": 2.930888575458392, "grad_norm": 0.06189405545592308, "learning_rate": 3.995402369749999e-05, "loss": 0.1489, "step": 14546 }, { "epoch": 2.9312915575256904, "grad_norm": 0.0858076959848404, "learning_rate": 3.994096929707268e-05, "loss": 0.2368, "step": 14548 }, { "epoch": 2.9316945395929883, "grad_norm": 0.05392390489578247, "learning_rate": 3.992791561122283e-05, "loss": 0.2169, "step": 14550 }, { "epoch": 2.9320975216602863, "grad_norm": 0.05944026634097099, "learning_rate": 3.991486264087773e-05, "loss": 0.1943, "step": 14552 }, { "epoch": 2.932500503727584, "grad_norm": 0.06575492769479752, "learning_rate": 3.9901810386964676e-05, "loss": 0.234, "step": 14554 }, { "epoch": 2.932903485794882, "grad_norm": 0.04996848106384277, "learning_rate": 3.988875885041085e-05, "loss": 0.2108, "step": 14556 }, { "epoch": 2.93330646786218, "grad_norm": 0.04437585547566414, "learning_rate": 3.987570803214345e-05, "loss": 0.1708, "step": 14558 }, { "epoch": 2.933709449929478, "grad_norm": 0.05040000379085541, "learning_rate": 3.986265793308953e-05, "loss": 0.2091, "step": 14560 }, { "epoch": 2.934112431996776, "grad_norm": 0.05723320692777634, "learning_rate": 3.98496085541762e-05, "loss": 0.1863, "step": 14562 }, { "epoch": 2.934515414064074, "grad_norm": 0.06218833848834038, "learning_rate": 3.983655989633042e-05, "loss": 0.2182, "step": 14564 }, { "epoch": 2.9349183961313723, "grad_norm": 0.05402584373950958, "learning_rate": 3.982351196047919e-05, "loss": 0.1882, "step": 14566 }, { "epoch": 2.9353213781986702, "grad_norm": 0.04149405285716057, "learning_rate": 3.981046474754939e-05, "loss": 0.2108, "step": 14568 }, { "epoch": 2.935724360265968, "grad_norm": 0.05765628442168236, "learning_rate": 3.979741825846789e-05, "loss": 0.1995, "step": 14570 }, { "epoch": 2.936127342333266, "grad_norm": 0.06953881680965424, "learning_rate": 3.978437249416146e-05, "loss": 0.2073, "step": 14572 }, { "epoch": 2.936530324400564, "grad_norm": 0.06293442100286484, "learning_rate": 3.9771327455556874e-05, "loss": 0.227, "step": 14574 }, { "epoch": 2.9369333064678624, "grad_norm": 0.03527409955859184, "learning_rate": 3.975828314358084e-05, "loss": 0.1351, "step": 14576 }, { "epoch": 2.9373362885351604, "grad_norm": 0.0683702901005745, "learning_rate": 3.9745239559159984e-05, "loss": 0.1517, "step": 14578 }, { "epoch": 2.9377392706024583, "grad_norm": 0.0447571687400341, "learning_rate": 3.9732196703220916e-05, "loss": 0.1898, "step": 14580 }, { "epoch": 2.9381422526697563, "grad_norm": 0.06839301437139511, "learning_rate": 3.971915457669017e-05, "loss": 0.2024, "step": 14582 }, { "epoch": 2.938545234737054, "grad_norm": 0.08592922985553741, "learning_rate": 3.970611318049425e-05, "loss": 0.2404, "step": 14584 }, { "epoch": 2.938948216804352, "grad_norm": 0.05581474304199219, "learning_rate": 3.969307251555956e-05, "loss": 0.1685, "step": 14586 }, { "epoch": 2.93935119887165, "grad_norm": 0.057193271815776825, "learning_rate": 3.9680032582812546e-05, "loss": 0.1826, "step": 14588 }, { "epoch": 2.939754180938948, "grad_norm": 0.06702768057584763, "learning_rate": 3.966699338317949e-05, "loss": 0.1922, "step": 14590 }, { "epoch": 2.940157163006246, "grad_norm": 0.0606420673429966, "learning_rate": 3.965395491758672e-05, "loss": 0.2224, "step": 14592 }, { "epoch": 2.9405601450735444, "grad_norm": 0.0733923390507698, "learning_rate": 3.964091718696043e-05, "loss": 0.2127, "step": 14594 }, { "epoch": 2.9409631271408423, "grad_norm": 0.06314688920974731, "learning_rate": 3.962788019222683e-05, "loss": 0.1973, "step": 14596 }, { "epoch": 2.9413661092081402, "grad_norm": 0.056110117584466934, "learning_rate": 3.9614843934312005e-05, "loss": 0.2015, "step": 14598 }, { "epoch": 2.941769091275438, "grad_norm": 0.05174252390861511, "learning_rate": 3.960180841414209e-05, "loss": 0.1905, "step": 14600 }, { "epoch": 2.942172073342736, "grad_norm": 0.05452876538038254, "learning_rate": 3.958877363264306e-05, "loss": 0.1852, "step": 14602 }, { "epoch": 2.9425750554100345, "grad_norm": 0.09879107773303986, "learning_rate": 3.957573959074091e-05, "loss": 0.2238, "step": 14604 }, { "epoch": 2.9429780374773324, "grad_norm": 0.03528512269258499, "learning_rate": 3.956270628936154e-05, "loss": 0.166, "step": 14606 }, { "epoch": 2.9433810195446304, "grad_norm": 0.08624977618455887, "learning_rate": 3.9549673729430837e-05, "loss": 0.2206, "step": 14608 }, { "epoch": 2.9437840016119283, "grad_norm": 0.04483683779835701, "learning_rate": 3.9536641911874575e-05, "loss": 0.1809, "step": 14610 }, { "epoch": 2.9441869836792263, "grad_norm": 0.05532107129693031, "learning_rate": 3.9523610837618565e-05, "loss": 0.1945, "step": 14612 }, { "epoch": 2.944589965746524, "grad_norm": 0.04578416794538498, "learning_rate": 3.951058050758846e-05, "loss": 0.1713, "step": 14614 }, { "epoch": 2.944992947813822, "grad_norm": 0.06979301571846008, "learning_rate": 3.949755092270996e-05, "loss": 0.1939, "step": 14616 }, { "epoch": 2.94539592988112, "grad_norm": 0.05871470272541046, "learning_rate": 3.948452208390864e-05, "loss": 0.1924, "step": 14618 }, { "epoch": 2.945798911948418, "grad_norm": 0.07979889214038849, "learning_rate": 3.947149399211006e-05, "loss": 0.1894, "step": 14620 }, { "epoch": 2.9462018940157164, "grad_norm": 0.0501592755317688, "learning_rate": 3.945846664823969e-05, "loss": 0.1665, "step": 14622 }, { "epoch": 2.9466048760830144, "grad_norm": 0.05370146036148071, "learning_rate": 3.944544005322301e-05, "loss": 0.1336, "step": 14624 }, { "epoch": 2.9470078581503123, "grad_norm": 0.07011505961418152, "learning_rate": 3.943241420798538e-05, "loss": 0.1929, "step": 14626 }, { "epoch": 2.9474108402176102, "grad_norm": 0.06304550915956497, "learning_rate": 3.941938911345215e-05, "loss": 0.1602, "step": 14628 }, { "epoch": 2.947813822284908, "grad_norm": 0.05381280928850174, "learning_rate": 3.940636477054859e-05, "loss": 0.1834, "step": 14630 }, { "epoch": 2.9482168043522066, "grad_norm": 0.08575271815061569, "learning_rate": 3.9393341180199944e-05, "loss": 0.2101, "step": 14632 }, { "epoch": 2.9486197864195045, "grad_norm": 0.060505758970975876, "learning_rate": 3.9380318343331357e-05, "loss": 0.1701, "step": 14634 }, { "epoch": 2.9490227684868024, "grad_norm": 0.05807434022426605, "learning_rate": 3.9367296260868e-05, "loss": 0.1929, "step": 14636 }, { "epoch": 2.9494257505541004, "grad_norm": 0.07592857629060745, "learning_rate": 3.935427493373489e-05, "loss": 0.2111, "step": 14638 }, { "epoch": 2.9498287326213983, "grad_norm": 0.04870220646262169, "learning_rate": 3.934125436285708e-05, "loss": 0.167, "step": 14640 }, { "epoch": 2.9502317146886963, "grad_norm": 0.09737106412649155, "learning_rate": 3.93282345491595e-05, "loss": 0.1648, "step": 14642 }, { "epoch": 2.950634696755994, "grad_norm": 0.06507037580013275, "learning_rate": 3.931521549356708e-05, "loss": 0.1759, "step": 14644 }, { "epoch": 2.951037678823292, "grad_norm": 0.04871666431427002, "learning_rate": 3.930219719700466e-05, "loss": 0.1689, "step": 14646 }, { "epoch": 2.95144066089059, "grad_norm": 0.07162641733884811, "learning_rate": 3.928917966039705e-05, "loss": 0.2247, "step": 14648 }, { "epoch": 2.9518436429578885, "grad_norm": 0.04025919735431671, "learning_rate": 3.927616288466896e-05, "loss": 0.1745, "step": 14650 }, { "epoch": 2.9522466250251864, "grad_norm": 0.05081998184323311, "learning_rate": 3.926314687074514e-05, "loss": 0.218, "step": 14652 }, { "epoch": 2.9526496070924844, "grad_norm": 0.05820939689874649, "learning_rate": 3.925013161955018e-05, "loss": 0.1842, "step": 14654 }, { "epoch": 2.9530525891597823, "grad_norm": 0.06399496644735336, "learning_rate": 3.923711713200868e-05, "loss": 0.2142, "step": 14656 }, { "epoch": 2.9534555712270802, "grad_norm": 0.04385112598538399, "learning_rate": 3.9224103409045164e-05, "loss": 0.1394, "step": 14658 }, { "epoch": 2.9538585532943786, "grad_norm": 0.053807783871889114, "learning_rate": 3.921109045158412e-05, "loss": 0.2103, "step": 14660 }, { "epoch": 2.9542615353616766, "grad_norm": 0.05360929295420647, "learning_rate": 3.9198078260549936e-05, "loss": 0.2018, "step": 14662 }, { "epoch": 2.9546645174289745, "grad_norm": 0.074669249355793, "learning_rate": 3.918506683686702e-05, "loss": 0.1858, "step": 14664 }, { "epoch": 2.9550674994962725, "grad_norm": 0.06290856748819351, "learning_rate": 3.917205618145964e-05, "loss": 0.2006, "step": 14666 }, { "epoch": 2.9554704815635704, "grad_norm": 0.06198323518037796, "learning_rate": 3.915904629525209e-05, "loss": 0.214, "step": 14668 }, { "epoch": 2.9558734636308683, "grad_norm": 0.05055319517850876, "learning_rate": 3.914603717916854e-05, "loss": 0.1624, "step": 14670 }, { "epoch": 2.9562764456981663, "grad_norm": 0.08773225545883179, "learning_rate": 3.913302883413316e-05, "loss": 0.1928, "step": 14672 }, { "epoch": 2.956679427765464, "grad_norm": 0.053192343562841415, "learning_rate": 3.912002126107002e-05, "loss": 0.1594, "step": 14674 }, { "epoch": 2.9570824098327626, "grad_norm": 0.04311097785830498, "learning_rate": 3.910701446090318e-05, "loss": 0.1447, "step": 14676 }, { "epoch": 2.9574853919000605, "grad_norm": 0.048903919756412506, "learning_rate": 3.9094008434556603e-05, "loss": 0.1815, "step": 14678 }, { "epoch": 2.9578883739673585, "grad_norm": 0.0562017560005188, "learning_rate": 3.908100318295424e-05, "loss": 0.2002, "step": 14680 }, { "epoch": 2.9582913560346564, "grad_norm": 0.05847121402621269, "learning_rate": 3.906799870701994e-05, "loss": 0.1946, "step": 14682 }, { "epoch": 2.9586943381019544, "grad_norm": 0.05873006582260132, "learning_rate": 3.905499500767753e-05, "loss": 0.1812, "step": 14684 }, { "epoch": 2.9590973201692523, "grad_norm": 0.08028661459684372, "learning_rate": 3.904199208585076e-05, "loss": 0.2255, "step": 14686 }, { "epoch": 2.9595003022365507, "grad_norm": 0.08308486640453339, "learning_rate": 3.902898994246337e-05, "loss": 0.1943, "step": 14688 }, { "epoch": 2.9599032843038486, "grad_norm": 0.044617436826229095, "learning_rate": 3.901598857843896e-05, "loss": 0.1629, "step": 14690 }, { "epoch": 2.9603062663711466, "grad_norm": 0.040800608694553375, "learning_rate": 3.900298799470118e-05, "loss": 0.1431, "step": 14692 }, { "epoch": 2.9607092484384445, "grad_norm": 0.08411987870931625, "learning_rate": 3.898998819217353e-05, "loss": 0.2145, "step": 14694 }, { "epoch": 2.9611122305057425, "grad_norm": 0.06618909537792206, "learning_rate": 3.8976989171779535e-05, "loss": 0.1955, "step": 14696 }, { "epoch": 2.9615152125730404, "grad_norm": 0.06884243339300156, "learning_rate": 3.896399093444256e-05, "loss": 0.209, "step": 14698 }, { "epoch": 2.9619181946403383, "grad_norm": 0.07660902291536331, "learning_rate": 3.8950993481086065e-05, "loss": 0.2158, "step": 14700 }, { "epoch": 2.9623211767076363, "grad_norm": 0.09313687682151794, "learning_rate": 3.893799681263328e-05, "loss": 0.2272, "step": 14702 }, { "epoch": 2.9627241587749347, "grad_norm": 0.05195392668247223, "learning_rate": 3.892500093000755e-05, "loss": 0.1562, "step": 14704 }, { "epoch": 2.9631271408422326, "grad_norm": 0.09103037416934967, "learning_rate": 3.891200583413201e-05, "loss": 0.1454, "step": 14706 }, { "epoch": 2.9635301229095306, "grad_norm": 0.056169308722019196, "learning_rate": 3.8899011525929863e-05, "loss": 0.2122, "step": 14708 }, { "epoch": 2.9639331049768285, "grad_norm": 0.049981966614723206, "learning_rate": 3.8886018006324174e-05, "loss": 0.1725, "step": 14710 }, { "epoch": 2.9643360870441264, "grad_norm": 0.03693721070885658, "learning_rate": 3.8873025276238004e-05, "loss": 0.1361, "step": 14712 }, { "epoch": 2.964739069111425, "grad_norm": 0.06162644922733307, "learning_rate": 3.88600333365943e-05, "loss": 0.1606, "step": 14714 }, { "epoch": 2.9651420511787228, "grad_norm": 0.0703161209821701, "learning_rate": 3.884704218831603e-05, "loss": 0.1939, "step": 14716 }, { "epoch": 2.9655450332460207, "grad_norm": 0.03999608755111694, "learning_rate": 3.883405183232604e-05, "loss": 0.1514, "step": 14718 }, { "epoch": 2.9659480153133186, "grad_norm": 0.06137223541736603, "learning_rate": 3.882106226954716e-05, "loss": 0.1938, "step": 14720 }, { "epoch": 2.9663509973806166, "grad_norm": 0.061834823340177536, "learning_rate": 3.880807350090213e-05, "loss": 0.1996, "step": 14722 }, { "epoch": 2.9667539794479145, "grad_norm": 0.07341071218252182, "learning_rate": 3.879508552731366e-05, "loss": 0.1703, "step": 14724 }, { "epoch": 2.9671569615152125, "grad_norm": 0.04776541888713837, "learning_rate": 3.878209834970438e-05, "loss": 0.1597, "step": 14726 }, { "epoch": 2.9675599435825104, "grad_norm": 0.04152734950184822, "learning_rate": 3.876911196899692e-05, "loss": 0.1617, "step": 14728 }, { "epoch": 2.9679629256498083, "grad_norm": 0.05166812613606453, "learning_rate": 3.8756126386113766e-05, "loss": 0.1923, "step": 14730 }, { "epoch": 2.9683659077171067, "grad_norm": 0.058748915791511536, "learning_rate": 3.874314160197743e-05, "loss": 0.1995, "step": 14732 }, { "epoch": 2.9687688897844047, "grad_norm": 0.07406263053417206, "learning_rate": 3.8730157617510295e-05, "loss": 0.2137, "step": 14734 }, { "epoch": 2.9691718718517026, "grad_norm": 0.06733974814414978, "learning_rate": 3.871717443363475e-05, "loss": 0.1814, "step": 14736 }, { "epoch": 2.9695748539190006, "grad_norm": 0.0653805136680603, "learning_rate": 3.870419205127307e-05, "loss": 0.1673, "step": 14738 }, { "epoch": 2.9699778359862985, "grad_norm": 0.06307506561279297, "learning_rate": 3.869121047134754e-05, "loss": 0.2257, "step": 14740 }, { "epoch": 2.970380818053597, "grad_norm": 0.0746563971042633, "learning_rate": 3.8678229694780324e-05, "loss": 0.169, "step": 14742 }, { "epoch": 2.970783800120895, "grad_norm": 0.06383204460144043, "learning_rate": 3.8665249722493576e-05, "loss": 0.2629, "step": 14744 }, { "epoch": 2.9711867821881928, "grad_norm": 0.0686652734875679, "learning_rate": 3.865227055540935e-05, "loss": 0.1795, "step": 14746 }, { "epoch": 2.9715897642554907, "grad_norm": 0.07736478000879288, "learning_rate": 3.863929219444968e-05, "loss": 0.1945, "step": 14748 }, { "epoch": 2.9719927463227886, "grad_norm": 0.0649065375328064, "learning_rate": 3.862631464053651e-05, "loss": 0.1543, "step": 14750 }, { "epoch": 2.9723957283900866, "grad_norm": 0.06521575897932053, "learning_rate": 3.861333789459178e-05, "loss": 0.2298, "step": 14752 }, { "epoch": 2.9727987104573845, "grad_norm": 0.057148393243551254, "learning_rate": 3.8600361957537296e-05, "loss": 0.2069, "step": 14754 }, { "epoch": 2.9732016925246825, "grad_norm": 0.07252558320760727, "learning_rate": 3.858738683029489e-05, "loss": 0.2158, "step": 14756 }, { "epoch": 2.9736046745919804, "grad_norm": 0.055180639028549194, "learning_rate": 3.857441251378625e-05, "loss": 0.224, "step": 14758 }, { "epoch": 2.974007656659279, "grad_norm": 0.05758465826511383, "learning_rate": 3.856143900893309e-05, "loss": 0.1997, "step": 14760 }, { "epoch": 2.9744106387265767, "grad_norm": 0.0825326144695282, "learning_rate": 3.8548466316656985e-05, "loss": 0.1782, "step": 14762 }, { "epoch": 2.9748136207938747, "grad_norm": 0.0534152127802372, "learning_rate": 3.853549443787955e-05, "loss": 0.1905, "step": 14764 }, { "epoch": 2.9752166028611726, "grad_norm": 0.049097124487161636, "learning_rate": 3.852252337352223e-05, "loss": 0.2191, "step": 14766 }, { "epoch": 2.9756195849284706, "grad_norm": 0.04427768290042877, "learning_rate": 3.850955312450651e-05, "loss": 0.1987, "step": 14768 }, { "epoch": 2.976022566995769, "grad_norm": 0.05539549142122269, "learning_rate": 3.849658369175375e-05, "loss": 0.1895, "step": 14770 }, { "epoch": 2.976425549063067, "grad_norm": 0.05405467003583908, "learning_rate": 3.84836150761853e-05, "loss": 0.1869, "step": 14772 }, { "epoch": 2.976828531130365, "grad_norm": 0.05085950717329979, "learning_rate": 3.8470647278722404e-05, "loss": 0.1758, "step": 14774 }, { "epoch": 2.9772315131976628, "grad_norm": 0.061146095395088196, "learning_rate": 3.84576803002863e-05, "loss": 0.1606, "step": 14776 }, { "epoch": 2.9776344952649607, "grad_norm": 0.046381138265132904, "learning_rate": 3.8444714141798106e-05, "loss": 0.1816, "step": 14778 }, { "epoch": 2.9780374773322587, "grad_norm": 0.06234044209122658, "learning_rate": 3.843174880417896e-05, "loss": 0.199, "step": 14780 }, { "epoch": 2.9784404593995566, "grad_norm": 0.05754299834370613, "learning_rate": 3.8418784288349865e-05, "loss": 0.1671, "step": 14782 }, { "epoch": 2.9788434414668545, "grad_norm": 0.07325414568185806, "learning_rate": 3.840582059523182e-05, "loss": 0.2082, "step": 14784 }, { "epoch": 2.9792464235341525, "grad_norm": 0.041347935795784, "learning_rate": 3.839285772574574e-05, "loss": 0.1734, "step": 14786 }, { "epoch": 2.979649405601451, "grad_norm": 0.07269836962223053, "learning_rate": 3.837989568081249e-05, "loss": 0.2257, "step": 14788 }, { "epoch": 2.980052387668749, "grad_norm": 0.060071539133787155, "learning_rate": 3.8366934461352846e-05, "loss": 0.1823, "step": 14790 }, { "epoch": 2.9804553697360467, "grad_norm": 0.0773068517446518, "learning_rate": 3.835397406828759e-05, "loss": 0.1749, "step": 14792 }, { "epoch": 2.9808583518033447, "grad_norm": 0.048480074852705, "learning_rate": 3.834101450253738e-05, "loss": 0.1994, "step": 14794 }, { "epoch": 2.9812613338706426, "grad_norm": 0.05097859725356102, "learning_rate": 3.832805576502287e-05, "loss": 0.2009, "step": 14796 }, { "epoch": 2.981664315937941, "grad_norm": 0.05600469186902046, "learning_rate": 3.83150978566646e-05, "loss": 0.2094, "step": 14798 }, { "epoch": 2.982067298005239, "grad_norm": 0.05589543282985687, "learning_rate": 3.83021407783831e-05, "loss": 0.1738, "step": 14800 }, { "epoch": 2.982470280072537, "grad_norm": 0.04236073046922684, "learning_rate": 3.8289184531098795e-05, "loss": 0.1333, "step": 14802 }, { "epoch": 2.982873262139835, "grad_norm": 0.04489387571811676, "learning_rate": 3.82762291157321e-05, "loss": 0.1528, "step": 14804 }, { "epoch": 2.9832762442071328, "grad_norm": 0.05915519595146179, "learning_rate": 3.826327453320334e-05, "loss": 0.2527, "step": 14806 }, { "epoch": 2.9836792262744307, "grad_norm": 0.0606289803981781, "learning_rate": 3.8250320784432805e-05, "loss": 0.1475, "step": 14808 }, { "epoch": 2.9840822083417287, "grad_norm": 0.06469710171222687, "learning_rate": 3.823736787034067e-05, "loss": 0.2019, "step": 14810 }, { "epoch": 2.9844851904090266, "grad_norm": 0.058138661086559296, "learning_rate": 3.822441579184712e-05, "loss": 0.2144, "step": 14812 }, { "epoch": 2.9848881724763245, "grad_norm": 0.05327381566166878, "learning_rate": 3.8211464549872214e-05, "loss": 0.1871, "step": 14814 }, { "epoch": 2.985291154543623, "grad_norm": 0.05329843983054161, "learning_rate": 3.819851414533604e-05, "loss": 0.2019, "step": 14816 }, { "epoch": 2.985694136610921, "grad_norm": 0.07764440774917603, "learning_rate": 3.818556457915854e-05, "loss": 0.1886, "step": 14818 }, { "epoch": 2.986097118678219, "grad_norm": 0.05114993825554848, "learning_rate": 3.8172615852259644e-05, "loss": 0.225, "step": 14820 }, { "epoch": 2.9865001007455168, "grad_norm": 0.059108905494213104, "learning_rate": 3.81596679655592e-05, "loss": 0.2137, "step": 14822 }, { "epoch": 2.9869030828128147, "grad_norm": 0.060795605182647705, "learning_rate": 3.8146720919977005e-05, "loss": 0.2168, "step": 14824 }, { "epoch": 2.987306064880113, "grad_norm": 0.0726071447134018, "learning_rate": 3.813377471643279e-05, "loss": 0.1937, "step": 14826 }, { "epoch": 2.987709046947411, "grad_norm": 0.06462717056274414, "learning_rate": 3.812082935584627e-05, "loss": 0.2359, "step": 14828 }, { "epoch": 2.988112029014709, "grad_norm": 0.0547826811671257, "learning_rate": 3.8107884839137e-05, "loss": 0.1894, "step": 14830 }, { "epoch": 2.988515011082007, "grad_norm": 0.047724831849336624, "learning_rate": 3.80949411672246e-05, "loss": 0.2132, "step": 14832 }, { "epoch": 2.988917993149305, "grad_norm": 0.04862813651561737, "learning_rate": 3.808199834102852e-05, "loss": 0.2272, "step": 14834 }, { "epoch": 2.989320975216603, "grad_norm": 0.05972544103860855, "learning_rate": 3.806905636146824e-05, "loss": 0.2171, "step": 14836 }, { "epoch": 2.9897239572839007, "grad_norm": 0.06059863418340683, "learning_rate": 3.8056115229463086e-05, "loss": 0.2395, "step": 14838 }, { "epoch": 2.9901269393511987, "grad_norm": 0.061624523252248764, "learning_rate": 3.804317494593244e-05, "loss": 0.2317, "step": 14840 }, { "epoch": 2.9905299214184966, "grad_norm": 0.05651069059967995, "learning_rate": 3.8030235511795484e-05, "loss": 0.1882, "step": 14842 }, { "epoch": 2.990932903485795, "grad_norm": 0.047117821872234344, "learning_rate": 3.801729692797149e-05, "loss": 0.1767, "step": 14844 }, { "epoch": 2.991335885553093, "grad_norm": 0.058522630482912064, "learning_rate": 3.800435919537953e-05, "loss": 0.227, "step": 14846 }, { "epoch": 2.991738867620391, "grad_norm": 0.06430003046989441, "learning_rate": 3.799142231493873e-05, "loss": 0.2436, "step": 14848 }, { "epoch": 2.992141849687689, "grad_norm": 0.03316927328705788, "learning_rate": 3.7978486287568076e-05, "loss": 0.1492, "step": 14850 }, { "epoch": 2.9925448317549868, "grad_norm": 0.07100309431552887, "learning_rate": 3.796555111418654e-05, "loss": 0.1636, "step": 14852 }, { "epoch": 2.992947813822285, "grad_norm": 0.05562622845172882, "learning_rate": 3.795261679571298e-05, "loss": 0.2164, "step": 14854 }, { "epoch": 2.993350795889583, "grad_norm": 0.06413529068231583, "learning_rate": 3.7939683333066276e-05, "loss": 0.1957, "step": 14856 }, { "epoch": 2.993753777956881, "grad_norm": 0.07106629014015198, "learning_rate": 3.7926750727165175e-05, "loss": 0.1811, "step": 14858 }, { "epoch": 2.994156760024179, "grad_norm": 0.06832096725702286, "learning_rate": 3.7913818978928403e-05, "loss": 0.1592, "step": 14860 }, { "epoch": 2.994559742091477, "grad_norm": 0.04877585172653198, "learning_rate": 3.790088808927459e-05, "loss": 0.2011, "step": 14862 }, { "epoch": 2.994962724158775, "grad_norm": 0.08021704107522964, "learning_rate": 3.788795805912235e-05, "loss": 0.2259, "step": 14864 }, { "epoch": 2.995365706226073, "grad_norm": 0.05000881478190422, "learning_rate": 3.787502888939019e-05, "loss": 0.1734, "step": 14866 }, { "epoch": 2.9957686882933707, "grad_norm": 0.06632080674171448, "learning_rate": 3.786210058099659e-05, "loss": 0.179, "step": 14868 }, { "epoch": 2.996171670360669, "grad_norm": 0.05244016274809837, "learning_rate": 3.784917313485995e-05, "loss": 0.2026, "step": 14870 }, { "epoch": 2.996574652427967, "grad_norm": 0.07437470555305481, "learning_rate": 3.783624655189862e-05, "loss": 0.2714, "step": 14872 }, { "epoch": 2.996977634495265, "grad_norm": 0.053574662655591965, "learning_rate": 3.7823320833030885e-05, "loss": 0.1724, "step": 14874 }, { "epoch": 2.997380616562563, "grad_norm": 0.055552784353494644, "learning_rate": 3.781039597917496e-05, "loss": 0.1951, "step": 14876 }, { "epoch": 2.997783598629861, "grad_norm": 0.04839596152305603, "learning_rate": 3.7797471991249e-05, "loss": 0.1626, "step": 14878 }, { "epoch": 2.998186580697159, "grad_norm": 0.06522393971681595, "learning_rate": 3.778454887017113e-05, "loss": 0.2036, "step": 14880 }, { "epoch": 2.998589562764457, "grad_norm": 0.07289113104343414, "learning_rate": 3.777162661685937e-05, "loss": 0.2254, "step": 14882 }, { "epoch": 2.998992544831755, "grad_norm": 0.05435548722743988, "learning_rate": 3.77587052322317e-05, "loss": 0.166, "step": 14884 }, { "epoch": 2.999395526899053, "grad_norm": 0.062487825751304626, "learning_rate": 3.774578471720603e-05, "loss": 0.2084, "step": 14886 }, { "epoch": 2.999798508966351, "grad_norm": 0.05912657454609871, "learning_rate": 3.7732865072700225e-05, "loss": 0.1823, "step": 14888 } ], "logging_steps": 2, "max_steps": 24815, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.266637847050322e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }