diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,52129 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 14889, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00040298206729800525, + "grad_norm": 0.26894956827163696, + "learning_rate": 1.6116035455278e-07, + "loss": 0.3796, + "step": 2 + }, + { + "epoch": 0.0008059641345960105, + "grad_norm": 0.26727548241615295, + "learning_rate": 3.2232070910556e-07, + "loss": 0.4024, + "step": 4 + }, + { + "epoch": 0.0012089462018940156, + "grad_norm": 0.3290499448776245, + "learning_rate": 4.834810636583401e-07, + "loss": 0.3773, + "step": 6 + }, + { + "epoch": 0.001611928269192021, + "grad_norm": 0.2747355103492737, + "learning_rate": 6.4464141821112e-07, + "loss": 0.4127, + "step": 8 + }, + { + "epoch": 0.0020149103364900263, + "grad_norm": 0.19865649938583374, + "learning_rate": 8.058017727639e-07, + "loss": 0.3967, + "step": 10 + }, + { + "epoch": 0.0024178924037880313, + "grad_norm": 0.20322462916374207, + "learning_rate": 9.669621273166802e-07, + "loss": 0.4543, + "step": 12 + }, + { + "epoch": 0.0028208744710860366, + "grad_norm": 0.19949519634246826, + "learning_rate": 1.1281224818694602e-06, + "loss": 0.3594, + "step": 14 + }, + { + "epoch": 0.003223856538384042, + "grad_norm": 0.20132654905319214, + "learning_rate": 1.28928283642224e-06, + "loss": 0.418, + "step": 16 + }, + { + "epoch": 0.0036268386056820473, + "grad_norm": 0.22182698547840118, + "learning_rate": 1.4504431909750204e-06, + "loss": 0.4144, + "step": 18 + }, + { + "epoch": 0.004029820672980053, + "grad_norm": 0.2903820872306824, + "learning_rate": 1.6116035455278e-06, + "loss": 0.3906, + "step": 20 + }, + { + "epoch": 0.004432802740278058, + "grad_norm": 0.20233897864818573, + "learning_rate": 1.7727639000805805e-06, + "loss": 0.3694, + "step": 22 + }, + { + "epoch": 0.0048357848075760625, + "grad_norm": 0.21239183843135834, + "learning_rate": 1.9339242546333603e-06, + "loss": 0.4169, + "step": 24 + }, + { + "epoch": 0.005238766874874068, + "grad_norm": 0.3125857710838318, + "learning_rate": 2.09508460918614e-06, + "loss": 0.4125, + "step": 26 + }, + { + "epoch": 0.005641748942172073, + "grad_norm": 0.1859792172908783, + "learning_rate": 2.2562449637389205e-06, + "loss": 0.3919, + "step": 28 + }, + { + "epoch": 0.006044731009470079, + "grad_norm": 0.38755470514297485, + "learning_rate": 2.4174053182917003e-06, + "loss": 0.4378, + "step": 30 + }, + { + "epoch": 0.006447713076768084, + "grad_norm": 0.2895958125591278, + "learning_rate": 2.57856567284448e-06, + "loss": 0.3866, + "step": 32 + }, + { + "epoch": 0.006850695144066089, + "grad_norm": 0.20762863755226135, + "learning_rate": 2.7397260273972604e-06, + "loss": 0.4852, + "step": 34 + }, + { + "epoch": 0.007253677211364095, + "grad_norm": 0.1820192039012909, + "learning_rate": 2.9008863819500407e-06, + "loss": 0.3758, + "step": 36 + }, + { + "epoch": 0.007656659278662099, + "grad_norm": 0.21095845103263855, + "learning_rate": 3.0620467365028206e-06, + "loss": 0.3388, + "step": 38 + }, + { + "epoch": 0.008059641345960105, + "grad_norm": 0.17223376035690308, + "learning_rate": 3.2232070910556e-06, + "loss": 0.3846, + "step": 40 + }, + { + "epoch": 0.00846262341325811, + "grad_norm": 0.28751063346862793, + "learning_rate": 3.3843674456083807e-06, + "loss": 0.424, + "step": 42 + }, + { + "epoch": 0.008865605480556116, + "grad_norm": 0.22931885719299316, + "learning_rate": 3.545527800161161e-06, + "loss": 0.3927, + "step": 44 + }, + { + "epoch": 0.00926858754785412, + "grad_norm": 0.17054371535778046, + "learning_rate": 3.706688154713941e-06, + "loss": 0.3698, + "step": 46 + }, + { + "epoch": 0.009671569615152125, + "grad_norm": 0.1970645934343338, + "learning_rate": 3.867848509266721e-06, + "loss": 0.3825, + "step": 48 + }, + { + "epoch": 0.01007455168245013, + "grad_norm": 0.226581409573555, + "learning_rate": 4.0290088638195005e-06, + "loss": 0.388, + "step": 50 + }, + { + "epoch": 0.010477533749748136, + "grad_norm": 0.23808637261390686, + "learning_rate": 4.19016921837228e-06, + "loss": 0.3802, + "step": 52 + }, + { + "epoch": 0.010880515817046141, + "grad_norm": 0.23428556323051453, + "learning_rate": 4.351329572925061e-06, + "loss": 0.4011, + "step": 54 + }, + { + "epoch": 0.011283497884344146, + "grad_norm": 0.22363170981407166, + "learning_rate": 4.512489927477841e-06, + "loss": 0.388, + "step": 56 + }, + { + "epoch": 0.011686479951642152, + "grad_norm": 0.19610151648521423, + "learning_rate": 4.673650282030621e-06, + "loss": 0.3456, + "step": 58 + }, + { + "epoch": 0.012089462018940157, + "grad_norm": 0.32007136940956116, + "learning_rate": 4.834810636583401e-06, + "loss": 0.4075, + "step": 60 + }, + { + "epoch": 0.012492444086238163, + "grad_norm": 0.2625230550765991, + "learning_rate": 4.9959709911361805e-06, + "loss": 0.3101, + "step": 62 + }, + { + "epoch": 0.012895426153536168, + "grad_norm": 0.25986766815185547, + "learning_rate": 5.15713134568896e-06, + "loss": 0.3616, + "step": 64 + }, + { + "epoch": 0.013298408220834173, + "grad_norm": 0.3018483817577362, + "learning_rate": 5.31829170024174e-06, + "loss": 0.3482, + "step": 66 + }, + { + "epoch": 0.013701390288132179, + "grad_norm": 0.23337864875793457, + "learning_rate": 5.479452054794521e-06, + "loss": 0.3467, + "step": 68 + }, + { + "epoch": 0.014104372355430184, + "grad_norm": 0.19600503146648407, + "learning_rate": 5.6406124093473016e-06, + "loss": 0.3617, + "step": 70 + }, + { + "epoch": 0.01450735442272819, + "grad_norm": 0.24549278616905212, + "learning_rate": 5.801772763900081e-06, + "loss": 0.355, + "step": 72 + }, + { + "epoch": 0.014910336490026195, + "grad_norm": 0.2724650502204895, + "learning_rate": 5.962933118452861e-06, + "loss": 0.366, + "step": 74 + }, + { + "epoch": 0.015313318557324198, + "grad_norm": 0.179411381483078, + "learning_rate": 6.124093473005641e-06, + "loss": 0.3963, + "step": 76 + }, + { + "epoch": 0.015716300624622204, + "grad_norm": 0.2288782000541687, + "learning_rate": 6.285253827558421e-06, + "loss": 0.3481, + "step": 78 + }, + { + "epoch": 0.01611928269192021, + "grad_norm": 0.27673980593681335, + "learning_rate": 6.4464141821112e-06, + "loss": 0.3715, + "step": 80 + }, + { + "epoch": 0.016522264759218214, + "grad_norm": 0.19291406869888306, + "learning_rate": 6.607574536663981e-06, + "loss": 0.2958, + "step": 82 + }, + { + "epoch": 0.01692524682651622, + "grad_norm": 0.2432788461446762, + "learning_rate": 6.768734891216761e-06, + "loss": 0.3135, + "step": 84 + }, + { + "epoch": 0.017328228893814225, + "grad_norm": 0.35042449831962585, + "learning_rate": 6.929895245769541e-06, + "loss": 0.289, + "step": 86 + }, + { + "epoch": 0.017731210961112232, + "grad_norm": 0.2282453328371048, + "learning_rate": 7.091055600322322e-06, + "loss": 0.3255, + "step": 88 + }, + { + "epoch": 0.018134193028410236, + "grad_norm": 0.2614617347717285, + "learning_rate": 7.252215954875101e-06, + "loss": 0.3164, + "step": 90 + }, + { + "epoch": 0.01853717509570824, + "grad_norm": 0.20971183478832245, + "learning_rate": 7.413376309427882e-06, + "loss": 0.2639, + "step": 92 + }, + { + "epoch": 0.018940157163006247, + "grad_norm": 0.2980507016181946, + "learning_rate": 7.574536663980661e-06, + "loss": 0.2453, + "step": 94 + }, + { + "epoch": 0.01934313923030425, + "grad_norm": 0.27161258459091187, + "learning_rate": 7.735697018533441e-06, + "loss": 0.3087, + "step": 96 + }, + { + "epoch": 0.019746121297602257, + "grad_norm": 0.2935725450515747, + "learning_rate": 7.89685737308622e-06, + "loss": 0.2627, + "step": 98 + }, + { + "epoch": 0.02014910336490026, + "grad_norm": 0.20042681694030762, + "learning_rate": 8.058017727639001e-06, + "loss": 0.3023, + "step": 100 + }, + { + "epoch": 0.020552085432198268, + "grad_norm": 0.2027212679386139, + "learning_rate": 8.21917808219178e-06, + "loss": 0.263, + "step": 102 + }, + { + "epoch": 0.02095506749949627, + "grad_norm": 0.23784109950065613, + "learning_rate": 8.38033843674456e-06, + "loss": 0.2263, + "step": 104 + }, + { + "epoch": 0.02135804956679428, + "grad_norm": 0.2175307422876358, + "learning_rate": 8.541498791297341e-06, + "loss": 0.2717, + "step": 106 + }, + { + "epoch": 0.021761031634092282, + "grad_norm": 0.38441169261932373, + "learning_rate": 8.702659145850122e-06, + "loss": 0.253, + "step": 108 + }, + { + "epoch": 0.02216401370139029, + "grad_norm": 0.32977694272994995, + "learning_rate": 8.863819500402901e-06, + "loss": 0.2557, + "step": 110 + }, + { + "epoch": 0.022566995768688293, + "grad_norm": 0.35294121503829956, + "learning_rate": 9.024979854955682e-06, + "loss": 0.2767, + "step": 112 + }, + { + "epoch": 0.0229699778359863, + "grad_norm": 0.2736724019050598, + "learning_rate": 9.186140209508463e-06, + "loss": 0.2394, + "step": 114 + }, + { + "epoch": 0.023372959903284304, + "grad_norm": 0.48598694801330566, + "learning_rate": 9.347300564061242e-06, + "loss": 0.2349, + "step": 116 + }, + { + "epoch": 0.02377594197058231, + "grad_norm": 0.2282780408859253, + "learning_rate": 9.508460918614022e-06, + "loss": 0.284, + "step": 118 + }, + { + "epoch": 0.024178924037880314, + "grad_norm": 0.297885537147522, + "learning_rate": 9.669621273166801e-06, + "loss": 0.2087, + "step": 120 + }, + { + "epoch": 0.024581906105178318, + "grad_norm": 0.1727965772151947, + "learning_rate": 9.830781627719582e-06, + "loss": 0.2552, + "step": 122 + }, + { + "epoch": 0.024984888172476325, + "grad_norm": 0.2835056781768799, + "learning_rate": 9.991941982272361e-06, + "loss": 0.2698, + "step": 124 + }, + { + "epoch": 0.02538787023977433, + "grad_norm": 0.29209110140800476, + "learning_rate": 1.0153102336825142e-05, + "loss": 0.1992, + "step": 126 + }, + { + "epoch": 0.025790852307072336, + "grad_norm": 0.31140977144241333, + "learning_rate": 1.031426269137792e-05, + "loss": 0.224, + "step": 128 + }, + { + "epoch": 0.02619383437437034, + "grad_norm": 0.3317878842353821, + "learning_rate": 1.0475423045930701e-05, + "loss": 0.216, + "step": 130 + }, + { + "epoch": 0.026596816441668347, + "grad_norm": 0.22915571928024292, + "learning_rate": 1.063658340048348e-05, + "loss": 0.2123, + "step": 132 + }, + { + "epoch": 0.02699979850896635, + "grad_norm": 0.2686645984649658, + "learning_rate": 1.0797743755036261e-05, + "loss": 0.2238, + "step": 134 + }, + { + "epoch": 0.027402780576264357, + "grad_norm": 0.22468291223049164, + "learning_rate": 1.0958904109589042e-05, + "loss": 0.251, + "step": 136 + }, + { + "epoch": 0.02780576264356236, + "grad_norm": 0.5836780667304993, + "learning_rate": 1.1120064464141822e-05, + "loss": 0.2242, + "step": 138 + }, + { + "epoch": 0.028208744710860368, + "grad_norm": 0.6434018611907959, + "learning_rate": 1.1281224818694603e-05, + "loss": 0.251, + "step": 140 + }, + { + "epoch": 0.02861172677815837, + "grad_norm": 0.2891378700733185, + "learning_rate": 1.1442385173247382e-05, + "loss": 0.2184, + "step": 142 + }, + { + "epoch": 0.02901470884545638, + "grad_norm": 0.3141033351421356, + "learning_rate": 1.1603545527800163e-05, + "loss": 0.2608, + "step": 144 + }, + { + "epoch": 0.029417690912754382, + "grad_norm": 0.20865300297737122, + "learning_rate": 1.1764705882352942e-05, + "loss": 0.2162, + "step": 146 + }, + { + "epoch": 0.02982067298005239, + "grad_norm": 0.2990216016769409, + "learning_rate": 1.1925866236905723e-05, + "loss": 0.2296, + "step": 148 + }, + { + "epoch": 0.030223655047350393, + "grad_norm": 0.28841957449913025, + "learning_rate": 1.2087026591458502e-05, + "loss": 0.2101, + "step": 150 + }, + { + "epoch": 0.030626637114648397, + "grad_norm": 0.22070175409317017, + "learning_rate": 1.2248186946011282e-05, + "loss": 0.2492, + "step": 152 + }, + { + "epoch": 0.031029619181946404, + "grad_norm": 0.26178812980651855, + "learning_rate": 1.2409347300564061e-05, + "loss": 0.1963, + "step": 154 + }, + { + "epoch": 0.03143260124924441, + "grad_norm": 0.5203900933265686, + "learning_rate": 1.2570507655116842e-05, + "loss": 0.2219, + "step": 156 + }, + { + "epoch": 0.03183558331654241, + "grad_norm": 0.1924479454755783, + "learning_rate": 1.2731668009669623e-05, + "loss": 0.2366, + "step": 158 + }, + { + "epoch": 0.03223856538384042, + "grad_norm": 0.4090641140937805, + "learning_rate": 1.28928283642224e-05, + "loss": 0.2283, + "step": 160 + }, + { + "epoch": 0.032641547451138425, + "grad_norm": 0.25200772285461426, + "learning_rate": 1.305398871877518e-05, + "loss": 0.2175, + "step": 162 + }, + { + "epoch": 0.03304452951843643, + "grad_norm": 0.2138717919588089, + "learning_rate": 1.3215149073327961e-05, + "loss": 0.2178, + "step": 164 + }, + { + "epoch": 0.03344751158573443, + "grad_norm": 0.23985524475574493, + "learning_rate": 1.3376309427880742e-05, + "loss": 0.2733, + "step": 166 + }, + { + "epoch": 0.03385049365303244, + "grad_norm": 0.2478920966386795, + "learning_rate": 1.3537469782433523e-05, + "loss": 0.2489, + "step": 168 + }, + { + "epoch": 0.03425347572033045, + "grad_norm": 0.3181647062301636, + "learning_rate": 1.3698630136986302e-05, + "loss": 0.2129, + "step": 170 + }, + { + "epoch": 0.03465645778762845, + "grad_norm": 0.3164592683315277, + "learning_rate": 1.3859790491539082e-05, + "loss": 0.238, + "step": 172 + }, + { + "epoch": 0.035059439854926454, + "grad_norm": 0.2163039594888687, + "learning_rate": 1.4020950846091863e-05, + "loss": 0.1853, + "step": 174 + }, + { + "epoch": 0.035462421922224464, + "grad_norm": 0.2521723508834839, + "learning_rate": 1.4182111200644644e-05, + "loss": 0.2292, + "step": 176 + }, + { + "epoch": 0.03586540398952247, + "grad_norm": 0.22221054136753082, + "learning_rate": 1.4343271555197421e-05, + "loss": 0.216, + "step": 178 + }, + { + "epoch": 0.03626838605682047, + "grad_norm": 0.7538085579872131, + "learning_rate": 1.4504431909750202e-05, + "loss": 0.2608, + "step": 180 + }, + { + "epoch": 0.036671368124118475, + "grad_norm": 0.20844882726669312, + "learning_rate": 1.4665592264302983e-05, + "loss": 0.2367, + "step": 182 + }, + { + "epoch": 0.03707435019141648, + "grad_norm": 0.2020358443260193, + "learning_rate": 1.4826752618855763e-05, + "loss": 0.2516, + "step": 184 + }, + { + "epoch": 0.03747733225871449, + "grad_norm": 0.3234182298183441, + "learning_rate": 1.498791297340854e-05, + "loss": 0.1662, + "step": 186 + }, + { + "epoch": 0.03788031432601249, + "grad_norm": 0.18517746031284332, + "learning_rate": 1.5149073327961321e-05, + "loss": 0.2732, + "step": 188 + }, + { + "epoch": 0.0382832963933105, + "grad_norm": 0.2105470597743988, + "learning_rate": 1.5310233682514102e-05, + "loss": 0.2293, + "step": 190 + }, + { + "epoch": 0.0386862784606085, + "grad_norm": 0.29840391874313354, + "learning_rate": 1.5471394037066883e-05, + "loss": 0.2547, + "step": 192 + }, + { + "epoch": 0.03908926052790651, + "grad_norm": 0.22715766727924347, + "learning_rate": 1.563255439161966e-05, + "loss": 0.1969, + "step": 194 + }, + { + "epoch": 0.039492242595204514, + "grad_norm": 0.36480122804641724, + "learning_rate": 1.579371474617244e-05, + "loss": 0.2212, + "step": 196 + }, + { + "epoch": 0.03989522466250252, + "grad_norm": 0.15495972335338593, + "learning_rate": 1.595487510072522e-05, + "loss": 0.2534, + "step": 198 + }, + { + "epoch": 0.04029820672980052, + "grad_norm": 0.2647131383419037, + "learning_rate": 1.6116035455278002e-05, + "loss": 0.1396, + "step": 200 + }, + { + "epoch": 0.04070118879709853, + "grad_norm": 0.430426687002182, + "learning_rate": 1.6277195809830783e-05, + "loss": 0.2301, + "step": 202 + }, + { + "epoch": 0.041104170864396536, + "grad_norm": 0.19295406341552734, + "learning_rate": 1.643835616438356e-05, + "loss": 0.1858, + "step": 204 + }, + { + "epoch": 0.04150715293169454, + "grad_norm": 0.20830343663692474, + "learning_rate": 1.659951651893634e-05, + "loss": 0.2315, + "step": 206 + }, + { + "epoch": 0.04191013499899254, + "grad_norm": 0.4402204155921936, + "learning_rate": 1.676067687348912e-05, + "loss": 0.1993, + "step": 208 + }, + { + "epoch": 0.04231311706629055, + "grad_norm": 0.40240907669067383, + "learning_rate": 1.6921837228041902e-05, + "loss": 0.1841, + "step": 210 + }, + { + "epoch": 0.04271609913358856, + "grad_norm": 0.21966539323329926, + "learning_rate": 1.7082997582594683e-05, + "loss": 0.193, + "step": 212 + }, + { + "epoch": 0.04311908120088656, + "grad_norm": 0.2012406885623932, + "learning_rate": 1.7244157937147464e-05, + "loss": 0.1679, + "step": 214 + }, + { + "epoch": 0.043522063268184565, + "grad_norm": 0.2646319270133972, + "learning_rate": 1.7405318291700244e-05, + "loss": 0.2292, + "step": 216 + }, + { + "epoch": 0.04392504533548257, + "grad_norm": 0.25195521116256714, + "learning_rate": 1.7566478646253025e-05, + "loss": 0.1845, + "step": 218 + }, + { + "epoch": 0.04432802740278058, + "grad_norm": 0.24844960868358612, + "learning_rate": 1.7727639000805802e-05, + "loss": 0.2392, + "step": 220 + }, + { + "epoch": 0.04473100947007858, + "grad_norm": 0.21988102793693542, + "learning_rate": 1.7888799355358583e-05, + "loss": 0.227, + "step": 222 + }, + { + "epoch": 0.045133991537376586, + "grad_norm": 0.34720098972320557, + "learning_rate": 1.8049959709911364e-05, + "loss": 0.2223, + "step": 224 + }, + { + "epoch": 0.04553697360467459, + "grad_norm": 0.2286384403705597, + "learning_rate": 1.8211120064464144e-05, + "loss": 0.195, + "step": 226 + }, + { + "epoch": 0.0459399556719726, + "grad_norm": 0.5426890850067139, + "learning_rate": 1.8372280419016925e-05, + "loss": 0.1991, + "step": 228 + }, + { + "epoch": 0.046342937739270604, + "grad_norm": 0.2465570718050003, + "learning_rate": 1.8533440773569702e-05, + "loss": 0.2399, + "step": 230 + }, + { + "epoch": 0.04674591980656861, + "grad_norm": 0.25233685970306396, + "learning_rate": 1.8694601128122483e-05, + "loss": 0.1794, + "step": 232 + }, + { + "epoch": 0.04714890187386661, + "grad_norm": 0.29172617197036743, + "learning_rate": 1.8855761482675264e-05, + "loss": 0.2746, + "step": 234 + }, + { + "epoch": 0.04755188394116462, + "grad_norm": 0.22605657577514648, + "learning_rate": 1.9016921837228044e-05, + "loss": 0.2205, + "step": 236 + }, + { + "epoch": 0.047954866008462625, + "grad_norm": 0.22374406456947327, + "learning_rate": 1.9178082191780822e-05, + "loss": 0.1858, + "step": 238 + }, + { + "epoch": 0.04835784807576063, + "grad_norm": 0.3003512918949127, + "learning_rate": 1.9339242546333602e-05, + "loss": 0.1881, + "step": 240 + }, + { + "epoch": 0.04876083014305863, + "grad_norm": 0.24595895409584045, + "learning_rate": 1.9500402900886383e-05, + "loss": 0.2389, + "step": 242 + }, + { + "epoch": 0.049163812210356636, + "grad_norm": 0.1715419590473175, + "learning_rate": 1.9661563255439164e-05, + "loss": 0.2205, + "step": 244 + }, + { + "epoch": 0.04956679427765465, + "grad_norm": 0.1657872498035431, + "learning_rate": 1.982272360999194e-05, + "loss": 0.2125, + "step": 246 + }, + { + "epoch": 0.04996977634495265, + "grad_norm": 0.2673228085041046, + "learning_rate": 1.9983883964544722e-05, + "loss": 0.2688, + "step": 248 + }, + { + "epoch": 0.050372758412250654, + "grad_norm": 0.2491181194782257, + "learning_rate": 2.0145044319097503e-05, + "loss": 0.2558, + "step": 250 + }, + { + "epoch": 0.05077574047954866, + "grad_norm": 0.1648625135421753, + "learning_rate": 2.0306204673650283e-05, + "loss": 0.2304, + "step": 252 + }, + { + "epoch": 0.05117872254684667, + "grad_norm": 0.21671715378761292, + "learning_rate": 2.0467365028203064e-05, + "loss": 0.1811, + "step": 254 + }, + { + "epoch": 0.05158170461414467, + "grad_norm": 0.19184859097003937, + "learning_rate": 2.062852538275584e-05, + "loss": 0.1923, + "step": 256 + }, + { + "epoch": 0.051984686681442675, + "grad_norm": 0.3803746998310089, + "learning_rate": 2.0789685737308622e-05, + "loss": 0.1985, + "step": 258 + }, + { + "epoch": 0.05238766874874068, + "grad_norm": 0.2140813171863556, + "learning_rate": 2.0950846091861403e-05, + "loss": 0.2185, + "step": 260 + }, + { + "epoch": 0.05279065081603869, + "grad_norm": 0.16832232475280762, + "learning_rate": 2.1112006446414183e-05, + "loss": 0.2087, + "step": 262 + }, + { + "epoch": 0.05319363288333669, + "grad_norm": 0.21901485323905945, + "learning_rate": 2.127316680096696e-05, + "loss": 0.1855, + "step": 264 + }, + { + "epoch": 0.0535966149506347, + "grad_norm": 0.2979806959629059, + "learning_rate": 2.143432715551974e-05, + "loss": 0.2444, + "step": 266 + }, + { + "epoch": 0.0539995970179327, + "grad_norm": 0.2841276228427887, + "learning_rate": 2.1595487510072522e-05, + "loss": 0.2148, + "step": 268 + }, + { + "epoch": 0.054402579085230704, + "grad_norm": 0.1896170824766159, + "learning_rate": 2.1756647864625303e-05, + "loss": 0.2656, + "step": 270 + }, + { + "epoch": 0.054805561152528715, + "grad_norm": 0.24286943674087524, + "learning_rate": 2.1917808219178083e-05, + "loss": 0.2079, + "step": 272 + }, + { + "epoch": 0.05520854321982672, + "grad_norm": 0.41237831115722656, + "learning_rate": 2.2078968573730864e-05, + "loss": 0.135, + "step": 274 + }, + { + "epoch": 0.05561152528712472, + "grad_norm": 0.17708082497119904, + "learning_rate": 2.2240128928283645e-05, + "loss": 0.2, + "step": 276 + }, + { + "epoch": 0.056014507354422725, + "grad_norm": 0.1603289693593979, + "learning_rate": 2.2401289282836426e-05, + "loss": 0.1929, + "step": 278 + }, + { + "epoch": 0.056417489421720736, + "grad_norm": 0.25106561183929443, + "learning_rate": 2.2562449637389206e-05, + "loss": 0.1673, + "step": 280 + }, + { + "epoch": 0.05682047148901874, + "grad_norm": 0.19034595787525177, + "learning_rate": 2.2723609991941984e-05, + "loss": 0.2536, + "step": 282 + }, + { + "epoch": 0.05722345355631674, + "grad_norm": 0.2286035567522049, + "learning_rate": 2.2884770346494764e-05, + "loss": 0.1878, + "step": 284 + }, + { + "epoch": 0.05762643562361475, + "grad_norm": 0.21165654063224792, + "learning_rate": 2.3045930701047545e-05, + "loss": 0.1909, + "step": 286 + }, + { + "epoch": 0.05802941769091276, + "grad_norm": 0.18575643002986908, + "learning_rate": 2.3207091055600326e-05, + "loss": 0.2177, + "step": 288 + }, + { + "epoch": 0.05843239975821076, + "grad_norm": 0.23574145138263702, + "learning_rate": 2.3368251410153103e-05, + "loss": 0.1526, + "step": 290 + }, + { + "epoch": 0.058835381825508765, + "grad_norm": 0.25691771507263184, + "learning_rate": 2.3529411764705884e-05, + "loss": 0.2041, + "step": 292 + }, + { + "epoch": 0.05923836389280677, + "grad_norm": 0.16939234733581543, + "learning_rate": 2.3690572119258664e-05, + "loss": 0.1546, + "step": 294 + }, + { + "epoch": 0.05964134596010478, + "grad_norm": 0.15170009434223175, + "learning_rate": 2.3851732473811445e-05, + "loss": 0.2195, + "step": 296 + }, + { + "epoch": 0.06004432802740278, + "grad_norm": 0.22137551009655, + "learning_rate": 2.4012892828364222e-05, + "loss": 0.2026, + "step": 298 + }, + { + "epoch": 0.060447310094700786, + "grad_norm": 0.16329768300056458, + "learning_rate": 2.4174053182917003e-05, + "loss": 0.1763, + "step": 300 + }, + { + "epoch": 0.06085029216199879, + "grad_norm": 0.17110270261764526, + "learning_rate": 2.4335213537469784e-05, + "loss": 0.2008, + "step": 302 + }, + { + "epoch": 0.06125327422929679, + "grad_norm": 0.2618560492992401, + "learning_rate": 2.4496373892022564e-05, + "loss": 0.2408, + "step": 304 + }, + { + "epoch": 0.061656256296594804, + "grad_norm": 0.14731261134147644, + "learning_rate": 2.4657534246575342e-05, + "loss": 0.2269, + "step": 306 + }, + { + "epoch": 0.06205923836389281, + "grad_norm": 0.19489218294620514, + "learning_rate": 2.4818694601128122e-05, + "loss": 0.2106, + "step": 308 + }, + { + "epoch": 0.06246222043119081, + "grad_norm": 0.18849453330039978, + "learning_rate": 2.4979854955680903e-05, + "loss": 0.1856, + "step": 310 + }, + { + "epoch": 0.06286520249848881, + "grad_norm": 0.83469557762146, + "learning_rate": 2.5141015310233684e-05, + "loss": 0.2558, + "step": 312 + }, + { + "epoch": 0.06326818456578683, + "grad_norm": 0.17429494857788086, + "learning_rate": 2.5302175664786465e-05, + "loss": 0.2025, + "step": 314 + }, + { + "epoch": 0.06367116663308482, + "grad_norm": 0.17523562908172607, + "learning_rate": 2.5463336019339245e-05, + "loss": 0.2015, + "step": 316 + }, + { + "epoch": 0.06407414870038283, + "grad_norm": 0.1575326770544052, + "learning_rate": 2.5624496373892026e-05, + "loss": 0.2234, + "step": 318 + }, + { + "epoch": 0.06447713076768084, + "grad_norm": 0.19641879200935364, + "learning_rate": 2.57856567284448e-05, + "loss": 0.1846, + "step": 320 + }, + { + "epoch": 0.06488011283497884, + "grad_norm": 0.2497224360704422, + "learning_rate": 2.594681708299758e-05, + "loss": 0.1492, + "step": 322 + }, + { + "epoch": 0.06528309490227685, + "grad_norm": 0.36354440450668335, + "learning_rate": 2.610797743755036e-05, + "loss": 0.1787, + "step": 324 + }, + { + "epoch": 0.06568607696957485, + "grad_norm": 0.16666480898857117, + "learning_rate": 2.6269137792103142e-05, + "loss": 0.2055, + "step": 326 + }, + { + "epoch": 0.06608905903687286, + "grad_norm": 0.20041917264461517, + "learning_rate": 2.6430298146655923e-05, + "loss": 0.2166, + "step": 328 + }, + { + "epoch": 0.06649204110417087, + "grad_norm": 0.3149804472923279, + "learning_rate": 2.6591458501208703e-05, + "loss": 0.1834, + "step": 330 + }, + { + "epoch": 0.06689502317146886, + "grad_norm": 0.17761199176311493, + "learning_rate": 2.6752618855761484e-05, + "loss": 0.174, + "step": 332 + }, + { + "epoch": 0.06729800523876688, + "grad_norm": 0.1416761577129364, + "learning_rate": 2.6913779210314265e-05, + "loss": 0.2131, + "step": 334 + }, + { + "epoch": 0.06770098730606489, + "grad_norm": 0.207278773188591, + "learning_rate": 2.7074939564867045e-05, + "loss": 0.265, + "step": 336 + }, + { + "epoch": 0.06810396937336288, + "grad_norm": 0.3748891055583954, + "learning_rate": 2.7236099919419823e-05, + "loss": 0.1741, + "step": 338 + }, + { + "epoch": 0.0685069514406609, + "grad_norm": 0.38350576162338257, + "learning_rate": 2.7397260273972603e-05, + "loss": 0.165, + "step": 340 + }, + { + "epoch": 0.06890993350795889, + "grad_norm": 0.2217901349067688, + "learning_rate": 2.7558420628525384e-05, + "loss": 0.1614, + "step": 342 + }, + { + "epoch": 0.0693129155752569, + "grad_norm": 0.16407828032970428, + "learning_rate": 2.7719580983078165e-05, + "loss": 0.1877, + "step": 344 + }, + { + "epoch": 0.06971589764255491, + "grad_norm": 0.19459925591945648, + "learning_rate": 2.7880741337630946e-05, + "loss": 0.1603, + "step": 346 + }, + { + "epoch": 0.07011887970985291, + "grad_norm": 0.1655324548482895, + "learning_rate": 2.8041901692183726e-05, + "loss": 0.2015, + "step": 348 + }, + { + "epoch": 0.07052186177715092, + "grad_norm": 0.2289515882730484, + "learning_rate": 2.8203062046736507e-05, + "loss": 0.2403, + "step": 350 + }, + { + "epoch": 0.07092484384444893, + "grad_norm": 0.19211649894714355, + "learning_rate": 2.8364222401289288e-05, + "loss": 0.2097, + "step": 352 + }, + { + "epoch": 0.07132782591174693, + "grad_norm": 0.17989704012870789, + "learning_rate": 2.852538275584206e-05, + "loss": 0.19, + "step": 354 + }, + { + "epoch": 0.07173080797904494, + "grad_norm": 0.1994090974330902, + "learning_rate": 2.8686543110394842e-05, + "loss": 0.1731, + "step": 356 + }, + { + "epoch": 0.07213379004634293, + "grad_norm": 0.20826664566993713, + "learning_rate": 2.8847703464947623e-05, + "loss": 0.1895, + "step": 358 + }, + { + "epoch": 0.07253677211364094, + "grad_norm": 0.28034508228302, + "learning_rate": 2.9008863819500404e-05, + "loss": 0.2372, + "step": 360 + }, + { + "epoch": 0.07293975418093895, + "grad_norm": 0.24738198518753052, + "learning_rate": 2.9170024174053184e-05, + "loss": 0.1983, + "step": 362 + }, + { + "epoch": 0.07334273624823695, + "grad_norm": 0.2001752257347107, + "learning_rate": 2.9331184528605965e-05, + "loss": 0.1969, + "step": 364 + }, + { + "epoch": 0.07374571831553496, + "grad_norm": 0.2115003913640976, + "learning_rate": 2.9492344883158746e-05, + "loss": 0.2637, + "step": 366 + }, + { + "epoch": 0.07414870038283296, + "grad_norm": 0.1739916056394577, + "learning_rate": 2.9653505237711526e-05, + "loss": 0.2119, + "step": 368 + }, + { + "epoch": 0.07455168245013097, + "grad_norm": 0.19847404956817627, + "learning_rate": 2.9814665592264307e-05, + "loss": 0.178, + "step": 370 + }, + { + "epoch": 0.07495466451742898, + "grad_norm": 0.27303946018218994, + "learning_rate": 2.997582594681708e-05, + "loss": 0.203, + "step": 372 + }, + { + "epoch": 0.07535764658472698, + "grad_norm": 0.20624373853206635, + "learning_rate": 3.0136986301369862e-05, + "loss": 0.2282, + "step": 374 + }, + { + "epoch": 0.07576062865202499, + "grad_norm": 0.15636469423770905, + "learning_rate": 3.0298146655922643e-05, + "loss": 0.1499, + "step": 376 + }, + { + "epoch": 0.076163610719323, + "grad_norm": 0.15241625905036926, + "learning_rate": 3.0459307010475423e-05, + "loss": 0.234, + "step": 378 + }, + { + "epoch": 0.076566592786621, + "grad_norm": 0.2701398432254791, + "learning_rate": 3.0620467365028204e-05, + "loss": 0.232, + "step": 380 + }, + { + "epoch": 0.076969574853919, + "grad_norm": 0.23230019211769104, + "learning_rate": 3.078162771958099e-05, + "loss": 0.1942, + "step": 382 + }, + { + "epoch": 0.077372556921217, + "grad_norm": 0.16999439895153046, + "learning_rate": 3.0942788074133765e-05, + "loss": 0.1608, + "step": 384 + }, + { + "epoch": 0.07777553898851501, + "grad_norm": 0.2917148470878601, + "learning_rate": 3.110394842868655e-05, + "loss": 0.2297, + "step": 386 + }, + { + "epoch": 0.07817852105581302, + "grad_norm": 0.2013273537158966, + "learning_rate": 3.126510878323932e-05, + "loss": 0.193, + "step": 388 + }, + { + "epoch": 0.07858150312311102, + "grad_norm": 0.2821531295776367, + "learning_rate": 3.1426269137792104e-05, + "loss": 0.1884, + "step": 390 + }, + { + "epoch": 0.07898448519040903, + "grad_norm": 0.17071853578090668, + "learning_rate": 3.158742949234488e-05, + "loss": 0.2326, + "step": 392 + }, + { + "epoch": 0.07938746725770703, + "grad_norm": 0.2293412685394287, + "learning_rate": 3.1748589846897665e-05, + "loss": 0.2068, + "step": 394 + }, + { + "epoch": 0.07979044932500504, + "grad_norm": 0.15788160264492035, + "learning_rate": 3.190975020145044e-05, + "loss": 0.2345, + "step": 396 + }, + { + "epoch": 0.08019343139230305, + "grad_norm": 0.16741882264614105, + "learning_rate": 3.207091055600323e-05, + "loss": 0.2019, + "step": 398 + }, + { + "epoch": 0.08059641345960104, + "grad_norm": 0.17850607633590698, + "learning_rate": 3.2232070910556004e-05, + "loss": 0.2225, + "step": 400 + }, + { + "epoch": 0.08099939552689905, + "grad_norm": 0.19540104269981384, + "learning_rate": 3.239323126510879e-05, + "loss": 0.2101, + "step": 402 + }, + { + "epoch": 0.08140237759419706, + "grad_norm": 0.26288026571273804, + "learning_rate": 3.2554391619661566e-05, + "loss": 0.2099, + "step": 404 + }, + { + "epoch": 0.08180535966149506, + "grad_norm": 0.16300523281097412, + "learning_rate": 3.271555197421434e-05, + "loss": 0.2115, + "step": 406 + }, + { + "epoch": 0.08220834172879307, + "grad_norm": 0.14380481839179993, + "learning_rate": 3.287671232876712e-05, + "loss": 0.1825, + "step": 408 + }, + { + "epoch": 0.08261132379609107, + "grad_norm": 0.1476619839668274, + "learning_rate": 3.3037872683319904e-05, + "loss": 0.1802, + "step": 410 + }, + { + "epoch": 0.08301430586338908, + "grad_norm": 0.2590979337692261, + "learning_rate": 3.319903303787268e-05, + "loss": 0.2175, + "step": 412 + }, + { + "epoch": 0.08341728793068709, + "grad_norm": 0.20722797513008118, + "learning_rate": 3.3360193392425466e-05, + "loss": 0.184, + "step": 414 + }, + { + "epoch": 0.08382026999798509, + "grad_norm": 0.17478062212467194, + "learning_rate": 3.352135374697824e-05, + "loss": 0.1962, + "step": 416 + }, + { + "epoch": 0.0842232520652831, + "grad_norm": 0.18867933750152588, + "learning_rate": 3.368251410153103e-05, + "loss": 0.2151, + "step": 418 + }, + { + "epoch": 0.0846262341325811, + "grad_norm": 0.24660547077655792, + "learning_rate": 3.3843674456083804e-05, + "loss": 0.2336, + "step": 420 + }, + { + "epoch": 0.0850292161998791, + "grad_norm": 0.26829826831817627, + "learning_rate": 3.400483481063659e-05, + "loss": 0.2369, + "step": 422 + }, + { + "epoch": 0.08543219826717711, + "grad_norm": 0.1370697319507599, + "learning_rate": 3.4165995165189366e-05, + "loss": 0.1818, + "step": 424 + }, + { + "epoch": 0.08583518033447511, + "grad_norm": 1.2252482175827026, + "learning_rate": 3.432715551974214e-05, + "loss": 0.2081, + "step": 426 + }, + { + "epoch": 0.08623816240177312, + "grad_norm": 0.14920452237129211, + "learning_rate": 3.448831587429493e-05, + "loss": 0.2401, + "step": 428 + }, + { + "epoch": 0.08664114446907113, + "grad_norm": 0.1496407687664032, + "learning_rate": 3.4649476228847704e-05, + "loss": 0.2269, + "step": 430 + }, + { + "epoch": 0.08704412653636913, + "grad_norm": 0.12217120826244354, + "learning_rate": 3.481063658340049e-05, + "loss": 0.1497, + "step": 432 + }, + { + "epoch": 0.08744710860366714, + "grad_norm": 0.21569272875785828, + "learning_rate": 3.4971796937953266e-05, + "loss": 0.2101, + "step": 434 + }, + { + "epoch": 0.08785009067096514, + "grad_norm": 0.1281503438949585, + "learning_rate": 3.513295729250605e-05, + "loss": 0.2057, + "step": 436 + }, + { + "epoch": 0.08825307273826315, + "grad_norm": 0.14296643435955048, + "learning_rate": 3.529411764705883e-05, + "loss": 0.2512, + "step": 438 + }, + { + "epoch": 0.08865605480556116, + "grad_norm": 0.12831254303455353, + "learning_rate": 3.5455278001611605e-05, + "loss": 0.1635, + "step": 440 + }, + { + "epoch": 0.08905903687285915, + "grad_norm": 0.17410136759281158, + "learning_rate": 3.561643835616438e-05, + "loss": 0.2307, + "step": 442 + }, + { + "epoch": 0.08946201894015716, + "grad_norm": 0.12199216336011887, + "learning_rate": 3.5777598710717166e-05, + "loss": 0.2194, + "step": 444 + }, + { + "epoch": 0.08986500100745516, + "grad_norm": 0.17704665660858154, + "learning_rate": 3.593875906526994e-05, + "loss": 0.1931, + "step": 446 + }, + { + "epoch": 0.09026798307475317, + "grad_norm": 0.20914296805858612, + "learning_rate": 3.609991941982273e-05, + "loss": 0.256, + "step": 448 + }, + { + "epoch": 0.09067096514205118, + "grad_norm": 0.14003607630729675, + "learning_rate": 3.6261079774375505e-05, + "loss": 0.1724, + "step": 450 + }, + { + "epoch": 0.09107394720934918, + "grad_norm": 0.23989439010620117, + "learning_rate": 3.642224012892829e-05, + "loss": 0.1871, + "step": 452 + }, + { + "epoch": 0.09147692927664719, + "grad_norm": 0.17189793288707733, + "learning_rate": 3.6583400483481066e-05, + "loss": 0.1911, + "step": 454 + }, + { + "epoch": 0.0918799113439452, + "grad_norm": 0.18356207013130188, + "learning_rate": 3.674456083803385e-05, + "loss": 0.2036, + "step": 456 + }, + { + "epoch": 0.0922828934112432, + "grad_norm": 0.19528169929981232, + "learning_rate": 3.690572119258662e-05, + "loss": 0.2428, + "step": 458 + }, + { + "epoch": 0.09268587547854121, + "grad_norm": 0.12380246818065643, + "learning_rate": 3.7066881547139405e-05, + "loss": 0.1746, + "step": 460 + }, + { + "epoch": 0.0930888575458392, + "grad_norm": 0.20087508857250214, + "learning_rate": 3.722804190169218e-05, + "loss": 0.257, + "step": 462 + }, + { + "epoch": 0.09349183961313721, + "grad_norm": 0.22255493700504303, + "learning_rate": 3.7389202256244966e-05, + "loss": 0.1897, + "step": 464 + }, + { + "epoch": 0.09389482168043523, + "grad_norm": 0.2199143022298813, + "learning_rate": 3.7550362610797743e-05, + "loss": 0.208, + "step": 466 + }, + { + "epoch": 0.09429780374773322, + "grad_norm": 0.12534235417842865, + "learning_rate": 3.771152296535053e-05, + "loss": 0.2261, + "step": 468 + }, + { + "epoch": 0.09470078581503123, + "grad_norm": 0.14178584516048431, + "learning_rate": 3.7872683319903305e-05, + "loss": 0.1724, + "step": 470 + }, + { + "epoch": 0.09510376788232924, + "grad_norm": 0.192081019282341, + "learning_rate": 3.803384367445609e-05, + "loss": 0.235, + "step": 472 + }, + { + "epoch": 0.09550674994962724, + "grad_norm": 0.24138332903385162, + "learning_rate": 3.8195004029008866e-05, + "loss": 0.187, + "step": 474 + }, + { + "epoch": 0.09590973201692525, + "grad_norm": 0.20738311111927032, + "learning_rate": 3.8356164383561644e-05, + "loss": 0.1986, + "step": 476 + }, + { + "epoch": 0.09631271408422325, + "grad_norm": 0.16928641498088837, + "learning_rate": 3.851732473811442e-05, + "loss": 0.1959, + "step": 478 + }, + { + "epoch": 0.09671569615152126, + "grad_norm": 0.1994764506816864, + "learning_rate": 3.8678485092667205e-05, + "loss": 0.2657, + "step": 480 + }, + { + "epoch": 0.09711867821881927, + "grad_norm": 0.12018430978059769, + "learning_rate": 3.883964544721998e-05, + "loss": 0.2247, + "step": 482 + }, + { + "epoch": 0.09752166028611726, + "grad_norm": 0.12260066717863083, + "learning_rate": 3.9000805801772766e-05, + "loss": 0.2436, + "step": 484 + }, + { + "epoch": 0.09792464235341528, + "grad_norm": 0.20381806790828705, + "learning_rate": 3.9161966156325544e-05, + "loss": 0.2528, + "step": 486 + }, + { + "epoch": 0.09832762442071327, + "grad_norm": 0.1553252935409546, + "learning_rate": 3.932312651087833e-05, + "loss": 0.1424, + "step": 488 + }, + { + "epoch": 0.09873060648801128, + "grad_norm": 0.12027744948863983, + "learning_rate": 3.9484286865431105e-05, + "loss": 0.1758, + "step": 490 + }, + { + "epoch": 0.0991335885553093, + "grad_norm": 0.19894269108772278, + "learning_rate": 3.964544721998388e-05, + "loss": 0.2018, + "step": 492 + }, + { + "epoch": 0.09953657062260729, + "grad_norm": 0.23655803501605988, + "learning_rate": 3.9806607574536666e-05, + "loss": 0.2014, + "step": 494 + }, + { + "epoch": 0.0999395526899053, + "grad_norm": 0.14982983469963074, + "learning_rate": 3.9967767929089444e-05, + "loss": 0.1603, + "step": 496 + }, + { + "epoch": 0.10034253475720331, + "grad_norm": 0.17792493104934692, + "learning_rate": 4.012892828364223e-05, + "loss": 0.2296, + "step": 498 + }, + { + "epoch": 0.10074551682450131, + "grad_norm": 0.13077042996883392, + "learning_rate": 4.0290088638195005e-05, + "loss": 0.2162, + "step": 500 + }, + { + "epoch": 0.10114849889179932, + "grad_norm": 0.15423962473869324, + "learning_rate": 4.045124899274779e-05, + "loss": 0.2, + "step": 502 + }, + { + "epoch": 0.10155148095909731, + "grad_norm": 0.1420324593782425, + "learning_rate": 4.0612409347300567e-05, + "loss": 0.1833, + "step": 504 + }, + { + "epoch": 0.10195446302639533, + "grad_norm": 0.20684389770030975, + "learning_rate": 4.077356970185335e-05, + "loss": 0.211, + "step": 506 + }, + { + "epoch": 0.10235744509369334, + "grad_norm": 0.1461561918258667, + "learning_rate": 4.093473005640613e-05, + "loss": 0.1594, + "step": 508 + }, + { + "epoch": 0.10276042716099133, + "grad_norm": 0.34511980414390564, + "learning_rate": 4.1095890410958905e-05, + "loss": 0.1807, + "step": 510 + }, + { + "epoch": 0.10316340922828934, + "grad_norm": 0.35092246532440186, + "learning_rate": 4.125705076551168e-05, + "loss": 0.1953, + "step": 512 + }, + { + "epoch": 0.10356639129558734, + "grad_norm": 0.1539539396762848, + "learning_rate": 4.141821112006447e-05, + "loss": 0.2323, + "step": 514 + }, + { + "epoch": 0.10396937336288535, + "grad_norm": 0.11350340396165848, + "learning_rate": 4.1579371474617244e-05, + "loss": 0.1686, + "step": 516 + }, + { + "epoch": 0.10437235543018336, + "grad_norm": 0.21042321622371674, + "learning_rate": 4.174053182917003e-05, + "loss": 0.1826, + "step": 518 + }, + { + "epoch": 0.10477533749748136, + "grad_norm": 0.15128977596759796, + "learning_rate": 4.1901692183722805e-05, + "loss": 0.1553, + "step": 520 + }, + { + "epoch": 0.10517831956477937, + "grad_norm": 0.13588109612464905, + "learning_rate": 4.206285253827559e-05, + "loss": 0.1567, + "step": 522 + }, + { + "epoch": 0.10558130163207738, + "grad_norm": 0.3547620475292206, + "learning_rate": 4.222401289282837e-05, + "loss": 0.1821, + "step": 524 + }, + { + "epoch": 0.10598428369937538, + "grad_norm": 0.2869426906108856, + "learning_rate": 4.2385173247381144e-05, + "loss": 0.209, + "step": 526 + }, + { + "epoch": 0.10638726576667339, + "grad_norm": 0.2165244221687317, + "learning_rate": 4.254633360193392e-05, + "loss": 0.1807, + "step": 528 + }, + { + "epoch": 0.10679024783397138, + "grad_norm": 0.13855288922786713, + "learning_rate": 4.2707493956486705e-05, + "loss": 0.2015, + "step": 530 + }, + { + "epoch": 0.1071932299012694, + "grad_norm": 0.16502977907657623, + "learning_rate": 4.286865431103948e-05, + "loss": 0.2467, + "step": 532 + }, + { + "epoch": 0.1075962119685674, + "grad_norm": 0.21364782750606537, + "learning_rate": 4.302981466559227e-05, + "loss": 0.2445, + "step": 534 + }, + { + "epoch": 0.1079991940358654, + "grad_norm": 0.1428193747997284, + "learning_rate": 4.3190975020145044e-05, + "loss": 0.2289, + "step": 536 + }, + { + "epoch": 0.10840217610316341, + "grad_norm": 0.16634321212768555, + "learning_rate": 4.335213537469783e-05, + "loss": 0.1819, + "step": 538 + }, + { + "epoch": 0.10880515817046141, + "grad_norm": 0.15989083051681519, + "learning_rate": 4.3513295729250606e-05, + "loss": 0.1597, + "step": 540 + }, + { + "epoch": 0.10920814023775942, + "grad_norm": 0.10890760272741318, + "learning_rate": 4.367445608380339e-05, + "loss": 0.1679, + "step": 542 + }, + { + "epoch": 0.10961112230505743, + "grad_norm": 0.1206541359424591, + "learning_rate": 4.383561643835617e-05, + "loss": 0.2062, + "step": 544 + }, + { + "epoch": 0.11001410437235543, + "grad_norm": 0.18289783596992493, + "learning_rate": 4.3996776792908944e-05, + "loss": 0.2238, + "step": 546 + }, + { + "epoch": 0.11041708643965344, + "grad_norm": 0.24548190832138062, + "learning_rate": 4.415793714746173e-05, + "loss": 0.2025, + "step": 548 + }, + { + "epoch": 0.11082006850695145, + "grad_norm": 0.14362087845802307, + "learning_rate": 4.4319097502014506e-05, + "loss": 0.1806, + "step": 550 + }, + { + "epoch": 0.11122305057424944, + "grad_norm": 0.12100233882665634, + "learning_rate": 4.448025785656729e-05, + "loss": 0.2202, + "step": 552 + }, + { + "epoch": 0.11162603264154745, + "grad_norm": 0.12502926588058472, + "learning_rate": 4.464141821112007e-05, + "loss": 0.2509, + "step": 554 + }, + { + "epoch": 0.11202901470884545, + "grad_norm": 0.4697296619415283, + "learning_rate": 4.480257856567285e-05, + "loss": 0.2091, + "step": 556 + }, + { + "epoch": 0.11243199677614346, + "grad_norm": 0.13163422048091888, + "learning_rate": 4.496373892022563e-05, + "loss": 0.1593, + "step": 558 + }, + { + "epoch": 0.11283497884344147, + "grad_norm": 0.11262835562229156, + "learning_rate": 4.512489927477841e-05, + "loss": 0.2005, + "step": 560 + }, + { + "epoch": 0.11323796091073947, + "grad_norm": 0.13250380754470825, + "learning_rate": 4.528605962933118e-05, + "loss": 0.2237, + "step": 562 + }, + { + "epoch": 0.11364094297803748, + "grad_norm": 0.17639359831809998, + "learning_rate": 4.544721998388397e-05, + "loss": 0.2141, + "step": 564 + }, + { + "epoch": 0.11404392504533548, + "grad_norm": 0.16560794413089752, + "learning_rate": 4.5608380338436744e-05, + "loss": 0.1954, + "step": 566 + }, + { + "epoch": 0.11444690711263349, + "grad_norm": 0.20894743502140045, + "learning_rate": 4.576954069298953e-05, + "loss": 0.1817, + "step": 568 + }, + { + "epoch": 0.1148498891799315, + "grad_norm": 0.17747287452220917, + "learning_rate": 4.5930701047542306e-05, + "loss": 0.1519, + "step": 570 + }, + { + "epoch": 0.1152528712472295, + "grad_norm": 0.32621023058891296, + "learning_rate": 4.609186140209509e-05, + "loss": 0.1936, + "step": 572 + }, + { + "epoch": 0.1156558533145275, + "grad_norm": 0.169255793094635, + "learning_rate": 4.625302175664787e-05, + "loss": 0.2114, + "step": 574 + }, + { + "epoch": 0.11605883538182551, + "grad_norm": 0.11003939807415009, + "learning_rate": 4.641418211120065e-05, + "loss": 0.1852, + "step": 576 + }, + { + "epoch": 0.11646181744912351, + "grad_norm": 0.10437988489866257, + "learning_rate": 4.657534246575342e-05, + "loss": 0.1732, + "step": 578 + }, + { + "epoch": 0.11686479951642152, + "grad_norm": 0.1323920041322708, + "learning_rate": 4.6736502820306206e-05, + "loss": 0.1749, + "step": 580 + }, + { + "epoch": 0.11726778158371952, + "grad_norm": 0.16257719695568085, + "learning_rate": 4.689766317485898e-05, + "loss": 0.2036, + "step": 582 + }, + { + "epoch": 0.11767076365101753, + "grad_norm": 0.13824671506881714, + "learning_rate": 4.705882352941177e-05, + "loss": 0.2065, + "step": 584 + }, + { + "epoch": 0.11807374571831554, + "grad_norm": 0.14059747755527496, + "learning_rate": 4.7219983883964545e-05, + "loss": 0.2077, + "step": 586 + }, + { + "epoch": 0.11847672778561354, + "grad_norm": 0.1127152293920517, + "learning_rate": 4.738114423851733e-05, + "loss": 0.2296, + "step": 588 + }, + { + "epoch": 0.11887970985291155, + "grad_norm": 0.1408272385597229, + "learning_rate": 4.7542304593070106e-05, + "loss": 0.2205, + "step": 590 + }, + { + "epoch": 0.11928269192020956, + "grad_norm": 0.14287517964839935, + "learning_rate": 4.770346494762289e-05, + "loss": 0.2082, + "step": 592 + }, + { + "epoch": 0.11968567398750755, + "grad_norm": 0.11590491980314255, + "learning_rate": 4.786462530217567e-05, + "loss": 0.1657, + "step": 594 + }, + { + "epoch": 0.12008865605480556, + "grad_norm": 0.13355225324630737, + "learning_rate": 4.8025785656728445e-05, + "loss": 0.2237, + "step": 596 + }, + { + "epoch": 0.12049163812210356, + "grad_norm": 0.3162963092327118, + "learning_rate": 4.818694601128122e-05, + "loss": 0.197, + "step": 598 + }, + { + "epoch": 0.12089462018940157, + "grad_norm": 0.19958584010601044, + "learning_rate": 4.8348106365834006e-05, + "loss": 0.1635, + "step": 600 + }, + { + "epoch": 0.12129760225669958, + "grad_norm": 0.13679653406143188, + "learning_rate": 4.8509266720386783e-05, + "loss": 0.2212, + "step": 602 + }, + { + "epoch": 0.12170058432399758, + "grad_norm": 0.1415790170431137, + "learning_rate": 4.867042707493957e-05, + "loss": 0.1708, + "step": 604 + }, + { + "epoch": 0.12210356639129559, + "grad_norm": 0.1621982902288437, + "learning_rate": 4.8831587429492345e-05, + "loss": 0.2567, + "step": 606 + }, + { + "epoch": 0.12250654845859359, + "grad_norm": 0.19178320467472076, + "learning_rate": 4.899274778404513e-05, + "loss": 0.2501, + "step": 608 + }, + { + "epoch": 0.1229095305258916, + "grad_norm": 0.3538917005062103, + "learning_rate": 4.9153908138597906e-05, + "loss": 0.1876, + "step": 610 + }, + { + "epoch": 0.12331251259318961, + "grad_norm": 0.17144201695919037, + "learning_rate": 4.9315068493150684e-05, + "loss": 0.2218, + "step": 612 + }, + { + "epoch": 0.1237154946604876, + "grad_norm": 0.12202147394418716, + "learning_rate": 4.947622884770347e-05, + "loss": 0.2277, + "step": 614 + }, + { + "epoch": 0.12411847672778561, + "grad_norm": 0.4998626708984375, + "learning_rate": 4.9637389202256245e-05, + "loss": 0.1924, + "step": 616 + }, + { + "epoch": 0.12452145879508363, + "grad_norm": 0.15225987136363983, + "learning_rate": 4.979854955680903e-05, + "loss": 0.1767, + "step": 618 + }, + { + "epoch": 0.12492444086238162, + "grad_norm": 0.11958550661802292, + "learning_rate": 4.9959709911361806e-05, + "loss": 0.1952, + "step": 620 + }, + { + "epoch": 0.12532742292967963, + "grad_norm": 0.1595647633075714, + "learning_rate": 5.012087026591459e-05, + "loss": 0.2142, + "step": 622 + }, + { + "epoch": 0.12573040499697763, + "grad_norm": 0.09010529518127441, + "learning_rate": 5.028203062046737e-05, + "loss": 0.183, + "step": 624 + }, + { + "epoch": 0.12613338706427563, + "grad_norm": 0.15077313780784607, + "learning_rate": 5.044319097502015e-05, + "loss": 0.2067, + "step": 626 + }, + { + "epoch": 0.12653636913157365, + "grad_norm": 0.10020453482866287, + "learning_rate": 5.060435132957293e-05, + "loss": 0.1862, + "step": 628 + }, + { + "epoch": 0.12693935119887165, + "grad_norm": 0.11209502071142197, + "learning_rate": 5.076551168412571e-05, + "loss": 0.238, + "step": 630 + }, + { + "epoch": 0.12734233326616964, + "grad_norm": 0.38250023126602173, + "learning_rate": 5.092667203867849e-05, + "loss": 0.2162, + "step": 632 + }, + { + "epoch": 0.12774531533346767, + "grad_norm": 0.0977146178483963, + "learning_rate": 5.1087832393231275e-05, + "loss": 0.2049, + "step": 634 + }, + { + "epoch": 0.12814829740076566, + "grad_norm": 0.49452531337738037, + "learning_rate": 5.124899274778405e-05, + "loss": 0.1717, + "step": 636 + }, + { + "epoch": 0.12855127946806366, + "grad_norm": 0.15723511576652527, + "learning_rate": 5.141015310233682e-05, + "loss": 0.207, + "step": 638 + }, + { + "epoch": 0.12895426153536169, + "grad_norm": 0.15820923447608948, + "learning_rate": 5.15713134568896e-05, + "loss": 0.17, + "step": 640 + }, + { + "epoch": 0.12935724360265968, + "grad_norm": 0.2272777259349823, + "learning_rate": 5.1732473811442384e-05, + "loss": 0.2383, + "step": 642 + }, + { + "epoch": 0.12976022566995768, + "grad_norm": 0.18713967502117157, + "learning_rate": 5.189363416599516e-05, + "loss": 0.2485, + "step": 644 + }, + { + "epoch": 0.1301632077372557, + "grad_norm": 0.18062381446361542, + "learning_rate": 5.2054794520547945e-05, + "loss": 0.1667, + "step": 646 + }, + { + "epoch": 0.1305661898045537, + "grad_norm": 0.1531069129705429, + "learning_rate": 5.221595487510072e-05, + "loss": 0.2006, + "step": 648 + }, + { + "epoch": 0.1309691718718517, + "grad_norm": 0.1526888906955719, + "learning_rate": 5.237711522965351e-05, + "loss": 0.2549, + "step": 650 + }, + { + "epoch": 0.1313721539391497, + "grad_norm": 0.21363383531570435, + "learning_rate": 5.2538275584206284e-05, + "loss": 0.1518, + "step": 652 + }, + { + "epoch": 0.13177513600644772, + "grad_norm": 0.10798677057027817, + "learning_rate": 5.269943593875907e-05, + "loss": 0.1979, + "step": 654 + }, + { + "epoch": 0.13217811807374572, + "grad_norm": 0.15461646020412445, + "learning_rate": 5.2860596293311845e-05, + "loss": 0.1976, + "step": 656 + }, + { + "epoch": 0.1325811001410437, + "grad_norm": 0.14786763489246368, + "learning_rate": 5.302175664786463e-05, + "loss": 0.1827, + "step": 658 + }, + { + "epoch": 0.13298408220834174, + "grad_norm": 0.14497677981853485, + "learning_rate": 5.318291700241741e-05, + "loss": 0.2326, + "step": 660 + }, + { + "epoch": 0.13338706427563973, + "grad_norm": 0.13996298611164093, + "learning_rate": 5.334407735697019e-05, + "loss": 0.174, + "step": 662 + }, + { + "epoch": 0.13379004634293773, + "grad_norm": 0.13437992334365845, + "learning_rate": 5.350523771152297e-05, + "loss": 0.191, + "step": 664 + }, + { + "epoch": 0.13419302841023575, + "grad_norm": 0.08243107795715332, + "learning_rate": 5.366639806607575e-05, + "loss": 0.2212, + "step": 666 + }, + { + "epoch": 0.13459601047753375, + "grad_norm": 0.18093906342983246, + "learning_rate": 5.382755842062853e-05, + "loss": 0.2098, + "step": 668 + }, + { + "epoch": 0.13499899254483175, + "grad_norm": 0.15489110350608826, + "learning_rate": 5.3988718775181314e-05, + "loss": 0.2106, + "step": 670 + }, + { + "epoch": 0.13540197461212977, + "grad_norm": 0.30413106083869934, + "learning_rate": 5.414987912973409e-05, + "loss": 0.1632, + "step": 672 + }, + { + "epoch": 0.13580495667942777, + "grad_norm": 0.11697816103696823, + "learning_rate": 5.431103948428686e-05, + "loss": 0.2011, + "step": 674 + }, + { + "epoch": 0.13620793874672577, + "grad_norm": 0.15263471007347107, + "learning_rate": 5.4472199838839646e-05, + "loss": 0.2353, + "step": 676 + }, + { + "epoch": 0.1366109208140238, + "grad_norm": 0.1420680284500122, + "learning_rate": 5.463336019339242e-05, + "loss": 0.1445, + "step": 678 + }, + { + "epoch": 0.1370139028813218, + "grad_norm": 0.1247248575091362, + "learning_rate": 5.479452054794521e-05, + "loss": 0.2184, + "step": 680 + }, + { + "epoch": 0.13741688494861978, + "grad_norm": 0.1049036830663681, + "learning_rate": 5.4955680902497984e-05, + "loss": 0.2056, + "step": 682 + }, + { + "epoch": 0.13781986701591778, + "grad_norm": 0.15257203578948975, + "learning_rate": 5.511684125705077e-05, + "loss": 0.2059, + "step": 684 + }, + { + "epoch": 0.1382228490832158, + "grad_norm": 0.14933043718338013, + "learning_rate": 5.5278001611603546e-05, + "loss": 0.1938, + "step": 686 + }, + { + "epoch": 0.1386258311505138, + "grad_norm": 0.12864787876605988, + "learning_rate": 5.543916196615633e-05, + "loss": 0.169, + "step": 688 + }, + { + "epoch": 0.1390288132178118, + "grad_norm": 0.1254183053970337, + "learning_rate": 5.560032232070911e-05, + "loss": 0.1997, + "step": 690 + }, + { + "epoch": 0.13943179528510982, + "grad_norm": 0.12011466920375824, + "learning_rate": 5.576148267526189e-05, + "loss": 0.2001, + "step": 692 + }, + { + "epoch": 0.13983477735240782, + "grad_norm": 0.09766723215579987, + "learning_rate": 5.592264302981467e-05, + "loss": 0.2324, + "step": 694 + }, + { + "epoch": 0.14023775941970582, + "grad_norm": 0.14080072939395905, + "learning_rate": 5.608380338436745e-05, + "loss": 0.1545, + "step": 696 + }, + { + "epoch": 0.14064074148700384, + "grad_norm": 0.1432671695947647, + "learning_rate": 5.624496373892023e-05, + "loss": 0.2256, + "step": 698 + }, + { + "epoch": 0.14104372355430184, + "grad_norm": 0.11519324779510498, + "learning_rate": 5.6406124093473014e-05, + "loss": 0.2022, + "step": 700 + }, + { + "epoch": 0.14144670562159983, + "grad_norm": 0.12213943898677826, + "learning_rate": 5.656728444802579e-05, + "loss": 0.1849, + "step": 702 + }, + { + "epoch": 0.14184968768889786, + "grad_norm": 0.13887788355350494, + "learning_rate": 5.6728444802578575e-05, + "loss": 0.2355, + "step": 704 + }, + { + "epoch": 0.14225266975619585, + "grad_norm": 0.15707039833068848, + "learning_rate": 5.688960515713135e-05, + "loss": 0.2055, + "step": 706 + }, + { + "epoch": 0.14265565182349385, + "grad_norm": 0.09798435121774673, + "learning_rate": 5.705076551168412e-05, + "loss": 0.2247, + "step": 708 + }, + { + "epoch": 0.14305863389079185, + "grad_norm": 0.12091681361198425, + "learning_rate": 5.721192586623691e-05, + "loss": 0.2108, + "step": 710 + }, + { + "epoch": 0.14346161595808987, + "grad_norm": 0.11818325519561768, + "learning_rate": 5.7373086220789685e-05, + "loss": 0.2348, + "step": 712 + }, + { + "epoch": 0.14386459802538787, + "grad_norm": 0.11795388907194138, + "learning_rate": 5.753424657534247e-05, + "loss": 0.2175, + "step": 714 + }, + { + "epoch": 0.14426758009268587, + "grad_norm": 0.08712694048881531, + "learning_rate": 5.7695406929895246e-05, + "loss": 0.1859, + "step": 716 + }, + { + "epoch": 0.1446705621599839, + "grad_norm": 0.10926694422960281, + "learning_rate": 5.785656728444802e-05, + "loss": 0.2296, + "step": 718 + }, + { + "epoch": 0.1450735442272819, + "grad_norm": 0.12363690137863159, + "learning_rate": 5.801772763900081e-05, + "loss": 0.2363, + "step": 720 + }, + { + "epoch": 0.14547652629457988, + "grad_norm": 0.10204677283763885, + "learning_rate": 5.8178887993553585e-05, + "loss": 0.1557, + "step": 722 + }, + { + "epoch": 0.1458795083618779, + "grad_norm": 0.08687115460634232, + "learning_rate": 5.834004834810637e-05, + "loss": 0.2098, + "step": 724 + }, + { + "epoch": 0.1462824904291759, + "grad_norm": 0.12376438081264496, + "learning_rate": 5.8501208702659146e-05, + "loss": 0.1462, + "step": 726 + }, + { + "epoch": 0.1466854724964739, + "grad_norm": 0.08748535066843033, + "learning_rate": 5.866236905721193e-05, + "loss": 0.2304, + "step": 728 + }, + { + "epoch": 0.14708845456377193, + "grad_norm": 0.09505739063024521, + "learning_rate": 5.882352941176471e-05, + "loss": 0.1962, + "step": 730 + }, + { + "epoch": 0.14749143663106992, + "grad_norm": 0.11972901225090027, + "learning_rate": 5.898468976631749e-05, + "loss": 0.1907, + "step": 732 + }, + { + "epoch": 0.14789441869836792, + "grad_norm": 0.08803921192884445, + "learning_rate": 5.914585012087027e-05, + "loss": 0.2302, + "step": 734 + }, + { + "epoch": 0.14829740076566592, + "grad_norm": 0.18843410909175873, + "learning_rate": 5.930701047542305e-05, + "loss": 0.2544, + "step": 736 + }, + { + "epoch": 0.14870038283296394, + "grad_norm": 0.1302308738231659, + "learning_rate": 5.946817082997583e-05, + "loss": 0.231, + "step": 738 + }, + { + "epoch": 0.14910336490026194, + "grad_norm": 0.10559621453285217, + "learning_rate": 5.9629331184528614e-05, + "loss": 0.1879, + "step": 740 + }, + { + "epoch": 0.14950634696755993, + "grad_norm": 0.11322048306465149, + "learning_rate": 5.9790491539081385e-05, + "loss": 0.2098, + "step": 742 + }, + { + "epoch": 0.14990932903485796, + "grad_norm": 0.10071557015180588, + "learning_rate": 5.995165189363416e-05, + "loss": 0.182, + "step": 744 + }, + { + "epoch": 0.15031231110215595, + "grad_norm": 0.10590151697397232, + "learning_rate": 6.0112812248186946e-05, + "loss": 0.2738, + "step": 746 + }, + { + "epoch": 0.15071529316945395, + "grad_norm": 0.11812455207109451, + "learning_rate": 6.0273972602739724e-05, + "loss": 0.1983, + "step": 748 + }, + { + "epoch": 0.15111827523675198, + "grad_norm": 0.15133807063102722, + "learning_rate": 6.043513295729251e-05, + "loss": 0.2795, + "step": 750 + }, + { + "epoch": 0.15152125730404997, + "grad_norm": 0.0995708629488945, + "learning_rate": 6.0596293311845285e-05, + "loss": 0.178, + "step": 752 + }, + { + "epoch": 0.15192423937134797, + "grad_norm": 0.0896756649017334, + "learning_rate": 6.075745366639807e-05, + "loss": 0.2164, + "step": 754 + }, + { + "epoch": 0.152327221438646, + "grad_norm": 0.10863078385591507, + "learning_rate": 6.0918614020950846e-05, + "loss": 0.2283, + "step": 756 + }, + { + "epoch": 0.152730203505944, + "grad_norm": 0.18915948271751404, + "learning_rate": 6.107977437550362e-05, + "loss": 0.1713, + "step": 758 + }, + { + "epoch": 0.153133185573242, + "grad_norm": 0.2530260682106018, + "learning_rate": 6.124093473005641e-05, + "loss": 0.1929, + "step": 760 + }, + { + "epoch": 0.15353616764053998, + "grad_norm": 0.091359943151474, + "learning_rate": 6.140209508460919e-05, + "loss": 0.1789, + "step": 762 + }, + { + "epoch": 0.153939149707838, + "grad_norm": 0.08231671154499054, + "learning_rate": 6.156325543916198e-05, + "loss": 0.2092, + "step": 764 + }, + { + "epoch": 0.154342131775136, + "grad_norm": 0.10517023503780365, + "learning_rate": 6.172441579371475e-05, + "loss": 0.1662, + "step": 766 + }, + { + "epoch": 0.154745113842434, + "grad_norm": 0.31121233105659485, + "learning_rate": 6.188557614826753e-05, + "loss": 0.1885, + "step": 768 + }, + { + "epoch": 0.15514809590973203, + "grad_norm": 0.12587322294712067, + "learning_rate": 6.204673650282031e-05, + "loss": 0.1954, + "step": 770 + }, + { + "epoch": 0.15555107797703002, + "grad_norm": 0.11300528049468994, + "learning_rate": 6.22078968573731e-05, + "loss": 0.2143, + "step": 772 + }, + { + "epoch": 0.15595406004432802, + "grad_norm": 0.12358961999416351, + "learning_rate": 6.236905721192587e-05, + "loss": 0.2489, + "step": 774 + }, + { + "epoch": 0.15635704211162604, + "grad_norm": 0.07826519012451172, + "learning_rate": 6.253021756647864e-05, + "loss": 0.1567, + "step": 776 + }, + { + "epoch": 0.15676002417892404, + "grad_norm": 0.14281342923641205, + "learning_rate": 6.269137792103142e-05, + "loss": 0.25, + "step": 778 + }, + { + "epoch": 0.15716300624622204, + "grad_norm": 0.11091539263725281, + "learning_rate": 6.285253827558421e-05, + "loss": 0.202, + "step": 780 + }, + { + "epoch": 0.15756598831352006, + "grad_norm": 0.10273347795009613, + "learning_rate": 6.301369863013699e-05, + "loss": 0.238, + "step": 782 + }, + { + "epoch": 0.15796897038081806, + "grad_norm": 0.12522001564502716, + "learning_rate": 6.317485898468976e-05, + "loss": 0.1256, + "step": 784 + }, + { + "epoch": 0.15837195244811605, + "grad_norm": 0.13455836474895477, + "learning_rate": 6.333601933924255e-05, + "loss": 0.2008, + "step": 786 + }, + { + "epoch": 0.15877493451541405, + "grad_norm": 0.1040363609790802, + "learning_rate": 6.349717969379533e-05, + "loss": 0.2343, + "step": 788 + }, + { + "epoch": 0.15917791658271208, + "grad_norm": 0.13617351651191711, + "learning_rate": 6.365834004834811e-05, + "loss": 0.1752, + "step": 790 + }, + { + "epoch": 0.15958089865001007, + "grad_norm": 0.11176435649394989, + "learning_rate": 6.381950040290089e-05, + "loss": 0.207, + "step": 792 + }, + { + "epoch": 0.15998388071730807, + "grad_norm": 0.11505793035030365, + "learning_rate": 6.398066075745367e-05, + "loss": 0.1928, + "step": 794 + }, + { + "epoch": 0.1603868627846061, + "grad_norm": 0.10995722562074661, + "learning_rate": 6.414182111200645e-05, + "loss": 0.2455, + "step": 796 + }, + { + "epoch": 0.1607898448519041, + "grad_norm": 0.09231299161911011, + "learning_rate": 6.430298146655924e-05, + "loss": 0.1634, + "step": 798 + }, + { + "epoch": 0.1611928269192021, + "grad_norm": 0.09691483527421951, + "learning_rate": 6.446414182111201e-05, + "loss": 0.1466, + "step": 800 + }, + { + "epoch": 0.1615958089865001, + "grad_norm": 0.11957940459251404, + "learning_rate": 6.462530217566479e-05, + "loss": 0.1902, + "step": 802 + }, + { + "epoch": 0.1619987910537981, + "grad_norm": 0.11081273853778839, + "learning_rate": 6.478646253021758e-05, + "loss": 0.1711, + "step": 804 + }, + { + "epoch": 0.1624017731210961, + "grad_norm": 0.10039583593606949, + "learning_rate": 6.494762288477036e-05, + "loss": 0.1578, + "step": 806 + }, + { + "epoch": 0.16280475518839413, + "grad_norm": 0.08877286314964294, + "learning_rate": 6.510878323932313e-05, + "loss": 0.1832, + "step": 808 + }, + { + "epoch": 0.16320773725569213, + "grad_norm": 0.1771819144487381, + "learning_rate": 6.526994359387592e-05, + "loss": 0.1901, + "step": 810 + }, + { + "epoch": 0.16361071932299012, + "grad_norm": 0.13575054705142975, + "learning_rate": 6.543110394842869e-05, + "loss": 0.2324, + "step": 812 + }, + { + "epoch": 0.16401370139028812, + "grad_norm": 0.10554531216621399, + "learning_rate": 6.559226430298147e-05, + "loss": 0.2033, + "step": 814 + }, + { + "epoch": 0.16441668345758614, + "grad_norm": 0.09214624017477036, + "learning_rate": 6.575342465753424e-05, + "loss": 0.2308, + "step": 816 + }, + { + "epoch": 0.16481966552488414, + "grad_norm": 0.09753014147281647, + "learning_rate": 6.591458501208702e-05, + "loss": 0.2118, + "step": 818 + }, + { + "epoch": 0.16522264759218214, + "grad_norm": 0.10157620906829834, + "learning_rate": 6.607574536663981e-05, + "loss": 0.2282, + "step": 820 + }, + { + "epoch": 0.16562562965948016, + "grad_norm": 0.09195137768983841, + "learning_rate": 6.623690572119259e-05, + "loss": 0.2375, + "step": 822 + }, + { + "epoch": 0.16602861172677816, + "grad_norm": 0.09710432589054108, + "learning_rate": 6.639806607574536e-05, + "loss": 0.1578, + "step": 824 + }, + { + "epoch": 0.16643159379407615, + "grad_norm": 0.0950450599193573, + "learning_rate": 6.655922643029815e-05, + "loss": 0.1785, + "step": 826 + }, + { + "epoch": 0.16683457586137418, + "grad_norm": 0.09225623309612274, + "learning_rate": 6.672038678485093e-05, + "loss": 0.1755, + "step": 828 + }, + { + "epoch": 0.16723755792867218, + "grad_norm": 0.10279329121112823, + "learning_rate": 6.688154713940372e-05, + "loss": 0.1872, + "step": 830 + }, + { + "epoch": 0.16764053999597017, + "grad_norm": 0.08554810285568237, + "learning_rate": 6.704270749395649e-05, + "loss": 0.2222, + "step": 832 + }, + { + "epoch": 0.1680435220632682, + "grad_norm": 0.08733980357646942, + "learning_rate": 6.720386784850927e-05, + "loss": 0.139, + "step": 834 + }, + { + "epoch": 0.1684465041305662, + "grad_norm": 0.09240876138210297, + "learning_rate": 6.736502820306205e-05, + "loss": 0.2171, + "step": 836 + }, + { + "epoch": 0.1688494861978642, + "grad_norm": 0.08311144262552261, + "learning_rate": 6.752618855761484e-05, + "loss": 0.1865, + "step": 838 + }, + { + "epoch": 0.1692524682651622, + "grad_norm": 0.11759477853775024, + "learning_rate": 6.768734891216761e-05, + "loss": 0.2025, + "step": 840 + }, + { + "epoch": 0.1696554503324602, + "grad_norm": 0.15229858458042145, + "learning_rate": 6.784850926672039e-05, + "loss": 0.1846, + "step": 842 + }, + { + "epoch": 0.1700584323997582, + "grad_norm": 0.08780387789011002, + "learning_rate": 6.800966962127318e-05, + "loss": 0.1839, + "step": 844 + }, + { + "epoch": 0.1704614144670562, + "grad_norm": 0.11263580620288849, + "learning_rate": 6.817082997582595e-05, + "loss": 0.1762, + "step": 846 + }, + { + "epoch": 0.17086439653435423, + "grad_norm": 0.0929393470287323, + "learning_rate": 6.833199033037873e-05, + "loss": 0.1633, + "step": 848 + }, + { + "epoch": 0.17126737860165223, + "grad_norm": 0.09778440743684769, + "learning_rate": 6.84931506849315e-05, + "loss": 0.1693, + "step": 850 + }, + { + "epoch": 0.17167036066895022, + "grad_norm": 0.12297005206346512, + "learning_rate": 6.865431103948429e-05, + "loss": 0.1497, + "step": 852 + }, + { + "epoch": 0.17207334273624825, + "grad_norm": 0.17671999335289001, + "learning_rate": 6.881547139403707e-05, + "loss": 0.2439, + "step": 854 + }, + { + "epoch": 0.17247632480354624, + "grad_norm": 0.08522593975067139, + "learning_rate": 6.897663174858985e-05, + "loss": 0.1595, + "step": 856 + }, + { + "epoch": 0.17287930687084424, + "grad_norm": 0.1255025416612625, + "learning_rate": 6.913779210314262e-05, + "loss": 0.2527, + "step": 858 + }, + { + "epoch": 0.17328228893814226, + "grad_norm": 0.3059910535812378, + "learning_rate": 6.929895245769541e-05, + "loss": 0.2162, + "step": 860 + }, + { + "epoch": 0.17368527100544026, + "grad_norm": 0.1549808382987976, + "learning_rate": 6.946011281224819e-05, + "loss": 0.2101, + "step": 862 + }, + { + "epoch": 0.17408825307273826, + "grad_norm": 0.08645348250865936, + "learning_rate": 6.962127316680098e-05, + "loss": 0.1709, + "step": 864 + }, + { + "epoch": 0.17449123514003625, + "grad_norm": 0.09869391471147537, + "learning_rate": 6.978243352135375e-05, + "loss": 0.2026, + "step": 866 + }, + { + "epoch": 0.17489421720733428, + "grad_norm": 0.08920720964670181, + "learning_rate": 6.994359387590653e-05, + "loss": 0.2328, + "step": 868 + }, + { + "epoch": 0.17529719927463228, + "grad_norm": 0.10059194266796112, + "learning_rate": 7.010475423045932e-05, + "loss": 0.2471, + "step": 870 + }, + { + "epoch": 0.17570018134193027, + "grad_norm": 0.13767802715301514, + "learning_rate": 7.02659145850121e-05, + "loss": 0.2062, + "step": 872 + }, + { + "epoch": 0.1761031634092283, + "grad_norm": 0.11204895377159119, + "learning_rate": 7.042707493956487e-05, + "loss": 0.1811, + "step": 874 + }, + { + "epoch": 0.1765061454765263, + "grad_norm": 0.08391435444355011, + "learning_rate": 7.058823529411765e-05, + "loss": 0.1874, + "step": 876 + }, + { + "epoch": 0.1769091275438243, + "grad_norm": 0.09149591624736786, + "learning_rate": 7.074939564867044e-05, + "loss": 0.2454, + "step": 878 + }, + { + "epoch": 0.17731210961112231, + "grad_norm": 0.09233218431472778, + "learning_rate": 7.091055600322321e-05, + "loss": 0.2473, + "step": 880 + }, + { + "epoch": 0.1777150916784203, + "grad_norm": 0.08432731032371521, + "learning_rate": 7.107171635777598e-05, + "loss": 0.1843, + "step": 882 + }, + { + "epoch": 0.1781180737457183, + "grad_norm": 0.12103287875652313, + "learning_rate": 7.123287671232876e-05, + "loss": 0.2372, + "step": 884 + }, + { + "epoch": 0.17852105581301633, + "grad_norm": 0.08081512898206711, + "learning_rate": 7.139403706688155e-05, + "loss": 0.2181, + "step": 886 + }, + { + "epoch": 0.17892403788031433, + "grad_norm": 0.09800492227077484, + "learning_rate": 7.155519742143433e-05, + "loss": 0.1669, + "step": 888 + }, + { + "epoch": 0.17932701994761233, + "grad_norm": 0.0976727157831192, + "learning_rate": 7.17163577759871e-05, + "loss": 0.2072, + "step": 890 + }, + { + "epoch": 0.17973000201491032, + "grad_norm": 0.1146702691912651, + "learning_rate": 7.187751813053989e-05, + "loss": 0.1986, + "step": 892 + }, + { + "epoch": 0.18013298408220835, + "grad_norm": 0.10681789368391037, + "learning_rate": 7.203867848509267e-05, + "loss": 0.1632, + "step": 894 + }, + { + "epoch": 0.18053596614950634, + "grad_norm": 0.10094150900840759, + "learning_rate": 7.219983883964545e-05, + "loss": 0.1643, + "step": 896 + }, + { + "epoch": 0.18093894821680434, + "grad_norm": 0.09761274605989456, + "learning_rate": 7.236099919419823e-05, + "loss": 0.1779, + "step": 898 + }, + { + "epoch": 0.18134193028410237, + "grad_norm": 0.1324063241481781, + "learning_rate": 7.252215954875101e-05, + "loss": 0.2365, + "step": 900 + }, + { + "epoch": 0.18174491235140036, + "grad_norm": 0.11601895093917847, + "learning_rate": 7.26833199033038e-05, + "loss": 0.1712, + "step": 902 + }, + { + "epoch": 0.18214789441869836, + "grad_norm": 0.14145302772521973, + "learning_rate": 7.284448025785658e-05, + "loss": 0.2013, + "step": 904 + }, + { + "epoch": 0.18255087648599638, + "grad_norm": 0.09013397246599197, + "learning_rate": 7.300564061240935e-05, + "loss": 0.2259, + "step": 906 + }, + { + "epoch": 0.18295385855329438, + "grad_norm": 0.09207538515329361, + "learning_rate": 7.316680096696213e-05, + "loss": 0.2087, + "step": 908 + }, + { + "epoch": 0.18335684062059238, + "grad_norm": 0.07779651135206223, + "learning_rate": 7.332796132151492e-05, + "loss": 0.234, + "step": 910 + }, + { + "epoch": 0.1837598226878904, + "grad_norm": 0.08593969792127609, + "learning_rate": 7.34891216760677e-05, + "loss": 0.1854, + "step": 912 + }, + { + "epoch": 0.1841628047551884, + "grad_norm": 0.09124486148357391, + "learning_rate": 7.365028203062047e-05, + "loss": 0.2279, + "step": 914 + }, + { + "epoch": 0.1845657868224864, + "grad_norm": 0.11255534738302231, + "learning_rate": 7.381144238517324e-05, + "loss": 0.1733, + "step": 916 + }, + { + "epoch": 0.18496876888978442, + "grad_norm": 0.1038624569773674, + "learning_rate": 7.397260273972603e-05, + "loss": 0.2392, + "step": 918 + }, + { + "epoch": 0.18537175095708242, + "grad_norm": 0.10044854134321213, + "learning_rate": 7.413376309427881e-05, + "loss": 0.195, + "step": 920 + }, + { + "epoch": 0.1857747330243804, + "grad_norm": 0.0884871855378151, + "learning_rate": 7.42949234488316e-05, + "loss": 0.2164, + "step": 922 + }, + { + "epoch": 0.1861777150916784, + "grad_norm": 0.1108056977391243, + "learning_rate": 7.445608380338436e-05, + "loss": 0.1694, + "step": 924 + }, + { + "epoch": 0.18658069715897643, + "grad_norm": 0.07165519148111343, + "learning_rate": 7.461724415793715e-05, + "loss": 0.1398, + "step": 926 + }, + { + "epoch": 0.18698367922627443, + "grad_norm": 0.09175916761159897, + "learning_rate": 7.477840451248993e-05, + "loss": 0.1955, + "step": 928 + }, + { + "epoch": 0.18738666129357243, + "grad_norm": 0.07176446169614792, + "learning_rate": 7.493956486704272e-05, + "loss": 0.1766, + "step": 930 + }, + { + "epoch": 0.18778964336087045, + "grad_norm": 0.11476302146911621, + "learning_rate": 7.510072522159549e-05, + "loss": 0.1848, + "step": 932 + }, + { + "epoch": 0.18819262542816845, + "grad_norm": 0.107746422290802, + "learning_rate": 7.526188557614827e-05, + "loss": 0.2264, + "step": 934 + }, + { + "epoch": 0.18859560749546644, + "grad_norm": 0.10922015458345413, + "learning_rate": 7.542304593070106e-05, + "loss": 0.1803, + "step": 936 + }, + { + "epoch": 0.18899858956276447, + "grad_norm": 0.08000432699918747, + "learning_rate": 7.558420628525384e-05, + "loss": 0.1601, + "step": 938 + }, + { + "epoch": 0.18940157163006247, + "grad_norm": 0.07894396036863327, + "learning_rate": 7.574536663980661e-05, + "loss": 0.1968, + "step": 940 + }, + { + "epoch": 0.18980455369736046, + "grad_norm": 0.13226218521595, + "learning_rate": 7.59065269943594e-05, + "loss": 0.1964, + "step": 942 + }, + { + "epoch": 0.1902075357646585, + "grad_norm": 0.13322897255420685, + "learning_rate": 7.606768734891218e-05, + "loss": 0.2813, + "step": 944 + }, + { + "epoch": 0.19061051783195648, + "grad_norm": 0.07467541843652725, + "learning_rate": 7.622884770346496e-05, + "loss": 0.2153, + "step": 946 + }, + { + "epoch": 0.19101349989925448, + "grad_norm": 0.1104121133685112, + "learning_rate": 7.639000805801773e-05, + "loss": 0.2025, + "step": 948 + }, + { + "epoch": 0.19141648196655248, + "grad_norm": 0.06779658049345016, + "learning_rate": 7.65511684125705e-05, + "loss": 0.2228, + "step": 950 + }, + { + "epoch": 0.1918194640338505, + "grad_norm": 0.19166550040245056, + "learning_rate": 7.671232876712329e-05, + "loss": 0.1907, + "step": 952 + }, + { + "epoch": 0.1922224461011485, + "grad_norm": 0.06244197115302086, + "learning_rate": 7.687348912167607e-05, + "loss": 0.1687, + "step": 954 + }, + { + "epoch": 0.1926254281684465, + "grad_norm": 0.07573673874139786, + "learning_rate": 7.703464947622884e-05, + "loss": 0.1598, + "step": 956 + }, + { + "epoch": 0.19302841023574452, + "grad_norm": 0.0870039090514183, + "learning_rate": 7.719580983078163e-05, + "loss": 0.2082, + "step": 958 + }, + { + "epoch": 0.19343139230304252, + "grad_norm": 0.08709016442298889, + "learning_rate": 7.735697018533441e-05, + "loss": 0.1739, + "step": 960 + }, + { + "epoch": 0.1938343743703405, + "grad_norm": 0.2350974977016449, + "learning_rate": 7.75181305398872e-05, + "loss": 0.2275, + "step": 962 + }, + { + "epoch": 0.19423735643763854, + "grad_norm": 0.1652485728263855, + "learning_rate": 7.767929089443996e-05, + "loss": 0.2102, + "step": 964 + }, + { + "epoch": 0.19464033850493653, + "grad_norm": 0.087095707654953, + "learning_rate": 7.784045124899275e-05, + "loss": 0.2238, + "step": 966 + }, + { + "epoch": 0.19504332057223453, + "grad_norm": 0.11548943817615509, + "learning_rate": 7.800161160354553e-05, + "loss": 0.2168, + "step": 968 + }, + { + "epoch": 0.19544630263953255, + "grad_norm": 0.0807507336139679, + "learning_rate": 7.816277195809832e-05, + "loss": 0.1806, + "step": 970 + }, + { + "epoch": 0.19584928470683055, + "grad_norm": 0.393595814704895, + "learning_rate": 7.832393231265109e-05, + "loss": 0.2387, + "step": 972 + }, + { + "epoch": 0.19625226677412855, + "grad_norm": 0.08075542002916336, + "learning_rate": 7.848509266720387e-05, + "loss": 0.2201, + "step": 974 + }, + { + "epoch": 0.19665524884142654, + "grad_norm": 0.09349818527698517, + "learning_rate": 7.864625302175666e-05, + "loss": 0.1415, + "step": 976 + }, + { + "epoch": 0.19705823090872457, + "grad_norm": 0.21948008239269257, + "learning_rate": 7.880741337630944e-05, + "loss": 0.2032, + "step": 978 + }, + { + "epoch": 0.19746121297602257, + "grad_norm": 0.09178763628005981, + "learning_rate": 7.896857373086221e-05, + "loss": 0.2024, + "step": 980 + }, + { + "epoch": 0.19786419504332056, + "grad_norm": 0.09847205132246017, + "learning_rate": 7.9129734085415e-05, + "loss": 0.226, + "step": 982 + }, + { + "epoch": 0.1982671771106186, + "grad_norm": 0.15902641415596008, + "learning_rate": 7.929089443996776e-05, + "loss": 0.2055, + "step": 984 + }, + { + "epoch": 0.19867015917791658, + "grad_norm": 0.15558022260665894, + "learning_rate": 7.945205479452055e-05, + "loss": 0.1926, + "step": 986 + }, + { + "epoch": 0.19907314124521458, + "grad_norm": 0.09379275888204575, + "learning_rate": 7.961321514907333e-05, + "loss": 0.1761, + "step": 988 + }, + { + "epoch": 0.1994761233125126, + "grad_norm": 0.17286166548728943, + "learning_rate": 7.97743755036261e-05, + "loss": 0.2303, + "step": 990 + }, + { + "epoch": 0.1998791053798106, + "grad_norm": 0.11570542305707932, + "learning_rate": 7.993553585817889e-05, + "loss": 0.2176, + "step": 992 + }, + { + "epoch": 0.2002820874471086, + "grad_norm": 0.13672104477882385, + "learning_rate": 8.009669621273167e-05, + "loss": 0.2171, + "step": 994 + }, + { + "epoch": 0.20068506951440662, + "grad_norm": 0.12963563203811646, + "learning_rate": 8.025785656728446e-05, + "loss": 0.2296, + "step": 996 + }, + { + "epoch": 0.20108805158170462, + "grad_norm": 0.13109353184700012, + "learning_rate": 8.041901692183723e-05, + "loss": 0.2, + "step": 998 + }, + { + "epoch": 0.20149103364900262, + "grad_norm": 0.08438586443662643, + "learning_rate": 8.058017727639001e-05, + "loss": 0.1921, + "step": 1000 + }, + { + "epoch": 0.2018940157163006, + "grad_norm": 0.08162175118923187, + "learning_rate": 8.07413376309428e-05, + "loss": 0.2028, + "step": 1002 + }, + { + "epoch": 0.20229699778359864, + "grad_norm": 0.08619034290313721, + "learning_rate": 8.090249798549558e-05, + "loss": 0.1999, + "step": 1004 + }, + { + "epoch": 0.20269997985089663, + "grad_norm": 0.07941418886184692, + "learning_rate": 8.106365834004835e-05, + "loss": 0.2118, + "step": 1006 + }, + { + "epoch": 0.20310296191819463, + "grad_norm": 0.12020314484834671, + "learning_rate": 8.122481869460113e-05, + "loss": 0.223, + "step": 1008 + }, + { + "epoch": 0.20350594398549265, + "grad_norm": 0.08442337810993195, + "learning_rate": 8.138597904915392e-05, + "loss": 0.2103, + "step": 1010 + }, + { + "epoch": 0.20390892605279065, + "grad_norm": 0.1368478238582611, + "learning_rate": 8.15471394037067e-05, + "loss": 0.2011, + "step": 1012 + }, + { + "epoch": 0.20431190812008865, + "grad_norm": 0.12291720509529114, + "learning_rate": 8.170829975825947e-05, + "loss": 0.2377, + "step": 1014 + }, + { + "epoch": 0.20471489018738667, + "grad_norm": 0.09744734317064285, + "learning_rate": 8.186946011281226e-05, + "loss": 0.1922, + "step": 1016 + }, + { + "epoch": 0.20511787225468467, + "grad_norm": 0.08120467513799667, + "learning_rate": 8.203062046736503e-05, + "loss": 0.2241, + "step": 1018 + }, + { + "epoch": 0.20552085432198267, + "grad_norm": 0.10533369332551956, + "learning_rate": 8.219178082191781e-05, + "loss": 0.2189, + "step": 1020 + }, + { + "epoch": 0.2059238363892807, + "grad_norm": 0.10071130096912384, + "learning_rate": 8.23529411764706e-05, + "loss": 0.1639, + "step": 1022 + }, + { + "epoch": 0.2063268184565787, + "grad_norm": 0.1534520983695984, + "learning_rate": 8.251410153102337e-05, + "loss": 0.1691, + "step": 1024 + }, + { + "epoch": 0.20672980052387668, + "grad_norm": 0.08435958623886108, + "learning_rate": 8.267526188557615e-05, + "loss": 0.2177, + "step": 1026 + }, + { + "epoch": 0.20713278259117468, + "grad_norm": 0.11280474066734314, + "learning_rate": 8.283642224012893e-05, + "loss": 0.1767, + "step": 1028 + }, + { + "epoch": 0.2075357646584727, + "grad_norm": 0.09684017300605774, + "learning_rate": 8.299758259468172e-05, + "loss": 0.2139, + "step": 1030 + }, + { + "epoch": 0.2079387467257707, + "grad_norm": 0.08194670081138611, + "learning_rate": 8.315874294923449e-05, + "loss": 0.1917, + "step": 1032 + }, + { + "epoch": 0.2083417287930687, + "grad_norm": 0.15235085785388947, + "learning_rate": 8.331990330378727e-05, + "loss": 0.2436, + "step": 1034 + }, + { + "epoch": 0.20874471086036672, + "grad_norm": 0.08844275772571564, + "learning_rate": 8.348106365834006e-05, + "loss": 0.1862, + "step": 1036 + }, + { + "epoch": 0.20914769292766472, + "grad_norm": 0.1334722340106964, + "learning_rate": 8.364222401289284e-05, + "loss": 0.1837, + "step": 1038 + }, + { + "epoch": 0.20955067499496272, + "grad_norm": 0.08106778562068939, + "learning_rate": 8.380338436744561e-05, + "loss": 0.2101, + "step": 1040 + }, + { + "epoch": 0.20995365706226074, + "grad_norm": 0.0860428661108017, + "learning_rate": 8.39645447219984e-05, + "loss": 0.1754, + "step": 1042 + }, + { + "epoch": 0.21035663912955874, + "grad_norm": 0.0777168869972229, + "learning_rate": 8.412570507655118e-05, + "loss": 0.2105, + "step": 1044 + }, + { + "epoch": 0.21075962119685673, + "grad_norm": 0.08495823293924332, + "learning_rate": 8.428686543110396e-05, + "loss": 0.187, + "step": 1046 + }, + { + "epoch": 0.21116260326415476, + "grad_norm": 0.07410518079996109, + "learning_rate": 8.444802578565673e-05, + "loss": 0.1962, + "step": 1048 + }, + { + "epoch": 0.21156558533145275, + "grad_norm": 0.0910082459449768, + "learning_rate": 8.460918614020952e-05, + "loss": 0.2341, + "step": 1050 + }, + { + "epoch": 0.21196856739875075, + "grad_norm": 0.11832420527935028, + "learning_rate": 8.477034649476229e-05, + "loss": 0.2532, + "step": 1052 + }, + { + "epoch": 0.21237154946604875, + "grad_norm": 0.09605500847101212, + "learning_rate": 8.493150684931507e-05, + "loss": 0.1657, + "step": 1054 + }, + { + "epoch": 0.21277453153334677, + "grad_norm": 0.07742031663656235, + "learning_rate": 8.509266720386784e-05, + "loss": 0.1792, + "step": 1056 + }, + { + "epoch": 0.21317751360064477, + "grad_norm": 0.07660829275846481, + "learning_rate": 8.525382755842063e-05, + "loss": 0.1488, + "step": 1058 + }, + { + "epoch": 0.21358049566794277, + "grad_norm": 0.09640536457300186, + "learning_rate": 8.541498791297341e-05, + "loss": 0.2233, + "step": 1060 + }, + { + "epoch": 0.2139834777352408, + "grad_norm": 0.12653407454490662, + "learning_rate": 8.55761482675262e-05, + "loss": 0.2007, + "step": 1062 + }, + { + "epoch": 0.2143864598025388, + "grad_norm": 0.09995963424444199, + "learning_rate": 8.573730862207897e-05, + "loss": 0.1933, + "step": 1064 + }, + { + "epoch": 0.21478944186983678, + "grad_norm": 0.08510065078735352, + "learning_rate": 8.589846897663175e-05, + "loss": 0.2423, + "step": 1066 + }, + { + "epoch": 0.2151924239371348, + "grad_norm": 0.09552331268787384, + "learning_rate": 8.605962933118453e-05, + "loss": 0.2171, + "step": 1068 + }, + { + "epoch": 0.2155954060044328, + "grad_norm": 0.09067709743976593, + "learning_rate": 8.622078968573732e-05, + "loss": 0.2328, + "step": 1070 + }, + { + "epoch": 0.2159983880717308, + "grad_norm": 0.09756525605916977, + "learning_rate": 8.638195004029009e-05, + "loss": 0.1938, + "step": 1072 + }, + { + "epoch": 0.21640137013902883, + "grad_norm": 0.10737069696187973, + "learning_rate": 8.654311039484287e-05, + "loss": 0.1905, + "step": 1074 + }, + { + "epoch": 0.21680435220632682, + "grad_norm": 0.08812405914068222, + "learning_rate": 8.670427074939566e-05, + "loss": 0.2276, + "step": 1076 + }, + { + "epoch": 0.21720733427362482, + "grad_norm": 0.08840040117502213, + "learning_rate": 8.686543110394844e-05, + "loss": 0.1989, + "step": 1078 + }, + { + "epoch": 0.21761031634092282, + "grad_norm": 0.1236484944820404, + "learning_rate": 8.702659145850121e-05, + "loss": 0.1772, + "step": 1080 + }, + { + "epoch": 0.21801329840822084, + "grad_norm": 0.0713542103767395, + "learning_rate": 8.7187751813054e-05, + "loss": 0.1878, + "step": 1082 + }, + { + "epoch": 0.21841628047551884, + "grad_norm": 0.0982968658208847, + "learning_rate": 8.734891216760678e-05, + "loss": 0.1925, + "step": 1084 + }, + { + "epoch": 0.21881926254281683, + "grad_norm": 0.07813037931919098, + "learning_rate": 8.751007252215955e-05, + "loss": 0.2141, + "step": 1086 + }, + { + "epoch": 0.21922224461011486, + "grad_norm": 0.10197921842336655, + "learning_rate": 8.767123287671233e-05, + "loss": 0.2662, + "step": 1088 + }, + { + "epoch": 0.21962522667741285, + "grad_norm": 0.0717720240354538, + "learning_rate": 8.78323932312651e-05, + "loss": 0.1753, + "step": 1090 + }, + { + "epoch": 0.22002820874471085, + "grad_norm": 0.08220771700143814, + "learning_rate": 8.799355358581789e-05, + "loss": 0.2293, + "step": 1092 + }, + { + "epoch": 0.22043119081200888, + "grad_norm": 0.10889850556850433, + "learning_rate": 8.815471394037067e-05, + "loss": 0.1917, + "step": 1094 + }, + { + "epoch": 0.22083417287930687, + "grad_norm": 0.06890220940113068, + "learning_rate": 8.831587429492346e-05, + "loss": 0.184, + "step": 1096 + }, + { + "epoch": 0.22123715494660487, + "grad_norm": 0.11168145388364792, + "learning_rate": 8.847703464947623e-05, + "loss": 0.2341, + "step": 1098 + }, + { + "epoch": 0.2216401370139029, + "grad_norm": 0.06456907838582993, + "learning_rate": 8.863819500402901e-05, + "loss": 0.1529, + "step": 1100 + }, + { + "epoch": 0.2220431190812009, + "grad_norm": 0.07093362510204315, + "learning_rate": 8.87993553585818e-05, + "loss": 0.2238, + "step": 1102 + }, + { + "epoch": 0.2224461011484989, + "grad_norm": 0.08005674928426743, + "learning_rate": 8.896051571313458e-05, + "loss": 0.183, + "step": 1104 + }, + { + "epoch": 0.22284908321579688, + "grad_norm": 0.09400587528944016, + "learning_rate": 8.912167606768735e-05, + "loss": 0.1935, + "step": 1106 + }, + { + "epoch": 0.2232520652830949, + "grad_norm": 0.09655874222517014, + "learning_rate": 8.928283642224013e-05, + "loss": 0.1628, + "step": 1108 + }, + { + "epoch": 0.2236550473503929, + "grad_norm": 0.10121942311525345, + "learning_rate": 8.944399677679292e-05, + "loss": 0.1718, + "step": 1110 + }, + { + "epoch": 0.2240580294176909, + "grad_norm": 0.1059177815914154, + "learning_rate": 8.96051571313457e-05, + "loss": 0.2144, + "step": 1112 + }, + { + "epoch": 0.22446101148498893, + "grad_norm": 0.07645639777183533, + "learning_rate": 8.976631748589847e-05, + "loss": 0.212, + "step": 1114 + }, + { + "epoch": 0.22486399355228692, + "grad_norm": 0.07680249214172363, + "learning_rate": 8.992747784045126e-05, + "loss": 0.184, + "step": 1116 + }, + { + "epoch": 0.22526697561958492, + "grad_norm": 0.10838435590267181, + "learning_rate": 9.008863819500404e-05, + "loss": 0.2153, + "step": 1118 + }, + { + "epoch": 0.22566995768688294, + "grad_norm": 0.0947759747505188, + "learning_rate": 9.024979854955683e-05, + "loss": 0.1972, + "step": 1120 + }, + { + "epoch": 0.22607293975418094, + "grad_norm": 0.12324242293834686, + "learning_rate": 9.041095890410958e-05, + "loss": 0.2042, + "step": 1122 + }, + { + "epoch": 0.22647592182147894, + "grad_norm": 0.09450756758451462, + "learning_rate": 9.057211925866237e-05, + "loss": 0.2614, + "step": 1124 + }, + { + "epoch": 0.22687890388877696, + "grad_norm": 0.092324398458004, + "learning_rate": 9.073327961321515e-05, + "loss": 0.1664, + "step": 1126 + }, + { + "epoch": 0.22728188595607496, + "grad_norm": 0.12351454794406891, + "learning_rate": 9.089443996776793e-05, + "loss": 0.2146, + "step": 1128 + }, + { + "epoch": 0.22768486802337295, + "grad_norm": 0.07259409874677658, + "learning_rate": 9.10556003223207e-05, + "loss": 0.1875, + "step": 1130 + }, + { + "epoch": 0.22808785009067095, + "grad_norm": 0.060035668313503265, + "learning_rate": 9.121676067687349e-05, + "loss": 0.2133, + "step": 1132 + }, + { + "epoch": 0.22849083215796898, + "grad_norm": 0.06675513088703156, + "learning_rate": 9.137792103142627e-05, + "loss": 0.1959, + "step": 1134 + }, + { + "epoch": 0.22889381422526697, + "grad_norm": 0.10324272513389587, + "learning_rate": 9.153908138597906e-05, + "loss": 0.2426, + "step": 1136 + }, + { + "epoch": 0.22929679629256497, + "grad_norm": 0.06724414229393005, + "learning_rate": 9.170024174053183e-05, + "loss": 0.1731, + "step": 1138 + }, + { + "epoch": 0.229699778359863, + "grad_norm": 0.07515553385019302, + "learning_rate": 9.186140209508461e-05, + "loss": 0.1696, + "step": 1140 + }, + { + "epoch": 0.230102760427161, + "grad_norm": 0.08454802632331848, + "learning_rate": 9.20225624496374e-05, + "loss": 0.2387, + "step": 1142 + }, + { + "epoch": 0.230505742494459, + "grad_norm": 0.06945478171110153, + "learning_rate": 9.218372280419018e-05, + "loss": 0.2107, + "step": 1144 + }, + { + "epoch": 0.230908724561757, + "grad_norm": 0.06837344914674759, + "learning_rate": 9.234488315874295e-05, + "loss": 0.2317, + "step": 1146 + }, + { + "epoch": 0.231311706629055, + "grad_norm": 0.07493479549884796, + "learning_rate": 9.250604351329573e-05, + "loss": 0.2259, + "step": 1148 + }, + { + "epoch": 0.231714688696353, + "grad_norm": 0.08560243993997574, + "learning_rate": 9.266720386784852e-05, + "loss": 0.212, + "step": 1150 + }, + { + "epoch": 0.23211767076365103, + "grad_norm": 0.07562673836946487, + "learning_rate": 9.28283642224013e-05, + "loss": 0.221, + "step": 1152 + }, + { + "epoch": 0.23252065283094903, + "grad_norm": 0.10255958139896393, + "learning_rate": 9.298952457695407e-05, + "loss": 0.2069, + "step": 1154 + }, + { + "epoch": 0.23292363489824702, + "grad_norm": 0.06924106925725937, + "learning_rate": 9.315068493150684e-05, + "loss": 0.1942, + "step": 1156 + }, + { + "epoch": 0.23332661696554502, + "grad_norm": 0.08090320974588394, + "learning_rate": 9.331184528605963e-05, + "loss": 0.1634, + "step": 1158 + }, + { + "epoch": 0.23372959903284304, + "grad_norm": 0.05619840696454048, + "learning_rate": 9.347300564061241e-05, + "loss": 0.2432, + "step": 1160 + }, + { + "epoch": 0.23413258110014104, + "grad_norm": 0.0675722137093544, + "learning_rate": 9.36341659951652e-05, + "loss": 0.2034, + "step": 1162 + }, + { + "epoch": 0.23453556316743904, + "grad_norm": 0.07722295820713043, + "learning_rate": 9.379532634971797e-05, + "loss": 0.1589, + "step": 1164 + }, + { + "epoch": 0.23493854523473706, + "grad_norm": 0.06578662246465683, + "learning_rate": 9.395648670427075e-05, + "loss": 0.1686, + "step": 1166 + }, + { + "epoch": 0.23534152730203506, + "grad_norm": 0.08277074992656708, + "learning_rate": 9.411764705882353e-05, + "loss": 0.1611, + "step": 1168 + }, + { + "epoch": 0.23574450936933306, + "grad_norm": 0.07715737074613571, + "learning_rate": 9.427880741337632e-05, + "loss": 0.1441, + "step": 1170 + }, + { + "epoch": 0.23614749143663108, + "grad_norm": 0.08344750106334686, + "learning_rate": 9.443996776792909e-05, + "loss": 0.2076, + "step": 1172 + }, + { + "epoch": 0.23655047350392908, + "grad_norm": 0.07293462008237839, + "learning_rate": 9.460112812248187e-05, + "loss": 0.1415, + "step": 1174 + }, + { + "epoch": 0.23695345557122707, + "grad_norm": 0.08313830941915512, + "learning_rate": 9.476228847703466e-05, + "loss": 0.2245, + "step": 1176 + }, + { + "epoch": 0.2373564376385251, + "grad_norm": 0.08135011047124863, + "learning_rate": 9.492344883158744e-05, + "loss": 0.2603, + "step": 1178 + }, + { + "epoch": 0.2377594197058231, + "grad_norm": 0.090848907828331, + "learning_rate": 9.508460918614021e-05, + "loss": 0.2008, + "step": 1180 + }, + { + "epoch": 0.2381624017731211, + "grad_norm": 0.05730780214071274, + "learning_rate": 9.5245769540693e-05, + "loss": 0.2229, + "step": 1182 + }, + { + "epoch": 0.23856538384041912, + "grad_norm": 0.10784997791051865, + "learning_rate": 9.540692989524578e-05, + "loss": 0.2067, + "step": 1184 + }, + { + "epoch": 0.2389683659077171, + "grad_norm": 0.07142709195613861, + "learning_rate": 9.556809024979856e-05, + "loss": 0.2081, + "step": 1186 + }, + { + "epoch": 0.2393713479750151, + "grad_norm": 0.08427638560533524, + "learning_rate": 9.572925060435133e-05, + "loss": 0.1884, + "step": 1188 + }, + { + "epoch": 0.2397743300423131, + "grad_norm": 0.06093582883477211, + "learning_rate": 9.58904109589041e-05, + "loss": 0.2077, + "step": 1190 + }, + { + "epoch": 0.24017731210961113, + "grad_norm": 0.12947604060173035, + "learning_rate": 9.605157131345689e-05, + "loss": 0.2013, + "step": 1192 + }, + { + "epoch": 0.24058029417690913, + "grad_norm": 0.07346334308385849, + "learning_rate": 9.621273166800967e-05, + "loss": 0.2296, + "step": 1194 + }, + { + "epoch": 0.24098327624420712, + "grad_norm": 0.07245267927646637, + "learning_rate": 9.637389202256244e-05, + "loss": 0.2416, + "step": 1196 + }, + { + "epoch": 0.24138625831150515, + "grad_norm": 0.0768049955368042, + "learning_rate": 9.653505237711523e-05, + "loss": 0.1814, + "step": 1198 + }, + { + "epoch": 0.24178924037880314, + "grad_norm": 0.09695810824632645, + "learning_rate": 9.669621273166801e-05, + "loss": 0.2085, + "step": 1200 + }, + { + "epoch": 0.24219222244610114, + "grad_norm": 0.10410469025373459, + "learning_rate": 9.68573730862208e-05, + "loss": 0.1793, + "step": 1202 + }, + { + "epoch": 0.24259520451339917, + "grad_norm": 0.08499378710985184, + "learning_rate": 9.701853344077357e-05, + "loss": 0.1548, + "step": 1204 + }, + { + "epoch": 0.24299818658069716, + "grad_norm": 0.07553906738758087, + "learning_rate": 9.717969379532635e-05, + "loss": 0.1562, + "step": 1206 + }, + { + "epoch": 0.24340116864799516, + "grad_norm": 0.06801648437976837, + "learning_rate": 9.734085414987914e-05, + "loss": 0.1601, + "step": 1208 + }, + { + "epoch": 0.24380415071529318, + "grad_norm": 0.07008855044841766, + "learning_rate": 9.750201450443192e-05, + "loss": 0.2134, + "step": 1210 + }, + { + "epoch": 0.24420713278259118, + "grad_norm": 0.0757250189781189, + "learning_rate": 9.766317485898469e-05, + "loss": 0.2255, + "step": 1212 + }, + { + "epoch": 0.24461011484988918, + "grad_norm": 0.07179669290781021, + "learning_rate": 9.782433521353747e-05, + "loss": 0.1668, + "step": 1214 + }, + { + "epoch": 0.24501309691718717, + "grad_norm": 0.08034947514533997, + "learning_rate": 9.798549556809026e-05, + "loss": 0.2016, + "step": 1216 + }, + { + "epoch": 0.2454160789844852, + "grad_norm": 0.07972761243581772, + "learning_rate": 9.814665592264304e-05, + "loss": 0.2055, + "step": 1218 + }, + { + "epoch": 0.2458190610517832, + "grad_norm": 0.09259269386529922, + "learning_rate": 9.830781627719581e-05, + "loss": 0.2188, + "step": 1220 + }, + { + "epoch": 0.2462220431190812, + "grad_norm": 0.09266602993011475, + "learning_rate": 9.84689766317486e-05, + "loss": 0.1846, + "step": 1222 + }, + { + "epoch": 0.24662502518637922, + "grad_norm": 0.09400223940610886, + "learning_rate": 9.863013698630137e-05, + "loss": 0.2265, + "step": 1224 + }, + { + "epoch": 0.2470280072536772, + "grad_norm": 0.11411723494529724, + "learning_rate": 9.879129734085415e-05, + "loss": 0.2147, + "step": 1226 + }, + { + "epoch": 0.2474309893209752, + "grad_norm": 0.08758651465177536, + "learning_rate": 9.895245769540694e-05, + "loss": 0.1507, + "step": 1228 + }, + { + "epoch": 0.24783397138827323, + "grad_norm": 0.09494465589523315, + "learning_rate": 9.91136180499597e-05, + "loss": 0.2167, + "step": 1230 + }, + { + "epoch": 0.24823695345557123, + "grad_norm": 0.0853755846619606, + "learning_rate": 9.927477840451249e-05, + "loss": 0.2135, + "step": 1232 + }, + { + "epoch": 0.24863993552286923, + "grad_norm": 0.1234959214925766, + "learning_rate": 9.943593875906527e-05, + "loss": 0.2236, + "step": 1234 + }, + { + "epoch": 0.24904291759016725, + "grad_norm": 0.0904574766755104, + "learning_rate": 9.959709911361806e-05, + "loss": 0.2291, + "step": 1236 + }, + { + "epoch": 0.24944589965746525, + "grad_norm": 0.06450655311346054, + "learning_rate": 9.975825946817083e-05, + "loss": 0.2311, + "step": 1238 + }, + { + "epoch": 0.24984888172476324, + "grad_norm": 0.08350057154893875, + "learning_rate": 9.991941982272361e-05, + "loss": 0.2108, + "step": 1240 + }, + { + "epoch": 0.25025186379206127, + "grad_norm": 0.06631229817867279, + "learning_rate": 9.999999955601e-05, + "loss": 0.1971, + "step": 1242 + }, + { + "epoch": 0.25065484585935927, + "grad_norm": 0.09298577904701233, + "learning_rate": 9.999999600409e-05, + "loss": 0.2115, + "step": 1244 + }, + { + "epoch": 0.25105782792665726, + "grad_norm": 0.0637964978814125, + "learning_rate": 9.999998890025024e-05, + "loss": 0.2547, + "step": 1246 + }, + { + "epoch": 0.25146080999395526, + "grad_norm": 0.08004257082939148, + "learning_rate": 9.999997824449123e-05, + "loss": 0.2039, + "step": 1248 + }, + { + "epoch": 0.25186379206125326, + "grad_norm": 0.06306260079145432, + "learning_rate": 9.999996403681373e-05, + "loss": 0.2126, + "step": 1250 + }, + { + "epoch": 0.25226677412855125, + "grad_norm": 0.06148292124271393, + "learning_rate": 9.999994627721875e-05, + "loss": 0.1845, + "step": 1252 + }, + { + "epoch": 0.2526697561958493, + "grad_norm": 0.06277811527252197, + "learning_rate": 9.999992496570755e-05, + "loss": 0.1993, + "step": 1254 + }, + { + "epoch": 0.2530727382631473, + "grad_norm": 0.06535515934228897, + "learning_rate": 9.999990010228164e-05, + "loss": 0.1837, + "step": 1256 + }, + { + "epoch": 0.2534757203304453, + "grad_norm": 0.07780063897371292, + "learning_rate": 9.99998716869428e-05, + "loss": 0.1825, + "step": 1258 + }, + { + "epoch": 0.2538787023977433, + "grad_norm": 0.0873618796467781, + "learning_rate": 9.999983971969302e-05, + "loss": 0.2225, + "step": 1260 + }, + { + "epoch": 0.2542816844650413, + "grad_norm": 0.08165138959884644, + "learning_rate": 9.99998042005346e-05, + "loss": 0.2264, + "step": 1262 + }, + { + "epoch": 0.2546846665323393, + "grad_norm": 0.07608946412801743, + "learning_rate": 9.999976512947007e-05, + "loss": 0.2499, + "step": 1264 + }, + { + "epoch": 0.25508764859963734, + "grad_norm": 0.08392094075679779, + "learning_rate": 9.999972250650215e-05, + "loss": 0.1677, + "step": 1266 + }, + { + "epoch": 0.25549063066693534, + "grad_norm": 0.2400495409965515, + "learning_rate": 9.999967633163394e-05, + "loss": 0.1398, + "step": 1268 + }, + { + "epoch": 0.25589361273423333, + "grad_norm": 0.09449879825115204, + "learning_rate": 9.999962660486868e-05, + "loss": 0.2168, + "step": 1270 + }, + { + "epoch": 0.25629659480153133, + "grad_norm": 0.063988097012043, + "learning_rate": 9.999957332620989e-05, + "loss": 0.2091, + "step": 1272 + }, + { + "epoch": 0.2566995768688293, + "grad_norm": 0.27040895819664, + "learning_rate": 9.999951649566139e-05, + "loss": 0.1982, + "step": 1274 + }, + { + "epoch": 0.2571025589361273, + "grad_norm": 0.07855580747127533, + "learning_rate": 9.999945611322719e-05, + "loss": 0.2506, + "step": 1276 + }, + { + "epoch": 0.2575055410034253, + "grad_norm": 0.2895398437976837, + "learning_rate": 9.99993921789116e-05, + "loss": 0.2104, + "step": 1278 + }, + { + "epoch": 0.25790852307072337, + "grad_norm": 0.06772362440824509, + "learning_rate": 9.999932469271915e-05, + "loss": 0.2338, + "step": 1280 + }, + { + "epoch": 0.25831150513802137, + "grad_norm": 0.14966493844985962, + "learning_rate": 9.999925365465463e-05, + "loss": 0.2066, + "step": 1282 + }, + { + "epoch": 0.25871448720531937, + "grad_norm": 0.06578774750232697, + "learning_rate": 9.99991790647231e-05, + "loss": 0.1908, + "step": 1284 + }, + { + "epoch": 0.25911746927261736, + "grad_norm": 0.17319980263710022, + "learning_rate": 9.999910092292985e-05, + "loss": 0.1874, + "step": 1286 + }, + { + "epoch": 0.25952045133991536, + "grad_norm": 0.06658493727445602, + "learning_rate": 9.999901922928042e-05, + "loss": 0.2024, + "step": 1288 + }, + { + "epoch": 0.25992343340721336, + "grad_norm": 0.08135207742452621, + "learning_rate": 9.999893398378064e-05, + "loss": 0.2109, + "step": 1290 + }, + { + "epoch": 0.2603264154745114, + "grad_norm": 0.061190057545900345, + "learning_rate": 9.999884518643654e-05, + "loss": 0.1343, + "step": 1292 + }, + { + "epoch": 0.2607293975418094, + "grad_norm": 0.08722230792045593, + "learning_rate": 9.999875283725444e-05, + "loss": 0.2028, + "step": 1294 + }, + { + "epoch": 0.2611323796091074, + "grad_norm": 0.09891603142023087, + "learning_rate": 9.999865693624091e-05, + "loss": 0.1758, + "step": 1296 + }, + { + "epoch": 0.2615353616764054, + "grad_norm": 0.10503659397363663, + "learning_rate": 9.999855748340274e-05, + "loss": 0.1831, + "step": 1298 + }, + { + "epoch": 0.2619383437437034, + "grad_norm": 0.08791584521532059, + "learning_rate": 9.999845447874702e-05, + "loss": 0.2349, + "step": 1300 + }, + { + "epoch": 0.2623413258110014, + "grad_norm": 0.10015831142663956, + "learning_rate": 9.999834792228105e-05, + "loss": 0.2115, + "step": 1302 + }, + { + "epoch": 0.2627443078782994, + "grad_norm": 0.08647844940423965, + "learning_rate": 9.99982378140124e-05, + "loss": 0.1913, + "step": 1304 + }, + { + "epoch": 0.26314728994559744, + "grad_norm": 0.15039680898189545, + "learning_rate": 9.999812415394891e-05, + "loss": 0.2008, + "step": 1306 + }, + { + "epoch": 0.26355027201289544, + "grad_norm": 0.08669546246528625, + "learning_rate": 9.999800694209862e-05, + "loss": 0.2544, + "step": 1308 + }, + { + "epoch": 0.26395325408019343, + "grad_norm": 0.07216177880764008, + "learning_rate": 9.999788617846989e-05, + "loss": 0.1602, + "step": 1310 + }, + { + "epoch": 0.26435623614749143, + "grad_norm": 0.1571531444787979, + "learning_rate": 9.999776186307129e-05, + "loss": 0.2214, + "step": 1312 + }, + { + "epoch": 0.2647592182147894, + "grad_norm": 0.07496760785579681, + "learning_rate": 9.999763399591162e-05, + "loss": 0.2304, + "step": 1314 + }, + { + "epoch": 0.2651622002820874, + "grad_norm": 0.06391184777021408, + "learning_rate": 9.999750257700002e-05, + "loss": 0.2287, + "step": 1316 + }, + { + "epoch": 0.2655651823493855, + "grad_norm": 0.08557723462581635, + "learning_rate": 9.999736760634578e-05, + "loss": 0.2596, + "step": 1318 + }, + { + "epoch": 0.2659681644166835, + "grad_norm": 0.0653078481554985, + "learning_rate": 9.999722908395851e-05, + "loss": 0.193, + "step": 1320 + }, + { + "epoch": 0.26637114648398147, + "grad_norm": 0.07708454132080078, + "learning_rate": 9.999708700984804e-05, + "loss": 0.2117, + "step": 1322 + }, + { + "epoch": 0.26677412855127947, + "grad_norm": 0.07978586852550507, + "learning_rate": 9.999694138402448e-05, + "loss": 0.1499, + "step": 1324 + }, + { + "epoch": 0.26717711061857746, + "grad_norm": 0.18177969753742218, + "learning_rate": 9.999679220649815e-05, + "loss": 0.1992, + "step": 1326 + }, + { + "epoch": 0.26758009268587546, + "grad_norm": 0.08391742408275604, + "learning_rate": 9.999663947727966e-05, + "loss": 0.1551, + "step": 1328 + }, + { + "epoch": 0.2679830747531735, + "grad_norm": 0.06364039331674576, + "learning_rate": 9.999648319637986e-05, + "loss": 0.2108, + "step": 1330 + }, + { + "epoch": 0.2683860568204715, + "grad_norm": 0.08180107921361923, + "learning_rate": 9.999632336380986e-05, + "loss": 0.2298, + "step": 1332 + }, + { + "epoch": 0.2687890388877695, + "grad_norm": 0.08764016628265381, + "learning_rate": 9.999615997958101e-05, + "loss": 0.2359, + "step": 1334 + }, + { + "epoch": 0.2691920209550675, + "grad_norm": 0.07102248817682266, + "learning_rate": 9.999599304370489e-05, + "loss": 0.2487, + "step": 1336 + }, + { + "epoch": 0.2695950030223655, + "grad_norm": 0.07571367919445038, + "learning_rate": 9.99958225561934e-05, + "loss": 0.2138, + "step": 1338 + }, + { + "epoch": 0.2699979850896635, + "grad_norm": 0.09333647787570953, + "learning_rate": 9.999564851705862e-05, + "loss": 0.1944, + "step": 1340 + }, + { + "epoch": 0.2704009671569615, + "grad_norm": 0.06818639487028122, + "learning_rate": 9.999547092631293e-05, + "loss": 0.1859, + "step": 1342 + }, + { + "epoch": 0.27080394922425954, + "grad_norm": 0.07819613069295883, + "learning_rate": 9.999528978396895e-05, + "loss": 0.2134, + "step": 1344 + }, + { + "epoch": 0.27120693129155754, + "grad_norm": 0.07632673531770706, + "learning_rate": 9.999510509003953e-05, + "loss": 0.1645, + "step": 1346 + }, + { + "epoch": 0.27160991335885554, + "grad_norm": 0.08994800597429276, + "learning_rate": 9.99949168445378e-05, + "loss": 0.2005, + "step": 1348 + }, + { + "epoch": 0.27201289542615353, + "grad_norm": 0.07124733179807663, + "learning_rate": 9.999472504747714e-05, + "loss": 0.215, + "step": 1350 + }, + { + "epoch": 0.27241587749345153, + "grad_norm": 0.0745796486735344, + "learning_rate": 9.999452969887116e-05, + "loss": 0.2201, + "step": 1352 + }, + { + "epoch": 0.2728188595607495, + "grad_norm": 0.09241003543138504, + "learning_rate": 9.999433079873372e-05, + "loss": 0.2026, + "step": 1354 + }, + { + "epoch": 0.2732218416280476, + "grad_norm": 0.07417726516723633, + "learning_rate": 9.999412834707902e-05, + "loss": 0.1683, + "step": 1356 + }, + { + "epoch": 0.2736248236953456, + "grad_norm": 0.07573942840099335, + "learning_rate": 9.999392234392138e-05, + "loss": 0.2068, + "step": 1358 + }, + { + "epoch": 0.2740278057626436, + "grad_norm": 0.08692026883363724, + "learning_rate": 9.999371278927543e-05, + "loss": 0.23, + "step": 1360 + }, + { + "epoch": 0.27443078782994157, + "grad_norm": 0.07430460304021835, + "learning_rate": 9.99934996831561e-05, + "loss": 0.2022, + "step": 1362 + }, + { + "epoch": 0.27483376989723957, + "grad_norm": 0.09575946629047394, + "learning_rate": 9.99932830255785e-05, + "loss": 0.1642, + "step": 1364 + }, + { + "epoch": 0.27523675196453756, + "grad_norm": 0.11394454538822174, + "learning_rate": 9.999306281655803e-05, + "loss": 0.2194, + "step": 1366 + }, + { + "epoch": 0.27563973403183556, + "grad_norm": 0.09447552263736725, + "learning_rate": 9.99928390561103e-05, + "loss": 0.1799, + "step": 1368 + }, + { + "epoch": 0.2760427160991336, + "grad_norm": 0.09351827204227448, + "learning_rate": 9.999261174425127e-05, + "loss": 0.1997, + "step": 1370 + }, + { + "epoch": 0.2764456981664316, + "grad_norm": 0.06894346326589584, + "learning_rate": 9.999238088099704e-05, + "loss": 0.1359, + "step": 1372 + }, + { + "epoch": 0.2768486802337296, + "grad_norm": 0.10162418335676193, + "learning_rate": 9.999214646636404e-05, + "loss": 0.2037, + "step": 1374 + }, + { + "epoch": 0.2772516623010276, + "grad_norm": 0.08390204608440399, + "learning_rate": 9.999190850036889e-05, + "loss": 0.2224, + "step": 1376 + }, + { + "epoch": 0.2776546443683256, + "grad_norm": 0.06677880138158798, + "learning_rate": 9.99916669830285e-05, + "loss": 0.1331, + "step": 1378 + }, + { + "epoch": 0.2780576264356236, + "grad_norm": 0.06353922933340073, + "learning_rate": 9.999142191436004e-05, + "loss": 0.1537, + "step": 1380 + }, + { + "epoch": 0.27846060850292165, + "grad_norm": 0.079979807138443, + "learning_rate": 9.999117329438092e-05, + "loss": 0.2154, + "step": 1382 + }, + { + "epoch": 0.27886359057021964, + "grad_norm": 0.08919129520654678, + "learning_rate": 9.999092112310881e-05, + "loss": 0.2191, + "step": 1384 + }, + { + "epoch": 0.27926657263751764, + "grad_norm": 0.06999266147613525, + "learning_rate": 9.99906654005616e-05, + "loss": 0.2094, + "step": 1386 + }, + { + "epoch": 0.27966955470481564, + "grad_norm": 0.16508431732654572, + "learning_rate": 9.999040612675748e-05, + "loss": 0.2204, + "step": 1388 + }, + { + "epoch": 0.28007253677211363, + "grad_norm": 0.07799568772315979, + "learning_rate": 9.999014330171485e-05, + "loss": 0.1778, + "step": 1390 + }, + { + "epoch": 0.28047551883941163, + "grad_norm": 0.08466193079948425, + "learning_rate": 9.998987692545239e-05, + "loss": 0.1751, + "step": 1392 + }, + { + "epoch": 0.2808785009067096, + "grad_norm": 0.06041021645069122, + "learning_rate": 9.998960699798902e-05, + "loss": 0.2082, + "step": 1394 + }, + { + "epoch": 0.2812814829740077, + "grad_norm": 0.08678558468818665, + "learning_rate": 9.99893335193439e-05, + "loss": 0.2406, + "step": 1396 + }, + { + "epoch": 0.2816844650413057, + "grad_norm": 0.07207660377025604, + "learning_rate": 9.998905648953649e-05, + "loss": 0.1713, + "step": 1398 + }, + { + "epoch": 0.2820874471086037, + "grad_norm": 0.0904468297958374, + "learning_rate": 9.998877590858646e-05, + "loss": 0.18, + "step": 1400 + }, + { + "epoch": 0.28249042917590167, + "grad_norm": 0.09611000120639801, + "learning_rate": 9.998849177651371e-05, + "loss": 0.218, + "step": 1402 + }, + { + "epoch": 0.28289341124319967, + "grad_norm": 0.07803361862897873, + "learning_rate": 9.998820409333847e-05, + "loss": 0.1766, + "step": 1404 + }, + { + "epoch": 0.28329639331049766, + "grad_norm": 0.07811973243951797, + "learning_rate": 9.998791285908115e-05, + "loss": 0.1674, + "step": 1406 + }, + { + "epoch": 0.2836993753777957, + "grad_norm": 0.0652318000793457, + "learning_rate": 9.998761807376245e-05, + "loss": 0.2013, + "step": 1408 + }, + { + "epoch": 0.2841023574450937, + "grad_norm": 0.08281879127025604, + "learning_rate": 9.998731973740329e-05, + "loss": 0.2161, + "step": 1410 + }, + { + "epoch": 0.2845053395123917, + "grad_norm": 0.06643449515104294, + "learning_rate": 9.998701785002489e-05, + "loss": 0.1932, + "step": 1412 + }, + { + "epoch": 0.2849083215796897, + "grad_norm": 0.06392282247543335, + "learning_rate": 9.998671241164868e-05, + "loss": 0.1612, + "step": 1414 + }, + { + "epoch": 0.2853113036469877, + "grad_norm": 0.06907442212104797, + "learning_rate": 9.998640342229636e-05, + "loss": 0.2002, + "step": 1416 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.07263526320457458, + "learning_rate": 9.998609088198988e-05, + "loss": 0.2152, + "step": 1418 + }, + { + "epoch": 0.2861172677815837, + "grad_norm": 0.058846525847911835, + "learning_rate": 9.998577479075145e-05, + "loss": 0.1925, + "step": 1420 + }, + { + "epoch": 0.28652024984888175, + "grad_norm": 0.05752931907773018, + "learning_rate": 9.998545514860352e-05, + "loss": 0.1744, + "step": 1422 + }, + { + "epoch": 0.28692323191617974, + "grad_norm": 0.07596255093812943, + "learning_rate": 9.99851319555688e-05, + "loss": 0.1915, + "step": 1424 + }, + { + "epoch": 0.28732621398347774, + "grad_norm": 0.08544960618019104, + "learning_rate": 9.998480521167025e-05, + "loss": 0.1749, + "step": 1426 + }, + { + "epoch": 0.28772919605077574, + "grad_norm": 0.07587330788373947, + "learning_rate": 9.998447491693105e-05, + "loss": 0.1518, + "step": 1428 + }, + { + "epoch": 0.28813217811807373, + "grad_norm": 0.07118236273527145, + "learning_rate": 9.998414107137471e-05, + "loss": 0.1754, + "step": 1430 + }, + { + "epoch": 0.28853516018537173, + "grad_norm": 0.06920721381902695, + "learning_rate": 9.998380367502493e-05, + "loss": 0.1693, + "step": 1432 + }, + { + "epoch": 0.2889381422526698, + "grad_norm": 0.07532205432653427, + "learning_rate": 9.998346272790567e-05, + "loss": 0.2045, + "step": 1434 + }, + { + "epoch": 0.2893411243199678, + "grad_norm": 0.07022588700056076, + "learning_rate": 9.998311823004114e-05, + "loss": 0.1946, + "step": 1436 + }, + { + "epoch": 0.2897441063872658, + "grad_norm": 0.0767081007361412, + "learning_rate": 9.998277018145585e-05, + "loss": 0.1599, + "step": 1438 + }, + { + "epoch": 0.2901470884545638, + "grad_norm": 0.09657610207796097, + "learning_rate": 9.998241858217449e-05, + "loss": 0.1972, + "step": 1440 + }, + { + "epoch": 0.29055007052186177, + "grad_norm": 0.06490109115839005, + "learning_rate": 9.998206343222205e-05, + "loss": 0.2065, + "step": 1442 + }, + { + "epoch": 0.29095305258915977, + "grad_norm": 0.09028349071741104, + "learning_rate": 9.998170473162376e-05, + "loss": 0.187, + "step": 1444 + }, + { + "epoch": 0.29135603465645776, + "grad_norm": 0.08025387674570084, + "learning_rate": 9.99813424804051e-05, + "loss": 0.2205, + "step": 1446 + }, + { + "epoch": 0.2917590167237558, + "grad_norm": 0.08773567527532578, + "learning_rate": 9.99809766785918e-05, + "loss": 0.2921, + "step": 1448 + }, + { + "epoch": 0.2921619987910538, + "grad_norm": 0.07190251350402832, + "learning_rate": 9.998060732620985e-05, + "loss": 0.2012, + "step": 1450 + }, + { + "epoch": 0.2925649808583518, + "grad_norm": 0.08821823447942734, + "learning_rate": 9.998023442328549e-05, + "loss": 0.1774, + "step": 1452 + }, + { + "epoch": 0.2929679629256498, + "grad_norm": 0.07339081913232803, + "learning_rate": 9.99798579698452e-05, + "loss": 0.2081, + "step": 1454 + }, + { + "epoch": 0.2933709449929478, + "grad_norm": 0.07733795791864395, + "learning_rate": 9.997947796591573e-05, + "loss": 0.2352, + "step": 1456 + }, + { + "epoch": 0.2937739270602458, + "grad_norm": 0.061597902327775955, + "learning_rate": 9.99790944115241e-05, + "loss": 0.2202, + "step": 1458 + }, + { + "epoch": 0.29417690912754385, + "grad_norm": 0.06928958743810654, + "learning_rate": 9.997870730669752e-05, + "loss": 0.2337, + "step": 1460 + }, + { + "epoch": 0.29457989119484185, + "grad_norm": 0.07841575145721436, + "learning_rate": 9.997831665146348e-05, + "loss": 0.2247, + "step": 1462 + }, + { + "epoch": 0.29498287326213984, + "grad_norm": 0.08794905245304108, + "learning_rate": 9.997792244584978e-05, + "loss": 0.2142, + "step": 1464 + }, + { + "epoch": 0.29538585532943784, + "grad_norm": 0.09079600870609283, + "learning_rate": 9.997752468988439e-05, + "loss": 0.2001, + "step": 1466 + }, + { + "epoch": 0.29578883739673584, + "grad_norm": 0.06976237893104553, + "learning_rate": 9.997712338359557e-05, + "loss": 0.1637, + "step": 1468 + }, + { + "epoch": 0.29619181946403383, + "grad_norm": 0.06324432045221329, + "learning_rate": 9.997671852701185e-05, + "loss": 0.1678, + "step": 1470 + }, + { + "epoch": 0.29659480153133183, + "grad_norm": 0.05822910740971565, + "learning_rate": 9.997631012016195e-05, + "loss": 0.1582, + "step": 1472 + }, + { + "epoch": 0.2969977835986299, + "grad_norm": 0.07130776345729828, + "learning_rate": 9.997589816307491e-05, + "loss": 0.1777, + "step": 1474 + }, + { + "epoch": 0.2974007656659279, + "grad_norm": 0.0914449617266655, + "learning_rate": 9.997548265577998e-05, + "loss": 0.2279, + "step": 1476 + }, + { + "epoch": 0.2978037477332259, + "grad_norm": 0.07395707070827484, + "learning_rate": 9.99750635983067e-05, + "loss": 0.1786, + "step": 1478 + }, + { + "epoch": 0.2982067298005239, + "grad_norm": 0.0627974271774292, + "learning_rate": 9.997464099068484e-05, + "loss": 0.2187, + "step": 1480 + }, + { + "epoch": 0.29860971186782187, + "grad_norm": 0.08622007071971893, + "learning_rate": 9.997421483294438e-05, + "loss": 0.2272, + "step": 1482 + }, + { + "epoch": 0.29901269393511987, + "grad_norm": 0.05741937831044197, + "learning_rate": 9.997378512511561e-05, + "loss": 0.1968, + "step": 1484 + }, + { + "epoch": 0.2994156760024179, + "grad_norm": 0.07579632848501205, + "learning_rate": 9.997335186722909e-05, + "loss": 0.2003, + "step": 1486 + }, + { + "epoch": 0.2998186580697159, + "grad_norm": 0.06345254182815552, + "learning_rate": 9.997291505931558e-05, + "loss": 0.2027, + "step": 1488 + }, + { + "epoch": 0.3002216401370139, + "grad_norm": 0.05749038606882095, + "learning_rate": 9.997247470140608e-05, + "loss": 0.1728, + "step": 1490 + }, + { + "epoch": 0.3006246222043119, + "grad_norm": 0.06285730749368668, + "learning_rate": 9.99720307935319e-05, + "loss": 0.1839, + "step": 1492 + }, + { + "epoch": 0.3010276042716099, + "grad_norm": 0.08099797368049622, + "learning_rate": 9.99715833357246e-05, + "loss": 0.2482, + "step": 1494 + }, + { + "epoch": 0.3014305863389079, + "grad_norm": 0.06453213840723038, + "learning_rate": 9.997113232801592e-05, + "loss": 0.1932, + "step": 1496 + }, + { + "epoch": 0.3018335684062059, + "grad_norm": 0.07795906066894531, + "learning_rate": 9.99706777704379e-05, + "loss": 0.2067, + "step": 1498 + }, + { + "epoch": 0.30223655047350395, + "grad_norm": 0.06192241236567497, + "learning_rate": 9.997021966302287e-05, + "loss": 0.2083, + "step": 1500 + }, + { + "epoch": 0.30263953254080195, + "grad_norm": 0.07832437753677368, + "learning_rate": 9.996975800580333e-05, + "loss": 0.2086, + "step": 1502 + }, + { + "epoch": 0.30304251460809994, + "grad_norm": 0.06281822919845581, + "learning_rate": 9.996929279881211e-05, + "loss": 0.1819, + "step": 1504 + }, + { + "epoch": 0.30344549667539794, + "grad_norm": 0.08558258414268494, + "learning_rate": 9.996882404208224e-05, + "loss": 0.1979, + "step": 1506 + }, + { + "epoch": 0.30384847874269594, + "grad_norm": 0.07973451912403107, + "learning_rate": 9.996835173564703e-05, + "loss": 0.1916, + "step": 1508 + }, + { + "epoch": 0.30425146080999393, + "grad_norm": 0.09966539591550827, + "learning_rate": 9.996787587954001e-05, + "loss": 0.212, + "step": 1510 + }, + { + "epoch": 0.304654442877292, + "grad_norm": 0.06780754774808884, + "learning_rate": 9.996739647379501e-05, + "loss": 0.1834, + "step": 1512 + }, + { + "epoch": 0.30505742494459, + "grad_norm": 0.054154492914676666, + "learning_rate": 9.996691351844608e-05, + "loss": 0.1977, + "step": 1514 + }, + { + "epoch": 0.305460407011888, + "grad_norm": 0.08135867863893509, + "learning_rate": 9.996642701352752e-05, + "loss": 0.2166, + "step": 1516 + }, + { + "epoch": 0.305863389079186, + "grad_norm": 0.06651759892702103, + "learning_rate": 9.99659369590739e-05, + "loss": 0.1611, + "step": 1518 + }, + { + "epoch": 0.306266371146484, + "grad_norm": 0.07462992519140244, + "learning_rate": 9.996544335512001e-05, + "loss": 0.2129, + "step": 1520 + }, + { + "epoch": 0.30666935321378197, + "grad_norm": 0.0842273160815239, + "learning_rate": 9.996494620170094e-05, + "loss": 0.1879, + "step": 1522 + }, + { + "epoch": 0.30707233528107997, + "grad_norm": 0.06992574781179428, + "learning_rate": 9.9964445498852e-05, + "loss": 0.2261, + "step": 1524 + }, + { + "epoch": 0.307475317348378, + "grad_norm": 0.1636413186788559, + "learning_rate": 9.996394124660876e-05, + "loss": 0.2208, + "step": 1526 + }, + { + "epoch": 0.307878299415676, + "grad_norm": 0.07403954118490219, + "learning_rate": 9.996343344500705e-05, + "loss": 0.1993, + "step": 1528 + }, + { + "epoch": 0.308281281482974, + "grad_norm": 0.058403730392456055, + "learning_rate": 9.996292209408291e-05, + "loss": 0.2463, + "step": 1530 + }, + { + "epoch": 0.308684263550272, + "grad_norm": 0.06864384561777115, + "learning_rate": 9.996240719387271e-05, + "loss": 0.2136, + "step": 1532 + }, + { + "epoch": 0.30908724561757, + "grad_norm": 0.1078968495130539, + "learning_rate": 9.996188874441298e-05, + "loss": 0.2107, + "step": 1534 + }, + { + "epoch": 0.309490227684868, + "grad_norm": 0.06026478856801987, + "learning_rate": 9.996136674574059e-05, + "loss": 0.1582, + "step": 1536 + }, + { + "epoch": 0.30989320975216605, + "grad_norm": 0.059033844619989395, + "learning_rate": 9.996084119789262e-05, + "loss": 0.1873, + "step": 1538 + }, + { + "epoch": 0.31029619181946405, + "grad_norm": 0.05242142453789711, + "learning_rate": 9.996031210090637e-05, + "loss": 0.2194, + "step": 1540 + }, + { + "epoch": 0.31069917388676205, + "grad_norm": 0.07254286110401154, + "learning_rate": 9.995977945481946e-05, + "loss": 0.2227, + "step": 1542 + }, + { + "epoch": 0.31110215595406004, + "grad_norm": 0.07154665887355804, + "learning_rate": 9.995924325966973e-05, + "loss": 0.178, + "step": 1544 + }, + { + "epoch": 0.31150513802135804, + "grad_norm": 0.08451339602470398, + "learning_rate": 9.995870351549523e-05, + "loss": 0.2361, + "step": 1546 + }, + { + "epoch": 0.31190812008865604, + "grad_norm": 0.06354124844074249, + "learning_rate": 9.995816022233435e-05, + "loss": 0.1935, + "step": 1548 + }, + { + "epoch": 0.31231110215595403, + "grad_norm": 0.05122746527194977, + "learning_rate": 9.995761338022566e-05, + "loss": 0.1893, + "step": 1550 + }, + { + "epoch": 0.3127140842232521, + "grad_norm": 0.06881532818078995, + "learning_rate": 9.9957062989208e-05, + "loss": 0.1638, + "step": 1552 + }, + { + "epoch": 0.3131170662905501, + "grad_norm": 0.06362321227788925, + "learning_rate": 9.99565090493205e-05, + "loss": 0.1889, + "step": 1554 + }, + { + "epoch": 0.3135200483578481, + "grad_norm": 0.07629745453596115, + "learning_rate": 9.995595156060246e-05, + "loss": 0.2519, + "step": 1556 + }, + { + "epoch": 0.3139230304251461, + "grad_norm": 0.05831436440348625, + "learning_rate": 9.995539052309353e-05, + "loss": 0.1613, + "step": 1558 + }, + { + "epoch": 0.3143260124924441, + "grad_norm": 0.08071257919073105, + "learning_rate": 9.995482593683356e-05, + "loss": 0.2155, + "step": 1560 + }, + { + "epoch": 0.31472899455974207, + "grad_norm": 0.06999702006578445, + "learning_rate": 9.995425780186263e-05, + "loss": 0.2026, + "step": 1562 + }, + { + "epoch": 0.3151319766270401, + "grad_norm": 0.08394176512956619, + "learning_rate": 9.995368611822113e-05, + "loss": 0.2288, + "step": 1564 + }, + { + "epoch": 0.3155349586943381, + "grad_norm": 0.0686328262090683, + "learning_rate": 9.995311088594966e-05, + "loss": 0.1879, + "step": 1566 + }, + { + "epoch": 0.3159379407616361, + "grad_norm": 0.06367763131856918, + "learning_rate": 9.995253210508906e-05, + "loss": 0.198, + "step": 1568 + }, + { + "epoch": 0.3163409228289341, + "grad_norm": 0.0799856036901474, + "learning_rate": 9.995194977568047e-05, + "loss": 0.2044, + "step": 1570 + }, + { + "epoch": 0.3167439048962321, + "grad_norm": 0.059465620666742325, + "learning_rate": 9.995136389776527e-05, + "loss": 0.2186, + "step": 1572 + }, + { + "epoch": 0.3171468869635301, + "grad_norm": 0.07960893958806992, + "learning_rate": 9.995077447138506e-05, + "loss": 0.2338, + "step": 1574 + }, + { + "epoch": 0.3175498690308281, + "grad_norm": 0.05532738193869591, + "learning_rate": 9.995018149658171e-05, + "loss": 0.1919, + "step": 1576 + }, + { + "epoch": 0.31795285109812615, + "grad_norm": 0.06862778216600418, + "learning_rate": 9.994958497339735e-05, + "loss": 0.165, + "step": 1578 + }, + { + "epoch": 0.31835583316542415, + "grad_norm": 0.07541234791278839, + "learning_rate": 9.994898490187434e-05, + "loss": 0.211, + "step": 1580 + }, + { + "epoch": 0.31875881523272215, + "grad_norm": 0.05714013800024986, + "learning_rate": 9.994838128205535e-05, + "loss": 0.2176, + "step": 1582 + }, + { + "epoch": 0.31916179730002014, + "grad_norm": 0.08829407393932343, + "learning_rate": 9.994777411398323e-05, + "loss": 0.1978, + "step": 1584 + }, + { + "epoch": 0.31956477936731814, + "grad_norm": 0.11317098140716553, + "learning_rate": 9.994716339770111e-05, + "loss": 0.2177, + "step": 1586 + }, + { + "epoch": 0.31996776143461614, + "grad_norm": 0.05632421001791954, + "learning_rate": 9.994654913325239e-05, + "loss": 0.1938, + "step": 1588 + }, + { + "epoch": 0.3203707435019142, + "grad_norm": 0.1002504974603653, + "learning_rate": 9.994593132068068e-05, + "loss": 0.1982, + "step": 1590 + }, + { + "epoch": 0.3207737255692122, + "grad_norm": 0.07449984550476074, + "learning_rate": 9.99453099600299e-05, + "loss": 0.1514, + "step": 1592 + }, + { + "epoch": 0.3211767076365102, + "grad_norm": 0.06601531058549881, + "learning_rate": 9.994468505134417e-05, + "loss": 0.2112, + "step": 1594 + }, + { + "epoch": 0.3215796897038082, + "grad_norm": 0.08076255768537521, + "learning_rate": 9.994405659466791e-05, + "loss": 0.2217, + "step": 1596 + }, + { + "epoch": 0.3219826717711062, + "grad_norm": 0.06299552321434021, + "learning_rate": 9.994342459004571e-05, + "loss": 0.2193, + "step": 1598 + }, + { + "epoch": 0.3223856538384042, + "grad_norm": 0.07977569848299026, + "learning_rate": 9.994278903752252e-05, + "loss": 0.1955, + "step": 1600 + }, + { + "epoch": 0.32278863590570217, + "grad_norm": 0.06248839944601059, + "learning_rate": 9.994214993714346e-05, + "loss": 0.1976, + "step": 1602 + }, + { + "epoch": 0.3231916179730002, + "grad_norm": 0.09939960390329361, + "learning_rate": 9.994150728895394e-05, + "loss": 0.1709, + "step": 1604 + }, + { + "epoch": 0.3235946000402982, + "grad_norm": 0.07984374463558197, + "learning_rate": 9.994086109299961e-05, + "loss": 0.2249, + "step": 1606 + }, + { + "epoch": 0.3239975821075962, + "grad_norm": 0.08359546959400177, + "learning_rate": 9.994021134932638e-05, + "loss": 0.234, + "step": 1608 + }, + { + "epoch": 0.3244005641748942, + "grad_norm": 0.05540497601032257, + "learning_rate": 9.993955805798041e-05, + "loss": 0.1827, + "step": 1610 + }, + { + "epoch": 0.3248035462421922, + "grad_norm": 0.06334353983402252, + "learning_rate": 9.993890121900809e-05, + "loss": 0.1989, + "step": 1612 + }, + { + "epoch": 0.3252065283094902, + "grad_norm": 0.0763341635465622, + "learning_rate": 9.99382408324561e-05, + "loss": 0.1968, + "step": 1614 + }, + { + "epoch": 0.32560951037678826, + "grad_norm": 0.0655263289809227, + "learning_rate": 9.993757689837135e-05, + "loss": 0.1874, + "step": 1616 + }, + { + "epoch": 0.32601249244408625, + "grad_norm": 0.06662862002849579, + "learning_rate": 9.993690941680098e-05, + "loss": 0.1996, + "step": 1618 + }, + { + "epoch": 0.32641547451138425, + "grad_norm": 0.06894282251596451, + "learning_rate": 9.993623838779244e-05, + "loss": 0.2167, + "step": 1620 + }, + { + "epoch": 0.32681845657868225, + "grad_norm": 0.10926174372434616, + "learning_rate": 9.993556381139339e-05, + "loss": 0.2293, + "step": 1622 + }, + { + "epoch": 0.32722143864598024, + "grad_norm": 0.057909030467271805, + "learning_rate": 9.993488568765175e-05, + "loss": 0.192, + "step": 1624 + }, + { + "epoch": 0.32762442071327824, + "grad_norm": 0.06387782841920853, + "learning_rate": 9.993420401661569e-05, + "loss": 0.1674, + "step": 1626 + }, + { + "epoch": 0.32802740278057624, + "grad_norm": 0.07918224483728409, + "learning_rate": 9.993351879833363e-05, + "loss": 0.1965, + "step": 1628 + }, + { + "epoch": 0.3284303848478743, + "grad_norm": 0.07043313980102539, + "learning_rate": 9.993283003285425e-05, + "loss": 0.1587, + "step": 1630 + }, + { + "epoch": 0.3288333669151723, + "grad_norm": 0.06445840746164322, + "learning_rate": 9.993213772022648e-05, + "loss": 0.1999, + "step": 1632 + }, + { + "epoch": 0.3292363489824703, + "grad_norm": 0.07198817282915115, + "learning_rate": 9.99314418604995e-05, + "loss": 0.224, + "step": 1634 + }, + { + "epoch": 0.3296393310497683, + "grad_norm": 0.062218908220529556, + "learning_rate": 9.993074245372276e-05, + "loss": 0.2172, + "step": 1636 + }, + { + "epoch": 0.3300423131170663, + "grad_norm": 0.09242686629295349, + "learning_rate": 9.99300394999459e-05, + "loss": 0.1877, + "step": 1638 + }, + { + "epoch": 0.3304452951843643, + "grad_norm": 0.05355711653828621, + "learning_rate": 9.992933299921891e-05, + "loss": 0.2071, + "step": 1640 + }, + { + "epoch": 0.3308482772516623, + "grad_norm": 0.05850759148597717, + "learning_rate": 9.992862295159193e-05, + "loss": 0.157, + "step": 1642 + }, + { + "epoch": 0.3312512593189603, + "grad_norm": 0.06185588613152504, + "learning_rate": 9.992790935711544e-05, + "loss": 0.1621, + "step": 1644 + }, + { + "epoch": 0.3316542413862583, + "grad_norm": 0.07613895833492279, + "learning_rate": 9.992719221584012e-05, + "loss": 0.2315, + "step": 1646 + }, + { + "epoch": 0.3320572234535563, + "grad_norm": 0.09143196791410446, + "learning_rate": 9.99264715278169e-05, + "loss": 0.2347, + "step": 1648 + }, + { + "epoch": 0.3324602055208543, + "grad_norm": 0.08874918520450592, + "learning_rate": 9.992574729309701e-05, + "loss": 0.2206, + "step": 1650 + }, + { + "epoch": 0.3328631875881523, + "grad_norm": 0.11054286360740662, + "learning_rate": 9.992501951173186e-05, + "loss": 0.2128, + "step": 1652 + }, + { + "epoch": 0.3332661696554503, + "grad_norm": 0.05944681912660599, + "learning_rate": 9.992428818377318e-05, + "loss": 0.1919, + "step": 1654 + }, + { + "epoch": 0.33366915172274836, + "grad_norm": 0.09661544114351273, + "learning_rate": 9.992355330927288e-05, + "loss": 0.2246, + "step": 1656 + }, + { + "epoch": 0.33407213379004636, + "grad_norm": 0.05847775191068649, + "learning_rate": 9.992281488828322e-05, + "loss": 0.1888, + "step": 1658 + }, + { + "epoch": 0.33447511585734435, + "grad_norm": 0.05200980231165886, + "learning_rate": 9.992207292085662e-05, + "loss": 0.147, + "step": 1660 + }, + { + "epoch": 0.33487809792464235, + "grad_norm": 0.07436300069093704, + "learning_rate": 9.99213274070458e-05, + "loss": 0.1989, + "step": 1662 + }, + { + "epoch": 0.33528107999194035, + "grad_norm": 0.10399733483791351, + "learning_rate": 9.992057834690373e-05, + "loss": 0.2312, + "step": 1664 + }, + { + "epoch": 0.33568406205923834, + "grad_norm": 0.05904490128159523, + "learning_rate": 9.99198257404836e-05, + "loss": 0.1982, + "step": 1666 + }, + { + "epoch": 0.3360870441265364, + "grad_norm": 0.08844948559999466, + "learning_rate": 9.991906958783887e-05, + "loss": 0.1843, + "step": 1668 + }, + { + "epoch": 0.3364900261938344, + "grad_norm": 0.06130826473236084, + "learning_rate": 9.991830988902328e-05, + "loss": 0.2058, + "step": 1670 + }, + { + "epoch": 0.3368930082611324, + "grad_norm": 0.06297429651021957, + "learning_rate": 9.99175466440908e-05, + "loss": 0.1825, + "step": 1672 + }, + { + "epoch": 0.3372959903284304, + "grad_norm": 0.058773405849933624, + "learning_rate": 9.991677985309563e-05, + "loss": 0.245, + "step": 1674 + }, + { + "epoch": 0.3376989723957284, + "grad_norm": 0.06418730318546295, + "learning_rate": 9.991600951609226e-05, + "loss": 0.2072, + "step": 1676 + }, + { + "epoch": 0.3381019544630264, + "grad_norm": 0.07122557610273361, + "learning_rate": 9.991523563313538e-05, + "loss": 0.1764, + "step": 1678 + }, + { + "epoch": 0.3385049365303244, + "grad_norm": 0.08291307836771011, + "learning_rate": 9.991445820428e-05, + "loss": 0.1887, + "step": 1680 + }, + { + "epoch": 0.3389079185976224, + "grad_norm": 0.09296026825904846, + "learning_rate": 9.991367722958134e-05, + "loss": 0.2177, + "step": 1682 + }, + { + "epoch": 0.3393109006649204, + "grad_norm": 0.08643902093172073, + "learning_rate": 9.991289270909488e-05, + "loss": 0.1744, + "step": 1684 + }, + { + "epoch": 0.3397138827322184, + "grad_norm": 0.06946311146020889, + "learning_rate": 9.991210464287633e-05, + "loss": 0.237, + "step": 1686 + }, + { + "epoch": 0.3401168647995164, + "grad_norm": 0.06480636447668076, + "learning_rate": 9.99113130309817e-05, + "loss": 0.2472, + "step": 1688 + }, + { + "epoch": 0.3405198468668144, + "grad_norm": 0.0569583997130394, + "learning_rate": 9.991051787346721e-05, + "loss": 0.2188, + "step": 1690 + }, + { + "epoch": 0.3409228289341124, + "grad_norm": 0.06187818944454193, + "learning_rate": 9.990971917038933e-05, + "loss": 0.2236, + "step": 1692 + }, + { + "epoch": 0.34132581100141046, + "grad_norm": 0.07523096352815628, + "learning_rate": 9.990891692180485e-05, + "loss": 0.233, + "step": 1694 + }, + { + "epoch": 0.34172879306870846, + "grad_norm": 0.06507721543312073, + "learning_rate": 9.990811112777072e-05, + "loss": 0.1959, + "step": 1696 + }, + { + "epoch": 0.34213177513600646, + "grad_norm": 0.08201830089092255, + "learning_rate": 9.99073017883442e-05, + "loss": 0.1792, + "step": 1698 + }, + { + "epoch": 0.34253475720330445, + "grad_norm": 0.06180296465754509, + "learning_rate": 9.990648890358277e-05, + "loss": 0.18, + "step": 1700 + }, + { + "epoch": 0.34293773927060245, + "grad_norm": 0.09496378898620605, + "learning_rate": 9.990567247354416e-05, + "loss": 0.2402, + "step": 1702 + }, + { + "epoch": 0.34334072133790045, + "grad_norm": 0.06735090166330338, + "learning_rate": 9.990485249828641e-05, + "loss": 0.1585, + "step": 1704 + }, + { + "epoch": 0.34374370340519844, + "grad_norm": 0.060190655291080475, + "learning_rate": 9.990402897786775e-05, + "loss": 0.22, + "step": 1706 + }, + { + "epoch": 0.3441466854724965, + "grad_norm": 0.08018799871206284, + "learning_rate": 9.990320191234667e-05, + "loss": 0.2305, + "step": 1708 + }, + { + "epoch": 0.3445496675397945, + "grad_norm": 0.09596269577741623, + "learning_rate": 9.990237130178194e-05, + "loss": 0.2066, + "step": 1710 + }, + { + "epoch": 0.3449526496070925, + "grad_norm": 0.07448262721300125, + "learning_rate": 9.990153714623257e-05, + "loss": 0.1807, + "step": 1712 + }, + { + "epoch": 0.3453556316743905, + "grad_norm": 0.07886528223752975, + "learning_rate": 9.99006994457578e-05, + "loss": 0.2259, + "step": 1714 + }, + { + "epoch": 0.3457586137416885, + "grad_norm": 0.13016141951084137, + "learning_rate": 9.989985820041714e-05, + "loss": 0.2409, + "step": 1716 + }, + { + "epoch": 0.3461615958089865, + "grad_norm": 0.06906406581401825, + "learning_rate": 9.989901341027037e-05, + "loss": 0.1858, + "step": 1718 + }, + { + "epoch": 0.34656457787628453, + "grad_norm": 0.09606174379587173, + "learning_rate": 9.989816507537748e-05, + "loss": 0.2201, + "step": 1720 + }, + { + "epoch": 0.3469675599435825, + "grad_norm": 0.0815054252743721, + "learning_rate": 9.989731319579873e-05, + "loss": 0.1897, + "step": 1722 + }, + { + "epoch": 0.3473705420108805, + "grad_norm": 0.08122877776622772, + "learning_rate": 9.989645777159467e-05, + "loss": 0.1985, + "step": 1724 + }, + { + "epoch": 0.3477735240781785, + "grad_norm": 0.06756141036748886, + "learning_rate": 9.989559880282604e-05, + "loss": 0.1802, + "step": 1726 + }, + { + "epoch": 0.3481765061454765, + "grad_norm": 0.1047460213303566, + "learning_rate": 9.989473628955387e-05, + "loss": 0.1752, + "step": 1728 + }, + { + "epoch": 0.3485794882127745, + "grad_norm": 0.08227745443582535, + "learning_rate": 9.989387023183943e-05, + "loss": 0.1597, + "step": 1730 + }, + { + "epoch": 0.3489824702800725, + "grad_norm": 0.0924132764339447, + "learning_rate": 9.989300062974424e-05, + "loss": 0.1911, + "step": 1732 + }, + { + "epoch": 0.34938545234737056, + "grad_norm": 0.05753399431705475, + "learning_rate": 9.989212748333008e-05, + "loss": 0.1474, + "step": 1734 + }, + { + "epoch": 0.34978843441466856, + "grad_norm": 0.10338200628757477, + "learning_rate": 9.989125079265896e-05, + "loss": 0.1717, + "step": 1736 + }, + { + "epoch": 0.35019141648196656, + "grad_norm": 0.0935504212975502, + "learning_rate": 9.989037055779318e-05, + "loss": 0.2091, + "step": 1738 + }, + { + "epoch": 0.35059439854926455, + "grad_norm": 0.06872272491455078, + "learning_rate": 9.988948677879528e-05, + "loss": 0.1865, + "step": 1740 + }, + { + "epoch": 0.35099738061656255, + "grad_norm": 0.08346632122993469, + "learning_rate": 9.988859945572802e-05, + "loss": 0.1935, + "step": 1742 + }, + { + "epoch": 0.35140036268386055, + "grad_norm": 0.07062527537345886, + "learning_rate": 9.988770858865441e-05, + "loss": 0.2459, + "step": 1744 + }, + { + "epoch": 0.3518033447511586, + "grad_norm": 0.08531786501407623, + "learning_rate": 9.98868141776378e-05, + "loss": 0.2299, + "step": 1746 + }, + { + "epoch": 0.3522063268184566, + "grad_norm": 0.05672174692153931, + "learning_rate": 9.988591622274169e-05, + "loss": 0.164, + "step": 1748 + }, + { + "epoch": 0.3526093088857546, + "grad_norm": 0.05787867307662964, + "learning_rate": 9.988501472402984e-05, + "loss": 0.2275, + "step": 1750 + }, + { + "epoch": 0.3530122909530526, + "grad_norm": 0.09398354589939117, + "learning_rate": 9.988410968156637e-05, + "loss": 0.2173, + "step": 1752 + }, + { + "epoch": 0.3534152730203506, + "grad_norm": 0.06995268166065216, + "learning_rate": 9.988320109541549e-05, + "loss": 0.2, + "step": 1754 + }, + { + "epoch": 0.3538182550876486, + "grad_norm": 0.061387669295072556, + "learning_rate": 9.98822889656418e-05, + "loss": 0.2059, + "step": 1756 + }, + { + "epoch": 0.3542212371549466, + "grad_norm": 0.06817382574081421, + "learning_rate": 9.988137329231007e-05, + "loss": 0.2139, + "step": 1758 + }, + { + "epoch": 0.35462421922224463, + "grad_norm": 0.2687901258468628, + "learning_rate": 9.988045407548534e-05, + "loss": 0.2202, + "step": 1760 + }, + { + "epoch": 0.3550272012895426, + "grad_norm": 0.07906468957662582, + "learning_rate": 9.987953131523295e-05, + "loss": 0.2178, + "step": 1762 + }, + { + "epoch": 0.3554301833568406, + "grad_norm": 0.06844101846218109, + "learning_rate": 9.987860501161841e-05, + "loss": 0.197, + "step": 1764 + }, + { + "epoch": 0.3558331654241386, + "grad_norm": 0.06971251219511032, + "learning_rate": 9.987767516470754e-05, + "loss": 0.2172, + "step": 1766 + }, + { + "epoch": 0.3562361474914366, + "grad_norm": 0.06165212765336037, + "learning_rate": 9.98767417745664e-05, + "loss": 0.2212, + "step": 1768 + }, + { + "epoch": 0.3566391295587346, + "grad_norm": 0.07242526859045029, + "learning_rate": 9.987580484126129e-05, + "loss": 0.2357, + "step": 1770 + }, + { + "epoch": 0.35704211162603267, + "grad_norm": 0.07110414654016495, + "learning_rate": 9.987486436485877e-05, + "loss": 0.2393, + "step": 1772 + }, + { + "epoch": 0.35744509369333066, + "grad_norm": 0.06459176540374756, + "learning_rate": 9.987392034542564e-05, + "loss": 0.1703, + "step": 1774 + }, + { + "epoch": 0.35784807576062866, + "grad_norm": 0.0674254521727562, + "learning_rate": 9.987297278302898e-05, + "loss": 0.2301, + "step": 1776 + }, + { + "epoch": 0.35825105782792666, + "grad_norm": 0.054965537041425705, + "learning_rate": 9.987202167773609e-05, + "loss": 0.1904, + "step": 1778 + }, + { + "epoch": 0.35865403989522465, + "grad_norm": 0.08467627316713333, + "learning_rate": 9.987106702961453e-05, + "loss": 0.1696, + "step": 1780 + }, + { + "epoch": 0.35905702196252265, + "grad_norm": 0.0902966558933258, + "learning_rate": 9.987010883873214e-05, + "loss": 0.228, + "step": 1782 + }, + { + "epoch": 0.35946000402982065, + "grad_norm": 0.12665781378746033, + "learning_rate": 9.986914710515697e-05, + "loss": 0.2427, + "step": 1784 + }, + { + "epoch": 0.3598629860971187, + "grad_norm": 0.08858704566955566, + "learning_rate": 9.986818182895734e-05, + "loss": 0.1922, + "step": 1786 + }, + { + "epoch": 0.3602659681644167, + "grad_norm": 0.08362315595149994, + "learning_rate": 9.986721301020181e-05, + "loss": 0.2165, + "step": 1788 + }, + { + "epoch": 0.3606689502317147, + "grad_norm": 0.06421255320310593, + "learning_rate": 9.986624064895924e-05, + "loss": 0.2342, + "step": 1790 + }, + { + "epoch": 0.3610719322990127, + "grad_norm": 0.0623495914041996, + "learning_rate": 9.986526474529868e-05, + "loss": 0.1901, + "step": 1792 + }, + { + "epoch": 0.3614749143663107, + "grad_norm": 0.16917704045772552, + "learning_rate": 9.986428529928946e-05, + "loss": 0.222, + "step": 1794 + }, + { + "epoch": 0.3618778964336087, + "grad_norm": 0.19206245243549347, + "learning_rate": 9.986330231100116e-05, + "loss": 0.2488, + "step": 1796 + }, + { + "epoch": 0.36228087850090673, + "grad_norm": 0.08742326498031616, + "learning_rate": 9.986231578050361e-05, + "loss": 0.2056, + "step": 1798 + }, + { + "epoch": 0.36268386056820473, + "grad_norm": 0.616911768913269, + "learning_rate": 9.986132570786688e-05, + "loss": 0.2006, + "step": 1800 + }, + { + "epoch": 0.3630868426355027, + "grad_norm": 0.06567348539829254, + "learning_rate": 9.986033209316132e-05, + "loss": 0.1604, + "step": 1802 + }, + { + "epoch": 0.3634898247028007, + "grad_norm": 0.06277676671743393, + "learning_rate": 9.98593349364575e-05, + "loss": 0.1995, + "step": 1804 + }, + { + "epoch": 0.3638928067700987, + "grad_norm": 0.08555983006954193, + "learning_rate": 9.985833423782626e-05, + "loss": 0.2083, + "step": 1806 + }, + { + "epoch": 0.3642957888373967, + "grad_norm": 0.07542190700769424, + "learning_rate": 9.985732999733872e-05, + "loss": 0.2448, + "step": 1808 + }, + { + "epoch": 0.3646987709046947, + "grad_norm": 0.06384123116731644, + "learning_rate": 9.985632221506617e-05, + "loss": 0.1966, + "step": 1810 + }, + { + "epoch": 0.36510175297199277, + "grad_norm": 0.08267765492200851, + "learning_rate": 9.985531089108023e-05, + "loss": 0.1711, + "step": 1812 + }, + { + "epoch": 0.36550473503929076, + "grad_norm": 0.05912201106548309, + "learning_rate": 9.985429602545274e-05, + "loss": 0.2477, + "step": 1814 + }, + { + "epoch": 0.36590771710658876, + "grad_norm": 0.08566808700561523, + "learning_rate": 9.985327761825577e-05, + "loss": 0.1865, + "step": 1816 + }, + { + "epoch": 0.36631069917388676, + "grad_norm": 0.06715419143438339, + "learning_rate": 9.98522556695617e-05, + "loss": 0.2159, + "step": 1818 + }, + { + "epoch": 0.36671368124118475, + "grad_norm": 0.08008668571710587, + "learning_rate": 9.985123017944311e-05, + "loss": 0.177, + "step": 1820 + }, + { + "epoch": 0.36711666330848275, + "grad_norm": 0.07066267728805542, + "learning_rate": 9.985020114797287e-05, + "loss": 0.207, + "step": 1822 + }, + { + "epoch": 0.3675196453757808, + "grad_norm": 0.1034514307975769, + "learning_rate": 9.984916857522404e-05, + "loss": 0.1647, + "step": 1824 + }, + { + "epoch": 0.3679226274430788, + "grad_norm": 0.0847577378153801, + "learning_rate": 9.984813246127002e-05, + "loss": 0.1412, + "step": 1826 + }, + { + "epoch": 0.3683256095103768, + "grad_norm": 0.08753248304128647, + "learning_rate": 9.984709280618438e-05, + "loss": 0.2522, + "step": 1828 + }, + { + "epoch": 0.3687285915776748, + "grad_norm": 0.0821591392159462, + "learning_rate": 9.984604961004098e-05, + "loss": 0.1748, + "step": 1830 + }, + { + "epoch": 0.3691315736449728, + "grad_norm": 0.0583636499941349, + "learning_rate": 9.984500287291393e-05, + "loss": 0.1763, + "step": 1832 + }, + { + "epoch": 0.3695345557122708, + "grad_norm": 0.05514378100633621, + "learning_rate": 9.98439525948776e-05, + "loss": 0.1618, + "step": 1834 + }, + { + "epoch": 0.36993753777956884, + "grad_norm": 0.06351856887340546, + "learning_rate": 9.984289877600659e-05, + "loss": 0.1949, + "step": 1836 + }, + { + "epoch": 0.37034051984686683, + "grad_norm": 0.06602806597948074, + "learning_rate": 9.984184141637576e-05, + "loss": 0.2321, + "step": 1838 + }, + { + "epoch": 0.37074350191416483, + "grad_norm": 0.07348185032606125, + "learning_rate": 9.984078051606022e-05, + "loss": 0.1378, + "step": 1840 + }, + { + "epoch": 0.3711464839814628, + "grad_norm": 0.06466913968324661, + "learning_rate": 9.983971607513536e-05, + "loss": 0.1866, + "step": 1842 + }, + { + "epoch": 0.3715494660487608, + "grad_norm": 0.059923093765974045, + "learning_rate": 9.983864809367676e-05, + "loss": 0.214, + "step": 1844 + }, + { + "epoch": 0.3719524481160588, + "grad_norm": 0.07116147130727768, + "learning_rate": 9.983757657176032e-05, + "loss": 0.2026, + "step": 1846 + }, + { + "epoch": 0.3723554301833568, + "grad_norm": 0.11628149449825287, + "learning_rate": 9.983650150946213e-05, + "loss": 0.192, + "step": 1848 + }, + { + "epoch": 0.37275841225065487, + "grad_norm": 0.0832078754901886, + "learning_rate": 9.983542290685859e-05, + "loss": 0.196, + "step": 1850 + }, + { + "epoch": 0.37316139431795287, + "grad_norm": 0.07628065347671509, + "learning_rate": 9.98343407640263e-05, + "loss": 0.2624, + "step": 1852 + }, + { + "epoch": 0.37356437638525086, + "grad_norm": 0.07016732543706894, + "learning_rate": 9.983325508104214e-05, + "loss": 0.1612, + "step": 1854 + }, + { + "epoch": 0.37396735845254886, + "grad_norm": 0.0670395940542221, + "learning_rate": 9.983216585798322e-05, + "loss": 0.215, + "step": 1856 + }, + { + "epoch": 0.37437034051984686, + "grad_norm": 0.07254261523485184, + "learning_rate": 9.983107309492693e-05, + "loss": 0.211, + "step": 1858 + }, + { + "epoch": 0.37477332258714485, + "grad_norm": 0.08141341805458069, + "learning_rate": 9.982997679195092e-05, + "loss": 0.2174, + "step": 1860 + }, + { + "epoch": 0.3751763046544429, + "grad_norm": 0.0632607713341713, + "learning_rate": 9.982887694913306e-05, + "loss": 0.1653, + "step": 1862 + }, + { + "epoch": 0.3755792867217409, + "grad_norm": 0.06702928990125656, + "learning_rate": 9.982777356655144e-05, + "loss": 0.2359, + "step": 1864 + }, + { + "epoch": 0.3759822687890389, + "grad_norm": 0.052461106330156326, + "learning_rate": 9.98266666442845e-05, + "loss": 0.1681, + "step": 1866 + }, + { + "epoch": 0.3763852508563369, + "grad_norm": 0.05812010541558266, + "learning_rate": 9.982555618241082e-05, + "loss": 0.2286, + "step": 1868 + }, + { + "epoch": 0.3767882329236349, + "grad_norm": 0.08102001249790192, + "learning_rate": 9.982444218100935e-05, + "loss": 0.2297, + "step": 1870 + }, + { + "epoch": 0.3771912149909329, + "grad_norm": 0.08580035716295242, + "learning_rate": 9.982332464015915e-05, + "loss": 0.2389, + "step": 1872 + }, + { + "epoch": 0.3775941970582309, + "grad_norm": 0.07281485199928284, + "learning_rate": 9.982220355993968e-05, + "loss": 0.2064, + "step": 1874 + }, + { + "epoch": 0.37799717912552894, + "grad_norm": 0.0776129812002182, + "learning_rate": 9.982107894043053e-05, + "loss": 0.2068, + "step": 1876 + }, + { + "epoch": 0.37840016119282693, + "grad_norm": 0.087554931640625, + "learning_rate": 9.981995078171162e-05, + "loss": 0.1778, + "step": 1878 + }, + { + "epoch": 0.37880314326012493, + "grad_norm": 0.06286180019378662, + "learning_rate": 9.981881908386308e-05, + "loss": 0.1755, + "step": 1880 + }, + { + "epoch": 0.3792061253274229, + "grad_norm": 0.08133803308010101, + "learning_rate": 9.98176838469653e-05, + "loss": 0.1926, + "step": 1882 + }, + { + "epoch": 0.3796091073947209, + "grad_norm": 0.08006949722766876, + "learning_rate": 9.981654507109893e-05, + "loss": 0.1832, + "step": 1884 + }, + { + "epoch": 0.3800120894620189, + "grad_norm": 0.06335432082414627, + "learning_rate": 9.981540275634487e-05, + "loss": 0.1761, + "step": 1886 + }, + { + "epoch": 0.380415071529317, + "grad_norm": 0.0711664929986, + "learning_rate": 9.981425690278426e-05, + "loss": 0.173, + "step": 1888 + }, + { + "epoch": 0.38081805359661497, + "grad_norm": 0.08654266595840454, + "learning_rate": 9.981310751049851e-05, + "loss": 0.2176, + "step": 1890 + }, + { + "epoch": 0.38122103566391297, + "grad_norm": 0.06413505226373672, + "learning_rate": 9.981195457956928e-05, + "loss": 0.2199, + "step": 1892 + }, + { + "epoch": 0.38162401773121096, + "grad_norm": 0.06541746854782104, + "learning_rate": 9.981079811007845e-05, + "loss": 0.2112, + "step": 1894 + }, + { + "epoch": 0.38202699979850896, + "grad_norm": 0.07289738208055496, + "learning_rate": 9.980963810210817e-05, + "loss": 0.1691, + "step": 1896 + }, + { + "epoch": 0.38242998186580696, + "grad_norm": 0.10542286932468414, + "learning_rate": 9.980847455574087e-05, + "loss": 0.2083, + "step": 1898 + }, + { + "epoch": 0.38283296393310495, + "grad_norm": 0.08158999681472778, + "learning_rate": 9.98073074710592e-05, + "loss": 0.2275, + "step": 1900 + }, + { + "epoch": 0.383235946000403, + "grad_norm": 0.07118821889162064, + "learning_rate": 9.980613684814606e-05, + "loss": 0.2156, + "step": 1902 + }, + { + "epoch": 0.383638928067701, + "grad_norm": 0.06565722823143005, + "learning_rate": 9.980496268708461e-05, + "loss": 0.1858, + "step": 1904 + }, + { + "epoch": 0.384041910134999, + "grad_norm": 0.05890239030122757, + "learning_rate": 9.980378498795825e-05, + "loss": 0.1843, + "step": 1906 + }, + { + "epoch": 0.384444892202297, + "grad_norm": 0.06181297451257706, + "learning_rate": 9.980260375085067e-05, + "loss": 0.2045, + "step": 1908 + }, + { + "epoch": 0.384847874269595, + "grad_norm": 0.05857539921998978, + "learning_rate": 9.980141897584576e-05, + "loss": 0.1791, + "step": 1910 + }, + { + "epoch": 0.385250856336893, + "grad_norm": 0.07684215158224106, + "learning_rate": 9.98002306630277e-05, + "loss": 0.1961, + "step": 1912 + }, + { + "epoch": 0.38565383840419104, + "grad_norm": 0.06899187713861465, + "learning_rate": 9.979903881248088e-05, + "loss": 0.1255, + "step": 1914 + }, + { + "epoch": 0.38605682047148904, + "grad_norm": 0.08205246925354004, + "learning_rate": 9.979784342429e-05, + "loss": 0.1641, + "step": 1916 + }, + { + "epoch": 0.38645980253878703, + "grad_norm": 0.08253847807645798, + "learning_rate": 9.979664449853996e-05, + "loss": 0.1602, + "step": 1918 + }, + { + "epoch": 0.38686278460608503, + "grad_norm": 0.0585266575217247, + "learning_rate": 9.979544203531592e-05, + "loss": 0.1817, + "step": 1920 + }, + { + "epoch": 0.387265766673383, + "grad_norm": 0.10539548099040985, + "learning_rate": 9.979423603470333e-05, + "loss": 0.2231, + "step": 1922 + }, + { + "epoch": 0.387668748740681, + "grad_norm": 0.0943535566329956, + "learning_rate": 9.979302649678783e-05, + "loss": 0.2369, + "step": 1924 + }, + { + "epoch": 0.388071730807979, + "grad_norm": 0.07806706428527832, + "learning_rate": 9.979181342165538e-05, + "loss": 0.2279, + "step": 1926 + }, + { + "epoch": 0.3884747128752771, + "grad_norm": 0.06602683663368225, + "learning_rate": 9.979059680939213e-05, + "loss": 0.2579, + "step": 1928 + }, + { + "epoch": 0.38887769494257507, + "grad_norm": 0.08012797683477402, + "learning_rate": 9.97893766600845e-05, + "loss": 0.2526, + "step": 1930 + }, + { + "epoch": 0.38928067700987307, + "grad_norm": 0.058674756437540054, + "learning_rate": 9.978815297381919e-05, + "loss": 0.2077, + "step": 1932 + }, + { + "epoch": 0.38968365907717106, + "grad_norm": 0.06406107544898987, + "learning_rate": 9.97869257506831e-05, + "loss": 0.1945, + "step": 1934 + }, + { + "epoch": 0.39008664114446906, + "grad_norm": 0.06540010869503021, + "learning_rate": 9.978569499076345e-05, + "loss": 0.1683, + "step": 1936 + }, + { + "epoch": 0.39048962321176706, + "grad_norm": 0.056828927248716354, + "learning_rate": 9.978446069414763e-05, + "loss": 0.2444, + "step": 1938 + }, + { + "epoch": 0.3908926052790651, + "grad_norm": 0.05789874494075775, + "learning_rate": 9.978322286092334e-05, + "loss": 0.2166, + "step": 1940 + }, + { + "epoch": 0.3912955873463631, + "grad_norm": 0.06070829555392265, + "learning_rate": 9.978198149117852e-05, + "loss": 0.1722, + "step": 1942 + }, + { + "epoch": 0.3916985694136611, + "grad_norm": 0.06728588044643402, + "learning_rate": 9.978073658500135e-05, + "loss": 0.1812, + "step": 1944 + }, + { + "epoch": 0.3921015514809591, + "grad_norm": 0.07258196920156479, + "learning_rate": 9.977948814248028e-05, + "loss": 0.1818, + "step": 1946 + }, + { + "epoch": 0.3925045335482571, + "grad_norm": 0.07481992989778519, + "learning_rate": 9.977823616370397e-05, + "loss": 0.2135, + "step": 1948 + }, + { + "epoch": 0.3929075156155551, + "grad_norm": 0.08128810673952103, + "learning_rate": 9.977698064876136e-05, + "loss": 0.2571, + "step": 1950 + }, + { + "epoch": 0.3933104976828531, + "grad_norm": 0.09639985859394073, + "learning_rate": 9.977572159774167e-05, + "loss": 0.2428, + "step": 1952 + }, + { + "epoch": 0.39371347975015114, + "grad_norm": 0.06070376932621002, + "learning_rate": 9.977445901073431e-05, + "loss": 0.1951, + "step": 1954 + }, + { + "epoch": 0.39411646181744914, + "grad_norm": 0.06723980605602264, + "learning_rate": 9.9773192887829e-05, + "loss": 0.1721, + "step": 1956 + }, + { + "epoch": 0.39451944388474713, + "grad_norm": 0.061029743403196335, + "learning_rate": 9.977192322911565e-05, + "loss": 0.2161, + "step": 1958 + }, + { + "epoch": 0.39492242595204513, + "grad_norm": 0.06752429157495499, + "learning_rate": 9.977065003468447e-05, + "loss": 0.1729, + "step": 1960 + }, + { + "epoch": 0.3953254080193431, + "grad_norm": 0.07469742000102997, + "learning_rate": 9.976937330462593e-05, + "loss": 0.1864, + "step": 1962 + }, + { + "epoch": 0.3957283900866411, + "grad_norm": 0.06833580881357193, + "learning_rate": 9.976809303903069e-05, + "loss": 0.2202, + "step": 1964 + }, + { + "epoch": 0.3961313721539392, + "grad_norm": 0.06711506843566895, + "learning_rate": 9.976680923798971e-05, + "loss": 0.2449, + "step": 1966 + }, + { + "epoch": 0.3965343542212372, + "grad_norm": 0.0628892183303833, + "learning_rate": 9.97655219015942e-05, + "loss": 0.2307, + "step": 1968 + }, + { + "epoch": 0.39693733628853517, + "grad_norm": 0.04486105963587761, + "learning_rate": 9.97642310299356e-05, + "loss": 0.1519, + "step": 1970 + }, + { + "epoch": 0.39734031835583317, + "grad_norm": 0.060665931552648544, + "learning_rate": 9.976293662310561e-05, + "loss": 0.1907, + "step": 1972 + }, + { + "epoch": 0.39774330042313116, + "grad_norm": 0.0764361023902893, + "learning_rate": 9.97616386811962e-05, + "loss": 0.1947, + "step": 1974 + }, + { + "epoch": 0.39814628249042916, + "grad_norm": 0.08323927223682404, + "learning_rate": 9.976033720429954e-05, + "loss": 0.1534, + "step": 1976 + }, + { + "epoch": 0.39854926455772716, + "grad_norm": 0.10088339447975159, + "learning_rate": 9.97590321925081e-05, + "loss": 0.2169, + "step": 1978 + }, + { + "epoch": 0.3989522466250252, + "grad_norm": 0.06974364817142487, + "learning_rate": 9.975772364591461e-05, + "loss": 0.1841, + "step": 1980 + }, + { + "epoch": 0.3993552286923232, + "grad_norm": 0.051529139280319214, + "learning_rate": 9.9756411564612e-05, + "loss": 0.156, + "step": 1982 + }, + { + "epoch": 0.3997582107596212, + "grad_norm": 0.04539600387215614, + "learning_rate": 9.97550959486935e-05, + "loss": 0.1884, + "step": 1984 + }, + { + "epoch": 0.4001611928269192, + "grad_norm": 0.07553747296333313, + "learning_rate": 9.975377679825254e-05, + "loss": 0.1764, + "step": 1986 + }, + { + "epoch": 0.4005641748942172, + "grad_norm": 0.06788526475429535, + "learning_rate": 9.975245411338286e-05, + "loss": 0.1896, + "step": 1988 + }, + { + "epoch": 0.4009671569615152, + "grad_norm": 0.0715952068567276, + "learning_rate": 9.975112789417839e-05, + "loss": 0.1663, + "step": 1990 + }, + { + "epoch": 0.40137013902881324, + "grad_norm": 0.0693850964307785, + "learning_rate": 9.974979814073335e-05, + "loss": 0.1951, + "step": 1992 + }, + { + "epoch": 0.40177312109611124, + "grad_norm": 0.06748230755329132, + "learning_rate": 9.974846485314225e-05, + "loss": 0.2539, + "step": 1994 + }, + { + "epoch": 0.40217610316340924, + "grad_norm": 0.07175850868225098, + "learning_rate": 9.974712803149974e-05, + "loss": 0.1882, + "step": 1996 + }, + { + "epoch": 0.40257908523070723, + "grad_norm": 0.05859972909092903, + "learning_rate": 9.974578767590081e-05, + "loss": 0.2038, + "step": 1998 + }, + { + "epoch": 0.40298206729800523, + "grad_norm": 0.0738549679517746, + "learning_rate": 9.97444437864407e-05, + "loss": 0.2094, + "step": 2000 + }, + { + "epoch": 0.4033850493653032, + "grad_norm": 0.07968976348638535, + "learning_rate": 9.974309636321484e-05, + "loss": 0.1927, + "step": 2002 + }, + { + "epoch": 0.4037880314326012, + "grad_norm": 0.06320148706436157, + "learning_rate": 9.974174540631898e-05, + "loss": 0.2125, + "step": 2004 + }, + { + "epoch": 0.4041910134998993, + "grad_norm": 0.06155259907245636, + "learning_rate": 9.974039091584908e-05, + "loss": 0.2159, + "step": 2006 + }, + { + "epoch": 0.4045939955671973, + "grad_norm": 0.12573003768920898, + "learning_rate": 9.973903289190134e-05, + "loss": 0.2388, + "step": 2008 + }, + { + "epoch": 0.40499697763449527, + "grad_norm": 0.06864972412586212, + "learning_rate": 9.973767133457225e-05, + "loss": 0.1782, + "step": 2010 + }, + { + "epoch": 0.40539995970179327, + "grad_norm": 0.1890304684638977, + "learning_rate": 9.973630624395856e-05, + "loss": 0.2289, + "step": 2012 + }, + { + "epoch": 0.40580294176909126, + "grad_norm": 0.06997068971395493, + "learning_rate": 9.973493762015719e-05, + "loss": 0.226, + "step": 2014 + }, + { + "epoch": 0.40620592383638926, + "grad_norm": 0.048897821456193924, + "learning_rate": 9.973356546326539e-05, + "loss": 0.2236, + "step": 2016 + }, + { + "epoch": 0.4066089059036873, + "grad_norm": 0.06851400434970856, + "learning_rate": 9.973218977338064e-05, + "loss": 0.1874, + "step": 2018 + }, + { + "epoch": 0.4070118879709853, + "grad_norm": 0.07441789656877518, + "learning_rate": 9.973081055060067e-05, + "loss": 0.1879, + "step": 2020 + }, + { + "epoch": 0.4074148700382833, + "grad_norm": 0.06250349432229996, + "learning_rate": 9.972942779502345e-05, + "loss": 0.269, + "step": 2022 + }, + { + "epoch": 0.4078178521055813, + "grad_norm": 0.06143819913268089, + "learning_rate": 9.972804150674722e-05, + "loss": 0.173, + "step": 2024 + }, + { + "epoch": 0.4082208341728793, + "grad_norm": 0.06300696730613708, + "learning_rate": 9.972665168587043e-05, + "loss": 0.2063, + "step": 2026 + }, + { + "epoch": 0.4086238162401773, + "grad_norm": 0.05857381224632263, + "learning_rate": 9.972525833249184e-05, + "loss": 0.1931, + "step": 2028 + }, + { + "epoch": 0.4090267983074753, + "grad_norm": 0.054102227091789246, + "learning_rate": 9.972386144671043e-05, + "loss": 0.1767, + "step": 2030 + }, + { + "epoch": 0.40942978037477334, + "grad_norm": 0.08161073923110962, + "learning_rate": 9.97224610286254e-05, + "loss": 0.2202, + "step": 2032 + }, + { + "epoch": 0.40983276244207134, + "grad_norm": 0.06625241786241531, + "learning_rate": 9.972105707833628e-05, + "loss": 0.2147, + "step": 2034 + }, + { + "epoch": 0.41023574450936934, + "grad_norm": 0.06663915514945984, + "learning_rate": 9.971964959594276e-05, + "loss": 0.2014, + "step": 2036 + }, + { + "epoch": 0.41063872657666733, + "grad_norm": 0.06275123357772827, + "learning_rate": 9.971823858154487e-05, + "loss": 0.2785, + "step": 2038 + }, + { + "epoch": 0.41104170864396533, + "grad_norm": 0.06302032619714737, + "learning_rate": 9.971682403524281e-05, + "loss": 0.1869, + "step": 2040 + }, + { + "epoch": 0.41144469071126333, + "grad_norm": 0.13110080361366272, + "learning_rate": 9.971540595713709e-05, + "loss": 0.1643, + "step": 2042 + }, + { + "epoch": 0.4118476727785614, + "grad_norm": 0.16180118918418884, + "learning_rate": 9.971398434732843e-05, + "loss": 0.1396, + "step": 2044 + }, + { + "epoch": 0.4122506548458594, + "grad_norm": 0.06744285672903061, + "learning_rate": 9.971255920591784e-05, + "loss": 0.186, + "step": 2046 + }, + { + "epoch": 0.4126536369131574, + "grad_norm": 0.06002974510192871, + "learning_rate": 9.971113053300653e-05, + "loss": 0.1548, + "step": 2048 + }, + { + "epoch": 0.41305661898045537, + "grad_norm": 0.07181243598461151, + "learning_rate": 9.970969832869603e-05, + "loss": 0.2426, + "step": 2050 + }, + { + "epoch": 0.41345960104775337, + "grad_norm": 0.06398441642522812, + "learning_rate": 9.970826259308805e-05, + "loss": 0.1657, + "step": 2052 + }, + { + "epoch": 0.41386258311505136, + "grad_norm": 0.06433025002479553, + "learning_rate": 9.970682332628459e-05, + "loss": 0.2147, + "step": 2054 + }, + { + "epoch": 0.41426556518234936, + "grad_norm": 0.18160675466060638, + "learning_rate": 9.970538052838789e-05, + "loss": 0.2346, + "step": 2056 + }, + { + "epoch": 0.4146685472496474, + "grad_norm": 0.06317636370658875, + "learning_rate": 9.970393419950046e-05, + "loss": 0.2058, + "step": 2058 + }, + { + "epoch": 0.4150715293169454, + "grad_norm": 0.053193751722574234, + "learning_rate": 9.970248433972503e-05, + "loss": 0.1896, + "step": 2060 + }, + { + "epoch": 0.4154745113842434, + "grad_norm": 0.05231672152876854, + "learning_rate": 9.970103094916459e-05, + "loss": 0.1512, + "step": 2062 + }, + { + "epoch": 0.4158774934515414, + "grad_norm": 0.05676732212305069, + "learning_rate": 9.96995740279224e-05, + "loss": 0.1831, + "step": 2064 + }, + { + "epoch": 0.4162804755188394, + "grad_norm": 0.17120619118213654, + "learning_rate": 9.969811357610197e-05, + "loss": 0.2052, + "step": 2066 + }, + { + "epoch": 0.4166834575861374, + "grad_norm": 0.04707195237278938, + "learning_rate": 9.969664959380702e-05, + "loss": 0.1296, + "step": 2068 + }, + { + "epoch": 0.41708643965343545, + "grad_norm": 0.05727340281009674, + "learning_rate": 9.969518208114157e-05, + "loss": 0.2024, + "step": 2070 + }, + { + "epoch": 0.41748942172073344, + "grad_norm": 0.058648549020290375, + "learning_rate": 9.969371103820983e-05, + "loss": 0.2046, + "step": 2072 + }, + { + "epoch": 0.41789240378803144, + "grad_norm": 0.2652423083782196, + "learning_rate": 9.969223646511636e-05, + "loss": 0.1867, + "step": 2074 + }, + { + "epoch": 0.41829538585532944, + "grad_norm": 0.35208940505981445, + "learning_rate": 9.969075836196589e-05, + "loss": 0.2112, + "step": 2076 + }, + { + "epoch": 0.41869836792262743, + "grad_norm": 0.06454955041408539, + "learning_rate": 9.968927672886339e-05, + "loss": 0.2225, + "step": 2078 + }, + { + "epoch": 0.41910134998992543, + "grad_norm": 0.05421265587210655, + "learning_rate": 9.968779156591414e-05, + "loss": 0.1717, + "step": 2080 + }, + { + "epoch": 0.41950433205722343, + "grad_norm": 0.05428704246878624, + "learning_rate": 9.968630287322367e-05, + "loss": 0.1961, + "step": 2082 + }, + { + "epoch": 0.4199073141245215, + "grad_norm": 0.10567606985569, + "learning_rate": 9.968481065089768e-05, + "loss": 0.1915, + "step": 2084 + }, + { + "epoch": 0.4203102961918195, + "grad_norm": 0.06691636145114899, + "learning_rate": 9.96833148990422e-05, + "loss": 0.1842, + "step": 2086 + }, + { + "epoch": 0.4207132782591175, + "grad_norm": 0.06900250166654587, + "learning_rate": 9.968181561776348e-05, + "loss": 0.1391, + "step": 2088 + }, + { + "epoch": 0.42111626032641547, + "grad_norm": 0.05278802663087845, + "learning_rate": 9.968031280716805e-05, + "loss": 0.177, + "step": 2090 + }, + { + "epoch": 0.42151924239371347, + "grad_norm": 0.08458192646503448, + "learning_rate": 9.967880646736265e-05, + "loss": 0.2298, + "step": 2092 + }, + { + "epoch": 0.42192222446101146, + "grad_norm": 0.06902390718460083, + "learning_rate": 9.967729659845428e-05, + "loss": 0.2137, + "step": 2094 + }, + { + "epoch": 0.4223252065283095, + "grad_norm": 0.056869540363550186, + "learning_rate": 9.967578320055023e-05, + "loss": 0.2007, + "step": 2096 + }, + { + "epoch": 0.4227281885956075, + "grad_norm": 0.07104408740997314, + "learning_rate": 9.967426627375796e-05, + "loss": 0.1854, + "step": 2098 + }, + { + "epoch": 0.4231311706629055, + "grad_norm": 0.062130045145750046, + "learning_rate": 9.967274581818524e-05, + "loss": 0.1184, + "step": 2100 + }, + { + "epoch": 0.4235341527302035, + "grad_norm": 0.07842563092708588, + "learning_rate": 9.967122183394013e-05, + "loss": 0.2273, + "step": 2102 + }, + { + "epoch": 0.4239371347975015, + "grad_norm": 0.06722037494182587, + "learning_rate": 9.966969432113085e-05, + "loss": 0.1704, + "step": 2104 + }, + { + "epoch": 0.4243401168647995, + "grad_norm": 0.07529338449239731, + "learning_rate": 9.966816327986591e-05, + "loss": 0.1818, + "step": 2106 + }, + { + "epoch": 0.4247430989320975, + "grad_norm": 0.06540022790431976, + "learning_rate": 9.96666287102541e-05, + "loss": 0.2518, + "step": 2108 + }, + { + "epoch": 0.42514608099939555, + "grad_norm": 0.2302924394607544, + "learning_rate": 9.96650906124044e-05, + "loss": 0.2158, + "step": 2110 + }, + { + "epoch": 0.42554906306669354, + "grad_norm": 0.062155935913324356, + "learning_rate": 9.966354898642609e-05, + "loss": 0.2174, + "step": 2112 + }, + { + "epoch": 0.42595204513399154, + "grad_norm": 0.09047554433345795, + "learning_rate": 9.96620038324287e-05, + "loss": 0.1489, + "step": 2114 + }, + { + "epoch": 0.42635502720128954, + "grad_norm": 0.11305224895477295, + "learning_rate": 9.966045515052197e-05, + "loss": 0.1712, + "step": 2116 + }, + { + "epoch": 0.42675800926858753, + "grad_norm": 0.09782232344150543, + "learning_rate": 9.965890294081592e-05, + "loss": 0.1961, + "step": 2118 + }, + { + "epoch": 0.42716099133588553, + "grad_norm": 0.06156953424215317, + "learning_rate": 9.965734720342084e-05, + "loss": 0.182, + "step": 2120 + }, + { + "epoch": 0.4275639734031836, + "grad_norm": 0.07488352805376053, + "learning_rate": 9.965578793844723e-05, + "loss": 0.2113, + "step": 2122 + }, + { + "epoch": 0.4279669554704816, + "grad_norm": 0.24926647543907166, + "learning_rate": 9.965422514600585e-05, + "loss": 0.2136, + "step": 2124 + }, + { + "epoch": 0.4283699375377796, + "grad_norm": 0.06749057024717331, + "learning_rate": 9.965265882620771e-05, + "loss": 0.2188, + "step": 2126 + }, + { + "epoch": 0.4287729196050776, + "grad_norm": 0.07169239223003387, + "learning_rate": 9.965108897916411e-05, + "loss": 0.2276, + "step": 2128 + }, + { + "epoch": 0.42917590167237557, + "grad_norm": 0.0644363984465599, + "learning_rate": 9.964951560498657e-05, + "loss": 0.171, + "step": 2130 + }, + { + "epoch": 0.42957888373967357, + "grad_norm": 0.07681705802679062, + "learning_rate": 9.964793870378681e-05, + "loss": 0.2332, + "step": 2132 + }, + { + "epoch": 0.42998186580697156, + "grad_norm": 0.047269873321056366, + "learning_rate": 9.964635827567691e-05, + "loss": 0.134, + "step": 2134 + }, + { + "epoch": 0.4303848478742696, + "grad_norm": 0.06874144822359085, + "learning_rate": 9.964477432076911e-05, + "loss": 0.1645, + "step": 2136 + }, + { + "epoch": 0.4307878299415676, + "grad_norm": 0.05638271942734718, + "learning_rate": 9.964318683917593e-05, + "loss": 0.2089, + "step": 2138 + }, + { + "epoch": 0.4311908120088656, + "grad_norm": 0.07065290212631226, + "learning_rate": 9.964159583101016e-05, + "loss": 0.2175, + "step": 2140 + }, + { + "epoch": 0.4315937940761636, + "grad_norm": 0.0586637519299984, + "learning_rate": 9.96400012963848e-05, + "loss": 0.207, + "step": 2142 + }, + { + "epoch": 0.4319967761434616, + "grad_norm": 0.061192356050014496, + "learning_rate": 9.963840323541314e-05, + "loss": 0.212, + "step": 2144 + }, + { + "epoch": 0.4323997582107596, + "grad_norm": 0.06529909372329712, + "learning_rate": 9.96368016482087e-05, + "loss": 0.1886, + "step": 2146 + }, + { + "epoch": 0.43280274027805765, + "grad_norm": 0.06861956417560577, + "learning_rate": 9.963519653488527e-05, + "loss": 0.2226, + "step": 2148 + }, + { + "epoch": 0.43320572234535565, + "grad_norm": 0.07531817257404327, + "learning_rate": 9.963358789555683e-05, + "loss": 0.2213, + "step": 2150 + }, + { + "epoch": 0.43360870441265364, + "grad_norm": 0.0557052381336689, + "learning_rate": 9.96319757303377e-05, + "loss": 0.1994, + "step": 2152 + }, + { + "epoch": 0.43401168647995164, + "grad_norm": 0.05331805348396301, + "learning_rate": 9.963036003934238e-05, + "loss": 0.2163, + "step": 2154 + }, + { + "epoch": 0.43441466854724964, + "grad_norm": 0.05752795189619064, + "learning_rate": 9.962874082268567e-05, + "loss": 0.2135, + "step": 2156 + }, + { + "epoch": 0.43481765061454763, + "grad_norm": 0.060762520879507065, + "learning_rate": 9.962711808048258e-05, + "loss": 0.2401, + "step": 2158 + }, + { + "epoch": 0.43522063268184563, + "grad_norm": 0.05835457518696785, + "learning_rate": 9.962549181284838e-05, + "loss": 0.1785, + "step": 2160 + }, + { + "epoch": 0.4356236147491437, + "grad_norm": 0.06465306878089905, + "learning_rate": 9.96238620198986e-05, + "loss": 0.1975, + "step": 2162 + }, + { + "epoch": 0.4360265968164417, + "grad_norm": 0.0581306591629982, + "learning_rate": 9.962222870174902e-05, + "loss": 0.1602, + "step": 2164 + }, + { + "epoch": 0.4364295788837397, + "grad_norm": 0.04962344840168953, + "learning_rate": 9.962059185851569e-05, + "loss": 0.2231, + "step": 2166 + }, + { + "epoch": 0.4368325609510377, + "grad_norm": 0.0643840953707695, + "learning_rate": 9.961895149031486e-05, + "loss": 0.1481, + "step": 2168 + }, + { + "epoch": 0.43723554301833567, + "grad_norm": 0.12278378009796143, + "learning_rate": 9.961730759726307e-05, + "loss": 0.2492, + "step": 2170 + }, + { + "epoch": 0.43763852508563367, + "grad_norm": 0.05890028551220894, + "learning_rate": 9.96156601794771e-05, + "loss": 0.1423, + "step": 2172 + }, + { + "epoch": 0.4380415071529317, + "grad_norm": 0.05931360647082329, + "learning_rate": 9.961400923707398e-05, + "loss": 0.1958, + "step": 2174 + }, + { + "epoch": 0.4384444892202297, + "grad_norm": 0.07436667382717133, + "learning_rate": 9.961235477017098e-05, + "loss": 0.2163, + "step": 2176 + }, + { + "epoch": 0.4388474712875277, + "grad_norm": 0.08503016084432602, + "learning_rate": 9.961069677888566e-05, + "loss": 0.2187, + "step": 2178 + }, + { + "epoch": 0.4392504533548257, + "grad_norm": 0.10647798329591751, + "learning_rate": 9.960903526333576e-05, + "loss": 0.2981, + "step": 2180 + }, + { + "epoch": 0.4396534354221237, + "grad_norm": 0.06522869318723679, + "learning_rate": 9.960737022363935e-05, + "loss": 0.1779, + "step": 2182 + }, + { + "epoch": 0.4400564174894217, + "grad_norm": 0.050729621201753616, + "learning_rate": 9.960570165991469e-05, + "loss": 0.1355, + "step": 2184 + }, + { + "epoch": 0.4404593995567197, + "grad_norm": 0.0786505937576294, + "learning_rate": 9.960402957228032e-05, + "loss": 0.175, + "step": 2186 + }, + { + "epoch": 0.44086238162401775, + "grad_norm": 0.09716632962226868, + "learning_rate": 9.960235396085502e-05, + "loss": 0.2767, + "step": 2188 + }, + { + "epoch": 0.44126536369131575, + "grad_norm": 0.13773681223392487, + "learning_rate": 9.960067482575781e-05, + "loss": 0.242, + "step": 2190 + }, + { + "epoch": 0.44166834575861375, + "grad_norm": 0.06173882633447647, + "learning_rate": 9.9598992167108e-05, + "loss": 0.213, + "step": 2192 + }, + { + "epoch": 0.44207132782591174, + "grad_norm": 0.06848306208848953, + "learning_rate": 9.95973059850251e-05, + "loss": 0.2, + "step": 2194 + }, + { + "epoch": 0.44247430989320974, + "grad_norm": 0.06073886528611183, + "learning_rate": 9.95956162796289e-05, + "loss": 0.1993, + "step": 2196 + }, + { + "epoch": 0.44287729196050774, + "grad_norm": 0.07946296781301498, + "learning_rate": 9.959392305103943e-05, + "loss": 0.261, + "step": 2198 + }, + { + "epoch": 0.4432802740278058, + "grad_norm": 0.051856525242328644, + "learning_rate": 9.959222629937699e-05, + "loss": 0.2111, + "step": 2200 + }, + { + "epoch": 0.4436832560951038, + "grad_norm": 0.05881345272064209, + "learning_rate": 9.95905260247621e-05, + "loss": 0.2147, + "step": 2202 + }, + { + "epoch": 0.4440862381624018, + "grad_norm": 0.04665559157729149, + "learning_rate": 9.958882222731555e-05, + "loss": 0.184, + "step": 2204 + }, + { + "epoch": 0.4444892202296998, + "grad_norm": 0.06761233508586884, + "learning_rate": 9.958711490715838e-05, + "loss": 0.2161, + "step": 2206 + }, + { + "epoch": 0.4448922022969978, + "grad_norm": 0.09146778285503387, + "learning_rate": 9.958540406441187e-05, + "loss": 0.2292, + "step": 2208 + }, + { + "epoch": 0.44529518436429577, + "grad_norm": 0.06186634674668312, + "learning_rate": 9.958368969919756e-05, + "loss": 0.1743, + "step": 2210 + }, + { + "epoch": 0.44569816643159377, + "grad_norm": 0.06448390334844589, + "learning_rate": 9.958197181163722e-05, + "loss": 0.165, + "step": 2212 + }, + { + "epoch": 0.4461011484988918, + "grad_norm": 0.07400643825531006, + "learning_rate": 9.95802504018529e-05, + "loss": 0.204, + "step": 2214 + }, + { + "epoch": 0.4465041305661898, + "grad_norm": 0.0617455393075943, + "learning_rate": 9.957852546996688e-05, + "loss": 0.1909, + "step": 2216 + }, + { + "epoch": 0.4469071126334878, + "grad_norm": 0.0583464615046978, + "learning_rate": 9.957679701610171e-05, + "loss": 0.1884, + "step": 2218 + }, + { + "epoch": 0.4473100947007858, + "grad_norm": 0.06541068851947784, + "learning_rate": 9.957506504038015e-05, + "loss": 0.1666, + "step": 2220 + }, + { + "epoch": 0.4477130767680838, + "grad_norm": 0.06339999288320541, + "learning_rate": 9.957332954292526e-05, + "loss": 0.2063, + "step": 2222 + }, + { + "epoch": 0.4481160588353818, + "grad_norm": 0.0743519738316536, + "learning_rate": 9.957159052386033e-05, + "loss": 0.1872, + "step": 2224 + }, + { + "epoch": 0.44851904090267986, + "grad_norm": 0.08613268285989761, + "learning_rate": 9.956984798330888e-05, + "loss": 0.2495, + "step": 2226 + }, + { + "epoch": 0.44892202296997785, + "grad_norm": 0.06281512975692749, + "learning_rate": 9.956810192139471e-05, + "loss": 0.2057, + "step": 2228 + }, + { + "epoch": 0.44932500503727585, + "grad_norm": 0.08816128969192505, + "learning_rate": 9.956635233824185e-05, + "loss": 0.2602, + "step": 2230 + }, + { + "epoch": 0.44972798710457385, + "grad_norm": 0.10240280628204346, + "learning_rate": 9.956459923397459e-05, + "loss": 0.2232, + "step": 2232 + }, + { + "epoch": 0.45013096917187184, + "grad_norm": 0.06470140814781189, + "learning_rate": 9.956284260871745e-05, + "loss": 0.2375, + "step": 2234 + }, + { + "epoch": 0.45053395123916984, + "grad_norm": 0.06823807209730148, + "learning_rate": 9.956108246259526e-05, + "loss": 0.1698, + "step": 2236 + }, + { + "epoch": 0.45093693330646784, + "grad_norm": 0.0750935971736908, + "learning_rate": 9.955931879573302e-05, + "loss": 0.1979, + "step": 2238 + }, + { + "epoch": 0.4513399153737659, + "grad_norm": 0.09087410569190979, + "learning_rate": 9.955755160825604e-05, + "loss": 0.2525, + "step": 2240 + }, + { + "epoch": 0.4517428974410639, + "grad_norm": 0.0783960372209549, + "learning_rate": 9.955578090028983e-05, + "loss": 0.1925, + "step": 2242 + }, + { + "epoch": 0.4521458795083619, + "grad_norm": 0.06691782921552658, + "learning_rate": 9.955400667196021e-05, + "loss": 0.2186, + "step": 2244 + }, + { + "epoch": 0.4525488615756599, + "grad_norm": 0.0832383930683136, + "learning_rate": 9.95522289233932e-05, + "loss": 0.219, + "step": 2246 + }, + { + "epoch": 0.4529518436429579, + "grad_norm": 0.07930655032396317, + "learning_rate": 9.95504476547151e-05, + "loss": 0.2159, + "step": 2248 + }, + { + "epoch": 0.45335482571025587, + "grad_norm": 0.07651390880346298, + "learning_rate": 9.954866286605246e-05, + "loss": 0.2016, + "step": 2250 + }, + { + "epoch": 0.4537578077775539, + "grad_norm": 0.06129351630806923, + "learning_rate": 9.954687455753202e-05, + "loss": 0.193, + "step": 2252 + }, + { + "epoch": 0.4541607898448519, + "grad_norm": 0.07738249748945236, + "learning_rate": 9.954508272928087e-05, + "loss": 0.1875, + "step": 2254 + }, + { + "epoch": 0.4545637719121499, + "grad_norm": 0.0622461661696434, + "learning_rate": 9.954328738142628e-05, + "loss": 0.212, + "step": 2256 + }, + { + "epoch": 0.4549667539794479, + "grad_norm": 0.08971701562404633, + "learning_rate": 9.954148851409577e-05, + "loss": 0.1996, + "step": 2258 + }, + { + "epoch": 0.4553697360467459, + "grad_norm": 0.08519180119037628, + "learning_rate": 9.953968612741717e-05, + "loss": 0.2016, + "step": 2260 + }, + { + "epoch": 0.4557727181140439, + "grad_norm": 0.06239600107073784, + "learning_rate": 9.953788022151848e-05, + "loss": 0.2145, + "step": 2262 + }, + { + "epoch": 0.4561757001813419, + "grad_norm": 0.06886850297451019, + "learning_rate": 9.9536070796528e-05, + "loss": 0.1723, + "step": 2264 + }, + { + "epoch": 0.45657868224863996, + "grad_norm": 0.07382401078939438, + "learning_rate": 9.953425785257428e-05, + "loss": 0.247, + "step": 2266 + }, + { + "epoch": 0.45698166431593795, + "grad_norm": 0.07136175781488419, + "learning_rate": 9.953244138978608e-05, + "loss": 0.2037, + "step": 2268 + }, + { + "epoch": 0.45738464638323595, + "grad_norm": 0.07412570714950562, + "learning_rate": 9.953062140829249e-05, + "loss": 0.2047, + "step": 2270 + }, + { + "epoch": 0.45778762845053395, + "grad_norm": 0.07109946012496948, + "learning_rate": 9.952879790822276e-05, + "loss": 0.1648, + "step": 2272 + }, + { + "epoch": 0.45819061051783194, + "grad_norm": 0.09064648300409317, + "learning_rate": 9.952697088970642e-05, + "loss": 0.2203, + "step": 2274 + }, + { + "epoch": 0.45859359258512994, + "grad_norm": 0.06263390183448792, + "learning_rate": 9.952514035287328e-05, + "loss": 0.1801, + "step": 2276 + }, + { + "epoch": 0.458996574652428, + "grad_norm": 0.06853969395160675, + "learning_rate": 9.952330629785338e-05, + "loss": 0.1989, + "step": 2278 + }, + { + "epoch": 0.459399556719726, + "grad_norm": 0.06831579655408859, + "learning_rate": 9.9521468724777e-05, + "loss": 0.2149, + "step": 2280 + }, + { + "epoch": 0.459802538787024, + "grad_norm": 0.07879806309938431, + "learning_rate": 9.951962763377469e-05, + "loss": 0.2391, + "step": 2282 + }, + { + "epoch": 0.460205520854322, + "grad_norm": 0.07946325838565826, + "learning_rate": 9.95177830249772e-05, + "loss": 0.217, + "step": 2284 + }, + { + "epoch": 0.46060850292162, + "grad_norm": 0.04904184117913246, + "learning_rate": 9.951593489851562e-05, + "loss": 0.2036, + "step": 2286 + }, + { + "epoch": 0.461011484988918, + "grad_norm": 0.06621547043323517, + "learning_rate": 9.95140832545212e-05, + "loss": 0.2289, + "step": 2288 + }, + { + "epoch": 0.46141446705621597, + "grad_norm": 0.3637445867061615, + "learning_rate": 9.95122280931255e-05, + "loss": 0.212, + "step": 2290 + }, + { + "epoch": 0.461817449123514, + "grad_norm": 0.10791198909282684, + "learning_rate": 9.95103694144603e-05, + "loss": 0.2213, + "step": 2292 + }, + { + "epoch": 0.462220431190812, + "grad_norm": 0.05832645669579506, + "learning_rate": 9.950850721865763e-05, + "loss": 0.1969, + "step": 2294 + }, + { + "epoch": 0.46262341325811, + "grad_norm": 0.051262617111206055, + "learning_rate": 9.950664150584979e-05, + "loss": 0.2189, + "step": 2296 + }, + { + "epoch": 0.463026395325408, + "grad_norm": 0.08473943918943405, + "learning_rate": 9.950477227616931e-05, + "loss": 0.1805, + "step": 2298 + }, + { + "epoch": 0.463429377392706, + "grad_norm": 0.06247183680534363, + "learning_rate": 9.950289952974898e-05, + "loss": 0.1754, + "step": 2300 + }, + { + "epoch": 0.463832359460004, + "grad_norm": 0.09339699894189835, + "learning_rate": 9.950102326672184e-05, + "loss": 0.2281, + "step": 2302 + }, + { + "epoch": 0.46423534152730206, + "grad_norm": 0.06101800501346588, + "learning_rate": 9.949914348722116e-05, + "loss": 0.2179, + "step": 2304 + }, + { + "epoch": 0.46463832359460006, + "grad_norm": 0.08553671091794968, + "learning_rate": 9.94972601913805e-05, + "loss": 0.2065, + "step": 2306 + }, + { + "epoch": 0.46504130566189805, + "grad_norm": 0.07908914238214493, + "learning_rate": 9.949537337933363e-05, + "loss": 0.1714, + "step": 2308 + }, + { + "epoch": 0.46544428772919605, + "grad_norm": 0.05011114850640297, + "learning_rate": 9.949348305121459e-05, + "loss": 0.1514, + "step": 2310 + }, + { + "epoch": 0.46584726979649405, + "grad_norm": 0.07748831063508987, + "learning_rate": 9.949158920715766e-05, + "loss": 0.217, + "step": 2312 + }, + { + "epoch": 0.46625025186379204, + "grad_norm": 0.07742765545845032, + "learning_rate": 9.94896918472974e-05, + "loss": 0.2007, + "step": 2314 + }, + { + "epoch": 0.46665323393109004, + "grad_norm": 0.06443807482719421, + "learning_rate": 9.948779097176857e-05, + "loss": 0.2428, + "step": 2316 + }, + { + "epoch": 0.4670562159983881, + "grad_norm": 0.060489460825920105, + "learning_rate": 9.948588658070622e-05, + "loss": 0.2127, + "step": 2318 + }, + { + "epoch": 0.4674591980656861, + "grad_norm": 0.0675460621714592, + "learning_rate": 9.948397867424562e-05, + "loss": 0.21, + "step": 2320 + }, + { + "epoch": 0.4678621801329841, + "grad_norm": 0.061212215572595596, + "learning_rate": 9.948206725252231e-05, + "loss": 0.1984, + "step": 2322 + }, + { + "epoch": 0.4682651622002821, + "grad_norm": 0.062292277812957764, + "learning_rate": 9.948015231567208e-05, + "loss": 0.166, + "step": 2324 + }, + { + "epoch": 0.4686681442675801, + "grad_norm": 0.048851098865270615, + "learning_rate": 9.947823386383097e-05, + "loss": 0.1606, + "step": 2326 + }, + { + "epoch": 0.4690711263348781, + "grad_norm": 0.05528166517615318, + "learning_rate": 9.947631189713524e-05, + "loss": 0.1853, + "step": 2328 + }, + { + "epoch": 0.4694741084021761, + "grad_norm": 0.051568541675806046, + "learning_rate": 9.947438641572145e-05, + "loss": 0.2046, + "step": 2330 + }, + { + "epoch": 0.4698770904694741, + "grad_norm": 0.09390091896057129, + "learning_rate": 9.947245741972638e-05, + "loss": 0.2408, + "step": 2332 + }, + { + "epoch": 0.4702800725367721, + "grad_norm": 0.055641692131757736, + "learning_rate": 9.947052490928704e-05, + "loss": 0.2001, + "step": 2334 + }, + { + "epoch": 0.4706830546040701, + "grad_norm": 0.05879371985793114, + "learning_rate": 9.946858888454072e-05, + "loss": 0.2079, + "step": 2336 + }, + { + "epoch": 0.4710860366713681, + "grad_norm": 0.06226501986384392, + "learning_rate": 9.946664934562497e-05, + "loss": 0.2495, + "step": 2338 + }, + { + "epoch": 0.4714890187386661, + "grad_norm": 0.059466030448675156, + "learning_rate": 9.946470629267756e-05, + "loss": 0.1897, + "step": 2340 + }, + { + "epoch": 0.4718920008059641, + "grad_norm": 0.05849766731262207, + "learning_rate": 9.946275972583651e-05, + "loss": 0.1906, + "step": 2342 + }, + { + "epoch": 0.47229498287326216, + "grad_norm": 0.051197245717048645, + "learning_rate": 9.946080964524013e-05, + "loss": 0.1546, + "step": 2344 + }, + { + "epoch": 0.47269796494056016, + "grad_norm": 0.058286767452955246, + "learning_rate": 9.945885605102694e-05, + "loss": 0.1748, + "step": 2346 + }, + { + "epoch": 0.47310094700785815, + "grad_norm": 0.064874567091465, + "learning_rate": 9.94568989433357e-05, + "loss": 0.2119, + "step": 2348 + }, + { + "epoch": 0.47350392907515615, + "grad_norm": 0.05664130300283432, + "learning_rate": 9.945493832230546e-05, + "loss": 0.22, + "step": 2350 + }, + { + "epoch": 0.47390691114245415, + "grad_norm": 0.1102651059627533, + "learning_rate": 9.945297418807549e-05, + "loss": 0.2036, + "step": 2352 + }, + { + "epoch": 0.47430989320975214, + "grad_norm": 0.10592664033174515, + "learning_rate": 9.945100654078532e-05, + "loss": 0.2118, + "step": 2354 + }, + { + "epoch": 0.4747128752770502, + "grad_norm": 0.06491530686616898, + "learning_rate": 9.944903538057473e-05, + "loss": 0.1692, + "step": 2356 + }, + { + "epoch": 0.4751158573443482, + "grad_norm": 0.08583839237689972, + "learning_rate": 9.944706070758373e-05, + "loss": 0.2252, + "step": 2358 + }, + { + "epoch": 0.4755188394116462, + "grad_norm": 0.07632534205913544, + "learning_rate": 9.944508252195264e-05, + "loss": 0.2198, + "step": 2360 + }, + { + "epoch": 0.4759218214789442, + "grad_norm": 0.08482904732227325, + "learning_rate": 9.944310082382198e-05, + "loss": 0.1565, + "step": 2362 + }, + { + "epoch": 0.4763248035462422, + "grad_norm": 0.0885341688990593, + "learning_rate": 9.944111561333248e-05, + "loss": 0.202, + "step": 2364 + }, + { + "epoch": 0.4767277856135402, + "grad_norm": 0.050671521574258804, + "learning_rate": 9.94391268906252e-05, + "loss": 0.1886, + "step": 2366 + }, + { + "epoch": 0.47713076768083823, + "grad_norm": 0.058330848813056946, + "learning_rate": 9.943713465584143e-05, + "loss": 0.1961, + "step": 2368 + }, + { + "epoch": 0.4775337497481362, + "grad_norm": 0.04063691198825836, + "learning_rate": 9.943513890912266e-05, + "loss": 0.1735, + "step": 2370 + }, + { + "epoch": 0.4779367318154342, + "grad_norm": 0.04938462749123573, + "learning_rate": 9.943313965061069e-05, + "loss": 0.1799, + "step": 2372 + }, + { + "epoch": 0.4783397138827322, + "grad_norm": 0.0486234650015831, + "learning_rate": 9.943113688044753e-05, + "loss": 0.1653, + "step": 2374 + }, + { + "epoch": 0.4787426959500302, + "grad_norm": 0.05696294456720352, + "learning_rate": 9.942913059877546e-05, + "loss": 0.1955, + "step": 2376 + }, + { + "epoch": 0.4791456780173282, + "grad_norm": 0.043870650231838226, + "learning_rate": 9.9427120805737e-05, + "loss": 0.156, + "step": 2378 + }, + { + "epoch": 0.4795486600846262, + "grad_norm": 0.06747590750455856, + "learning_rate": 9.942510750147493e-05, + "loss": 0.2309, + "step": 2380 + }, + { + "epoch": 0.47995164215192426, + "grad_norm": 0.05680996552109718, + "learning_rate": 9.942309068613227e-05, + "loss": 0.1492, + "step": 2382 + }, + { + "epoch": 0.48035462421922226, + "grad_norm": 0.08178498595952988, + "learning_rate": 9.942107035985229e-05, + "loss": 0.2317, + "step": 2384 + }, + { + "epoch": 0.48075760628652026, + "grad_norm": 0.055255163460969925, + "learning_rate": 9.941904652277849e-05, + "loss": 0.2344, + "step": 2386 + }, + { + "epoch": 0.48116058835381825, + "grad_norm": 0.07890515774488449, + "learning_rate": 9.941701917505468e-05, + "loss": 0.173, + "step": 2388 + }, + { + "epoch": 0.48156357042111625, + "grad_norm": 0.07036946713924408, + "learning_rate": 9.941498831682486e-05, + "loss": 0.1778, + "step": 2390 + }, + { + "epoch": 0.48196655248841425, + "grad_norm": 0.05692208930850029, + "learning_rate": 9.941295394823328e-05, + "loss": 0.1556, + "step": 2392 + }, + { + "epoch": 0.4823695345557123, + "grad_norm": 0.06822335720062256, + "learning_rate": 9.941091606942447e-05, + "loss": 0.1858, + "step": 2394 + }, + { + "epoch": 0.4827725166230103, + "grad_norm": 0.05806328356266022, + "learning_rate": 9.940887468054323e-05, + "loss": 0.1881, + "step": 2396 + }, + { + "epoch": 0.4831754986903083, + "grad_norm": 0.06417976319789886, + "learning_rate": 9.940682978173455e-05, + "loss": 0.1779, + "step": 2398 + }, + { + "epoch": 0.4835784807576063, + "grad_norm": 0.057871218770742416, + "learning_rate": 9.940478137314368e-05, + "loss": 0.1556, + "step": 2400 + }, + { + "epoch": 0.4839814628249043, + "grad_norm": 0.06445048749446869, + "learning_rate": 9.940272945491616e-05, + "loss": 0.1756, + "step": 2402 + }, + { + "epoch": 0.4843844448922023, + "grad_norm": 0.05714261159300804, + "learning_rate": 9.940067402719773e-05, + "loss": 0.2266, + "step": 2404 + }, + { + "epoch": 0.4847874269595003, + "grad_norm": 0.060727428644895554, + "learning_rate": 9.939861509013444e-05, + "loss": 0.2172, + "step": 2406 + }, + { + "epoch": 0.48519040902679833, + "grad_norm": 0.05237874761223793, + "learning_rate": 9.939655264387253e-05, + "loss": 0.1958, + "step": 2408 + }, + { + "epoch": 0.4855933910940963, + "grad_norm": 0.05351502448320389, + "learning_rate": 9.939448668855853e-05, + "loss": 0.2019, + "step": 2410 + }, + { + "epoch": 0.4859963731613943, + "grad_norm": 0.07500231266021729, + "learning_rate": 9.939241722433918e-05, + "loss": 0.1265, + "step": 2412 + }, + { + "epoch": 0.4863993552286923, + "grad_norm": 0.06734588742256165, + "learning_rate": 9.939034425136152e-05, + "loss": 0.2397, + "step": 2414 + }, + { + "epoch": 0.4868023372959903, + "grad_norm": 0.07433614134788513, + "learning_rate": 9.938826776977276e-05, + "loss": 0.1875, + "step": 2416 + }, + { + "epoch": 0.4872053193632883, + "grad_norm": 0.090156689286232, + "learning_rate": 9.938618777972046e-05, + "loss": 0.1549, + "step": 2418 + }, + { + "epoch": 0.48760830143058637, + "grad_norm": 0.058888860046863556, + "learning_rate": 9.938410428135236e-05, + "loss": 0.2471, + "step": 2420 + }, + { + "epoch": 0.48801128349788436, + "grad_norm": 0.08173494786024094, + "learning_rate": 9.938201727481647e-05, + "loss": 0.2159, + "step": 2422 + }, + { + "epoch": 0.48841426556518236, + "grad_norm": 0.08549089729785919, + "learning_rate": 9.937992676026105e-05, + "loss": 0.1892, + "step": 2424 + }, + { + "epoch": 0.48881724763248036, + "grad_norm": 0.06504713743925095, + "learning_rate": 9.93778327378346e-05, + "loss": 0.2715, + "step": 2426 + }, + { + "epoch": 0.48922022969977835, + "grad_norm": 0.04682251811027527, + "learning_rate": 9.937573520768589e-05, + "loss": 0.1937, + "step": 2428 + }, + { + "epoch": 0.48962321176707635, + "grad_norm": 0.05704076960682869, + "learning_rate": 9.93736341699639e-05, + "loss": 0.2076, + "step": 2430 + }, + { + "epoch": 0.49002619383437435, + "grad_norm": 0.04885469004511833, + "learning_rate": 9.93715296248179e-05, + "loss": 0.2168, + "step": 2432 + }, + { + "epoch": 0.4904291759016724, + "grad_norm": 0.05703292787075043, + "learning_rate": 9.936942157239741e-05, + "loss": 0.1514, + "step": 2434 + }, + { + "epoch": 0.4908321579689704, + "grad_norm": 0.055790483951568604, + "learning_rate": 9.936731001285215e-05, + "loss": 0.2213, + "step": 2436 + }, + { + "epoch": 0.4912351400362684, + "grad_norm": 0.06825339794158936, + "learning_rate": 9.936519494633216e-05, + "loss": 0.2115, + "step": 2438 + }, + { + "epoch": 0.4916381221035664, + "grad_norm": 0.04873501509428024, + "learning_rate": 9.936307637298765e-05, + "loss": 0.2197, + "step": 2440 + }, + { + "epoch": 0.4920411041708644, + "grad_norm": 0.09479009360074997, + "learning_rate": 9.936095429296915e-05, + "loss": 0.2088, + "step": 2442 + }, + { + "epoch": 0.4924440862381624, + "grad_norm": 0.0469268262386322, + "learning_rate": 9.93588287064274e-05, + "loss": 0.1793, + "step": 2444 + }, + { + "epoch": 0.49284706830546043, + "grad_norm": 0.05863165855407715, + "learning_rate": 9.935669961351336e-05, + "loss": 0.1689, + "step": 2446 + }, + { + "epoch": 0.49325005037275843, + "grad_norm": 0.053911954164505005, + "learning_rate": 9.935456701437835e-05, + "loss": 0.1522, + "step": 2448 + }, + { + "epoch": 0.4936530324400564, + "grad_norm": 0.055318981409072876, + "learning_rate": 9.935243090917383e-05, + "loss": 0.1643, + "step": 2450 + }, + { + "epoch": 0.4940560145073544, + "grad_norm": 0.0797191932797432, + "learning_rate": 9.935029129805153e-05, + "loss": 0.2051, + "step": 2452 + }, + { + "epoch": 0.4944589965746524, + "grad_norm": 0.048973795026540756, + "learning_rate": 9.934814818116348e-05, + "loss": 0.1867, + "step": 2454 + }, + { + "epoch": 0.4948619786419504, + "grad_norm": 0.05954229086637497, + "learning_rate": 9.93460015586619e-05, + "loss": 0.1994, + "step": 2456 + }, + { + "epoch": 0.4952649607092484, + "grad_norm": 0.075109101831913, + "learning_rate": 9.934385143069927e-05, + "loss": 0.2131, + "step": 2458 + }, + { + "epoch": 0.49566794277654647, + "grad_norm": 0.07947821170091629, + "learning_rate": 9.934169779742837e-05, + "loss": 0.1959, + "step": 2460 + }, + { + "epoch": 0.49607092484384446, + "grad_norm": 0.06468906998634338, + "learning_rate": 9.933954065900215e-05, + "loss": 0.194, + "step": 2462 + }, + { + "epoch": 0.49647390691114246, + "grad_norm": 0.07549004256725311, + "learning_rate": 9.933738001557386e-05, + "loss": 0.204, + "step": 2464 + }, + { + "epoch": 0.49687688897844046, + "grad_norm": 0.06827647238969803, + "learning_rate": 9.933521586729703e-05, + "loss": 0.1886, + "step": 2466 + }, + { + "epoch": 0.49727987104573845, + "grad_norm": 0.053156349807977676, + "learning_rate": 9.933304821432535e-05, + "loss": 0.1744, + "step": 2468 + }, + { + "epoch": 0.49768285311303645, + "grad_norm": 0.05513354763388634, + "learning_rate": 9.933087705681281e-05, + "loss": 0.1869, + "step": 2470 + }, + { + "epoch": 0.4980858351803345, + "grad_norm": 0.051556315273046494, + "learning_rate": 9.932870239491367e-05, + "loss": 0.152, + "step": 2472 + }, + { + "epoch": 0.4984888172476325, + "grad_norm": 0.054548539221286774, + "learning_rate": 9.932652422878239e-05, + "loss": 0.1541, + "step": 2474 + }, + { + "epoch": 0.4988917993149305, + "grad_norm": 0.05240153521299362, + "learning_rate": 9.932434255857372e-05, + "loss": 0.2452, + "step": 2476 + }, + { + "epoch": 0.4992947813822285, + "grad_norm": 0.09754245728254318, + "learning_rate": 9.932215738444263e-05, + "loss": 0.2503, + "step": 2478 + }, + { + "epoch": 0.4996977634495265, + "grad_norm": 0.05348599702119827, + "learning_rate": 9.931996870654438e-05, + "loss": 0.2018, + "step": 2480 + }, + { + "epoch": 0.5001007455168245, + "grad_norm": 0.055064063519239426, + "learning_rate": 9.931777652503442e-05, + "loss": 0.1739, + "step": 2482 + }, + { + "epoch": 0.5005037275841225, + "grad_norm": 0.07443667203187943, + "learning_rate": 9.931558084006849e-05, + "loss": 0.2414, + "step": 2484 + }, + { + "epoch": 0.5009067096514205, + "grad_norm": 0.06823594868183136, + "learning_rate": 9.931338165180254e-05, + "loss": 0.188, + "step": 2486 + }, + { + "epoch": 0.5013096917187185, + "grad_norm": 0.0575629360973835, + "learning_rate": 9.931117896039286e-05, + "loss": 0.2041, + "step": 2488 + }, + { + "epoch": 0.5017126737860165, + "grad_norm": 0.05056982487440109, + "learning_rate": 9.930897276599587e-05, + "loss": 0.1903, + "step": 2490 + }, + { + "epoch": 0.5021156558533145, + "grad_norm": 0.0499715618789196, + "learning_rate": 9.930676306876832e-05, + "loss": 0.188, + "step": 2492 + }, + { + "epoch": 0.5025186379206126, + "grad_norm": 0.06276939809322357, + "learning_rate": 9.930454986886716e-05, + "loss": 0.1728, + "step": 2494 + }, + { + "epoch": 0.5029216199879105, + "grad_norm": 0.05897986888885498, + "learning_rate": 9.930233316644963e-05, + "loss": 0.1712, + "step": 2496 + }, + { + "epoch": 0.5033246020552086, + "grad_norm": 0.06042663753032684, + "learning_rate": 9.93001129616732e-05, + "loss": 0.1711, + "step": 2498 + }, + { + "epoch": 0.5037275841225065, + "grad_norm": 0.0756792277097702, + "learning_rate": 9.92978892546956e-05, + "loss": 0.2415, + "step": 2500 + }, + { + "epoch": 0.5041305661898046, + "grad_norm": 0.07142479717731476, + "learning_rate": 9.92956620456748e-05, + "loss": 0.2126, + "step": 2502 + }, + { + "epoch": 0.5045335482571025, + "grad_norm": 0.045365698635578156, + "learning_rate": 9.929343133476898e-05, + "loss": 0.2324, + "step": 2504 + }, + { + "epoch": 0.5049365303244006, + "grad_norm": 0.0676538497209549, + "learning_rate": 9.929119712213664e-05, + "loss": 0.2497, + "step": 2506 + }, + { + "epoch": 0.5053395123916986, + "grad_norm": 0.08482253551483154, + "learning_rate": 9.92889594079365e-05, + "loss": 0.1959, + "step": 2508 + }, + { + "epoch": 0.5057424944589965, + "grad_norm": 0.05569139122962952, + "learning_rate": 9.928671819232749e-05, + "loss": 0.2229, + "step": 2510 + }, + { + "epoch": 0.5061454765262946, + "grad_norm": 0.052241578698158264, + "learning_rate": 9.928447347546885e-05, + "loss": 0.2421, + "step": 2512 + }, + { + "epoch": 0.5065484585935925, + "grad_norm": 0.04659108445048332, + "learning_rate": 9.928222525752002e-05, + "loss": 0.1722, + "step": 2514 + }, + { + "epoch": 0.5069514406608906, + "grad_norm": 0.06179108843207359, + "learning_rate": 9.927997353864073e-05, + "loss": 0.1461, + "step": 2516 + }, + { + "epoch": 0.5073544227281886, + "grad_norm": 0.05853302776813507, + "learning_rate": 9.927771831899095e-05, + "loss": 0.1849, + "step": 2518 + }, + { + "epoch": 0.5077574047954866, + "grad_norm": 0.05103042721748352, + "learning_rate": 9.927545959873086e-05, + "loss": 0.2018, + "step": 2520 + }, + { + "epoch": 0.5081603868627846, + "grad_norm": 0.047647956758737564, + "learning_rate": 9.92731973780209e-05, + "loss": 0.1589, + "step": 2522 + }, + { + "epoch": 0.5085633689300826, + "grad_norm": 0.07556428760290146, + "learning_rate": 9.927093165702182e-05, + "loss": 0.2306, + "step": 2524 + }, + { + "epoch": 0.5089663509973806, + "grad_norm": 0.06066511198878288, + "learning_rate": 9.926866243589456e-05, + "loss": 0.1792, + "step": 2526 + }, + { + "epoch": 0.5093693330646786, + "grad_norm": 0.06090389937162399, + "learning_rate": 9.92663897148003e-05, + "loss": 0.2135, + "step": 2528 + }, + { + "epoch": 0.5097723151319766, + "grad_norm": 0.04129406809806824, + "learning_rate": 9.92641134939005e-05, + "loss": 0.1798, + "step": 2530 + }, + { + "epoch": 0.5101752971992747, + "grad_norm": 0.0553179495036602, + "learning_rate": 9.926183377335689e-05, + "loss": 0.2488, + "step": 2532 + }, + { + "epoch": 0.5105782792665726, + "grad_norm": 0.04894782975316048, + "learning_rate": 9.925955055333136e-05, + "loss": 0.218, + "step": 2534 + }, + { + "epoch": 0.5109812613338707, + "grad_norm": 0.05691911652684212, + "learning_rate": 9.925726383398617e-05, + "loss": 0.1882, + "step": 2536 + }, + { + "epoch": 0.5113842434011686, + "grad_norm": 0.040179017931222916, + "learning_rate": 9.925497361548371e-05, + "loss": 0.1446, + "step": 2538 + }, + { + "epoch": 0.5117872254684667, + "grad_norm": 0.06504693627357483, + "learning_rate": 9.92526798979867e-05, + "loss": 0.1943, + "step": 2540 + }, + { + "epoch": 0.5121902075357646, + "grad_norm": 0.06121726706624031, + "learning_rate": 9.925038268165808e-05, + "loss": 0.1886, + "step": 2542 + }, + { + "epoch": 0.5125931896030627, + "grad_norm": 0.05474965274333954, + "learning_rate": 9.924808196666103e-05, + "loss": 0.206, + "step": 2544 + }, + { + "epoch": 0.5129961716703607, + "grad_norm": 0.06797949224710464, + "learning_rate": 9.924577775315901e-05, + "loss": 0.1983, + "step": 2546 + }, + { + "epoch": 0.5133991537376587, + "grad_norm": 0.05731568858027458, + "learning_rate": 9.924347004131568e-05, + "loss": 0.226, + "step": 2548 + }, + { + "epoch": 0.5138021358049567, + "grad_norm": 0.059852488338947296, + "learning_rate": 9.924115883129501e-05, + "loss": 0.1754, + "step": 2550 + }, + { + "epoch": 0.5142051178722546, + "grad_norm": 0.05445285141468048, + "learning_rate": 9.923884412326116e-05, + "loss": 0.217, + "step": 2552 + }, + { + "epoch": 0.5146080999395527, + "grad_norm": 0.07617855072021484, + "learning_rate": 9.923652591737856e-05, + "loss": 0.2535, + "step": 2554 + }, + { + "epoch": 0.5150110820068506, + "grad_norm": 0.06034578010439873, + "learning_rate": 9.923420421381191e-05, + "loss": 0.1515, + "step": 2556 + }, + { + "epoch": 0.5154140640741487, + "grad_norm": 0.05395951867103577, + "learning_rate": 9.923187901272613e-05, + "loss": 0.194, + "step": 2558 + }, + { + "epoch": 0.5158170461414467, + "grad_norm": 0.06721216440200806, + "learning_rate": 9.92295503142864e-05, + "loss": 0.1354, + "step": 2560 + }, + { + "epoch": 0.5162200282087447, + "grad_norm": 0.05250773951411247, + "learning_rate": 9.922721811865815e-05, + "loss": 0.2008, + "step": 2562 + }, + { + "epoch": 0.5166230102760427, + "grad_norm": 0.06109349802136421, + "learning_rate": 9.922488242600705e-05, + "loss": 0.1996, + "step": 2564 + }, + { + "epoch": 0.5170259923433407, + "grad_norm": 0.057626061141490936, + "learning_rate": 9.922254323649902e-05, + "loss": 0.2194, + "step": 2566 + }, + { + "epoch": 0.5174289744106387, + "grad_norm": 0.08392177522182465, + "learning_rate": 9.922020055030025e-05, + "loss": 0.2169, + "step": 2568 + }, + { + "epoch": 0.5178319564779368, + "grad_norm": 0.04369249939918518, + "learning_rate": 9.921785436757713e-05, + "loss": 0.2107, + "step": 2570 + }, + { + "epoch": 0.5182349385452347, + "grad_norm": 0.04233145713806152, + "learning_rate": 9.921550468849636e-05, + "loss": 0.1723, + "step": 2572 + }, + { + "epoch": 0.5186379206125328, + "grad_norm": 0.06697847694158554, + "learning_rate": 9.921315151322486e-05, + "loss": 0.18, + "step": 2574 + }, + { + "epoch": 0.5190409026798307, + "grad_norm": 0.06663229316473007, + "learning_rate": 9.921079484192975e-05, + "loss": 0.2025, + "step": 2576 + }, + { + "epoch": 0.5194438847471288, + "grad_norm": 0.05028081312775612, + "learning_rate": 9.92084346747785e-05, + "loss": 0.1953, + "step": 2578 + }, + { + "epoch": 0.5198468668144267, + "grad_norm": 0.05140357092022896, + "learning_rate": 9.920607101193875e-05, + "loss": 0.1753, + "step": 2580 + }, + { + "epoch": 0.5202498488817248, + "grad_norm": 0.048799578100442886, + "learning_rate": 9.920370385357839e-05, + "loss": 0.1184, + "step": 2582 + }, + { + "epoch": 0.5206528309490228, + "grad_norm": 0.04645717889070511, + "learning_rate": 9.92013331998656e-05, + "loss": 0.1956, + "step": 2584 + }, + { + "epoch": 0.5210558130163208, + "grad_norm": 0.06382746249437332, + "learning_rate": 9.91989590509688e-05, + "loss": 0.2239, + "step": 2586 + }, + { + "epoch": 0.5214587950836188, + "grad_norm": 0.05598960071802139, + "learning_rate": 9.919658140705662e-05, + "loss": 0.2089, + "step": 2588 + }, + { + "epoch": 0.5218617771509168, + "grad_norm": 0.04480404034256935, + "learning_rate": 9.919420026829797e-05, + "loss": 0.2062, + "step": 2590 + }, + { + "epoch": 0.5222647592182148, + "grad_norm": 0.0500781387090683, + "learning_rate": 9.919181563486201e-05, + "loss": 0.1724, + "step": 2592 + }, + { + "epoch": 0.5226677412855127, + "grad_norm": 0.07896167784929276, + "learning_rate": 9.918942750691816e-05, + "loss": 0.1899, + "step": 2594 + }, + { + "epoch": 0.5230707233528108, + "grad_norm": 0.06729870289564133, + "learning_rate": 9.918703588463603e-05, + "loss": 0.197, + "step": 2596 + }, + { + "epoch": 0.5234737054201088, + "grad_norm": 0.059732042253017426, + "learning_rate": 9.918464076818553e-05, + "loss": 0.2084, + "step": 2598 + }, + { + "epoch": 0.5238766874874068, + "grad_norm": 0.07242099940776825, + "learning_rate": 9.918224215773682e-05, + "loss": 0.1948, + "step": 2600 + }, + { + "epoch": 0.5242796695547048, + "grad_norm": 0.05805491283535957, + "learning_rate": 9.917984005346027e-05, + "loss": 0.2387, + "step": 2602 + }, + { + "epoch": 0.5246826516220028, + "grad_norm": 0.06213317811489105, + "learning_rate": 9.917743445552654e-05, + "loss": 0.2208, + "step": 2604 + }, + { + "epoch": 0.5250856336893008, + "grad_norm": 0.05157310143113136, + "learning_rate": 9.917502536410652e-05, + "loss": 0.19, + "step": 2606 + }, + { + "epoch": 0.5254886157565988, + "grad_norm": 0.06837864220142365, + "learning_rate": 9.917261277937133e-05, + "loss": 0.2255, + "step": 2608 + }, + { + "epoch": 0.5258915978238968, + "grad_norm": 0.06203755363821983, + "learning_rate": 9.917019670149236e-05, + "loss": 0.2151, + "step": 2610 + }, + { + "epoch": 0.5262945798911949, + "grad_norm": 0.05853395164012909, + "learning_rate": 9.916777713064129e-05, + "loss": 0.2176, + "step": 2612 + }, + { + "epoch": 0.5266975619584928, + "grad_norm": 0.0638599693775177, + "learning_rate": 9.916535406698994e-05, + "loss": 0.1831, + "step": 2614 + }, + { + "epoch": 0.5271005440257909, + "grad_norm": 0.055155493319034576, + "learning_rate": 9.916292751071046e-05, + "loss": 0.231, + "step": 2616 + }, + { + "epoch": 0.5275035260930888, + "grad_norm": 0.04611392319202423, + "learning_rate": 9.916049746197524e-05, + "loss": 0.1901, + "step": 2618 + }, + { + "epoch": 0.5279065081603869, + "grad_norm": 0.05793232470750809, + "learning_rate": 9.91580639209569e-05, + "loss": 0.1608, + "step": 2620 + }, + { + "epoch": 0.5283094902276849, + "grad_norm": 0.05881835147738457, + "learning_rate": 9.915562688782832e-05, + "loss": 0.2177, + "step": 2622 + }, + { + "epoch": 0.5287124722949829, + "grad_norm": 0.04847261682152748, + "learning_rate": 9.915318636276262e-05, + "loss": 0.183, + "step": 2624 + }, + { + "epoch": 0.5291154543622809, + "grad_norm": 0.0664074644446373, + "learning_rate": 9.915074234593316e-05, + "loss": 0.2255, + "step": 2626 + }, + { + "epoch": 0.5295184364295789, + "grad_norm": 0.06530804187059402, + "learning_rate": 9.914829483751358e-05, + "loss": 0.1786, + "step": 2628 + }, + { + "epoch": 0.5299214184968769, + "grad_norm": 0.0650443434715271, + "learning_rate": 9.914584383767773e-05, + "loss": 0.1792, + "step": 2630 + }, + { + "epoch": 0.5303244005641748, + "grad_norm": 0.05127038061618805, + "learning_rate": 9.914338934659973e-05, + "loss": 0.242, + "step": 2632 + }, + { + "epoch": 0.5307273826314729, + "grad_norm": 0.051562655717134476, + "learning_rate": 9.914093136445395e-05, + "loss": 0.1925, + "step": 2634 + }, + { + "epoch": 0.531130364698771, + "grad_norm": 0.058482736349105835, + "learning_rate": 9.913846989141499e-05, + "loss": 0.2273, + "step": 2636 + }, + { + "epoch": 0.5315333467660689, + "grad_norm": 0.058450616896152496, + "learning_rate": 9.913600492765771e-05, + "loss": 0.1778, + "step": 2638 + }, + { + "epoch": 0.531936328833367, + "grad_norm": 0.04832917079329491, + "learning_rate": 9.913353647335723e-05, + "loss": 0.2222, + "step": 2640 + }, + { + "epoch": 0.5323393109006649, + "grad_norm": 0.05004747956991196, + "learning_rate": 9.91310645286889e-05, + "loss": 0.1857, + "step": 2642 + }, + { + "epoch": 0.5327422929679629, + "grad_norm": 0.05869967117905617, + "learning_rate": 9.91285890938283e-05, + "loss": 0.1851, + "step": 2644 + }, + { + "epoch": 0.5331452750352609, + "grad_norm": 0.06229320168495178, + "learning_rate": 9.912611016895131e-05, + "loss": 0.1887, + "step": 2646 + }, + { + "epoch": 0.5335482571025589, + "grad_norm": 0.05405684933066368, + "learning_rate": 9.912362775423403e-05, + "loss": 0.1712, + "step": 2648 + }, + { + "epoch": 0.533951239169857, + "grad_norm": 0.06194634735584259, + "learning_rate": 9.912114184985279e-05, + "loss": 0.2287, + "step": 2650 + }, + { + "epoch": 0.5343542212371549, + "grad_norm": 0.04791036620736122, + "learning_rate": 9.911865245598419e-05, + "loss": 0.1607, + "step": 2652 + }, + { + "epoch": 0.534757203304453, + "grad_norm": 0.06112559512257576, + "learning_rate": 9.911615957280506e-05, + "loss": 0.2234, + "step": 2654 + }, + { + "epoch": 0.5351601853717509, + "grad_norm": 0.04859330132603645, + "learning_rate": 9.911366320049253e-05, + "loss": 0.2157, + "step": 2656 + }, + { + "epoch": 0.535563167439049, + "grad_norm": 0.07223501056432724, + "learning_rate": 9.91111633392239e-05, + "loss": 0.2384, + "step": 2658 + }, + { + "epoch": 0.535966149506347, + "grad_norm": 0.04936986416578293, + "learning_rate": 9.910865998917675e-05, + "loss": 0.2292, + "step": 2660 + }, + { + "epoch": 0.536369131573645, + "grad_norm": 0.05428776890039444, + "learning_rate": 9.910615315052896e-05, + "loss": 0.201, + "step": 2662 + }, + { + "epoch": 0.536772113640943, + "grad_norm": 0.05943161994218826, + "learning_rate": 9.910364282345857e-05, + "loss": 0.219, + "step": 2664 + }, + { + "epoch": 0.537175095708241, + "grad_norm": 0.058624330908060074, + "learning_rate": 9.910112900814393e-05, + "loss": 0.1767, + "step": 2666 + }, + { + "epoch": 0.537578077775539, + "grad_norm": 0.056275349110364914, + "learning_rate": 9.90986117047636e-05, + "loss": 0.1772, + "step": 2668 + }, + { + "epoch": 0.537981059842837, + "grad_norm": 0.06266789138317108, + "learning_rate": 9.90960909134964e-05, + "loss": 0.1881, + "step": 2670 + }, + { + "epoch": 0.538384041910135, + "grad_norm": 0.05568632483482361, + "learning_rate": 9.909356663452146e-05, + "loss": 0.1773, + "step": 2672 + }, + { + "epoch": 0.538787023977433, + "grad_norm": 0.050182417035102844, + "learning_rate": 9.909103886801803e-05, + "loss": 0.2263, + "step": 2674 + }, + { + "epoch": 0.539190006044731, + "grad_norm": 0.06228543445467949, + "learning_rate": 9.908850761416573e-05, + "loss": 0.2069, + "step": 2676 + }, + { + "epoch": 0.539592988112029, + "grad_norm": 0.06411072611808777, + "learning_rate": 9.908597287314434e-05, + "loss": 0.1986, + "step": 2678 + }, + { + "epoch": 0.539995970179327, + "grad_norm": 0.04468056932091713, + "learning_rate": 9.908343464513394e-05, + "loss": 0.1636, + "step": 2680 + }, + { + "epoch": 0.540398952246625, + "grad_norm": 0.05913592502474785, + "learning_rate": 9.908089293031483e-05, + "loss": 0.2376, + "step": 2682 + }, + { + "epoch": 0.540801934313923, + "grad_norm": 0.253537118434906, + "learning_rate": 9.907834772886761e-05, + "loss": 0.1615, + "step": 2684 + }, + { + "epoch": 0.541204916381221, + "grad_norm": 0.0542256236076355, + "learning_rate": 9.907579904097305e-05, + "loss": 0.165, + "step": 2686 + }, + { + "epoch": 0.5416078984485191, + "grad_norm": 0.06959859281778336, + "learning_rate": 9.907324686681218e-05, + "loss": 0.1724, + "step": 2688 + }, + { + "epoch": 0.542010880515817, + "grad_norm": 0.05335042253136635, + "learning_rate": 9.907069120656636e-05, + "loss": 0.2072, + "step": 2690 + }, + { + "epoch": 0.5424138625831151, + "grad_norm": 0.04987449571490288, + "learning_rate": 9.90681320604171e-05, + "loss": 0.1566, + "step": 2692 + }, + { + "epoch": 0.542816844650413, + "grad_norm": 0.04512554407119751, + "learning_rate": 9.906556942854623e-05, + "loss": 0.2381, + "step": 2694 + }, + { + "epoch": 0.5432198267177111, + "grad_norm": 0.042101211845874786, + "learning_rate": 9.906300331113576e-05, + "loss": 0.1501, + "step": 2696 + }, + { + "epoch": 0.543622808785009, + "grad_norm": 0.06392179429531097, + "learning_rate": 9.9060433708368e-05, + "loss": 0.2025, + "step": 2698 + }, + { + "epoch": 0.5440257908523071, + "grad_norm": 0.05543966218829155, + "learning_rate": 9.905786062042551e-05, + "loss": 0.1677, + "step": 2700 + }, + { + "epoch": 0.5444287729196051, + "grad_norm": 0.06768188625574112, + "learning_rate": 9.905528404749102e-05, + "loss": 0.197, + "step": 2702 + }, + { + "epoch": 0.5448317549869031, + "grad_norm": 0.09007920324802399, + "learning_rate": 9.905270398974763e-05, + "loss": 0.1605, + "step": 2704 + }, + { + "epoch": 0.5452347370542011, + "grad_norm": 0.06303185969591141, + "learning_rate": 9.90501204473786e-05, + "loss": 0.1849, + "step": 2706 + }, + { + "epoch": 0.545637719121499, + "grad_norm": 0.08676113933324814, + "learning_rate": 9.904753342056746e-05, + "loss": 0.1749, + "step": 2708 + }, + { + "epoch": 0.5460407011887971, + "grad_norm": 0.056663088500499725, + "learning_rate": 9.904494290949797e-05, + "loss": 0.204, + "step": 2710 + }, + { + "epoch": 0.5464436832560952, + "grad_norm": 0.06061787158250809, + "learning_rate": 9.904234891435416e-05, + "loss": 0.1735, + "step": 2712 + }, + { + "epoch": 0.5468466653233931, + "grad_norm": 0.0656784325838089, + "learning_rate": 9.903975143532034e-05, + "loss": 0.2053, + "step": 2714 + }, + { + "epoch": 0.5472496473906912, + "grad_norm": 0.0533025786280632, + "learning_rate": 9.9037150472581e-05, + "loss": 0.1506, + "step": 2716 + }, + { + "epoch": 0.5476526294579891, + "grad_norm": 0.060621704906225204, + "learning_rate": 9.903454602632092e-05, + "loss": 0.2182, + "step": 2718 + }, + { + "epoch": 0.5480556115252871, + "grad_norm": 0.05371670052409172, + "learning_rate": 9.903193809672509e-05, + "loss": 0.145, + "step": 2720 + }, + { + "epoch": 0.5484585935925851, + "grad_norm": 0.08136498928070068, + "learning_rate": 9.90293266839788e-05, + "loss": 0.2058, + "step": 2722 + }, + { + "epoch": 0.5488615756598831, + "grad_norm": 0.06714341789484024, + "learning_rate": 9.902671178826757e-05, + "loss": 0.2286, + "step": 2724 + }, + { + "epoch": 0.5492645577271812, + "grad_norm": 0.05731106176972389, + "learning_rate": 9.902409340977713e-05, + "loss": 0.1818, + "step": 2726 + }, + { + "epoch": 0.5496675397944791, + "grad_norm": 0.06478509306907654, + "learning_rate": 9.902147154869348e-05, + "loss": 0.1992, + "step": 2728 + }, + { + "epoch": 0.5500705218617772, + "grad_norm": 0.06453832238912582, + "learning_rate": 9.901884620520291e-05, + "loss": 0.2373, + "step": 2730 + }, + { + "epoch": 0.5504735039290751, + "grad_norm": 0.06110521778464317, + "learning_rate": 9.901621737949189e-05, + "loss": 0.1849, + "step": 2732 + }, + { + "epoch": 0.5508764859963732, + "grad_norm": 0.04943872615695, + "learning_rate": 9.901358507174719e-05, + "loss": 0.179, + "step": 2734 + }, + { + "epoch": 0.5512794680636711, + "grad_norm": 0.046395331621170044, + "learning_rate": 9.901094928215577e-05, + "loss": 0.2282, + "step": 2736 + }, + { + "epoch": 0.5516824501309692, + "grad_norm": 0.05042804405093193, + "learning_rate": 9.900831001090491e-05, + "loss": 0.2087, + "step": 2738 + }, + { + "epoch": 0.5520854321982672, + "grad_norm": 0.040862563997507095, + "learning_rate": 9.900566725818208e-05, + "loss": 0.2409, + "step": 2740 + }, + { + "epoch": 0.5524884142655652, + "grad_norm": 0.058182161301374435, + "learning_rate": 9.900302102417502e-05, + "loss": 0.2004, + "step": 2742 + }, + { + "epoch": 0.5528913963328632, + "grad_norm": 0.05086760222911835, + "learning_rate": 9.900037130907171e-05, + "loss": 0.2169, + "step": 2744 + }, + { + "epoch": 0.5532943784001612, + "grad_norm": 0.04748394712805748, + "learning_rate": 9.89977181130604e-05, + "loss": 0.1727, + "step": 2746 + }, + { + "epoch": 0.5536973604674592, + "grad_norm": 0.05247688293457031, + "learning_rate": 9.899506143632954e-05, + "loss": 0.2065, + "step": 2748 + }, + { + "epoch": 0.5541003425347572, + "grad_norm": 0.05333872139453888, + "learning_rate": 9.899240127906791e-05, + "loss": 0.209, + "step": 2750 + }, + { + "epoch": 0.5545033246020552, + "grad_norm": 0.05926572158932686, + "learning_rate": 9.89897376414644e-05, + "loss": 0.2409, + "step": 2752 + }, + { + "epoch": 0.5549063066693533, + "grad_norm": 0.05073244497179985, + "learning_rate": 9.89870705237083e-05, + "loss": 0.1838, + "step": 2754 + }, + { + "epoch": 0.5553092887366512, + "grad_norm": 0.04936111718416214, + "learning_rate": 9.898439992598904e-05, + "loss": 0.2102, + "step": 2756 + }, + { + "epoch": 0.5557122708039492, + "grad_norm": 0.06824660301208496, + "learning_rate": 9.898172584849636e-05, + "loss": 0.2545, + "step": 2758 + }, + { + "epoch": 0.5561152528712472, + "grad_norm": 0.07397361099720001, + "learning_rate": 9.89790482914202e-05, + "loss": 0.2195, + "step": 2760 + }, + { + "epoch": 0.5565182349385452, + "grad_norm": 0.0612940639257431, + "learning_rate": 9.897636725495078e-05, + "loss": 0.2221, + "step": 2762 + }, + { + "epoch": 0.5569212170058433, + "grad_norm": 0.05344128981232643, + "learning_rate": 9.897368273927857e-05, + "loss": 0.193, + "step": 2764 + }, + { + "epoch": 0.5573241990731412, + "grad_norm": 0.06365928053855896, + "learning_rate": 9.897099474459424e-05, + "loss": 0.1599, + "step": 2766 + }, + { + "epoch": 0.5577271811404393, + "grad_norm": 0.05055849254131317, + "learning_rate": 9.896830327108878e-05, + "loss": 0.1981, + "step": 2768 + }, + { + "epoch": 0.5581301632077372, + "grad_norm": 0.05964979901909828, + "learning_rate": 9.896560831895335e-05, + "loss": 0.1928, + "step": 2770 + }, + { + "epoch": 0.5585331452750353, + "grad_norm": 0.05901632830500603, + "learning_rate": 9.896290988837942e-05, + "loss": 0.1739, + "step": 2772 + }, + { + "epoch": 0.5589361273423332, + "grad_norm": 0.05880974978208542, + "learning_rate": 9.896020797955868e-05, + "loss": 0.212, + "step": 2774 + }, + { + "epoch": 0.5593391094096313, + "grad_norm": 0.07250037789344788, + "learning_rate": 9.895750259268307e-05, + "loss": 0.1928, + "step": 2776 + }, + { + "epoch": 0.5597420914769293, + "grad_norm": 0.05343032628297806, + "learning_rate": 9.895479372794477e-05, + "loss": 0.206, + "step": 2778 + }, + { + "epoch": 0.5601450735442273, + "grad_norm": 0.052220240235328674, + "learning_rate": 9.89520813855362e-05, + "loss": 0.1788, + "step": 2780 + }, + { + "epoch": 0.5605480556115253, + "grad_norm": 0.056068528443574905, + "learning_rate": 9.894936556565008e-05, + "loss": 0.2249, + "step": 2782 + }, + { + "epoch": 0.5609510376788233, + "grad_norm": 0.050359394401311874, + "learning_rate": 9.89466462684793e-05, + "loss": 0.151, + "step": 2784 + }, + { + "epoch": 0.5613540197461213, + "grad_norm": 0.058597322553396225, + "learning_rate": 9.894392349421704e-05, + "loss": 0.1641, + "step": 2786 + }, + { + "epoch": 0.5617570018134193, + "grad_norm": 0.0488249696791172, + "learning_rate": 9.894119724305675e-05, + "loss": 0.1949, + "step": 2788 + }, + { + "epoch": 0.5621599838807173, + "grad_norm": 0.0449531264603138, + "learning_rate": 9.893846751519205e-05, + "loss": 0.1553, + "step": 2790 + }, + { + "epoch": 0.5625629659480154, + "grad_norm": 0.09443546086549759, + "learning_rate": 9.89357343108169e-05, + "loss": 0.214, + "step": 2792 + }, + { + "epoch": 0.5629659480153133, + "grad_norm": 0.08376643806695938, + "learning_rate": 9.893299763012545e-05, + "loss": 0.2011, + "step": 2794 + }, + { + "epoch": 0.5633689300826114, + "grad_norm": 0.05647800862789154, + "learning_rate": 9.893025747331211e-05, + "loss": 0.1575, + "step": 2796 + }, + { + "epoch": 0.5637719121499093, + "grad_norm": 0.05209295079112053, + "learning_rate": 9.89275138405715e-05, + "loss": 0.2157, + "step": 2798 + }, + { + "epoch": 0.5641748942172073, + "grad_norm": 0.06927008181810379, + "learning_rate": 9.892476673209858e-05, + "loss": 0.1606, + "step": 2800 + }, + { + "epoch": 0.5645778762845053, + "grad_norm": 0.06608123332262039, + "learning_rate": 9.892201614808848e-05, + "loss": 0.2286, + "step": 2802 + }, + { + "epoch": 0.5649808583518033, + "grad_norm": 0.05888240784406662, + "learning_rate": 9.891926208873658e-05, + "loss": 0.2173, + "step": 2804 + }, + { + "epoch": 0.5653838404191014, + "grad_norm": 0.05698193237185478, + "learning_rate": 9.891650455423854e-05, + "loss": 0.2126, + "step": 2806 + }, + { + "epoch": 0.5657868224863993, + "grad_norm": 0.05356952175498009, + "learning_rate": 9.891374354479025e-05, + "loss": 0.2118, + "step": 2808 + }, + { + "epoch": 0.5661898045536974, + "grad_norm": 0.05336877331137657, + "learning_rate": 9.891097906058784e-05, + "loss": 0.1706, + "step": 2810 + }, + { + "epoch": 0.5665927866209953, + "grad_norm": 0.04258881136775017, + "learning_rate": 9.890821110182769e-05, + "loss": 0.2012, + "step": 2812 + }, + { + "epoch": 0.5669957686882934, + "grad_norm": 0.041180457919836044, + "learning_rate": 9.890543966870646e-05, + "loss": 0.1952, + "step": 2814 + }, + { + "epoch": 0.5673987507555914, + "grad_norm": 0.0453479178249836, + "learning_rate": 9.8902664761421e-05, + "loss": 0.2232, + "step": 2816 + }, + { + "epoch": 0.5678017328228894, + "grad_norm": 0.0834401398897171, + "learning_rate": 9.889988638016844e-05, + "loss": 0.1682, + "step": 2818 + }, + { + "epoch": 0.5682047148901874, + "grad_norm": 0.053984783589839935, + "learning_rate": 9.889710452514616e-05, + "loss": 0.2373, + "step": 2820 + }, + { + "epoch": 0.5686076969574854, + "grad_norm": 0.08120491355657578, + "learning_rate": 9.889431919655176e-05, + "loss": 0.1928, + "step": 2822 + }, + { + "epoch": 0.5690106790247834, + "grad_norm": 0.0568217933177948, + "learning_rate": 9.889153039458314e-05, + "loss": 0.2374, + "step": 2824 + }, + { + "epoch": 0.5694136610920814, + "grad_norm": 0.04305284842848778, + "learning_rate": 9.888873811943838e-05, + "loss": 0.1806, + "step": 2826 + }, + { + "epoch": 0.5698166431593794, + "grad_norm": 0.05167011171579361, + "learning_rate": 9.888594237131586e-05, + "loss": 0.2031, + "step": 2828 + }, + { + "epoch": 0.5702196252266775, + "grad_norm": 0.042756181210279465, + "learning_rate": 9.888314315041417e-05, + "loss": 0.218, + "step": 2830 + }, + { + "epoch": 0.5706226072939754, + "grad_norm": 0.05174868926405907, + "learning_rate": 9.888034045693215e-05, + "loss": 0.1962, + "step": 2832 + }, + { + "epoch": 0.5710255893612735, + "grad_norm": 0.04308900609612465, + "learning_rate": 9.887753429106894e-05, + "loss": 0.1956, + "step": 2834 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.050183191895484924, + "learning_rate": 9.887472465302386e-05, + "loss": 0.1871, + "step": 2836 + }, + { + "epoch": 0.5718315534958694, + "grad_norm": 0.052778877317905426, + "learning_rate": 9.887191154299649e-05, + "loss": 0.2299, + "step": 2838 + }, + { + "epoch": 0.5722345355631674, + "grad_norm": 0.05091378092765808, + "learning_rate": 9.886909496118668e-05, + "loss": 0.2262, + "step": 2840 + }, + { + "epoch": 0.5726375176304654, + "grad_norm": 0.054165229201316833, + "learning_rate": 9.886627490779452e-05, + "loss": 0.1676, + "step": 2842 + }, + { + "epoch": 0.5730404996977635, + "grad_norm": 0.0649203434586525, + "learning_rate": 9.886345138302035e-05, + "loss": 0.1646, + "step": 2844 + }, + { + "epoch": 0.5734434817650614, + "grad_norm": 0.052568770945072174, + "learning_rate": 9.886062438706474e-05, + "loss": 0.2328, + "step": 2846 + }, + { + "epoch": 0.5738464638323595, + "grad_norm": 0.047305673360824585, + "learning_rate": 9.885779392012852e-05, + "loss": 0.1873, + "step": 2848 + }, + { + "epoch": 0.5742494458996574, + "grad_norm": 0.05377272143959999, + "learning_rate": 9.885495998241275e-05, + "loss": 0.1747, + "step": 2850 + }, + { + "epoch": 0.5746524279669555, + "grad_norm": 0.061727289110422134, + "learning_rate": 9.885212257411875e-05, + "loss": 0.2086, + "step": 2852 + }, + { + "epoch": 0.5750554100342534, + "grad_norm": 0.056558411568403244, + "learning_rate": 9.88492816954481e-05, + "loss": 0.2031, + "step": 2854 + }, + { + "epoch": 0.5754583921015515, + "grad_norm": 0.06308440864086151, + "learning_rate": 9.88464373466026e-05, + "loss": 0.1929, + "step": 2856 + }, + { + "epoch": 0.5758613741688495, + "grad_norm": 0.06898178160190582, + "learning_rate": 9.88435895277843e-05, + "loss": 0.1831, + "step": 2858 + }, + { + "epoch": 0.5762643562361475, + "grad_norm": 0.06173472851514816, + "learning_rate": 9.884073823919553e-05, + "loss": 0.2316, + "step": 2860 + }, + { + "epoch": 0.5766673383034455, + "grad_norm": 0.04862646386027336, + "learning_rate": 9.88378834810388e-05, + "loss": 0.2015, + "step": 2862 + }, + { + "epoch": 0.5770703203707435, + "grad_norm": 0.06486746668815613, + "learning_rate": 9.883502525351695e-05, + "loss": 0.2179, + "step": 2864 + }, + { + "epoch": 0.5774733024380415, + "grad_norm": 0.0436808243393898, + "learning_rate": 9.8832163556833e-05, + "loss": 0.2002, + "step": 2866 + }, + { + "epoch": 0.5778762845053396, + "grad_norm": 0.04756801575422287, + "learning_rate": 9.882929839119025e-05, + "loss": 0.1953, + "step": 2868 + }, + { + "epoch": 0.5782792665726375, + "grad_norm": 0.05114509537816048, + "learning_rate": 9.882642975679224e-05, + "loss": 0.2263, + "step": 2870 + }, + { + "epoch": 0.5786822486399356, + "grad_norm": 0.06755529344081879, + "learning_rate": 9.882355765384273e-05, + "loss": 0.2195, + "step": 2872 + }, + { + "epoch": 0.5790852307072335, + "grad_norm": 0.049488335847854614, + "learning_rate": 9.882068208254578e-05, + "loss": 0.236, + "step": 2874 + }, + { + "epoch": 0.5794882127745316, + "grad_norm": 0.05077878385782242, + "learning_rate": 9.881780304310564e-05, + "loss": 0.1497, + "step": 2876 + }, + { + "epoch": 0.5798911948418295, + "grad_norm": 0.0435592457652092, + "learning_rate": 9.881492053572685e-05, + "loss": 0.1614, + "step": 2878 + }, + { + "epoch": 0.5802941769091275, + "grad_norm": 0.060861654579639435, + "learning_rate": 9.881203456061418e-05, + "loss": 0.1771, + "step": 2880 + }, + { + "epoch": 0.5806971589764256, + "grad_norm": 0.0579148605465889, + "learning_rate": 9.880914511797262e-05, + "loss": 0.2413, + "step": 2882 + }, + { + "epoch": 0.5811001410437235, + "grad_norm": 0.05515383556485176, + "learning_rate": 9.880625220800746e-05, + "loss": 0.2095, + "step": 2884 + }, + { + "epoch": 0.5815031231110216, + "grad_norm": 0.0689605101943016, + "learning_rate": 9.880335583092421e-05, + "loss": 0.2078, + "step": 2886 + }, + { + "epoch": 0.5819061051783195, + "grad_norm": 0.05493571236729622, + "learning_rate": 9.88004559869286e-05, + "loss": 0.1886, + "step": 2888 + }, + { + "epoch": 0.5823090872456176, + "grad_norm": 0.054588478058576584, + "learning_rate": 9.879755267622664e-05, + "loss": 0.201, + "step": 2890 + }, + { + "epoch": 0.5827120693129155, + "grad_norm": 0.06213277950882912, + "learning_rate": 9.879464589902458e-05, + "loss": 0.1827, + "step": 2892 + }, + { + "epoch": 0.5831150513802136, + "grad_norm": 0.05389844626188278, + "learning_rate": 9.879173565552891e-05, + "loss": 0.2057, + "step": 2894 + }, + { + "epoch": 0.5835180334475116, + "grad_norm": 0.04522683843970299, + "learning_rate": 9.878882194594637e-05, + "loss": 0.2145, + "step": 2896 + }, + { + "epoch": 0.5839210155148096, + "grad_norm": 0.039040908217430115, + "learning_rate": 9.878590477048394e-05, + "loss": 0.1494, + "step": 2898 + }, + { + "epoch": 0.5843239975821076, + "grad_norm": 0.06023675948381424, + "learning_rate": 9.878298412934886e-05, + "loss": 0.1985, + "step": 2900 + }, + { + "epoch": 0.5847269796494056, + "grad_norm": 0.07509973645210266, + "learning_rate": 9.87800600227486e-05, + "loss": 0.2325, + "step": 2902 + }, + { + "epoch": 0.5851299617167036, + "grad_norm": 0.06361687183380127, + "learning_rate": 9.877713245089089e-05, + "loss": 0.2007, + "step": 2904 + }, + { + "epoch": 0.5855329437840016, + "grad_norm": 0.05781874805688858, + "learning_rate": 9.87742014139837e-05, + "loss": 0.2256, + "step": 2906 + }, + { + "epoch": 0.5859359258512996, + "grad_norm": 0.043421436101198196, + "learning_rate": 9.877126691223525e-05, + "loss": 0.1871, + "step": 2908 + }, + { + "epoch": 0.5863389079185977, + "grad_norm": 0.05317642167210579, + "learning_rate": 9.8768328945854e-05, + "loss": 0.2071, + "step": 2910 + }, + { + "epoch": 0.5867418899858956, + "grad_norm": 0.05214408412575722, + "learning_rate": 9.876538751504865e-05, + "loss": 0.2015, + "step": 2912 + }, + { + "epoch": 0.5871448720531937, + "grad_norm": 0.050086211413145065, + "learning_rate": 9.876244262002817e-05, + "loss": 0.1946, + "step": 2914 + }, + { + "epoch": 0.5875478541204916, + "grad_norm": 0.0423690602183342, + "learning_rate": 9.875949426100172e-05, + "loss": 0.2099, + "step": 2916 + }, + { + "epoch": 0.5879508361877896, + "grad_norm": 0.0662025436758995, + "learning_rate": 9.87565424381788e-05, + "loss": 0.1724, + "step": 2918 + }, + { + "epoch": 0.5883538182550877, + "grad_norm": 0.06472502648830414, + "learning_rate": 9.875358715176908e-05, + "loss": 0.2587, + "step": 2920 + }, + { + "epoch": 0.5887568003223856, + "grad_norm": 0.08011851459741592, + "learning_rate": 9.875062840198248e-05, + "loss": 0.2292, + "step": 2922 + }, + { + "epoch": 0.5891597823896837, + "grad_norm": 0.11067450046539307, + "learning_rate": 9.874766618902922e-05, + "loss": 0.2778, + "step": 2924 + }, + { + "epoch": 0.5895627644569816, + "grad_norm": 0.05279851332306862, + "learning_rate": 9.874470051311971e-05, + "loss": 0.1758, + "step": 2926 + }, + { + "epoch": 0.5899657465242797, + "grad_norm": 0.07390250265598297, + "learning_rate": 9.874173137446463e-05, + "loss": 0.2023, + "step": 2928 + }, + { + "epoch": 0.5903687285915776, + "grad_norm": 0.05263578146696091, + "learning_rate": 9.873875877327491e-05, + "loss": 0.2448, + "step": 2930 + }, + { + "epoch": 0.5907717106588757, + "grad_norm": 0.05811066925525665, + "learning_rate": 9.873578270976172e-05, + "loss": 0.1733, + "step": 2932 + }, + { + "epoch": 0.5911746927261737, + "grad_norm": 0.05098055303096771, + "learning_rate": 9.873280318413644e-05, + "loss": 0.1623, + "step": 2934 + }, + { + "epoch": 0.5915776747934717, + "grad_norm": 0.050901107490062714, + "learning_rate": 9.87298201966108e-05, + "loss": 0.2252, + "step": 2936 + }, + { + "epoch": 0.5919806568607697, + "grad_norm": 0.06555388867855072, + "learning_rate": 9.872683374739662e-05, + "loss": 0.1906, + "step": 2938 + }, + { + "epoch": 0.5923836389280677, + "grad_norm": 0.05196414515376091, + "learning_rate": 9.872384383670611e-05, + "loss": 0.1696, + "step": 2940 + }, + { + "epoch": 0.5927866209953657, + "grad_norm": 0.15211619436740875, + "learning_rate": 9.872085046475169e-05, + "loss": 0.2249, + "step": 2942 + }, + { + "epoch": 0.5931896030626637, + "grad_norm": 0.07226165384054184, + "learning_rate": 9.871785363174592e-05, + "loss": 0.2197, + "step": 2944 + }, + { + "epoch": 0.5935925851299617, + "grad_norm": 0.07411706447601318, + "learning_rate": 9.871485333790178e-05, + "loss": 0.235, + "step": 2946 + }, + { + "epoch": 0.5939955671972598, + "grad_norm": 0.07494895905256271, + "learning_rate": 9.871184958343234e-05, + "loss": 0.2677, + "step": 2948 + }, + { + "epoch": 0.5943985492645577, + "grad_norm": 0.06455809623003006, + "learning_rate": 9.870884236855103e-05, + "loss": 0.2008, + "step": 2950 + }, + { + "epoch": 0.5948015313318558, + "grad_norm": 0.07862219214439392, + "learning_rate": 9.870583169347146e-05, + "loss": 0.203, + "step": 2952 + }, + { + "epoch": 0.5952045133991537, + "grad_norm": 0.09578139334917068, + "learning_rate": 9.870281755840747e-05, + "loss": 0.1717, + "step": 2954 + }, + { + "epoch": 0.5956074954664518, + "grad_norm": 0.07982958853244781, + "learning_rate": 9.869979996357323e-05, + "loss": 0.2288, + "step": 2956 + }, + { + "epoch": 0.5960104775337497, + "grad_norm": 0.15855936706066132, + "learning_rate": 9.869677890918307e-05, + "loss": 0.1945, + "step": 2958 + }, + { + "epoch": 0.5964134596010477, + "grad_norm": 0.04864702373743057, + "learning_rate": 9.869375439545164e-05, + "loss": 0.1835, + "step": 2960 + }, + { + "epoch": 0.5968164416683458, + "grad_norm": 0.08095841109752655, + "learning_rate": 9.869072642259375e-05, + "loss": 0.2169, + "step": 2962 + }, + { + "epoch": 0.5972194237356437, + "grad_norm": 0.051869262009859085, + "learning_rate": 9.868769499082453e-05, + "loss": 0.1718, + "step": 2964 + }, + { + "epoch": 0.5976224058029418, + "grad_norm": 0.06466794013977051, + "learning_rate": 9.868466010035932e-05, + "loss": 0.1867, + "step": 2966 + }, + { + "epoch": 0.5980253878702397, + "grad_norm": 0.07450581341981888, + "learning_rate": 9.868162175141373e-05, + "loss": 0.2033, + "step": 2968 + }, + { + "epoch": 0.5984283699375378, + "grad_norm": 0.06121667101979256, + "learning_rate": 9.867857994420357e-05, + "loss": 0.2116, + "step": 2970 + }, + { + "epoch": 0.5988313520048358, + "grad_norm": 0.0859074667096138, + "learning_rate": 9.867553467894494e-05, + "loss": 0.1945, + "step": 2972 + }, + { + "epoch": 0.5992343340721338, + "grad_norm": 0.0621977373957634, + "learning_rate": 9.867248595585419e-05, + "loss": 0.1979, + "step": 2974 + }, + { + "epoch": 0.5996373161394318, + "grad_norm": 0.057669565081596375, + "learning_rate": 9.866943377514787e-05, + "loss": 0.2332, + "step": 2976 + }, + { + "epoch": 0.6000402982067298, + "grad_norm": 0.05102207884192467, + "learning_rate": 9.86663781370428e-05, + "loss": 0.1866, + "step": 2978 + }, + { + "epoch": 0.6004432802740278, + "grad_norm": 0.04888800159096718, + "learning_rate": 9.866331904175608e-05, + "loss": 0.2701, + "step": 2980 + }, + { + "epoch": 0.6008462623413258, + "grad_norm": 0.17599685490131378, + "learning_rate": 9.866025648950496e-05, + "loss": 0.243, + "step": 2982 + }, + { + "epoch": 0.6012492444086238, + "grad_norm": 0.04738638177514076, + "learning_rate": 9.865719048050707e-05, + "loss": 0.1871, + "step": 2984 + }, + { + "epoch": 0.6016522264759219, + "grad_norm": 0.0663914680480957, + "learning_rate": 9.865412101498019e-05, + "loss": 0.1896, + "step": 2986 + }, + { + "epoch": 0.6020552085432198, + "grad_norm": 0.3060331344604492, + "learning_rate": 9.865104809314234e-05, + "loss": 0.2408, + "step": 2988 + }, + { + "epoch": 0.6024581906105179, + "grad_norm": 0.3577231168746948, + "learning_rate": 9.864797171521185e-05, + "loss": 0.1561, + "step": 2990 + }, + { + "epoch": 0.6028611726778158, + "grad_norm": 0.12939517199993134, + "learning_rate": 9.864489188140727e-05, + "loss": 0.1593, + "step": 2992 + }, + { + "epoch": 0.6032641547451139, + "grad_norm": 0.08543020486831665, + "learning_rate": 9.864180859194734e-05, + "loss": 1.2892, + "step": 2994 + }, + { + "epoch": 0.6036671368124118, + "grad_norm": 0.06838931888341904, + "learning_rate": 9.863872184705111e-05, + "loss": 0.2024, + "step": 2996 + }, + { + "epoch": 0.6040701188797098, + "grad_norm": 0.09500443190336227, + "learning_rate": 9.86356316469379e-05, + "loss": 0.1507, + "step": 2998 + }, + { + "epoch": 0.6044731009470079, + "grad_norm": 10.331274032592773, + "learning_rate": 9.863253799182718e-05, + "loss": 0.2497, + "step": 3000 + }, + { + "epoch": 0.6048760830143058, + "grad_norm": 0.3732292652130127, + "learning_rate": 9.862944088193874e-05, + "loss": 0.2482, + "step": 3002 + }, + { + "epoch": 0.6052790650816039, + "grad_norm": 0.0717553049325943, + "learning_rate": 9.862634031749258e-05, + "loss": 0.1793, + "step": 3004 + }, + { + "epoch": 0.6056820471489018, + "grad_norm": 0.06944863498210907, + "learning_rate": 9.862323629870899e-05, + "loss": 0.1742, + "step": 3006 + }, + { + "epoch": 0.6060850292161999, + "grad_norm": 0.07637281715869904, + "learning_rate": 9.862012882580845e-05, + "loss": 0.1575, + "step": 3008 + }, + { + "epoch": 0.6064880112834978, + "grad_norm": 0.3652530610561371, + "learning_rate": 9.86170178990117e-05, + "loss": 0.2212, + "step": 3010 + }, + { + "epoch": 0.6068909933507959, + "grad_norm": 0.07585793733596802, + "learning_rate": 9.861390351853976e-05, + "loss": 0.1926, + "step": 3012 + }, + { + "epoch": 0.6072939754180939, + "grad_norm": 0.06400326639413834, + "learning_rate": 9.861078568461386e-05, + "loss": 0.2252, + "step": 3014 + }, + { + "epoch": 0.6076969574853919, + "grad_norm": 0.4003634452819824, + "learning_rate": 9.86076643974555e-05, + "loss": 0.1863, + "step": 3016 + }, + { + "epoch": 0.6080999395526899, + "grad_norm": 0.07460381835699081, + "learning_rate": 9.860453965728638e-05, + "loss": 0.1905, + "step": 3018 + }, + { + "epoch": 0.6085029216199879, + "grad_norm": 0.0575389601290226, + "learning_rate": 9.860141146432848e-05, + "loss": 0.2124, + "step": 3020 + }, + { + "epoch": 0.6089059036872859, + "grad_norm": 0.07313038408756256, + "learning_rate": 9.859827981880408e-05, + "loss": 0.2669, + "step": 3022 + }, + { + "epoch": 0.609308885754584, + "grad_norm": 0.08500310778617859, + "learning_rate": 9.859514472093557e-05, + "loss": 0.2018, + "step": 3024 + }, + { + "epoch": 0.6097118678218819, + "grad_norm": 0.10082163661718369, + "learning_rate": 9.85920061709457e-05, + "loss": 0.1774, + "step": 3026 + }, + { + "epoch": 0.61011484988918, + "grad_norm": 0.5841183066368103, + "learning_rate": 9.858886416905741e-05, + "loss": 0.2417, + "step": 3028 + }, + { + "epoch": 0.6105178319564779, + "grad_norm": 1.5156642198562622, + "learning_rate": 9.858571871549394e-05, + "loss": 0.196, + "step": 3030 + }, + { + "epoch": 0.610920814023776, + "grad_norm": 8.291611671447754, + "learning_rate": 9.858256981047871e-05, + "loss": 0.1798, + "step": 3032 + }, + { + "epoch": 0.6113237960910739, + "grad_norm": 18.14417839050293, + "learning_rate": 9.857941745423541e-05, + "loss": 3.6401, + "step": 3034 + }, + { + "epoch": 0.611726778158372, + "grad_norm": 286.70599365234375, + "learning_rate": 9.857626164698798e-05, + "loss": 8.12, + "step": 3036 + }, + { + "epoch": 0.61212976022567, + "grad_norm": 0.9753457903862, + "learning_rate": 9.857310238896062e-05, + "loss": 1.2561, + "step": 3038 + }, + { + "epoch": 0.612532742292968, + "grad_norm": 0.18690338730812073, + "learning_rate": 9.856993968037775e-05, + "loss": 0.2009, + "step": 3040 + }, + { + "epoch": 0.612935724360266, + "grad_norm": 0.24463942646980286, + "learning_rate": 9.856677352146404e-05, + "loss": 0.2085, + "step": 3042 + }, + { + "epoch": 0.6133387064275639, + "grad_norm": 0.10862796753644943, + "learning_rate": 9.856360391244441e-05, + "loss": 0.215, + "step": 3044 + }, + { + "epoch": 0.613741688494862, + "grad_norm": 0.32057657837867737, + "learning_rate": 9.856043085354402e-05, + "loss": 0.2343, + "step": 3046 + }, + { + "epoch": 0.6141446705621599, + "grad_norm": 0.07785500586032867, + "learning_rate": 9.855725434498828e-05, + "loss": 0.1888, + "step": 3048 + }, + { + "epoch": 0.614547652629458, + "grad_norm": 0.047291629016399384, + "learning_rate": 9.855407438700284e-05, + "loss": 0.199, + "step": 3050 + }, + { + "epoch": 0.614950634696756, + "grad_norm": 0.0839415118098259, + "learning_rate": 9.855089097981362e-05, + "loss": 0.2054, + "step": 3052 + }, + { + "epoch": 0.615353616764054, + "grad_norm": 0.07940755039453506, + "learning_rate": 9.854770412364676e-05, + "loss": 0.2376, + "step": 3054 + }, + { + "epoch": 0.615756598831352, + "grad_norm": 0.07128242403268814, + "learning_rate": 9.854451381872862e-05, + "loss": 0.2176, + "step": 3056 + }, + { + "epoch": 0.61615958089865, + "grad_norm": 0.08210190385580063, + "learning_rate": 9.854132006528586e-05, + "loss": 0.2131, + "step": 3058 + }, + { + "epoch": 0.616562562965948, + "grad_norm": 0.09783251583576202, + "learning_rate": 9.853812286354536e-05, + "loss": 0.2359, + "step": 3060 + }, + { + "epoch": 0.616965545033246, + "grad_norm": 0.06867846846580505, + "learning_rate": 9.853492221373421e-05, + "loss": 0.1647, + "step": 3062 + }, + { + "epoch": 0.617368527100544, + "grad_norm": 0.053430572152137756, + "learning_rate": 9.853171811607983e-05, + "loss": 0.1712, + "step": 3064 + }, + { + "epoch": 0.6177715091678421, + "grad_norm": 0.06458867341279984, + "learning_rate": 9.852851057080982e-05, + "loss": 0.1941, + "step": 3066 + }, + { + "epoch": 0.61817449123514, + "grad_norm": 0.052936289459466934, + "learning_rate": 9.852529957815202e-05, + "loss": 0.2173, + "step": 3068 + }, + { + "epoch": 0.6185774733024381, + "grad_norm": 0.05379785597324371, + "learning_rate": 9.852208513833454e-05, + "loss": 0.1515, + "step": 3070 + }, + { + "epoch": 0.618980455369736, + "grad_norm": 0.06087642163038254, + "learning_rate": 9.851886725158573e-05, + "loss": 0.1818, + "step": 3072 + }, + { + "epoch": 0.6193834374370341, + "grad_norm": 0.09215617924928665, + "learning_rate": 9.851564591813418e-05, + "loss": 0.2171, + "step": 3074 + }, + { + "epoch": 0.6197864195043321, + "grad_norm": 0.0784756988286972, + "learning_rate": 9.851242113820873e-05, + "loss": 0.1924, + "step": 3076 + }, + { + "epoch": 0.62018940157163, + "grad_norm": 0.10722616314888, + "learning_rate": 9.850919291203848e-05, + "loss": 0.2037, + "step": 3078 + }, + { + "epoch": 0.6205923836389281, + "grad_norm": 0.06373076885938644, + "learning_rate": 9.850596123985274e-05, + "loss": 0.1746, + "step": 3080 + }, + { + "epoch": 0.620995365706226, + "grad_norm": 0.08903231471776962, + "learning_rate": 9.850272612188109e-05, + "loss": 0.2424, + "step": 3082 + }, + { + "epoch": 0.6213983477735241, + "grad_norm": 0.05747120454907417, + "learning_rate": 9.849948755835333e-05, + "loss": 0.2722, + "step": 3084 + }, + { + "epoch": 0.621801329840822, + "grad_norm": 0.05946004018187523, + "learning_rate": 9.849624554949954e-05, + "loss": 0.1631, + "step": 3086 + }, + { + "epoch": 0.6222043119081201, + "grad_norm": 0.07288770377635956, + "learning_rate": 9.849300009555005e-05, + "loss": 0.2492, + "step": 3088 + }, + { + "epoch": 0.6226072939754181, + "grad_norm": 0.056010086089372635, + "learning_rate": 9.848975119673536e-05, + "loss": 0.2085, + "step": 3090 + }, + { + "epoch": 0.6230102760427161, + "grad_norm": 0.05710703134536743, + "learning_rate": 9.848649885328631e-05, + "loss": 0.1921, + "step": 3092 + }, + { + "epoch": 0.6234132581100141, + "grad_norm": 0.07084643095731735, + "learning_rate": 9.848324306543391e-05, + "loss": 0.2246, + "step": 3094 + }, + { + "epoch": 0.6238162401773121, + "grad_norm": 0.061563413590192795, + "learning_rate": 9.847998383340947e-05, + "loss": 0.1917, + "step": 3096 + }, + { + "epoch": 0.6242192222446101, + "grad_norm": 0.044611282646656036, + "learning_rate": 9.847672115744451e-05, + "loss": 0.183, + "step": 3098 + }, + { + "epoch": 0.6246222043119081, + "grad_norm": 0.06666868925094604, + "learning_rate": 9.847345503777079e-05, + "loss": 0.228, + "step": 3100 + }, + { + "epoch": 0.6250251863792061, + "grad_norm": 0.08843137323856354, + "learning_rate": 9.847018547462037e-05, + "loss": 0.2709, + "step": 3102 + }, + { + "epoch": 0.6254281684465042, + "grad_norm": 0.05686771869659424, + "learning_rate": 9.846691246822548e-05, + "loss": 0.1932, + "step": 3104 + }, + { + "epoch": 0.6258311505138021, + "grad_norm": 0.0559655986726284, + "learning_rate": 9.846363601881862e-05, + "loss": 0.1875, + "step": 3106 + }, + { + "epoch": 0.6262341325811002, + "grad_norm": 0.04997412487864494, + "learning_rate": 9.846035612663261e-05, + "loss": 0.1792, + "step": 3108 + }, + { + "epoch": 0.6266371146483981, + "grad_norm": 0.07034078985452652, + "learning_rate": 9.845707279190037e-05, + "loss": 0.1944, + "step": 3110 + }, + { + "epoch": 0.6270400967156962, + "grad_norm": 0.06097714975476265, + "learning_rate": 9.845378601485517e-05, + "loss": 0.1702, + "step": 3112 + }, + { + "epoch": 0.6274430787829941, + "grad_norm": 0.055597711354494095, + "learning_rate": 9.845049579573051e-05, + "loss": 0.1629, + "step": 3114 + }, + { + "epoch": 0.6278460608502922, + "grad_norm": 0.0588107630610466, + "learning_rate": 9.844720213476012e-05, + "loss": 0.1652, + "step": 3116 + }, + { + "epoch": 0.6282490429175902, + "grad_norm": 0.05339299514889717, + "learning_rate": 9.844390503217796e-05, + "loss": 0.1681, + "step": 3118 + }, + { + "epoch": 0.6286520249848881, + "grad_norm": 0.0750700905919075, + "learning_rate": 9.844060448821827e-05, + "loss": 0.1935, + "step": 3120 + }, + { + "epoch": 0.6290550070521862, + "grad_norm": 0.061377957463264465, + "learning_rate": 9.843730050311551e-05, + "loss": 0.1798, + "step": 3122 + }, + { + "epoch": 0.6294579891194841, + "grad_norm": 0.053399331867694855, + "learning_rate": 9.843399307710437e-05, + "loss": 0.1576, + "step": 3124 + }, + { + "epoch": 0.6298609711867822, + "grad_norm": 0.08089054375886917, + "learning_rate": 9.843068221041982e-05, + "loss": 0.1891, + "step": 3126 + }, + { + "epoch": 0.6302639532540802, + "grad_norm": 0.17065788805484772, + "learning_rate": 9.842736790329707e-05, + "loss": 0.2374, + "step": 3128 + }, + { + "epoch": 0.6306669353213782, + "grad_norm": 0.071701280772686, + "learning_rate": 9.842405015597156e-05, + "loss": 0.1846, + "step": 3130 + }, + { + "epoch": 0.6310699173886762, + "grad_norm": 0.06457066535949707, + "learning_rate": 9.842072896867895e-05, + "loss": 0.1606, + "step": 3132 + }, + { + "epoch": 0.6314728994559742, + "grad_norm": 0.0577966645359993, + "learning_rate": 9.84174043416552e-05, + "loss": 0.1813, + "step": 3134 + }, + { + "epoch": 0.6318758815232722, + "grad_norm": 0.12219101935625076, + "learning_rate": 9.841407627513649e-05, + "loss": 0.214, + "step": 3136 + }, + { + "epoch": 0.6322788635905702, + "grad_norm": 0.16911907494068146, + "learning_rate": 9.841074476935921e-05, + "loss": 0.2144, + "step": 3138 + }, + { + "epoch": 0.6326818456578682, + "grad_norm": 0.05600879341363907, + "learning_rate": 9.840740982456005e-05, + "loss": 0.1586, + "step": 3140 + }, + { + "epoch": 0.6330848277251663, + "grad_norm": 0.05861913040280342, + "learning_rate": 9.840407144097593e-05, + "loss": 0.2022, + "step": 3142 + }, + { + "epoch": 0.6334878097924642, + "grad_norm": 0.08657240867614746, + "learning_rate": 9.840072961884396e-05, + "loss": 0.2582, + "step": 3144 + }, + { + "epoch": 0.6338907918597623, + "grad_norm": 0.064541295170784, + "learning_rate": 9.839738435840157e-05, + "loss": 0.1894, + "step": 3146 + }, + { + "epoch": 0.6342937739270602, + "grad_norm": 0.10545245558023453, + "learning_rate": 9.83940356598864e-05, + "loss": 0.245, + "step": 3148 + }, + { + "epoch": 0.6346967559943583, + "grad_norm": 0.05893385037779808, + "learning_rate": 9.839068352353633e-05, + "loss": 0.2393, + "step": 3150 + }, + { + "epoch": 0.6350997380616562, + "grad_norm": 0.06363905221223831, + "learning_rate": 9.838732794958949e-05, + "loss": 0.204, + "step": 3152 + }, + { + "epoch": 0.6355027201289543, + "grad_norm": 0.053424712270498276, + "learning_rate": 9.838396893828426e-05, + "loss": 0.1796, + "step": 3154 + }, + { + "epoch": 0.6359057021962523, + "grad_norm": 0.04647869989275932, + "learning_rate": 9.838060648985925e-05, + "loss": 0.2063, + "step": 3156 + }, + { + "epoch": 0.6363086842635503, + "grad_norm": 0.09858336299657822, + "learning_rate": 9.837724060455333e-05, + "loss": 0.2552, + "step": 3158 + }, + { + "epoch": 0.6367116663308483, + "grad_norm": 0.06279218941926956, + "learning_rate": 9.83738712826056e-05, + "loss": 0.1997, + "step": 3160 + }, + { + "epoch": 0.6371146483981462, + "grad_norm": 0.07511857897043228, + "learning_rate": 9.837049852425544e-05, + "loss": 0.1842, + "step": 3162 + }, + { + "epoch": 0.6375176304654443, + "grad_norm": 0.0642283633351326, + "learning_rate": 9.83671223297424e-05, + "loss": 0.2447, + "step": 3164 + }, + { + "epoch": 0.6379206125327423, + "grad_norm": 0.04851103946566582, + "learning_rate": 9.836374269930635e-05, + "loss": 0.1869, + "step": 3166 + }, + { + "epoch": 0.6383235946000403, + "grad_norm": 0.05043719336390495, + "learning_rate": 9.836035963318735e-05, + "loss": 0.1988, + "step": 3168 + }, + { + "epoch": 0.6387265766673383, + "grad_norm": 0.06849416345357895, + "learning_rate": 9.835697313162577e-05, + "loss": 0.1624, + "step": 3170 + }, + { + "epoch": 0.6391295587346363, + "grad_norm": 0.049817949533462524, + "learning_rate": 9.835358319486212e-05, + "loss": 0.1497, + "step": 3172 + }, + { + "epoch": 0.6395325408019343, + "grad_norm": 0.06779654324054718, + "learning_rate": 9.835018982313729e-05, + "loss": 0.2263, + "step": 3174 + }, + { + "epoch": 0.6399355228692323, + "grad_norm": 0.05196770653128624, + "learning_rate": 9.834679301669227e-05, + "loss": 0.2165, + "step": 3176 + }, + { + "epoch": 0.6403385049365303, + "grad_norm": 0.08842454105615616, + "learning_rate": 9.83433927757684e-05, + "loss": 0.2257, + "step": 3178 + }, + { + "epoch": 0.6407414870038284, + "grad_norm": 0.05393417552113533, + "learning_rate": 9.833998910060723e-05, + "loss": 0.179, + "step": 3180 + }, + { + "epoch": 0.6411444690711263, + "grad_norm": 0.06792977452278137, + "learning_rate": 9.833658199145053e-05, + "loss": 0.2053, + "step": 3182 + }, + { + "epoch": 0.6415474511384244, + "grad_norm": 0.06019643694162369, + "learning_rate": 9.833317144854035e-05, + "loss": 0.1592, + "step": 3184 + }, + { + "epoch": 0.6419504332057223, + "grad_norm": 0.06538268178701401, + "learning_rate": 9.832975747211896e-05, + "loss": 0.1791, + "step": 3186 + }, + { + "epoch": 0.6423534152730204, + "grad_norm": 0.06463984400033951, + "learning_rate": 9.832634006242891e-05, + "loss": 0.2425, + "step": 3188 + }, + { + "epoch": 0.6427563973403183, + "grad_norm": 0.050466809421777725, + "learning_rate": 9.832291921971295e-05, + "loss": 0.1764, + "step": 3190 + }, + { + "epoch": 0.6431593794076164, + "grad_norm": 0.0687469020485878, + "learning_rate": 9.831949494421409e-05, + "loss": 0.2187, + "step": 3192 + }, + { + "epoch": 0.6435623614749144, + "grad_norm": 0.06412825733423233, + "learning_rate": 9.831606723617557e-05, + "loss": 0.1452, + "step": 3194 + }, + { + "epoch": 0.6439653435422124, + "grad_norm": 0.07165306806564331, + "learning_rate": 9.831263609584091e-05, + "loss": 0.1617, + "step": 3196 + }, + { + "epoch": 0.6443683256095104, + "grad_norm": 0.050628188997507095, + "learning_rate": 9.830920152345385e-05, + "loss": 0.2079, + "step": 3198 + }, + { + "epoch": 0.6447713076768083, + "grad_norm": 0.07396326214075089, + "learning_rate": 9.830576351925836e-05, + "loss": 0.2052, + "step": 3200 + }, + { + "epoch": 0.6451742897441064, + "grad_norm": 0.05688002333045006, + "learning_rate": 9.83023220834987e-05, + "loss": 0.2432, + "step": 3202 + }, + { + "epoch": 0.6455772718114043, + "grad_norm": 0.06849195063114166, + "learning_rate": 9.829887721641931e-05, + "loss": 0.2222, + "step": 3204 + }, + { + "epoch": 0.6459802538787024, + "grad_norm": 0.08155640959739685, + "learning_rate": 9.829542891826493e-05, + "loss": 0.2307, + "step": 3206 + }, + { + "epoch": 0.6463832359460004, + "grad_norm": 0.06775806099176407, + "learning_rate": 9.829197718928053e-05, + "loss": 0.2101, + "step": 3208 + }, + { + "epoch": 0.6467862180132984, + "grad_norm": 0.05798272415995598, + "learning_rate": 9.828852202971129e-05, + "loss": 0.2196, + "step": 3210 + }, + { + "epoch": 0.6471892000805964, + "grad_norm": 0.04944278672337532, + "learning_rate": 9.828506343980269e-05, + "loss": 0.2321, + "step": 3212 + }, + { + "epoch": 0.6475921821478944, + "grad_norm": 0.09051971882581711, + "learning_rate": 9.828160141980037e-05, + "loss": 0.1878, + "step": 3214 + }, + { + "epoch": 0.6479951642151924, + "grad_norm": 0.05372557416558266, + "learning_rate": 9.827813596995033e-05, + "loss": 0.1763, + "step": 3216 + }, + { + "epoch": 0.6483981462824905, + "grad_norm": 0.05551968887448311, + "learning_rate": 9.82746670904987e-05, + "loss": 0.1705, + "step": 3218 + }, + { + "epoch": 0.6488011283497884, + "grad_norm": 0.07168183475732803, + "learning_rate": 9.827119478169194e-05, + "loss": 0.2073, + "step": 3220 + }, + { + "epoch": 0.6492041104170865, + "grad_norm": 0.05864137038588524, + "learning_rate": 9.82677190437767e-05, + "loss": 0.2129, + "step": 3222 + }, + { + "epoch": 0.6496070924843844, + "grad_norm": 0.04913345351815224, + "learning_rate": 9.826423987699988e-05, + "loss": 0.1874, + "step": 3224 + }, + { + "epoch": 0.6500100745516825, + "grad_norm": 0.07202889025211334, + "learning_rate": 9.826075728160863e-05, + "loss": 0.193, + "step": 3226 + }, + { + "epoch": 0.6504130566189804, + "grad_norm": 0.06070972606539726, + "learning_rate": 9.82572712578504e-05, + "loss": 0.2112, + "step": 3228 + }, + { + "epoch": 0.6508160386862785, + "grad_norm": 0.07425591349601746, + "learning_rate": 9.825378180597278e-05, + "loss": 0.1363, + "step": 3230 + }, + { + "epoch": 0.6512190207535765, + "grad_norm": 0.05426356941461563, + "learning_rate": 9.825028892622367e-05, + "loss": 0.2429, + "step": 3232 + }, + { + "epoch": 0.6516220028208745, + "grad_norm": 0.07585328072309494, + "learning_rate": 9.824679261885122e-05, + "loss": 0.185, + "step": 3234 + }, + { + "epoch": 0.6520249848881725, + "grad_norm": 0.05416212975978851, + "learning_rate": 9.824329288410376e-05, + "loss": 0.2271, + "step": 3236 + }, + { + "epoch": 0.6524279669554705, + "grad_norm": 0.07425907999277115, + "learning_rate": 9.823978972222993e-05, + "loss": 0.1591, + "step": 3238 + }, + { + "epoch": 0.6528309490227685, + "grad_norm": 0.06274693459272385, + "learning_rate": 9.823628313347859e-05, + "loss": 0.2194, + "step": 3240 + }, + { + "epoch": 0.6532339310900664, + "grad_norm": 0.10922153294086456, + "learning_rate": 9.823277311809884e-05, + "loss": 0.2269, + "step": 3242 + }, + { + "epoch": 0.6536369131573645, + "grad_norm": 0.05456622317433357, + "learning_rate": 9.822925967634003e-05, + "loss": 0.2249, + "step": 3244 + }, + { + "epoch": 0.6540398952246625, + "grad_norm": 0.07304324954748154, + "learning_rate": 9.822574280845171e-05, + "loss": 0.1911, + "step": 3246 + }, + { + "epoch": 0.6544428772919605, + "grad_norm": 0.14694897830486298, + "learning_rate": 9.822222251468378e-05, + "loss": 0.2003, + "step": 3248 + }, + { + "epoch": 0.6548458593592585, + "grad_norm": 0.07201841473579407, + "learning_rate": 9.821869879528628e-05, + "loss": 0.1586, + "step": 3250 + }, + { + "epoch": 0.6552488414265565, + "grad_norm": 0.0460667610168457, + "learning_rate": 9.821517165050953e-05, + "loss": 0.1666, + "step": 3252 + }, + { + "epoch": 0.6556518234938545, + "grad_norm": 0.07998193055391312, + "learning_rate": 9.821164108060407e-05, + "loss": 0.2349, + "step": 3254 + }, + { + "epoch": 0.6560548055611525, + "grad_norm": 0.07132408767938614, + "learning_rate": 9.820810708582077e-05, + "loss": 0.1718, + "step": 3256 + }, + { + "epoch": 0.6564577876284505, + "grad_norm": 0.06714514642953873, + "learning_rate": 9.820456966641063e-05, + "loss": 0.2025, + "step": 3258 + }, + { + "epoch": 0.6568607696957486, + "grad_norm": 0.07556632906198502, + "learning_rate": 9.820102882262494e-05, + "loss": 0.2169, + "step": 3260 + }, + { + "epoch": 0.6572637517630465, + "grad_norm": 0.05741098150610924, + "learning_rate": 9.819748455471525e-05, + "loss": 0.2155, + "step": 3262 + }, + { + "epoch": 0.6576667338303446, + "grad_norm": 0.052905187010765076, + "learning_rate": 9.819393686293334e-05, + "loss": 0.1433, + "step": 3264 + }, + { + "epoch": 0.6580697158976425, + "grad_norm": 0.08418918401002884, + "learning_rate": 9.819038574753123e-05, + "loss": 0.1992, + "step": 3266 + }, + { + "epoch": 0.6584726979649406, + "grad_norm": 0.073238305747509, + "learning_rate": 9.818683120876119e-05, + "loss": 0.2118, + "step": 3268 + }, + { + "epoch": 0.6588756800322386, + "grad_norm": 0.0671781674027443, + "learning_rate": 9.818327324687572e-05, + "loss": 0.186, + "step": 3270 + }, + { + "epoch": 0.6592786620995366, + "grad_norm": 0.06390842795372009, + "learning_rate": 9.817971186212758e-05, + "loss": 0.2308, + "step": 3272 + }, + { + "epoch": 0.6596816441668346, + "grad_norm": 0.04517597705125809, + "learning_rate": 9.817614705476976e-05, + "loss": 0.1754, + "step": 3274 + }, + { + "epoch": 0.6600846262341326, + "grad_norm": 0.061243556439876556, + "learning_rate": 9.81725788250555e-05, + "loss": 0.2033, + "step": 3276 + }, + { + "epoch": 0.6604876083014306, + "grad_norm": 0.05314614623785019, + "learning_rate": 9.816900717323827e-05, + "loss": 0.187, + "step": 3278 + }, + { + "epoch": 0.6608905903687285, + "grad_norm": 0.05768498033285141, + "learning_rate": 9.816543209957181e-05, + "loss": 0.1838, + "step": 3280 + }, + { + "epoch": 0.6612935724360266, + "grad_norm": 0.0638279840350151, + "learning_rate": 9.816185360431009e-05, + "loss": 0.1805, + "step": 3282 + }, + { + "epoch": 0.6616965545033247, + "grad_norm": 0.053918592631816864, + "learning_rate": 9.815827168770733e-05, + "loss": 0.1993, + "step": 3284 + }, + { + "epoch": 0.6620995365706226, + "grad_norm": 0.06501603126525879, + "learning_rate": 9.815468635001794e-05, + "loss": 0.1783, + "step": 3286 + }, + { + "epoch": 0.6625025186379206, + "grad_norm": 0.055193666368722916, + "learning_rate": 9.815109759149665e-05, + "loss": 0.2104, + "step": 3288 + }, + { + "epoch": 0.6629055007052186, + "grad_norm": 0.06583480536937714, + "learning_rate": 9.814750541239838e-05, + "loss": 0.2385, + "step": 3290 + }, + { + "epoch": 0.6633084827725166, + "grad_norm": 0.07717015594244003, + "learning_rate": 9.814390981297836e-05, + "loss": 0.1724, + "step": 3292 + }, + { + "epoch": 0.6637114648398146, + "grad_norm": 0.04836519435048103, + "learning_rate": 9.814031079349197e-05, + "loss": 0.1589, + "step": 3294 + }, + { + "epoch": 0.6641144469071126, + "grad_norm": 0.06062225624918938, + "learning_rate": 9.813670835419488e-05, + "loss": 0.197, + "step": 3296 + }, + { + "epoch": 0.6645174289744107, + "grad_norm": 0.04774460569024086, + "learning_rate": 9.813310249534301e-05, + "loss": 0.2113, + "step": 3298 + }, + { + "epoch": 0.6649204110417086, + "grad_norm": 0.060781124979257584, + "learning_rate": 9.812949321719252e-05, + "loss": 0.2177, + "step": 3300 + }, + { + "epoch": 0.6653233931090067, + "grad_norm": 0.053476136177778244, + "learning_rate": 9.812588051999981e-05, + "loss": 0.195, + "step": 3302 + }, + { + "epoch": 0.6657263751763046, + "grad_norm": 0.07423517107963562, + "learning_rate": 9.81222644040215e-05, + "loss": 0.1965, + "step": 3304 + }, + { + "epoch": 0.6661293572436027, + "grad_norm": 0.0641479343175888, + "learning_rate": 9.81186448695145e-05, + "loss": 0.191, + "step": 3306 + }, + { + "epoch": 0.6665323393109006, + "grad_norm": 0.29882076382637024, + "learning_rate": 9.811502191673591e-05, + "loss": 0.2615, + "step": 3308 + }, + { + "epoch": 0.6669353213781987, + "grad_norm": 0.05279373377561569, + "learning_rate": 9.811139554594314e-05, + "loss": 0.2141, + "step": 3310 + }, + { + "epoch": 0.6673383034454967, + "grad_norm": 0.053844355046749115, + "learning_rate": 9.810776575739375e-05, + "loss": 0.1597, + "step": 3312 + }, + { + "epoch": 0.6677412855127947, + "grad_norm": 0.05351219326257706, + "learning_rate": 9.810413255134561e-05, + "loss": 0.1949, + "step": 3314 + }, + { + "epoch": 0.6681442675800927, + "grad_norm": 0.06659112870693207, + "learning_rate": 9.810049592805684e-05, + "loss": 0.1775, + "step": 3316 + }, + { + "epoch": 0.6685472496473907, + "grad_norm": 0.07621248066425323, + "learning_rate": 9.809685588778577e-05, + "loss": 0.2378, + "step": 3318 + }, + { + "epoch": 0.6689502317146887, + "grad_norm": 0.07413389533758163, + "learning_rate": 9.809321243079096e-05, + "loss": 0.2343, + "step": 3320 + }, + { + "epoch": 0.6693532137819868, + "grad_norm": 0.053018804639577866, + "learning_rate": 9.808956555733126e-05, + "loss": 0.1307, + "step": 3322 + }, + { + "epoch": 0.6697561958492847, + "grad_norm": 0.09446662664413452, + "learning_rate": 9.808591526766573e-05, + "loss": 0.2026, + "step": 3324 + }, + { + "epoch": 0.6701591779165827, + "grad_norm": 0.19013406336307526, + "learning_rate": 9.808226156205369e-05, + "loss": 0.203, + "step": 3326 + }, + { + "epoch": 0.6705621599838807, + "grad_norm": 0.06566434353590012, + "learning_rate": 9.807860444075467e-05, + "loss": 0.1664, + "step": 3328 + }, + { + "epoch": 0.6709651420511787, + "grad_norm": 0.06202316656708717, + "learning_rate": 9.807494390402849e-05, + "loss": 0.1753, + "step": 3330 + }, + { + "epoch": 0.6713681241184767, + "grad_norm": 0.06079009175300598, + "learning_rate": 9.807127995213518e-05, + "loss": 0.224, + "step": 3332 + }, + { + "epoch": 0.6717711061857747, + "grad_norm": 0.056815944612026215, + "learning_rate": 9.8067612585335e-05, + "loss": 0.2313, + "step": 3334 + }, + { + "epoch": 0.6721740882530728, + "grad_norm": 0.06725732982158661, + "learning_rate": 9.806394180388854e-05, + "loss": 0.1753, + "step": 3336 + }, + { + "epoch": 0.6725770703203707, + "grad_norm": 0.06362208724021912, + "learning_rate": 9.806026760805649e-05, + "loss": 0.1315, + "step": 3338 + }, + { + "epoch": 0.6729800523876688, + "grad_norm": 0.062081463634967804, + "learning_rate": 9.80565899980999e-05, + "loss": 0.2372, + "step": 3340 + }, + { + "epoch": 0.6733830344549667, + "grad_norm": 0.06947702914476395, + "learning_rate": 9.805290897428e-05, + "loss": 0.153, + "step": 3342 + }, + { + "epoch": 0.6737860165222648, + "grad_norm": 0.06031232699751854, + "learning_rate": 9.80492245368583e-05, + "loss": 0.1669, + "step": 3344 + }, + { + "epoch": 0.6741889985895627, + "grad_norm": 0.07842449098825455, + "learning_rate": 9.804553668609654e-05, + "loss": 0.209, + "step": 3346 + }, + { + "epoch": 0.6745919806568608, + "grad_norm": 0.10207051783800125, + "learning_rate": 9.804184542225669e-05, + "loss": 0.2374, + "step": 3348 + }, + { + "epoch": 0.6749949627241588, + "grad_norm": 0.06684952229261398, + "learning_rate": 9.803815074560096e-05, + "loss": 0.2037, + "step": 3350 + }, + { + "epoch": 0.6753979447914568, + "grad_norm": 0.049116067588329315, + "learning_rate": 9.803445265639184e-05, + "loss": 0.1943, + "step": 3352 + }, + { + "epoch": 0.6758009268587548, + "grad_norm": 0.07274952530860901, + "learning_rate": 9.803075115489203e-05, + "loss": 0.2178, + "step": 3354 + }, + { + "epoch": 0.6762039089260528, + "grad_norm": 0.06149638071656227, + "learning_rate": 9.802704624136444e-05, + "loss": 0.2173, + "step": 3356 + }, + { + "epoch": 0.6766068909933508, + "grad_norm": 0.08942967653274536, + "learning_rate": 9.802333791607233e-05, + "loss": 0.2088, + "step": 3358 + }, + { + "epoch": 0.6770098730606487, + "grad_norm": 0.08084887266159058, + "learning_rate": 9.801962617927907e-05, + "loss": 0.2187, + "step": 3360 + }, + { + "epoch": 0.6774128551279468, + "grad_norm": 0.06932730227708817, + "learning_rate": 9.801591103124837e-05, + "loss": 0.2379, + "step": 3362 + }, + { + "epoch": 0.6778158371952449, + "grad_norm": 0.06372372061014175, + "learning_rate": 9.801219247224415e-05, + "loss": 0.1857, + "step": 3364 + }, + { + "epoch": 0.6782188192625428, + "grad_norm": 0.06282777339220047, + "learning_rate": 9.800847050253055e-05, + "loss": 0.2028, + "step": 3366 + }, + { + "epoch": 0.6786218013298408, + "grad_norm": 0.05891014263033867, + "learning_rate": 9.800474512237199e-05, + "loss": 0.2422, + "step": 3368 + }, + { + "epoch": 0.6790247833971388, + "grad_norm": 0.057885557413101196, + "learning_rate": 9.80010163320331e-05, + "loss": 0.2128, + "step": 3370 + }, + { + "epoch": 0.6794277654644368, + "grad_norm": 0.0572814866900444, + "learning_rate": 9.799728413177878e-05, + "loss": 0.2079, + "step": 3372 + }, + { + "epoch": 0.6798307475317349, + "grad_norm": 0.04849035665392876, + "learning_rate": 9.799354852187417e-05, + "loss": 0.1852, + "step": 3374 + }, + { + "epoch": 0.6802337295990328, + "grad_norm": 0.04705261439085007, + "learning_rate": 9.79898095025846e-05, + "loss": 0.1822, + "step": 3376 + }, + { + "epoch": 0.6806367116663309, + "grad_norm": 0.0639222264289856, + "learning_rate": 9.798606707417573e-05, + "loss": 0.2139, + "step": 3378 + }, + { + "epoch": 0.6810396937336288, + "grad_norm": 0.08927220851182938, + "learning_rate": 9.79823212369134e-05, + "loss": 0.2152, + "step": 3380 + }, + { + "epoch": 0.6814426758009269, + "grad_norm": 0.0470849983394146, + "learning_rate": 9.797857199106369e-05, + "loss": 0.2052, + "step": 3382 + }, + { + "epoch": 0.6818456578682248, + "grad_norm": 0.05196920037269592, + "learning_rate": 9.797481933689296e-05, + "loss": 0.161, + "step": 3384 + }, + { + "epoch": 0.6822486399355229, + "grad_norm": 0.07091446220874786, + "learning_rate": 9.79710632746678e-05, + "loss": 0.1761, + "step": 3386 + }, + { + "epoch": 0.6826516220028209, + "grad_norm": 0.05899330973625183, + "learning_rate": 9.796730380465502e-05, + "loss": 0.1786, + "step": 3388 + }, + { + "epoch": 0.6830546040701189, + "grad_norm": 0.12023943662643433, + "learning_rate": 9.796354092712168e-05, + "loss": 0.2054, + "step": 3390 + }, + { + "epoch": 0.6834575861374169, + "grad_norm": 0.05540652573108673, + "learning_rate": 9.795977464233513e-05, + "loss": 0.1603, + "step": 3392 + }, + { + "epoch": 0.6838605682047149, + "grad_norm": 0.059661611914634705, + "learning_rate": 9.795600495056285e-05, + "loss": 0.1768, + "step": 3394 + }, + { + "epoch": 0.6842635502720129, + "grad_norm": 0.07932312786579132, + "learning_rate": 9.79522318520727e-05, + "loss": 0.2197, + "step": 3396 + }, + { + "epoch": 0.6846665323393109, + "grad_norm": 0.06426575034856796, + "learning_rate": 9.794845534713266e-05, + "loss": 0.2275, + "step": 3398 + }, + { + "epoch": 0.6850695144066089, + "grad_norm": 0.041860274970531464, + "learning_rate": 9.794467543601106e-05, + "loss": 0.1523, + "step": 3400 + }, + { + "epoch": 0.685472496473907, + "grad_norm": 0.05880004167556763, + "learning_rate": 9.794089211897638e-05, + "loss": 0.196, + "step": 3402 + }, + { + "epoch": 0.6858754785412049, + "grad_norm": 0.05922669917345047, + "learning_rate": 9.79371053962974e-05, + "loss": 0.2885, + "step": 3404 + }, + { + "epoch": 0.686278460608503, + "grad_norm": 0.05087222531437874, + "learning_rate": 9.793331526824312e-05, + "loss": 0.128, + "step": 3406 + }, + { + "epoch": 0.6866814426758009, + "grad_norm": 0.07046099007129669, + "learning_rate": 9.792952173508277e-05, + "loss": 0.2136, + "step": 3408 + }, + { + "epoch": 0.6870844247430989, + "grad_norm": 0.05308796837925911, + "learning_rate": 9.792572479708586e-05, + "loss": 0.2458, + "step": 3410 + }, + { + "epoch": 0.6874874068103969, + "grad_norm": 0.05528232827782631, + "learning_rate": 9.79219244545221e-05, + "loss": 0.1772, + "step": 3412 + }, + { + "epoch": 0.6878903888776949, + "grad_norm": 0.04996131733059883, + "learning_rate": 9.791812070766147e-05, + "loss": 0.1741, + "step": 3414 + }, + { + "epoch": 0.688293370944993, + "grad_norm": 0.060730304569005966, + "learning_rate": 9.791431355677416e-05, + "loss": 0.1764, + "step": 3416 + }, + { + "epoch": 0.6886963530122909, + "grad_norm": 0.049104318022727966, + "learning_rate": 9.791050300213066e-05, + "loss": 0.1766, + "step": 3418 + }, + { + "epoch": 0.689099335079589, + "grad_norm": 0.043497003614902496, + "learning_rate": 9.790668904400165e-05, + "loss": 0.2286, + "step": 3420 + }, + { + "epoch": 0.6895023171468869, + "grad_norm": 0.054365627467632294, + "learning_rate": 9.790287168265806e-05, + "loss": 0.1956, + "step": 3422 + }, + { + "epoch": 0.689905299214185, + "grad_norm": 0.07255329191684723, + "learning_rate": 9.789905091837109e-05, + "loss": 0.1929, + "step": 3424 + }, + { + "epoch": 0.690308281281483, + "grad_norm": 0.038361676037311554, + "learning_rate": 9.789522675141212e-05, + "loss": 0.18, + "step": 3426 + }, + { + "epoch": 0.690711263348781, + "grad_norm": 0.0500890351831913, + "learning_rate": 9.789139918205285e-05, + "loss": 0.1705, + "step": 3428 + }, + { + "epoch": 0.691114245416079, + "grad_norm": 0.054220765829086304, + "learning_rate": 9.788756821056517e-05, + "loss": 0.256, + "step": 3430 + }, + { + "epoch": 0.691517227483377, + "grad_norm": 0.05716811493039131, + "learning_rate": 9.788373383722125e-05, + "loss": 0.1494, + "step": 3432 + }, + { + "epoch": 0.691920209550675, + "grad_norm": 0.057214152067899704, + "learning_rate": 9.787989606229343e-05, + "loss": 0.226, + "step": 3434 + }, + { + "epoch": 0.692323191617973, + "grad_norm": 0.05963205546140671, + "learning_rate": 9.787605488605438e-05, + "loss": 0.1787, + "step": 3436 + }, + { + "epoch": 0.692726173685271, + "grad_norm": 0.06281402707099915, + "learning_rate": 9.787221030877696e-05, + "loss": 0.1814, + "step": 3438 + }, + { + "epoch": 0.6931291557525691, + "grad_norm": 0.044817693531513214, + "learning_rate": 9.786836233073427e-05, + "loss": 0.2141, + "step": 3440 + }, + { + "epoch": 0.693532137819867, + "grad_norm": 0.07125949114561081, + "learning_rate": 9.786451095219967e-05, + "loss": 0.2133, + "step": 3442 + }, + { + "epoch": 0.693935119887165, + "grad_norm": 0.06045832857489586, + "learning_rate": 9.786065617344677e-05, + "loss": 0.1693, + "step": 3444 + }, + { + "epoch": 0.694338101954463, + "grad_norm": 0.05925006419420242, + "learning_rate": 9.78567979947494e-05, + "loss": 0.1897, + "step": 3446 + }, + { + "epoch": 0.694741084021761, + "grad_norm": 0.047181349247694016, + "learning_rate": 9.785293641638162e-05, + "loss": 0.1675, + "step": 3448 + }, + { + "epoch": 0.695144066089059, + "grad_norm": 0.045355185866355896, + "learning_rate": 9.784907143861779e-05, + "loss": 0.2318, + "step": 3450 + }, + { + "epoch": 0.695547048156357, + "grad_norm": 0.06112281605601311, + "learning_rate": 9.784520306173244e-05, + "loss": 0.1841, + "step": 3452 + }, + { + "epoch": 0.6959500302236551, + "grad_norm": 0.06289122998714447, + "learning_rate": 9.784133128600037e-05, + "loss": 0.1489, + "step": 3454 + }, + { + "epoch": 0.696353012290953, + "grad_norm": 0.06532011181116104, + "learning_rate": 9.783745611169665e-05, + "loss": 0.1487, + "step": 3456 + }, + { + "epoch": 0.6967559943582511, + "grad_norm": 0.055435534566640854, + "learning_rate": 9.783357753909654e-05, + "loss": 0.1732, + "step": 3458 + }, + { + "epoch": 0.697158976425549, + "grad_norm": 0.06668412685394287, + "learning_rate": 9.78296955684756e-05, + "loss": 0.2097, + "step": 3460 + }, + { + "epoch": 0.6975619584928471, + "grad_norm": 0.05773022770881653, + "learning_rate": 9.782581020010956e-05, + "loss": 0.2037, + "step": 3462 + }, + { + "epoch": 0.697964940560145, + "grad_norm": 0.06257740408182144, + "learning_rate": 9.782192143427446e-05, + "loss": 0.2101, + "step": 3464 + }, + { + "epoch": 0.6983679226274431, + "grad_norm": 0.07159219682216644, + "learning_rate": 9.781802927124652e-05, + "loss": 0.2461, + "step": 3466 + }, + { + "epoch": 0.6987709046947411, + "grad_norm": 0.04788602888584137, + "learning_rate": 9.781413371130228e-05, + "loss": 0.1685, + "step": 3468 + }, + { + "epoch": 0.6991738867620391, + "grad_norm": 0.05469825863838196, + "learning_rate": 9.781023475471845e-05, + "loss": 0.2121, + "step": 3470 + }, + { + "epoch": 0.6995768688293371, + "grad_norm": 0.042920369654893875, + "learning_rate": 9.780633240177198e-05, + "loss": 0.2039, + "step": 3472 + }, + { + "epoch": 0.6999798508966351, + "grad_norm": 0.043295204639434814, + "learning_rate": 9.780242665274013e-05, + "loss": 0.1937, + "step": 3474 + }, + { + "epoch": 0.7003828329639331, + "grad_norm": 0.042053669691085815, + "learning_rate": 9.779851750790033e-05, + "loss": 0.182, + "step": 3476 + }, + { + "epoch": 0.7007858150312312, + "grad_norm": 0.050461430102586746, + "learning_rate": 9.77946049675303e-05, + "loss": 0.1847, + "step": 3478 + }, + { + "epoch": 0.7011887970985291, + "grad_norm": 0.05842139571905136, + "learning_rate": 9.779068903190796e-05, + "loss": 0.2068, + "step": 3480 + }, + { + "epoch": 0.7015917791658272, + "grad_norm": 0.055488720536231995, + "learning_rate": 9.77867697013115e-05, + "loss": 0.2043, + "step": 3482 + }, + { + "epoch": 0.7019947612331251, + "grad_norm": 0.05179367586970329, + "learning_rate": 9.778284697601934e-05, + "loss": 0.1916, + "step": 3484 + }, + { + "epoch": 0.7023977433004231, + "grad_norm": 0.056457649916410446, + "learning_rate": 9.777892085631016e-05, + "loss": 0.2545, + "step": 3486 + }, + { + "epoch": 0.7028007253677211, + "grad_norm": 0.05595328286290169, + "learning_rate": 9.777499134246285e-05, + "loss": 0.1836, + "step": 3488 + }, + { + "epoch": 0.7032037074350191, + "grad_norm": 0.04369045048952103, + "learning_rate": 9.777105843475655e-05, + "loss": 0.1407, + "step": 3490 + }, + { + "epoch": 0.7036066895023172, + "grad_norm": 0.05445627123117447, + "learning_rate": 9.776712213347068e-05, + "loss": 0.2406, + "step": 3492 + }, + { + "epoch": 0.7040096715696151, + "grad_norm": 0.0552406869828701, + "learning_rate": 9.776318243888482e-05, + "loss": 0.1942, + "step": 3494 + }, + { + "epoch": 0.7044126536369132, + "grad_norm": 0.04776231199502945, + "learning_rate": 9.775923935127889e-05, + "loss": 0.2204, + "step": 3496 + }, + { + "epoch": 0.7048156357042111, + "grad_norm": 0.04784173145890236, + "learning_rate": 9.775529287093296e-05, + "loss": 0.243, + "step": 3498 + }, + { + "epoch": 0.7052186177715092, + "grad_norm": 0.04918511211872101, + "learning_rate": 9.77513429981274e-05, + "loss": 0.18, + "step": 3500 + }, + { + "epoch": 0.7056215998388071, + "grad_norm": 0.062354523688554764, + "learning_rate": 9.774738973314281e-05, + "loss": 0.2023, + "step": 3502 + }, + { + "epoch": 0.7060245819061052, + "grad_norm": 0.05601181089878082, + "learning_rate": 9.774343307626e-05, + "loss": 0.2051, + "step": 3504 + }, + { + "epoch": 0.7064275639734032, + "grad_norm": 0.07394832372665405, + "learning_rate": 9.773947302776006e-05, + "loss": 0.1896, + "step": 3506 + }, + { + "epoch": 0.7068305460407012, + "grad_norm": 0.05391557887196541, + "learning_rate": 9.77355095879243e-05, + "loss": 0.2055, + "step": 3508 + }, + { + "epoch": 0.7072335281079992, + "grad_norm": 0.06169082224369049, + "learning_rate": 9.77315427570343e-05, + "loss": 0.2209, + "step": 3510 + }, + { + "epoch": 0.7076365101752972, + "grad_norm": 0.050488028675317764, + "learning_rate": 9.772757253537184e-05, + "loss": 0.1484, + "step": 3512 + }, + { + "epoch": 0.7080394922425952, + "grad_norm": 0.04180103540420532, + "learning_rate": 9.772359892321893e-05, + "loss": 0.1583, + "step": 3514 + }, + { + "epoch": 0.7084424743098932, + "grad_norm": 0.059814367443323135, + "learning_rate": 9.771962192085789e-05, + "loss": 0.1898, + "step": 3516 + }, + { + "epoch": 0.7088454563771912, + "grad_norm": 0.06913917511701584, + "learning_rate": 9.771564152857123e-05, + "loss": 0.1652, + "step": 3518 + }, + { + "epoch": 0.7092484384444893, + "grad_norm": 0.060908909887075424, + "learning_rate": 9.77116577466417e-05, + "loss": 0.2386, + "step": 3520 + }, + { + "epoch": 0.7096514205117872, + "grad_norm": 0.03974832966923714, + "learning_rate": 9.77076705753523e-05, + "loss": 0.1894, + "step": 3522 + }, + { + "epoch": 0.7100544025790853, + "grad_norm": 0.04386703670024872, + "learning_rate": 9.770368001498629e-05, + "loss": 0.201, + "step": 3524 + }, + { + "epoch": 0.7104573846463832, + "grad_norm": 0.04671144112944603, + "learning_rate": 9.769968606582713e-05, + "loss": 0.1959, + "step": 3526 + }, + { + "epoch": 0.7108603667136812, + "grad_norm": 0.044962078332901, + "learning_rate": 9.769568872815856e-05, + "loss": 0.2122, + "step": 3528 + }, + { + "epoch": 0.7112633487809793, + "grad_norm": 0.046789754182100296, + "learning_rate": 9.769168800226454e-05, + "loss": 0.1655, + "step": 3530 + }, + { + "epoch": 0.7116663308482772, + "grad_norm": 0.04514605551958084, + "learning_rate": 9.768768388842929e-05, + "loss": 0.2152, + "step": 3532 + }, + { + "epoch": 0.7120693129155753, + "grad_norm": 0.044290874153375626, + "learning_rate": 9.768367638693723e-05, + "loss": 0.1686, + "step": 3534 + }, + { + "epoch": 0.7124722949828732, + "grad_norm": 0.05953364819288254, + "learning_rate": 9.767966549807306e-05, + "loss": 0.1464, + "step": 3536 + }, + { + "epoch": 0.7128752770501713, + "grad_norm": 0.04944094642996788, + "learning_rate": 9.767565122212171e-05, + "loss": 0.1621, + "step": 3538 + }, + { + "epoch": 0.7132782591174692, + "grad_norm": 0.07431504875421524, + "learning_rate": 9.767163355936835e-05, + "loss": 0.1801, + "step": 3540 + }, + { + "epoch": 0.7136812411847673, + "grad_norm": 0.06648550927639008, + "learning_rate": 9.766761251009836e-05, + "loss": 0.202, + "step": 3542 + }, + { + "epoch": 0.7140842232520653, + "grad_norm": 0.06690148264169693, + "learning_rate": 9.766358807459742e-05, + "loss": 0.1898, + "step": 3544 + }, + { + "epoch": 0.7144872053193633, + "grad_norm": 0.06820599734783173, + "learning_rate": 9.765956025315142e-05, + "loss": 0.2158, + "step": 3546 + }, + { + "epoch": 0.7148901873866613, + "grad_norm": 0.05998406931757927, + "learning_rate": 9.765552904604647e-05, + "loss": 0.2057, + "step": 3548 + }, + { + "epoch": 0.7152931694539593, + "grad_norm": 0.08947255462408066, + "learning_rate": 9.765149445356894e-05, + "loss": 0.2408, + "step": 3550 + }, + { + "epoch": 0.7156961515212573, + "grad_norm": 0.0403655469417572, + "learning_rate": 9.764745647600545e-05, + "loss": 0.1495, + "step": 3552 + }, + { + "epoch": 0.7160991335885553, + "grad_norm": 0.04842576012015343, + "learning_rate": 9.764341511364288e-05, + "loss": 0.1605, + "step": 3554 + }, + { + "epoch": 0.7165021156558533, + "grad_norm": 0.06308891624212265, + "learning_rate": 9.763937036676829e-05, + "loss": 0.2036, + "step": 3556 + }, + { + "epoch": 0.7169050977231514, + "grad_norm": 0.05665654316544533, + "learning_rate": 9.7635322235669e-05, + "loss": 0.2227, + "step": 3558 + }, + { + "epoch": 0.7173080797904493, + "grad_norm": 0.05124920234084129, + "learning_rate": 9.763127072063261e-05, + "loss": 0.1838, + "step": 3560 + }, + { + "epoch": 0.7177110618577474, + "grad_norm": 0.05555025488138199, + "learning_rate": 9.762721582194692e-05, + "loss": 0.2314, + "step": 3562 + }, + { + "epoch": 0.7181140439250453, + "grad_norm": 0.06558901816606522, + "learning_rate": 9.762315753989999e-05, + "loss": 0.1591, + "step": 3564 + }, + { + "epoch": 0.7185170259923434, + "grad_norm": 0.08271800726652145, + "learning_rate": 9.76190958747801e-05, + "loss": 0.2709, + "step": 3566 + }, + { + "epoch": 0.7189200080596413, + "grad_norm": 0.05561373382806778, + "learning_rate": 9.76150308268758e-05, + "loss": 0.1889, + "step": 3568 + }, + { + "epoch": 0.7193229901269393, + "grad_norm": 0.05947286635637283, + "learning_rate": 9.761096239647588e-05, + "loss": 0.2221, + "step": 3570 + }, + { + "epoch": 0.7197259721942374, + "grad_norm": 0.05997093394398689, + "learning_rate": 9.760689058386929e-05, + "loss": 0.2229, + "step": 3572 + }, + { + "epoch": 0.7201289542615353, + "grad_norm": 0.06581594049930573, + "learning_rate": 9.760281538934536e-05, + "loss": 0.2132, + "step": 3574 + }, + { + "epoch": 0.7205319363288334, + "grad_norm": 0.05122917890548706, + "learning_rate": 9.759873681319354e-05, + "loss": 0.2357, + "step": 3576 + }, + { + "epoch": 0.7209349183961313, + "grad_norm": 0.0549938790500164, + "learning_rate": 9.75946548557036e-05, + "loss": 0.2176, + "step": 3578 + }, + { + "epoch": 0.7213379004634294, + "grad_norm": 0.038299329578876495, + "learning_rate": 9.759056951716548e-05, + "loss": 0.2005, + "step": 3580 + }, + { + "epoch": 0.7217408825307274, + "grad_norm": 0.04204018786549568, + "learning_rate": 9.758648079786941e-05, + "loss": 0.1957, + "step": 3582 + }, + { + "epoch": 0.7221438645980254, + "grad_norm": 0.0423753559589386, + "learning_rate": 9.758238869810585e-05, + "loss": 0.2046, + "step": 3584 + }, + { + "epoch": 0.7225468466653234, + "grad_norm": 0.04697240889072418, + "learning_rate": 9.75782932181655e-05, + "loss": 0.1985, + "step": 3586 + }, + { + "epoch": 0.7229498287326214, + "grad_norm": 0.054548196494579315, + "learning_rate": 9.757419435833928e-05, + "loss": 0.1823, + "step": 3588 + }, + { + "epoch": 0.7233528107999194, + "grad_norm": 0.040596771985292435, + "learning_rate": 9.757009211891839e-05, + "loss": 0.184, + "step": 3590 + }, + { + "epoch": 0.7237557928672174, + "grad_norm": 0.04304562136530876, + "learning_rate": 9.756598650019421e-05, + "loss": 0.2115, + "step": 3592 + }, + { + "epoch": 0.7241587749345154, + "grad_norm": 0.04047536477446556, + "learning_rate": 9.756187750245844e-05, + "loss": 0.2026, + "step": 3594 + }, + { + "epoch": 0.7245617570018135, + "grad_norm": 0.050050195306539536, + "learning_rate": 9.755776512600295e-05, + "loss": 0.2037, + "step": 3596 + }, + { + "epoch": 0.7249647390691114, + "grad_norm": 0.063184455037117, + "learning_rate": 9.755364937111988e-05, + "loss": 0.1819, + "step": 3598 + }, + { + "epoch": 0.7253677211364095, + "grad_norm": 0.05444857105612755, + "learning_rate": 9.754953023810162e-05, + "loss": 0.1766, + "step": 3600 + }, + { + "epoch": 0.7257707032037074, + "grad_norm": 0.049810487776994705, + "learning_rate": 9.754540772724077e-05, + "loss": 0.1551, + "step": 3602 + }, + { + "epoch": 0.7261736852710055, + "grad_norm": 0.060143712908029556, + "learning_rate": 9.754128183883018e-05, + "loss": 0.2045, + "step": 3604 + }, + { + "epoch": 0.7265766673383034, + "grad_norm": 0.055027224123477936, + "learning_rate": 9.753715257316298e-05, + "loss": 0.1711, + "step": 3606 + }, + { + "epoch": 0.7269796494056014, + "grad_norm": 0.054849088191986084, + "learning_rate": 9.753301993053247e-05, + "loss": 0.2089, + "step": 3608 + }, + { + "epoch": 0.7273826314728995, + "grad_norm": 0.06147737428545952, + "learning_rate": 9.752888391123224e-05, + "loss": 0.2434, + "step": 3610 + }, + { + "epoch": 0.7277856135401974, + "grad_norm": 0.0619664303958416, + "learning_rate": 9.752474451555614e-05, + "loss": 0.2034, + "step": 3612 + }, + { + "epoch": 0.7281885956074955, + "grad_norm": 0.053098902106285095, + "learning_rate": 9.752060174379816e-05, + "loss": 0.2415, + "step": 3614 + }, + { + "epoch": 0.7285915776747934, + "grad_norm": 0.05426261946558952, + "learning_rate": 9.751645559625264e-05, + "loss": 0.2463, + "step": 3616 + }, + { + "epoch": 0.7289945597420915, + "grad_norm": 0.07284572720527649, + "learning_rate": 9.751230607321411e-05, + "loss": 0.2031, + "step": 3618 + }, + { + "epoch": 0.7293975418093894, + "grad_norm": 0.05113440379500389, + "learning_rate": 9.750815317497733e-05, + "loss": 0.1986, + "step": 3620 + }, + { + "epoch": 0.7298005238766875, + "grad_norm": 0.07082262635231018, + "learning_rate": 9.750399690183733e-05, + "loss": 0.1869, + "step": 3622 + }, + { + "epoch": 0.7302035059439855, + "grad_norm": 0.04850250855088234, + "learning_rate": 9.749983725408938e-05, + "loss": 0.1853, + "step": 3624 + }, + { + "epoch": 0.7306064880112835, + "grad_norm": 0.062024109065532684, + "learning_rate": 9.749567423202893e-05, + "loss": 0.203, + "step": 3626 + }, + { + "epoch": 0.7310094700785815, + "grad_norm": 0.062060195952653885, + "learning_rate": 9.749150783595176e-05, + "loss": 0.2347, + "step": 3628 + }, + { + "epoch": 0.7314124521458795, + "grad_norm": 0.04899689182639122, + "learning_rate": 9.748733806615382e-05, + "loss": 0.2279, + "step": 3630 + }, + { + "epoch": 0.7318154342131775, + "grad_norm": 0.05050105229020119, + "learning_rate": 9.748316492293132e-05, + "loss": 0.2057, + "step": 3632 + }, + { + "epoch": 0.7322184162804756, + "grad_norm": 0.05160733684897423, + "learning_rate": 9.747898840658072e-05, + "loss": 0.2122, + "step": 3634 + }, + { + "epoch": 0.7326213983477735, + "grad_norm": 0.051758818328380585, + "learning_rate": 9.747480851739872e-05, + "loss": 0.2239, + "step": 3636 + }, + { + "epoch": 0.7330243804150716, + "grad_norm": 0.07806959003210068, + "learning_rate": 9.747062525568226e-05, + "loss": 0.2139, + "step": 3638 + }, + { + "epoch": 0.7334273624823695, + "grad_norm": 0.03898259997367859, + "learning_rate": 9.746643862172849e-05, + "loss": 0.2334, + "step": 3640 + }, + { + "epoch": 0.7338303445496676, + "grad_norm": 0.048747796565294266, + "learning_rate": 9.746224861583484e-05, + "loss": 0.2342, + "step": 3642 + }, + { + "epoch": 0.7342333266169655, + "grad_norm": 0.04277431219816208, + "learning_rate": 9.745805523829893e-05, + "loss": 0.1758, + "step": 3644 + }, + { + "epoch": 0.7346363086842636, + "grad_norm": 0.05254721641540527, + "learning_rate": 9.74538584894187e-05, + "loss": 0.2353, + "step": 3646 + }, + { + "epoch": 0.7350392907515616, + "grad_norm": 0.04728619009256363, + "learning_rate": 9.744965836949225e-05, + "loss": 0.193, + "step": 3648 + }, + { + "epoch": 0.7354422728188595, + "grad_norm": 0.04738273471593857, + "learning_rate": 9.744545487881793e-05, + "loss": 0.1914, + "step": 3650 + }, + { + "epoch": 0.7358452548861576, + "grad_norm": 0.047282394021749496, + "learning_rate": 9.74412480176944e-05, + "loss": 0.2354, + "step": 3652 + }, + { + "epoch": 0.7362482369534555, + "grad_norm": 0.06668908894062042, + "learning_rate": 9.743703778642047e-05, + "loss": 0.2104, + "step": 3654 + }, + { + "epoch": 0.7366512190207536, + "grad_norm": 0.042231086641550064, + "learning_rate": 9.743282418529525e-05, + "loss": 0.1355, + "step": 3656 + }, + { + "epoch": 0.7370542010880515, + "grad_norm": 0.06251658499240875, + "learning_rate": 9.742860721461804e-05, + "loss": 0.22, + "step": 3658 + }, + { + "epoch": 0.7374571831553496, + "grad_norm": 0.040848251432180405, + "learning_rate": 9.742438687468843e-05, + "loss": 0.1877, + "step": 3660 + }, + { + "epoch": 0.7378601652226476, + "grad_norm": 0.07239284366369247, + "learning_rate": 9.742016316580622e-05, + "loss": 0.2361, + "step": 3662 + }, + { + "epoch": 0.7382631472899456, + "grad_norm": 0.048893146216869354, + "learning_rate": 9.741593608827146e-05, + "loss": 0.232, + "step": 3664 + }, + { + "epoch": 0.7386661293572436, + "grad_norm": 0.04481977969408035, + "learning_rate": 9.741170564238444e-05, + "loss": 0.181, + "step": 3666 + }, + { + "epoch": 0.7390691114245416, + "grad_norm": 0.04950220510363579, + "learning_rate": 9.740747182844567e-05, + "loss": 0.2143, + "step": 3668 + }, + { + "epoch": 0.7394720934918396, + "grad_norm": 0.04517577216029167, + "learning_rate": 9.740323464675591e-05, + "loss": 0.2452, + "step": 3670 + }, + { + "epoch": 0.7398750755591377, + "grad_norm": 0.04096828028559685, + "learning_rate": 9.739899409761617e-05, + "loss": 0.2507, + "step": 3672 + }, + { + "epoch": 0.7402780576264356, + "grad_norm": 0.04736701026558876, + "learning_rate": 9.739475018132771e-05, + "loss": 0.1706, + "step": 3674 + }, + { + "epoch": 0.7406810396937337, + "grad_norm": 0.06137583777308464, + "learning_rate": 9.739050289819198e-05, + "loss": 0.1851, + "step": 3676 + }, + { + "epoch": 0.7410840217610316, + "grad_norm": 0.05057776719331741, + "learning_rate": 9.738625224851071e-05, + "loss": 0.1475, + "step": 3678 + }, + { + "epoch": 0.7414870038283297, + "grad_norm": 0.058802492916584015, + "learning_rate": 9.738199823258587e-05, + "loss": 0.1984, + "step": 3680 + }, + { + "epoch": 0.7418899858956276, + "grad_norm": 0.0801457017660141, + "learning_rate": 9.737774085071965e-05, + "loss": 0.198, + "step": 3682 + }, + { + "epoch": 0.7422929679629257, + "grad_norm": 0.04764263331890106, + "learning_rate": 9.73734801032145e-05, + "loss": 0.2157, + "step": 3684 + }, + { + "epoch": 0.7426959500302237, + "grad_norm": 0.04858770966529846, + "learning_rate": 9.736921599037307e-05, + "loss": 0.1898, + "step": 3686 + }, + { + "epoch": 0.7430989320975216, + "grad_norm": 0.06790611147880554, + "learning_rate": 9.73649485124983e-05, + "loss": 0.1888, + "step": 3688 + }, + { + "epoch": 0.7435019141648197, + "grad_norm": 0.05359569564461708, + "learning_rate": 9.736067766989333e-05, + "loss": 0.2109, + "step": 3690 + }, + { + "epoch": 0.7439048962321176, + "grad_norm": 0.045951735228300095, + "learning_rate": 9.735640346286157e-05, + "loss": 0.2321, + "step": 3692 + }, + { + "epoch": 0.7443078782994157, + "grad_norm": 0.038241468369960785, + "learning_rate": 9.735212589170664e-05, + "loss": 0.1565, + "step": 3694 + }, + { + "epoch": 0.7447108603667136, + "grad_norm": 0.054619934409856796, + "learning_rate": 9.734784495673242e-05, + "loss": 0.2008, + "step": 3696 + }, + { + "epoch": 0.7451138424340117, + "grad_norm": 0.04232056066393852, + "learning_rate": 9.734356065824301e-05, + "loss": 0.1715, + "step": 3698 + }, + { + "epoch": 0.7455168245013097, + "grad_norm": 0.05665093660354614, + "learning_rate": 9.733927299654277e-05, + "loss": 0.1869, + "step": 3700 + }, + { + "epoch": 0.7459198065686077, + "grad_norm": 0.05509399622678757, + "learning_rate": 9.733498197193627e-05, + "loss": 0.2342, + "step": 3702 + }, + { + "epoch": 0.7463227886359057, + "grad_norm": 0.051437657326459885, + "learning_rate": 9.733068758472836e-05, + "loss": 0.2195, + "step": 3704 + }, + { + "epoch": 0.7467257707032037, + "grad_norm": 0.06696417927742004, + "learning_rate": 9.73263898352241e-05, + "loss": 0.1925, + "step": 3706 + }, + { + "epoch": 0.7471287527705017, + "grad_norm": 0.05943130701780319, + "learning_rate": 9.73220887237288e-05, + "loss": 0.1996, + "step": 3708 + }, + { + "epoch": 0.7475317348377997, + "grad_norm": 0.05695728957653046, + "learning_rate": 9.731778425054801e-05, + "loss": 0.2164, + "step": 3710 + }, + { + "epoch": 0.7479347169050977, + "grad_norm": 0.05246898904442787, + "learning_rate": 9.731347641598747e-05, + "loss": 0.2263, + "step": 3712 + }, + { + "epoch": 0.7483376989723958, + "grad_norm": 0.05090521275997162, + "learning_rate": 9.730916522035325e-05, + "loss": 0.2062, + "step": 3714 + }, + { + "epoch": 0.7487406810396937, + "grad_norm": 0.0509493350982666, + "learning_rate": 9.730485066395158e-05, + "loss": 0.2105, + "step": 3716 + }, + { + "epoch": 0.7491436631069918, + "grad_norm": 0.06465563923120499, + "learning_rate": 9.730053274708898e-05, + "loss": 0.2148, + "step": 3718 + }, + { + "epoch": 0.7495466451742897, + "grad_norm": 0.06121086701750755, + "learning_rate": 9.729621147007218e-05, + "loss": 0.2228, + "step": 3720 + }, + { + "epoch": 0.7499496272415878, + "grad_norm": 0.05174422636628151, + "learning_rate": 9.729188683320816e-05, + "loss": 0.2304, + "step": 3722 + }, + { + "epoch": 0.7503526093088858, + "grad_norm": 0.052721381187438965, + "learning_rate": 9.728755883680412e-05, + "loss": 0.1822, + "step": 3724 + }, + { + "epoch": 0.7507555913761838, + "grad_norm": 0.04520372301340103, + "learning_rate": 9.728322748116754e-05, + "loss": 0.1936, + "step": 3726 + }, + { + "epoch": 0.7511585734434818, + "grad_norm": 0.04828318580985069, + "learning_rate": 9.727889276660608e-05, + "loss": 0.175, + "step": 3728 + }, + { + "epoch": 0.7515615555107797, + "grad_norm": 0.04844732955098152, + "learning_rate": 9.72745546934277e-05, + "loss": 0.2404, + "step": 3730 + }, + { + "epoch": 0.7519645375780778, + "grad_norm": 0.04914208874106407, + "learning_rate": 9.727021326194057e-05, + "loss": 0.199, + "step": 3732 + }, + { + "epoch": 0.7523675196453757, + "grad_norm": 0.05813221260905266, + "learning_rate": 9.726586847245308e-05, + "loss": 0.1974, + "step": 3734 + }, + { + "epoch": 0.7527705017126738, + "grad_norm": 0.07353874295949936, + "learning_rate": 9.726152032527386e-05, + "loss": 0.194, + "step": 3736 + }, + { + "epoch": 0.7531734837799718, + "grad_norm": 0.06467559933662415, + "learning_rate": 9.725716882071185e-05, + "loss": 0.2251, + "step": 3738 + }, + { + "epoch": 0.7535764658472698, + "grad_norm": 0.08326564729213715, + "learning_rate": 9.725281395907612e-05, + "loss": 0.2149, + "step": 3740 + }, + { + "epoch": 0.7539794479145678, + "grad_norm": 0.06660158187150955, + "learning_rate": 9.724845574067607e-05, + "loss": 0.1908, + "step": 3742 + }, + { + "epoch": 0.7543824299818658, + "grad_norm": 0.04767664894461632, + "learning_rate": 9.724409416582129e-05, + "loss": 0.1932, + "step": 3744 + }, + { + "epoch": 0.7547854120491638, + "grad_norm": 0.05308603122830391, + "learning_rate": 9.723972923482163e-05, + "loss": 0.2752, + "step": 3746 + }, + { + "epoch": 0.7551883941164618, + "grad_norm": 0.05647696554660797, + "learning_rate": 9.723536094798713e-05, + "loss": 0.2571, + "step": 3748 + }, + { + "epoch": 0.7555913761837598, + "grad_norm": 0.06191622465848923, + "learning_rate": 9.723098930562813e-05, + "loss": 0.1999, + "step": 3750 + }, + { + "epoch": 0.7559943582510579, + "grad_norm": 0.06864877790212631, + "learning_rate": 9.72266143080552e-05, + "loss": 0.1778, + "step": 3752 + }, + { + "epoch": 0.7563973403183558, + "grad_norm": 0.04861884191632271, + "learning_rate": 9.72222359555791e-05, + "loss": 0.1615, + "step": 3754 + }, + { + "epoch": 0.7568003223856539, + "grad_norm": 0.06259770691394806, + "learning_rate": 9.721785424851089e-05, + "loss": 0.2407, + "step": 3756 + }, + { + "epoch": 0.7572033044529518, + "grad_norm": 0.07162967324256897, + "learning_rate": 9.721346918716184e-05, + "loss": 0.2126, + "step": 3758 + }, + { + "epoch": 0.7576062865202499, + "grad_norm": 0.07644571363925934, + "learning_rate": 9.720908077184341e-05, + "loss": 0.2492, + "step": 3760 + }, + { + "epoch": 0.7580092685875478, + "grad_norm": 0.10371597111225128, + "learning_rate": 9.720468900286741e-05, + "loss": 0.2252, + "step": 3762 + }, + { + "epoch": 0.7584122506548459, + "grad_norm": 0.23490440845489502, + "learning_rate": 9.720029388054578e-05, + "loss": 0.2502, + "step": 3764 + }, + { + "epoch": 0.7588152327221439, + "grad_norm": 0.05937601253390312, + "learning_rate": 9.719589540519077e-05, + "loss": 0.2005, + "step": 3766 + }, + { + "epoch": 0.7592182147894418, + "grad_norm": 0.05609803646802902, + "learning_rate": 9.719149357711483e-05, + "loss": 0.2276, + "step": 3768 + }, + { + "epoch": 0.7596211968567399, + "grad_norm": 0.059711702167987823, + "learning_rate": 9.718708839663065e-05, + "loss": 0.2541, + "step": 3770 + }, + { + "epoch": 0.7600241789240378, + "grad_norm": 0.05018291249871254, + "learning_rate": 9.718267986405118e-05, + "loss": 0.1599, + "step": 3772 + }, + { + "epoch": 0.7604271609913359, + "grad_norm": 0.05470029264688492, + "learning_rate": 9.717826797968958e-05, + "loss": 0.1962, + "step": 3774 + }, + { + "epoch": 0.760830143058634, + "grad_norm": 0.05150442197918892, + "learning_rate": 9.717385274385929e-05, + "loss": 0.198, + "step": 3776 + }, + { + "epoch": 0.7612331251259319, + "grad_norm": 0.06866799294948578, + "learning_rate": 9.716943415687394e-05, + "loss": 0.2038, + "step": 3778 + }, + { + "epoch": 0.7616361071932299, + "grad_norm": 0.046754587441682816, + "learning_rate": 9.716501221904741e-05, + "loss": 0.1902, + "step": 3780 + }, + { + "epoch": 0.7620390892605279, + "grad_norm": 0.05241142213344574, + "learning_rate": 9.716058693069386e-05, + "loss": 0.2104, + "step": 3782 + }, + { + "epoch": 0.7624420713278259, + "grad_norm": 0.06286633759737015, + "learning_rate": 9.715615829212763e-05, + "loss": 0.1956, + "step": 3784 + }, + { + "epoch": 0.7628450533951239, + "grad_norm": 0.09779243171215057, + "learning_rate": 9.715172630366334e-05, + "loss": 0.2315, + "step": 3786 + }, + { + "epoch": 0.7632480354624219, + "grad_norm": 0.061123188585042953, + "learning_rate": 9.71472909656158e-05, + "loss": 0.1918, + "step": 3788 + }, + { + "epoch": 0.76365101752972, + "grad_norm": 0.0447014644742012, + "learning_rate": 9.714285227830013e-05, + "loss": 0.192, + "step": 3790 + }, + { + "epoch": 0.7640539995970179, + "grad_norm": 0.08232752978801727, + "learning_rate": 9.71384102420316e-05, + "loss": 0.2029, + "step": 3792 + }, + { + "epoch": 0.764456981664316, + "grad_norm": 0.05152542516589165, + "learning_rate": 9.713396485712583e-05, + "loss": 0.2, + "step": 3794 + }, + { + "epoch": 0.7648599637316139, + "grad_norm": 0.06779137253761292, + "learning_rate": 9.712951612389855e-05, + "loss": 0.2021, + "step": 3796 + }, + { + "epoch": 0.765262945798912, + "grad_norm": 0.04836380109190941, + "learning_rate": 9.712506404266583e-05, + "loss": 0.1895, + "step": 3798 + }, + { + "epoch": 0.7656659278662099, + "grad_norm": 0.062117740511894226, + "learning_rate": 9.712060861374391e-05, + "loss": 0.1953, + "step": 3800 + }, + { + "epoch": 0.766068909933508, + "grad_norm": 0.057543814182281494, + "learning_rate": 9.711614983744932e-05, + "loss": 0.1926, + "step": 3802 + }, + { + "epoch": 0.766471892000806, + "grad_norm": 0.07145657390356064, + "learning_rate": 9.711168771409882e-05, + "loss": 0.1997, + "step": 3804 + }, + { + "epoch": 0.766874874068104, + "grad_norm": 0.07367728650569916, + "learning_rate": 9.710722224400935e-05, + "loss": 0.256, + "step": 3806 + }, + { + "epoch": 0.767277856135402, + "grad_norm": 0.5376330018043518, + "learning_rate": 9.710275342749813e-05, + "loss": 0.2055, + "step": 3808 + }, + { + "epoch": 0.7676808382026999, + "grad_norm": 0.048469942063093185, + "learning_rate": 9.709828126488265e-05, + "loss": 0.213, + "step": 3810 + }, + { + "epoch": 0.768083820269998, + "grad_norm": 0.05275251343846321, + "learning_rate": 9.709380575648061e-05, + "loss": 0.2052, + "step": 3812 + }, + { + "epoch": 0.7684868023372959, + "grad_norm": 0.046325862407684326, + "learning_rate": 9.70893269026099e-05, + "loss": 0.2063, + "step": 3814 + }, + { + "epoch": 0.768889784404594, + "grad_norm": 0.04846194013953209, + "learning_rate": 9.708484470358873e-05, + "loss": 0.2443, + "step": 3816 + }, + { + "epoch": 0.769292766471892, + "grad_norm": 0.06712636351585388, + "learning_rate": 9.708035915973548e-05, + "loss": 0.1973, + "step": 3818 + }, + { + "epoch": 0.76969574853919, + "grad_norm": 0.04823420196771622, + "learning_rate": 9.707587027136882e-05, + "loss": 0.2347, + "step": 3820 + }, + { + "epoch": 0.770098730606488, + "grad_norm": 0.05545537546277046, + "learning_rate": 9.707137803880762e-05, + "loss": 0.1621, + "step": 3822 + }, + { + "epoch": 0.770501712673786, + "grad_norm": 0.06045274809002876, + "learning_rate": 9.706688246237101e-05, + "loss": 0.1736, + "step": 3824 + }, + { + "epoch": 0.770904694741084, + "grad_norm": 0.04846609756350517, + "learning_rate": 9.706238354237833e-05, + "loss": 0.1557, + "step": 3826 + }, + { + "epoch": 0.7713076768083821, + "grad_norm": 0.056797757744789124, + "learning_rate": 9.70578812791492e-05, + "loss": 0.1836, + "step": 3828 + }, + { + "epoch": 0.77171065887568, + "grad_norm": 0.06821225583553314, + "learning_rate": 9.705337567300343e-05, + "loss": 0.2045, + "step": 3830 + }, + { + "epoch": 0.7721136409429781, + "grad_norm": 0.052724190056324005, + "learning_rate": 9.704886672426111e-05, + "loss": 0.1744, + "step": 3832 + }, + { + "epoch": 0.772516623010276, + "grad_norm": 0.12339714169502258, + "learning_rate": 9.704435443324254e-05, + "loss": 0.2365, + "step": 3834 + }, + { + "epoch": 0.7729196050775741, + "grad_norm": 0.05186685919761658, + "learning_rate": 9.703983880026827e-05, + "loss": 0.1393, + "step": 3836 + }, + { + "epoch": 0.773322587144872, + "grad_norm": 0.09036187082529068, + "learning_rate": 9.703531982565907e-05, + "loss": 0.2183, + "step": 3838 + }, + { + "epoch": 0.7737255692121701, + "grad_norm": 0.041202742606401443, + "learning_rate": 9.703079750973598e-05, + "loss": 0.2083, + "step": 3840 + }, + { + "epoch": 0.7741285512794681, + "grad_norm": 0.06593915820121765, + "learning_rate": 9.702627185282026e-05, + "loss": 0.2433, + "step": 3842 + }, + { + "epoch": 0.774531533346766, + "grad_norm": 0.05617796629667282, + "learning_rate": 9.702174285523337e-05, + "loss": 0.1957, + "step": 3844 + }, + { + "epoch": 0.7749345154140641, + "grad_norm": 0.05651909485459328, + "learning_rate": 9.70172105172971e-05, + "loss": 0.2095, + "step": 3846 + }, + { + "epoch": 0.775337497481362, + "grad_norm": 0.07869057357311249, + "learning_rate": 9.701267483933337e-05, + "loss": 0.2107, + "step": 3848 + }, + { + "epoch": 0.7757404795486601, + "grad_norm": 0.09256685525178909, + "learning_rate": 9.70081358216644e-05, + "loss": 0.2304, + "step": 3850 + }, + { + "epoch": 0.776143461615958, + "grad_norm": 0.0674339234828949, + "learning_rate": 9.700359346461265e-05, + "loss": 0.2421, + "step": 3852 + }, + { + "epoch": 0.7765464436832561, + "grad_norm": 0.050970036536455154, + "learning_rate": 9.699904776850078e-05, + "loss": 0.2318, + "step": 3854 + }, + { + "epoch": 0.7769494257505541, + "grad_norm": 0.049779586493968964, + "learning_rate": 9.699449873365173e-05, + "loss": 0.1531, + "step": 3856 + }, + { + "epoch": 0.7773524078178521, + "grad_norm": 0.04937596619129181, + "learning_rate": 9.698994636038864e-05, + "loss": 0.155, + "step": 3858 + }, + { + "epoch": 0.7777553898851501, + "grad_norm": 0.06441762298345566, + "learning_rate": 9.698539064903491e-05, + "loss": 0.2013, + "step": 3860 + }, + { + "epoch": 0.7781583719524481, + "grad_norm": 0.0680166631937027, + "learning_rate": 9.698083159991418e-05, + "loss": 0.1606, + "step": 3862 + }, + { + "epoch": 0.7785613540197461, + "grad_norm": 0.06702303141355515, + "learning_rate": 9.69762692133503e-05, + "loss": 0.2218, + "step": 3864 + }, + { + "epoch": 0.7789643360870441, + "grad_norm": 0.0592481829226017, + "learning_rate": 9.697170348966738e-05, + "loss": 0.2439, + "step": 3866 + }, + { + "epoch": 0.7793673181543421, + "grad_norm": 0.057907577604055405, + "learning_rate": 9.696713442918977e-05, + "loss": 0.1622, + "step": 3868 + }, + { + "epoch": 0.7797703002216402, + "grad_norm": 0.05967063456773758, + "learning_rate": 9.696256203224205e-05, + "loss": 0.2157, + "step": 3870 + }, + { + "epoch": 0.7801732822889381, + "grad_norm": 0.061265114694833755, + "learning_rate": 9.6957986299149e-05, + "loss": 0.1746, + "step": 3872 + }, + { + "epoch": 0.7805762643562362, + "grad_norm": 0.07470270991325378, + "learning_rate": 9.695340723023574e-05, + "loss": 0.1761, + "step": 3874 + }, + { + "epoch": 0.7809792464235341, + "grad_norm": 0.06045358628034592, + "learning_rate": 9.69488248258275e-05, + "loss": 0.2112, + "step": 3876 + }, + { + "epoch": 0.7813822284908322, + "grad_norm": 0.058921560645103455, + "learning_rate": 9.694423908624983e-05, + "loss": 0.2867, + "step": 3878 + }, + { + "epoch": 0.7817852105581302, + "grad_norm": 0.057590458542108536, + "learning_rate": 9.693965001182849e-05, + "loss": 0.2301, + "step": 3880 + }, + { + "epoch": 0.7821881926254282, + "grad_norm": 0.048990052193403244, + "learning_rate": 9.693505760288948e-05, + "loss": 0.2059, + "step": 3882 + }, + { + "epoch": 0.7825911746927262, + "grad_norm": 0.06255292892456055, + "learning_rate": 9.693046185975905e-05, + "loss": 0.2433, + "step": 3884 + }, + { + "epoch": 0.7829941567600242, + "grad_norm": 0.05485512688755989, + "learning_rate": 9.692586278276366e-05, + "loss": 0.2001, + "step": 3886 + }, + { + "epoch": 0.7833971388273222, + "grad_norm": 0.061514101922512054, + "learning_rate": 9.692126037223002e-05, + "loss": 0.1689, + "step": 3888 + }, + { + "epoch": 0.7838001208946201, + "grad_norm": 0.04852156713604927, + "learning_rate": 9.691665462848508e-05, + "loss": 0.2094, + "step": 3890 + }, + { + "epoch": 0.7842031029619182, + "grad_norm": 0.058205705136060715, + "learning_rate": 9.691204555185603e-05, + "loss": 0.1868, + "step": 3892 + }, + { + "epoch": 0.7846060850292162, + "grad_norm": 0.0600285641849041, + "learning_rate": 9.690743314267029e-05, + "loss": 0.2324, + "step": 3894 + }, + { + "epoch": 0.7850090670965142, + "grad_norm": 0.05361782759428024, + "learning_rate": 9.690281740125552e-05, + "loss": 0.2192, + "step": 3896 + }, + { + "epoch": 0.7854120491638122, + "grad_norm": 0.0479610301554203, + "learning_rate": 9.689819832793961e-05, + "loss": 0.2361, + "step": 3898 + }, + { + "epoch": 0.7858150312311102, + "grad_norm": 0.04447110369801521, + "learning_rate": 9.689357592305069e-05, + "loss": 0.1594, + "step": 3900 + }, + { + "epoch": 0.7862180132984082, + "grad_norm": 0.05070003494620323, + "learning_rate": 9.688895018691713e-05, + "loss": 0.1618, + "step": 3902 + }, + { + "epoch": 0.7866209953657062, + "grad_norm": 0.12754860520362854, + "learning_rate": 9.688432111986754e-05, + "loss": 0.2075, + "step": 3904 + }, + { + "epoch": 0.7870239774330042, + "grad_norm": 0.07254056632518768, + "learning_rate": 9.687968872223077e-05, + "loss": 0.2761, + "step": 3906 + }, + { + "epoch": 0.7874269595003023, + "grad_norm": 0.05646049603819847, + "learning_rate": 9.687505299433587e-05, + "loss": 0.1883, + "step": 3908 + }, + { + "epoch": 0.7878299415676002, + "grad_norm": 0.05001097545027733, + "learning_rate": 9.687041393651217e-05, + "loss": 0.1723, + "step": 3910 + }, + { + "epoch": 0.7882329236348983, + "grad_norm": 0.06254450976848602, + "learning_rate": 9.686577154908924e-05, + "loss": 0.2134, + "step": 3912 + }, + { + "epoch": 0.7886359057021962, + "grad_norm": 0.05184612423181534, + "learning_rate": 9.686112583239684e-05, + "loss": 0.177, + "step": 3914 + }, + { + "epoch": 0.7890388877694943, + "grad_norm": 0.05817123129963875, + "learning_rate": 9.6856476786765e-05, + "loss": 0.1708, + "step": 3916 + }, + { + "epoch": 0.7894418698367922, + "grad_norm": 0.04600764811038971, + "learning_rate": 9.685182441252398e-05, + "loss": 0.195, + "step": 3918 + }, + { + "epoch": 0.7898448519040903, + "grad_norm": 0.048723481595516205, + "learning_rate": 9.684716871000429e-05, + "loss": 0.1915, + "step": 3920 + }, + { + "epoch": 0.7902478339713883, + "grad_norm": 0.04340605065226555, + "learning_rate": 9.684250967953666e-05, + "loss": 0.2266, + "step": 3922 + }, + { + "epoch": 0.7906508160386863, + "grad_norm": 0.06534282863140106, + "learning_rate": 9.683784732145205e-05, + "loss": 0.2265, + "step": 3924 + }, + { + "epoch": 0.7910537981059843, + "grad_norm": 0.047532081604003906, + "learning_rate": 9.683318163608166e-05, + "loss": 0.1832, + "step": 3926 + }, + { + "epoch": 0.7914567801732822, + "grad_norm": 0.05659693479537964, + "learning_rate": 9.682851262375696e-05, + "loss": 0.2089, + "step": 3928 + }, + { + "epoch": 0.7918597622405803, + "grad_norm": 0.04877660050988197, + "learning_rate": 9.682384028480962e-05, + "loss": 0.1407, + "step": 3930 + }, + { + "epoch": 0.7922627443078784, + "grad_norm": 0.060812223702669144, + "learning_rate": 9.681916461957155e-05, + "loss": 0.2014, + "step": 3932 + }, + { + "epoch": 0.7926657263751763, + "grad_norm": 0.03950156643986702, + "learning_rate": 9.681448562837489e-05, + "loss": 0.1924, + "step": 3934 + }, + { + "epoch": 0.7930687084424743, + "grad_norm": 0.05669216439127922, + "learning_rate": 9.680980331155204e-05, + "loss": 0.2227, + "step": 3936 + }, + { + "epoch": 0.7934716905097723, + "grad_norm": 0.0466766394674778, + "learning_rate": 9.680511766943563e-05, + "loss": 0.1991, + "step": 3938 + }, + { + "epoch": 0.7938746725770703, + "grad_norm": 0.05456336587667465, + "learning_rate": 9.68004287023585e-05, + "loss": 0.2363, + "step": 3940 + }, + { + "epoch": 0.7942776546443683, + "grad_norm": 0.05615445598959923, + "learning_rate": 9.679573641065378e-05, + "loss": 0.1821, + "step": 3942 + }, + { + "epoch": 0.7946806367116663, + "grad_norm": 0.04117673635482788, + "learning_rate": 9.679104079465478e-05, + "loss": 0.1608, + "step": 3944 + }, + { + "epoch": 0.7950836187789644, + "grad_norm": 0.06808658689260483, + "learning_rate": 9.678634185469507e-05, + "loss": 0.2022, + "step": 3946 + }, + { + "epoch": 0.7954866008462623, + "grad_norm": 0.03982819616794586, + "learning_rate": 9.678163959110846e-05, + "loss": 0.1872, + "step": 3948 + }, + { + "epoch": 0.7958895829135604, + "grad_norm": 0.05367998778820038, + "learning_rate": 9.677693400422898e-05, + "loss": 0.2246, + "step": 3950 + }, + { + "epoch": 0.7962925649808583, + "grad_norm": 0.0663178563117981, + "learning_rate": 9.677222509439094e-05, + "loss": 0.2514, + "step": 3952 + }, + { + "epoch": 0.7966955470481564, + "grad_norm": 0.06157734617590904, + "learning_rate": 9.67675128619288e-05, + "loss": 0.2064, + "step": 3954 + }, + { + "epoch": 0.7970985291154543, + "grad_norm": 0.04360115900635719, + "learning_rate": 9.676279730717737e-05, + "loss": 0.1879, + "step": 3956 + }, + { + "epoch": 0.7975015111827524, + "grad_norm": 0.04340951517224312, + "learning_rate": 9.675807843047159e-05, + "loss": 0.1763, + "step": 3958 + }, + { + "epoch": 0.7979044932500504, + "grad_norm": 0.04515016824007034, + "learning_rate": 9.67533562321467e-05, + "loss": 0.2203, + "step": 3960 + }, + { + "epoch": 0.7983074753173484, + "grad_norm": 0.04955977573990822, + "learning_rate": 9.674863071253815e-05, + "loss": 0.2147, + "step": 3962 + }, + { + "epoch": 0.7987104573846464, + "grad_norm": 0.0752781331539154, + "learning_rate": 9.674390187198163e-05, + "loss": 0.1997, + "step": 3964 + }, + { + "epoch": 0.7991134394519444, + "grad_norm": 0.05286385491490364, + "learning_rate": 9.67391697108131e-05, + "loss": 0.2361, + "step": 3966 + }, + { + "epoch": 0.7995164215192424, + "grad_norm": 0.05145072564482689, + "learning_rate": 9.673443422936867e-05, + "loss": 0.2219, + "step": 3968 + }, + { + "epoch": 0.7999194035865403, + "grad_norm": 0.05920419469475746, + "learning_rate": 9.67296954279848e-05, + "loss": 0.146, + "step": 3970 + }, + { + "epoch": 0.8003223856538384, + "grad_norm": 0.05192543566226959, + "learning_rate": 9.672495330699808e-05, + "loss": 0.2088, + "step": 3972 + }, + { + "epoch": 0.8007253677211364, + "grad_norm": 0.04138815402984619, + "learning_rate": 9.672020786674543e-05, + "loss": 0.2041, + "step": 3974 + }, + { + "epoch": 0.8011283497884344, + "grad_norm": 0.04955004155635834, + "learning_rate": 9.671545910756392e-05, + "loss": 0.2155, + "step": 3976 + }, + { + "epoch": 0.8015313318557324, + "grad_norm": 0.04830560460686684, + "learning_rate": 9.67107070297909e-05, + "loss": 0.195, + "step": 3978 + }, + { + "epoch": 0.8019343139230304, + "grad_norm": 0.04398579150438309, + "learning_rate": 9.670595163376394e-05, + "loss": 0.2121, + "step": 3980 + }, + { + "epoch": 0.8023372959903284, + "grad_norm": 0.06084743142127991, + "learning_rate": 9.670119291982089e-05, + "loss": 0.2057, + "step": 3982 + }, + { + "epoch": 0.8027402780576265, + "grad_norm": 0.05149463191628456, + "learning_rate": 9.669643088829978e-05, + "loss": 0.2216, + "step": 3984 + }, + { + "epoch": 0.8031432601249244, + "grad_norm": 0.03941832110285759, + "learning_rate": 9.66916655395389e-05, + "loss": 0.1706, + "step": 3986 + }, + { + "epoch": 0.8035462421922225, + "grad_norm": 0.06777830421924591, + "learning_rate": 9.668689687387678e-05, + "loss": 0.2356, + "step": 3988 + }, + { + "epoch": 0.8039492242595204, + "grad_norm": 0.05458809807896614, + "learning_rate": 9.668212489165216e-05, + "loss": 0.1913, + "step": 3990 + }, + { + "epoch": 0.8043522063268185, + "grad_norm": 0.06013821065425873, + "learning_rate": 9.667734959320405e-05, + "loss": 0.2519, + "step": 3992 + }, + { + "epoch": 0.8047551883941164, + "grad_norm": 0.07752058655023575, + "learning_rate": 9.667257097887167e-05, + "loss": 0.2241, + "step": 3994 + }, + { + "epoch": 0.8051581704614145, + "grad_norm": 0.06690877676010132, + "learning_rate": 9.666778904899449e-05, + "loss": 0.183, + "step": 3996 + }, + { + "epoch": 0.8055611525287125, + "grad_norm": 0.050800006836652756, + "learning_rate": 9.666300380391222e-05, + "loss": 0.1714, + "step": 3998 + }, + { + "epoch": 0.8059641345960105, + "grad_norm": 0.06313162297010422, + "learning_rate": 9.665821524396476e-05, + "loss": 0.2002, + "step": 4000 + }, + { + "epoch": 0.8063671166633085, + "grad_norm": 0.06548028439283371, + "learning_rate": 9.665342336949232e-05, + "loss": 0.2249, + "step": 4002 + }, + { + "epoch": 0.8067700987306065, + "grad_norm": 0.05883391574025154, + "learning_rate": 9.664862818083531e-05, + "loss": 0.1774, + "step": 4004 + }, + { + "epoch": 0.8071730807979045, + "grad_norm": 0.06782971322536469, + "learning_rate": 9.664382967833435e-05, + "loss": 0.1652, + "step": 4006 + }, + { + "epoch": 0.8075760628652024, + "grad_norm": 0.056911651045084, + "learning_rate": 9.663902786233032e-05, + "loss": 0.1955, + "step": 4008 + }, + { + "epoch": 0.8079790449325005, + "grad_norm": 0.06347520649433136, + "learning_rate": 9.663422273316433e-05, + "loss": 0.2332, + "step": 4010 + }, + { + "epoch": 0.8083820269997986, + "grad_norm": 0.040850598365068436, + "learning_rate": 9.662941429117775e-05, + "loss": 0.141, + "step": 4012 + }, + { + "epoch": 0.8087850090670965, + "grad_norm": 0.05497581511735916, + "learning_rate": 9.662460253671216e-05, + "loss": 0.2611, + "step": 4014 + }, + { + "epoch": 0.8091879911343945, + "grad_norm": 0.05755303055047989, + "learning_rate": 9.661978747010936e-05, + "loss": 0.2626, + "step": 4016 + }, + { + "epoch": 0.8095909732016925, + "grad_norm": 0.04909120127558708, + "learning_rate": 9.661496909171141e-05, + "loss": 0.1573, + "step": 4018 + }, + { + "epoch": 0.8099939552689905, + "grad_norm": 0.0518498420715332, + "learning_rate": 9.661014740186063e-05, + "loss": 0.2242, + "step": 4020 + }, + { + "epoch": 0.8103969373362885, + "grad_norm": 0.07708277553319931, + "learning_rate": 9.66053224008995e-05, + "loss": 0.2089, + "step": 4022 + }, + { + "epoch": 0.8107999194035865, + "grad_norm": 0.07529980689287186, + "learning_rate": 9.66004940891708e-05, + "loss": 0.1794, + "step": 4024 + }, + { + "epoch": 0.8112029014708846, + "grad_norm": 0.04863161966204643, + "learning_rate": 9.659566246701753e-05, + "loss": 0.2272, + "step": 4026 + }, + { + "epoch": 0.8116058835381825, + "grad_norm": 0.03779434785246849, + "learning_rate": 9.659082753478292e-05, + "loss": 0.1701, + "step": 4028 + }, + { + "epoch": 0.8120088656054806, + "grad_norm": 0.05291910842061043, + "learning_rate": 9.658598929281042e-05, + "loss": 0.2102, + "step": 4030 + }, + { + "epoch": 0.8124118476727785, + "grad_norm": 0.03803229704499245, + "learning_rate": 9.658114774144376e-05, + "loss": 0.1921, + "step": 4032 + }, + { + "epoch": 0.8128148297400766, + "grad_norm": 0.0535709448158741, + "learning_rate": 9.657630288102686e-05, + "loss": 0.1913, + "step": 4034 + }, + { + "epoch": 0.8132178118073746, + "grad_norm": 0.045826297253370285, + "learning_rate": 9.657145471190388e-05, + "loss": 0.226, + "step": 4036 + }, + { + "epoch": 0.8136207938746726, + "grad_norm": 0.0441228449344635, + "learning_rate": 9.656660323441924e-05, + "loss": 0.1807, + "step": 4038 + }, + { + "epoch": 0.8140237759419706, + "grad_norm": 0.04264240711927414, + "learning_rate": 9.656174844891759e-05, + "loss": 0.2037, + "step": 4040 + }, + { + "epoch": 0.8144267580092686, + "grad_norm": 0.04649609327316284, + "learning_rate": 9.655689035574378e-05, + "loss": 0.2423, + "step": 4042 + }, + { + "epoch": 0.8148297400765666, + "grad_norm": 0.03742203488945961, + "learning_rate": 9.655202895524294e-05, + "loss": 0.1589, + "step": 4044 + }, + { + "epoch": 0.8152327221438646, + "grad_norm": 0.06208263710141182, + "learning_rate": 9.654716424776041e-05, + "loss": 0.1986, + "step": 4046 + }, + { + "epoch": 0.8156357042111626, + "grad_norm": 0.04026419296860695, + "learning_rate": 9.654229623364177e-05, + "loss": 0.1577, + "step": 4048 + }, + { + "epoch": 0.8160386862784607, + "grad_norm": 0.0645565316081047, + "learning_rate": 9.653742491323286e-05, + "loss": 0.1589, + "step": 4050 + }, + { + "epoch": 0.8164416683457586, + "grad_norm": 0.043083306401968, + "learning_rate": 9.653255028687969e-05, + "loss": 0.2045, + "step": 4052 + }, + { + "epoch": 0.8168446504130566, + "grad_norm": 0.054640740156173706, + "learning_rate": 9.652767235492856e-05, + "loss": 0.1936, + "step": 4054 + }, + { + "epoch": 0.8172476324803546, + "grad_norm": 0.06548301875591278, + "learning_rate": 9.652279111772603e-05, + "loss": 0.1989, + "step": 4056 + }, + { + "epoch": 0.8176506145476526, + "grad_norm": 0.07910330593585968, + "learning_rate": 9.651790657561879e-05, + "loss": 0.2104, + "step": 4058 + }, + { + "epoch": 0.8180535966149506, + "grad_norm": 0.06049291044473648, + "learning_rate": 9.651301872895387e-05, + "loss": 0.2042, + "step": 4060 + }, + { + "epoch": 0.8184565786822486, + "grad_norm": 0.05698911473155022, + "learning_rate": 9.650812757807848e-05, + "loss": 0.2017, + "step": 4062 + }, + { + "epoch": 0.8188595607495467, + "grad_norm": 0.048121679574251175, + "learning_rate": 9.650323312334008e-05, + "loss": 0.1826, + "step": 4064 + }, + { + "epoch": 0.8192625428168446, + "grad_norm": 0.06803172826766968, + "learning_rate": 9.649833536508639e-05, + "loss": 0.2246, + "step": 4066 + }, + { + "epoch": 0.8196655248841427, + "grad_norm": 0.05307772755622864, + "learning_rate": 9.649343430366531e-05, + "loss": 0.1693, + "step": 4068 + }, + { + "epoch": 0.8200685069514406, + "grad_norm": 0.04842696711421013, + "learning_rate": 9.648852993942501e-05, + "loss": 0.2509, + "step": 4070 + }, + { + "epoch": 0.8204714890187387, + "grad_norm": 0.06046567112207413, + "learning_rate": 9.64836222727139e-05, + "loss": 0.202, + "step": 4072 + }, + { + "epoch": 0.8208744710860366, + "grad_norm": 0.0492112897336483, + "learning_rate": 9.647871130388059e-05, + "loss": 0.1805, + "step": 4074 + }, + { + "epoch": 0.8212774531533347, + "grad_norm": 0.039268992841243744, + "learning_rate": 9.647379703327396e-05, + "loss": 0.1686, + "step": 4076 + }, + { + "epoch": 0.8216804352206327, + "grad_norm": 0.04768858104944229, + "learning_rate": 9.646887946124313e-05, + "loss": 0.2717, + "step": 4078 + }, + { + "epoch": 0.8220834172879307, + "grad_norm": 0.04175850749015808, + "learning_rate": 9.646395858813739e-05, + "loss": 0.1466, + "step": 4080 + }, + { + "epoch": 0.8224863993552287, + "grad_norm": 0.05806022137403488, + "learning_rate": 9.645903441430637e-05, + "loss": 0.2093, + "step": 4082 + }, + { + "epoch": 0.8228893814225267, + "grad_norm": 0.23905980587005615, + "learning_rate": 9.645410694009984e-05, + "loss": 0.224, + "step": 4084 + }, + { + "epoch": 0.8232923634898247, + "grad_norm": 0.05047454684972763, + "learning_rate": 9.644917616586783e-05, + "loss": 0.1972, + "step": 4086 + }, + { + "epoch": 0.8236953455571228, + "grad_norm": 0.05037853121757507, + "learning_rate": 9.644424209196064e-05, + "loss": 0.1878, + "step": 4088 + }, + { + "epoch": 0.8240983276244207, + "grad_norm": 0.05729193612933159, + "learning_rate": 9.643930471872877e-05, + "loss": 0.1767, + "step": 4090 + }, + { + "epoch": 0.8245013096917188, + "grad_norm": 0.061119064688682556, + "learning_rate": 9.643436404652295e-05, + "loss": 0.1605, + "step": 4092 + }, + { + "epoch": 0.8249042917590167, + "grad_norm": 0.03909744322299957, + "learning_rate": 9.642942007569418e-05, + "loss": 0.1822, + "step": 4094 + }, + { + "epoch": 0.8253072738263147, + "grad_norm": 0.06683524698019028, + "learning_rate": 9.642447280659365e-05, + "loss": 0.1889, + "step": 4096 + }, + { + "epoch": 0.8257102558936127, + "grad_norm": 0.06590251624584198, + "learning_rate": 9.641952223957282e-05, + "loss": 0.1949, + "step": 4098 + }, + { + "epoch": 0.8261132379609107, + "grad_norm": 0.05025814473628998, + "learning_rate": 9.641456837498338e-05, + "loss": 0.1786, + "step": 4100 + }, + { + "epoch": 0.8265162200282088, + "grad_norm": 0.06974563002586365, + "learning_rate": 9.640961121317722e-05, + "loss": 0.2245, + "step": 4102 + }, + { + "epoch": 0.8269192020955067, + "grad_norm": 0.05516672879457474, + "learning_rate": 9.640465075450651e-05, + "loss": 0.1922, + "step": 4104 + }, + { + "epoch": 0.8273221841628048, + "grad_norm": 0.05775739997625351, + "learning_rate": 9.639968699932361e-05, + "loss": 0.1736, + "step": 4106 + }, + { + "epoch": 0.8277251662301027, + "grad_norm": 0.04958879202604294, + "learning_rate": 9.639471994798117e-05, + "loss": 0.2281, + "step": 4108 + }, + { + "epoch": 0.8281281482974008, + "grad_norm": 0.049606986343860626, + "learning_rate": 9.6389749600832e-05, + "loss": 0.1881, + "step": 4110 + }, + { + "epoch": 0.8285311303646987, + "grad_norm": 0.0543147549033165, + "learning_rate": 9.638477595822922e-05, + "loss": 0.2541, + "step": 4112 + }, + { + "epoch": 0.8289341124319968, + "grad_norm": 0.048618897795677185, + "learning_rate": 9.637979902052614e-05, + "loss": 0.2474, + "step": 4114 + }, + { + "epoch": 0.8293370944992948, + "grad_norm": 0.053704481571912766, + "learning_rate": 9.63748187880763e-05, + "loss": 0.1822, + "step": 4116 + }, + { + "epoch": 0.8297400765665928, + "grad_norm": 0.0653117224574089, + "learning_rate": 9.636983526123351e-05, + "loss": 0.2378, + "step": 4118 + }, + { + "epoch": 0.8301430586338908, + "grad_norm": 0.05925583839416504, + "learning_rate": 9.636484844035179e-05, + "loss": 0.2085, + "step": 4120 + }, + { + "epoch": 0.8305460407011888, + "grad_norm": 0.05028758570551872, + "learning_rate": 9.635985832578536e-05, + "loss": 0.1704, + "step": 4122 + }, + { + "epoch": 0.8309490227684868, + "grad_norm": 0.07257190346717834, + "learning_rate": 9.635486491788875e-05, + "loss": 0.2143, + "step": 4124 + }, + { + "epoch": 0.8313520048357848, + "grad_norm": 0.11785920709371567, + "learning_rate": 9.634986821701667e-05, + "loss": 0.2413, + "step": 4126 + }, + { + "epoch": 0.8317549869030828, + "grad_norm": 0.04574896767735481, + "learning_rate": 9.634486822352408e-05, + "loss": 0.2276, + "step": 4128 + }, + { + "epoch": 0.8321579689703809, + "grad_norm": 0.05423697084188461, + "learning_rate": 9.633986493776617e-05, + "loss": 0.1688, + "step": 4130 + }, + { + "epoch": 0.8325609510376788, + "grad_norm": 0.05051505193114281, + "learning_rate": 9.633485836009836e-05, + "loss": 0.2345, + "step": 4132 + }, + { + "epoch": 0.8329639331049769, + "grad_norm": 0.045644182711839676, + "learning_rate": 9.63298484908763e-05, + "loss": 0.1883, + "step": 4134 + }, + { + "epoch": 0.8333669151722748, + "grad_norm": 0.04327382519841194, + "learning_rate": 9.632483533045592e-05, + "loss": 0.1658, + "step": 4136 + }, + { + "epoch": 0.8337698972395728, + "grad_norm": 0.05330512300133705, + "learning_rate": 9.631981887919332e-05, + "loss": 0.2277, + "step": 4138 + }, + { + "epoch": 0.8341728793068709, + "grad_norm": 0.04282519593834877, + "learning_rate": 9.631479913744486e-05, + "loss": 0.144, + "step": 4140 + }, + { + "epoch": 0.8345758613741688, + "grad_norm": 0.05408427491784096, + "learning_rate": 9.630977610556713e-05, + "loss": 0.1884, + "step": 4142 + }, + { + "epoch": 0.8349788434414669, + "grad_norm": 0.05264348164200783, + "learning_rate": 9.630474978391697e-05, + "loss": 0.2291, + "step": 4144 + }, + { + "epoch": 0.8353818255087648, + "grad_norm": 0.04659276828169823, + "learning_rate": 9.629972017285144e-05, + "loss": 0.2049, + "step": 4146 + }, + { + "epoch": 0.8357848075760629, + "grad_norm": 0.0541483573615551, + "learning_rate": 9.629468727272785e-05, + "loss": 0.1918, + "step": 4148 + }, + { + "epoch": 0.8361877896433608, + "grad_norm": 0.04391827434301376, + "learning_rate": 9.62896510839037e-05, + "loss": 0.1756, + "step": 4150 + }, + { + "epoch": 0.8365907717106589, + "grad_norm": 0.05609648674726486, + "learning_rate": 9.628461160673676e-05, + "loss": 0.2171, + "step": 4152 + }, + { + "epoch": 0.8369937537779569, + "grad_norm": 0.06236313283443451, + "learning_rate": 9.627956884158505e-05, + "loss": 0.1794, + "step": 4154 + }, + { + "epoch": 0.8373967358452549, + "grad_norm": 0.054331183433532715, + "learning_rate": 9.627452278880677e-05, + "loss": 0.1556, + "step": 4156 + }, + { + "epoch": 0.8377997179125529, + "grad_norm": 0.051585860550403595, + "learning_rate": 9.62694734487604e-05, + "loss": 0.1957, + "step": 4158 + }, + { + "epoch": 0.8382026999798509, + "grad_norm": 0.05942024290561676, + "learning_rate": 9.626442082180463e-05, + "loss": 0.1722, + "step": 4160 + }, + { + "epoch": 0.8386056820471489, + "grad_norm": 0.06458491086959839, + "learning_rate": 9.625936490829842e-05, + "loss": 0.1744, + "step": 4162 + }, + { + "epoch": 0.8390086641144469, + "grad_norm": 0.05910806357860565, + "learning_rate": 9.625430570860087e-05, + "loss": 0.1826, + "step": 4164 + }, + { + "epoch": 0.8394116461817449, + "grad_norm": 0.08108662813901901, + "learning_rate": 9.624924322307142e-05, + "loss": 0.1826, + "step": 4166 + }, + { + "epoch": 0.839814628249043, + "grad_norm": 0.057688187807798386, + "learning_rate": 9.62441774520697e-05, + "loss": 0.2289, + "step": 4168 + }, + { + "epoch": 0.8402176103163409, + "grad_norm": 0.05357905849814415, + "learning_rate": 9.62391083959556e-05, + "loss": 0.1976, + "step": 4170 + }, + { + "epoch": 0.840620592383639, + "grad_norm": 0.07163774967193604, + "learning_rate": 9.623403605508916e-05, + "loss": 0.2353, + "step": 4172 + }, + { + "epoch": 0.8410235744509369, + "grad_norm": 0.04970443621277809, + "learning_rate": 9.622896042983075e-05, + "loss": 0.2062, + "step": 4174 + }, + { + "epoch": 0.841426556518235, + "grad_norm": 0.0592183880507946, + "learning_rate": 9.622388152054092e-05, + "loss": 0.191, + "step": 4176 + }, + { + "epoch": 0.8418295385855329, + "grad_norm": 0.05800905451178551, + "learning_rate": 9.621879932758045e-05, + "loss": 0.2192, + "step": 4178 + }, + { + "epoch": 0.8422325206528309, + "grad_norm": 0.06246621534228325, + "learning_rate": 9.621371385131042e-05, + "loss": 0.1652, + "step": 4180 + }, + { + "epoch": 0.842635502720129, + "grad_norm": 0.06259558349847794, + "learning_rate": 9.620862509209206e-05, + "loss": 0.1838, + "step": 4182 + }, + { + "epoch": 0.8430384847874269, + "grad_norm": 0.047461558133363724, + "learning_rate": 9.620353305028687e-05, + "loss": 0.2317, + "step": 4184 + }, + { + "epoch": 0.843441466854725, + "grad_norm": 0.06128053739666939, + "learning_rate": 9.619843772625657e-05, + "loss": 0.2001, + "step": 4186 + }, + { + "epoch": 0.8438444489220229, + "grad_norm": 0.0504336915910244, + "learning_rate": 9.619333912036314e-05, + "loss": 0.2138, + "step": 4188 + }, + { + "epoch": 0.844247430989321, + "grad_norm": 0.0516376867890358, + "learning_rate": 9.618823723296879e-05, + "loss": 0.1869, + "step": 4190 + }, + { + "epoch": 0.844650413056619, + "grad_norm": 0.05454495549201965, + "learning_rate": 9.618313206443595e-05, + "loss": 0.2028, + "step": 4192 + }, + { + "epoch": 0.845053395123917, + "grad_norm": 0.04650101065635681, + "learning_rate": 9.617802361512723e-05, + "loss": 0.2244, + "step": 4194 + }, + { + "epoch": 0.845456377191215, + "grad_norm": 0.05743958428502083, + "learning_rate": 9.617291188540558e-05, + "loss": 0.2123, + "step": 4196 + }, + { + "epoch": 0.845859359258513, + "grad_norm": 0.07449749112129211, + "learning_rate": 9.616779687563411e-05, + "loss": 0.208, + "step": 4198 + }, + { + "epoch": 0.846262341325811, + "grad_norm": 0.047410573810338974, + "learning_rate": 9.616267858617617e-05, + "loss": 0.2003, + "step": 4200 + }, + { + "epoch": 0.846665323393109, + "grad_norm": 0.04203968122601509, + "learning_rate": 9.61575570173954e-05, + "loss": 0.2223, + "step": 4202 + }, + { + "epoch": 0.847068305460407, + "grad_norm": 0.047013357281684875, + "learning_rate": 9.61524321696556e-05, + "loss": 0.1855, + "step": 4204 + }, + { + "epoch": 0.8474712875277051, + "grad_norm": 0.046448126435279846, + "learning_rate": 9.614730404332079e-05, + "loss": 0.1873, + "step": 4206 + }, + { + "epoch": 0.847874269595003, + "grad_norm": 0.045094847679138184, + "learning_rate": 9.614217263875533e-05, + "loss": 0.2429, + "step": 4208 + }, + { + "epoch": 0.8482772516623011, + "grad_norm": 0.04861465469002724, + "learning_rate": 9.613703795632372e-05, + "loss": 0.2428, + "step": 4210 + }, + { + "epoch": 0.848680233729599, + "grad_norm": 0.05912397801876068, + "learning_rate": 9.61318999963907e-05, + "loss": 0.2101, + "step": 4212 + }, + { + "epoch": 0.849083215796897, + "grad_norm": 0.05289468914270401, + "learning_rate": 9.61267587593213e-05, + "loss": 0.2354, + "step": 4214 + }, + { + "epoch": 0.849486197864195, + "grad_norm": 0.04913996905088425, + "learning_rate": 9.612161424548072e-05, + "loss": 0.1922, + "step": 4216 + }, + { + "epoch": 0.849889179931493, + "grad_norm": 0.046018872410058975, + "learning_rate": 9.611646645523442e-05, + "loss": 0.1793, + "step": 4218 + }, + { + "epoch": 0.8502921619987911, + "grad_norm": 0.05342131480574608, + "learning_rate": 9.611131538894811e-05, + "loss": 0.2079, + "step": 4220 + }, + { + "epoch": 0.850695144066089, + "grad_norm": 0.05977238342165947, + "learning_rate": 9.610616104698768e-05, + "loss": 0.1883, + "step": 4222 + }, + { + "epoch": 0.8510981261333871, + "grad_norm": 0.04549986496567726, + "learning_rate": 9.610100342971932e-05, + "loss": 0.224, + "step": 4224 + }, + { + "epoch": 0.851501108200685, + "grad_norm": 0.048815593123435974, + "learning_rate": 9.60958425375094e-05, + "loss": 0.1479, + "step": 4226 + }, + { + "epoch": 0.8519040902679831, + "grad_norm": 0.06040544807910919, + "learning_rate": 9.609067837072454e-05, + "loss": 0.2188, + "step": 4228 + }, + { + "epoch": 0.8523070723352811, + "grad_norm": 0.06375422328710556, + "learning_rate": 9.60855109297316e-05, + "loss": 0.194, + "step": 4230 + }, + { + "epoch": 0.8527100544025791, + "grad_norm": 0.06123984232544899, + "learning_rate": 9.608034021489766e-05, + "loss": 0.2149, + "step": 4232 + }, + { + "epoch": 0.8531130364698771, + "grad_norm": 0.06358905136585236, + "learning_rate": 9.607516622659007e-05, + "loss": 0.1769, + "step": 4234 + }, + { + "epoch": 0.8535160185371751, + "grad_norm": 0.05531733110547066, + "learning_rate": 9.606998896517634e-05, + "loss": 0.2195, + "step": 4236 + }, + { + "epoch": 0.8539190006044731, + "grad_norm": 0.051569703966379166, + "learning_rate": 9.606480843102428e-05, + "loss": 0.2088, + "step": 4238 + }, + { + "epoch": 0.8543219826717711, + "grad_norm": 0.0638023391366005, + "learning_rate": 9.605962462450188e-05, + "loss": 0.1862, + "step": 4240 + }, + { + "epoch": 0.8547249647390691, + "grad_norm": 0.06016799062490463, + "learning_rate": 9.605443754597742e-05, + "loss": 0.2086, + "step": 4242 + }, + { + "epoch": 0.8551279468063672, + "grad_norm": 0.05225469917058945, + "learning_rate": 9.604924719581938e-05, + "loss": 0.19, + "step": 4244 + }, + { + "epoch": 0.8555309288736651, + "grad_norm": 0.04729381203651428, + "learning_rate": 9.604405357439646e-05, + "loss": 0.1858, + "step": 4246 + }, + { + "epoch": 0.8559339109409632, + "grad_norm": 0.04221804067492485, + "learning_rate": 9.603885668207762e-05, + "loss": 0.1857, + "step": 4248 + }, + { + "epoch": 0.8563368930082611, + "grad_norm": 0.08133088797330856, + "learning_rate": 9.6033656519232e-05, + "loss": 0.2085, + "step": 4250 + }, + { + "epoch": 0.8567398750755592, + "grad_norm": 0.048365265130996704, + "learning_rate": 9.602845308622905e-05, + "loss": 0.189, + "step": 4252 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.044929929077625275, + "learning_rate": 9.602324638343843e-05, + "loss": 0.1709, + "step": 4254 + }, + { + "epoch": 0.8575458392101551, + "grad_norm": 0.04540219157934189, + "learning_rate": 9.601803641122998e-05, + "loss": 0.1952, + "step": 4256 + }, + { + "epoch": 0.8579488212774532, + "grad_norm": 0.06363467872142792, + "learning_rate": 9.60128231699738e-05, + "loss": 0.1461, + "step": 4258 + }, + { + "epoch": 0.8583518033447511, + "grad_norm": 0.04269418120384216, + "learning_rate": 9.600760666004025e-05, + "loss": 0.1845, + "step": 4260 + }, + { + "epoch": 0.8587547854120492, + "grad_norm": 0.04981771484017372, + "learning_rate": 9.60023868817999e-05, + "loss": 0.2093, + "step": 4262 + }, + { + "epoch": 0.8591577674793471, + "grad_norm": 0.0496847927570343, + "learning_rate": 9.599716383562358e-05, + "loss": 0.1933, + "step": 4264 + }, + { + "epoch": 0.8595607495466452, + "grad_norm": 0.038256920874118805, + "learning_rate": 9.59919375218823e-05, + "loss": 0.158, + "step": 4266 + }, + { + "epoch": 0.8599637316139431, + "grad_norm": 0.04268384724855423, + "learning_rate": 9.59867079409473e-05, + "loss": 0.2097, + "step": 4268 + }, + { + "epoch": 0.8603667136812412, + "grad_norm": 0.059887178242206573, + "learning_rate": 9.598147509319015e-05, + "loss": 0.1843, + "step": 4270 + }, + { + "epoch": 0.8607696957485392, + "grad_norm": 0.04287001118063927, + "learning_rate": 9.597623897898251e-05, + "loss": 0.1696, + "step": 4272 + }, + { + "epoch": 0.8611726778158372, + "grad_norm": 0.056464433670043945, + "learning_rate": 9.597099959869641e-05, + "loss": 0.2019, + "step": 4274 + }, + { + "epoch": 0.8615756598831352, + "grad_norm": 0.04815569892525673, + "learning_rate": 9.596575695270402e-05, + "loss": 0.1792, + "step": 4276 + }, + { + "epoch": 0.8619786419504332, + "grad_norm": 0.053726229816675186, + "learning_rate": 9.596051104137775e-05, + "loss": 0.2035, + "step": 4278 + }, + { + "epoch": 0.8623816240177312, + "grad_norm": 0.053749434649944305, + "learning_rate": 9.595526186509028e-05, + "loss": 0.2442, + "step": 4280 + }, + { + "epoch": 0.8627846060850293, + "grad_norm": 0.052994389086961746, + "learning_rate": 9.59500094242145e-05, + "loss": 0.2216, + "step": 4282 + }, + { + "epoch": 0.8631875881523272, + "grad_norm": 0.040021564811468124, + "learning_rate": 9.594475371912355e-05, + "loss": 0.1594, + "step": 4284 + }, + { + "epoch": 0.8635905702196253, + "grad_norm": 0.044010285288095474, + "learning_rate": 9.593949475019076e-05, + "loss": 0.1902, + "step": 4286 + }, + { + "epoch": 0.8639935522869232, + "grad_norm": 0.056951433420181274, + "learning_rate": 9.593423251778975e-05, + "loss": 0.197, + "step": 4288 + }, + { + "epoch": 0.8643965343542213, + "grad_norm": 0.08449291437864304, + "learning_rate": 9.59289670222943e-05, + "loss": 0.2241, + "step": 4290 + }, + { + "epoch": 0.8647995164215192, + "grad_norm": 0.05323825031518936, + "learning_rate": 9.59236982640785e-05, + "loss": 0.195, + "step": 4292 + }, + { + "epoch": 0.8652024984888173, + "grad_norm": 0.05166636407375336, + "learning_rate": 9.591842624351661e-05, + "loss": 0.1491, + "step": 4294 + }, + { + "epoch": 0.8656054805561153, + "grad_norm": 0.0477205291390419, + "learning_rate": 9.591315096098316e-05, + "loss": 0.1932, + "step": 4296 + }, + { + "epoch": 0.8660084626234132, + "grad_norm": 0.06305918842554092, + "learning_rate": 9.59078724168529e-05, + "loss": 0.2263, + "step": 4298 + }, + { + "epoch": 0.8664114446907113, + "grad_norm": 0.052747536450624466, + "learning_rate": 9.590259061150079e-05, + "loss": 0.1766, + "step": 4300 + }, + { + "epoch": 0.8668144267580092, + "grad_norm": 0.048709429800510406, + "learning_rate": 9.589730554530208e-05, + "loss": 0.1508, + "step": 4302 + }, + { + "epoch": 0.8672174088253073, + "grad_norm": 0.0500153973698616, + "learning_rate": 9.589201721863214e-05, + "loss": 0.1934, + "step": 4304 + }, + { + "epoch": 0.8676203908926052, + "grad_norm": 0.06479570269584656, + "learning_rate": 9.588672563186674e-05, + "loss": 0.2535, + "step": 4306 + }, + { + "epoch": 0.8680233729599033, + "grad_norm": 0.0527423657476902, + "learning_rate": 9.58814307853817e-05, + "loss": 0.1761, + "step": 4308 + }, + { + "epoch": 0.8684263550272013, + "grad_norm": 0.07280410081148148, + "learning_rate": 9.58761326795532e-05, + "loss": 0.212, + "step": 4310 + }, + { + "epoch": 0.8688293370944993, + "grad_norm": 0.11216331273317337, + "learning_rate": 9.587083131475762e-05, + "loss": 0.1958, + "step": 4312 + }, + { + "epoch": 0.8692323191617973, + "grad_norm": 0.0491580106317997, + "learning_rate": 9.586552669137152e-05, + "loss": 0.2232, + "step": 4314 + }, + { + "epoch": 0.8696353012290953, + "grad_norm": 0.0734315887093544, + "learning_rate": 9.586021880977177e-05, + "loss": 0.1532, + "step": 4316 + }, + { + "epoch": 0.8700382832963933, + "grad_norm": 0.0685787945985794, + "learning_rate": 9.585490767033543e-05, + "loss": 0.1724, + "step": 4318 + }, + { + "epoch": 0.8704412653636913, + "grad_norm": 0.0568709559738636, + "learning_rate": 9.584959327343976e-05, + "loss": 0.1806, + "step": 4320 + }, + { + "epoch": 0.8708442474309893, + "grad_norm": 0.05582389608025551, + "learning_rate": 9.584427561946232e-05, + "loss": 0.1688, + "step": 4322 + }, + { + "epoch": 0.8712472294982874, + "grad_norm": 0.07161648571491241, + "learning_rate": 9.583895470878085e-05, + "loss": 0.1877, + "step": 4324 + }, + { + "epoch": 0.8716502115655853, + "grad_norm": 0.07797323167324066, + "learning_rate": 9.583363054177335e-05, + "loss": 0.1632, + "step": 4326 + }, + { + "epoch": 0.8720531936328834, + "grad_norm": 0.05685307830572128, + "learning_rate": 9.582830311881803e-05, + "loss": 0.1465, + "step": 4328 + }, + { + "epoch": 0.8724561757001813, + "grad_norm": 0.06444845348596573, + "learning_rate": 9.582297244029336e-05, + "loss": 0.1738, + "step": 4330 + }, + { + "epoch": 0.8728591577674794, + "grad_norm": 0.05205952376127243, + "learning_rate": 9.581763850657801e-05, + "loss": 0.1724, + "step": 4332 + }, + { + "epoch": 0.8732621398347774, + "grad_norm": 0.0661550834774971, + "learning_rate": 9.581230131805088e-05, + "loss": 0.1713, + "step": 4334 + }, + { + "epoch": 0.8736651219020753, + "grad_norm": 0.06810151785612106, + "learning_rate": 9.580696087509115e-05, + "loss": 0.2472, + "step": 4336 + }, + { + "epoch": 0.8740681039693734, + "grad_norm": 0.05709342285990715, + "learning_rate": 9.580161717807816e-05, + "loss": 0.1988, + "step": 4338 + }, + { + "epoch": 0.8744710860366713, + "grad_norm": 0.06400882452726364, + "learning_rate": 9.579627022739155e-05, + "loss": 0.2413, + "step": 4340 + }, + { + "epoch": 0.8748740681039694, + "grad_norm": 0.07204491645097733, + "learning_rate": 9.579092002341112e-05, + "loss": 0.204, + "step": 4342 + }, + { + "epoch": 0.8752770501712673, + "grad_norm": 0.06638916581869125, + "learning_rate": 9.578556656651699e-05, + "loss": 0.1755, + "step": 4344 + }, + { + "epoch": 0.8756800322385654, + "grad_norm": 0.04414273798465729, + "learning_rate": 9.578020985708942e-05, + "loss": 0.1649, + "step": 4346 + }, + { + "epoch": 0.8760830143058634, + "grad_norm": 0.06172650679945946, + "learning_rate": 9.577484989550896e-05, + "loss": 0.2017, + "step": 4348 + }, + { + "epoch": 0.8764859963731614, + "grad_norm": 0.045115333050489426, + "learning_rate": 9.576948668215638e-05, + "loss": 0.2568, + "step": 4350 + }, + { + "epoch": 0.8768889784404594, + "grad_norm": 0.05499257892370224, + "learning_rate": 9.576412021741264e-05, + "loss": 0.1959, + "step": 4352 + }, + { + "epoch": 0.8772919605077574, + "grad_norm": 0.049140289425849915, + "learning_rate": 9.575875050165902e-05, + "loss": 0.1836, + "step": 4354 + }, + { + "epoch": 0.8776949425750554, + "grad_norm": 0.04703235626220703, + "learning_rate": 9.575337753527692e-05, + "loss": 0.2211, + "step": 4356 + }, + { + "epoch": 0.8780979246423534, + "grad_norm": 0.05271073803305626, + "learning_rate": 9.574800131864805e-05, + "loss": 0.2134, + "step": 4358 + }, + { + "epoch": 0.8785009067096514, + "grad_norm": 0.056267786771059036, + "learning_rate": 9.574262185215433e-05, + "loss": 0.1945, + "step": 4360 + }, + { + "epoch": 0.8789038887769495, + "grad_norm": 0.05832262709736824, + "learning_rate": 9.573723913617791e-05, + "loss": 0.156, + "step": 4362 + }, + { + "epoch": 0.8793068708442474, + "grad_norm": 0.051851604133844376, + "learning_rate": 9.573185317110119e-05, + "loss": 0.2601, + "step": 4364 + }, + { + "epoch": 0.8797098529115455, + "grad_norm": 0.0523109994828701, + "learning_rate": 9.572646395730673e-05, + "loss": 0.2256, + "step": 4366 + }, + { + "epoch": 0.8801128349788434, + "grad_norm": 0.05357774719595909, + "learning_rate": 9.572107149517741e-05, + "loss": 0.1877, + "step": 4368 + }, + { + "epoch": 0.8805158170461415, + "grad_norm": 0.05208495631814003, + "learning_rate": 9.571567578509629e-05, + "loss": 0.1873, + "step": 4370 + }, + { + "epoch": 0.8809187991134394, + "grad_norm": 0.0533597432076931, + "learning_rate": 9.571027682744668e-05, + "loss": 0.2484, + "step": 4372 + }, + { + "epoch": 0.8813217811807375, + "grad_norm": 0.05073506757616997, + "learning_rate": 9.57048746226121e-05, + "loss": 0.2035, + "step": 4374 + }, + { + "epoch": 0.8817247632480355, + "grad_norm": 0.05859789997339249, + "learning_rate": 9.569946917097631e-05, + "loss": 0.181, + "step": 4376 + }, + { + "epoch": 0.8821277453153334, + "grad_norm": 0.06273086369037628, + "learning_rate": 9.569406047292334e-05, + "loss": 0.1764, + "step": 4378 + }, + { + "epoch": 0.8825307273826315, + "grad_norm": 0.05160843953490257, + "learning_rate": 9.568864852883739e-05, + "loss": 0.2122, + "step": 4380 + }, + { + "epoch": 0.8829337094499294, + "grad_norm": 0.05211114138364792, + "learning_rate": 9.56832333391029e-05, + "loss": 0.241, + "step": 4382 + }, + { + "epoch": 0.8833366915172275, + "grad_norm": 0.05442271754145622, + "learning_rate": 9.567781490410456e-05, + "loss": 0.1976, + "step": 4384 + }, + { + "epoch": 0.8837396735845255, + "grad_norm": 0.051495131105184555, + "learning_rate": 9.567239322422734e-05, + "loss": 0.1859, + "step": 4386 + }, + { + "epoch": 0.8841426556518235, + "grad_norm": 0.06409385055303574, + "learning_rate": 9.566696829985633e-05, + "loss": 0.1892, + "step": 4388 + }, + { + "epoch": 0.8845456377191215, + "grad_norm": 0.05642905831336975, + "learning_rate": 9.566154013137691e-05, + "loss": 0.1252, + "step": 4390 + }, + { + "epoch": 0.8849486197864195, + "grad_norm": 0.07028786838054657, + "learning_rate": 9.565610871917472e-05, + "loss": 0.1937, + "step": 4392 + }, + { + "epoch": 0.8853516018537175, + "grad_norm": 0.048712924122810364, + "learning_rate": 9.565067406363556e-05, + "loss": 0.2038, + "step": 4394 + }, + { + "epoch": 0.8857545839210155, + "grad_norm": 0.07108557969331741, + "learning_rate": 9.564523616514556e-05, + "loss": 0.2014, + "step": 4396 + }, + { + "epoch": 0.8861575659883135, + "grad_norm": 0.05257798731327057, + "learning_rate": 9.563979502409096e-05, + "loss": 0.1502, + "step": 4398 + }, + { + "epoch": 0.8865605480556116, + "grad_norm": 0.07098128646612167, + "learning_rate": 9.563435064085832e-05, + "loss": 0.2139, + "step": 4400 + }, + { + "epoch": 0.8869635301229095, + "grad_norm": 0.050052594393491745, + "learning_rate": 9.562890301583438e-05, + "loss": 0.1656, + "step": 4402 + }, + { + "epoch": 0.8873665121902076, + "grad_norm": 0.05565377324819565, + "learning_rate": 9.562345214940616e-05, + "loss": 0.2131, + "step": 4404 + }, + { + "epoch": 0.8877694942575055, + "grad_norm": 0.06834172457456589, + "learning_rate": 9.561799804196083e-05, + "loss": 0.1691, + "step": 4406 + }, + { + "epoch": 0.8881724763248036, + "grad_norm": 0.05031581595540047, + "learning_rate": 9.56125406938859e-05, + "loss": 0.193, + "step": 4408 + }, + { + "epoch": 0.8885754583921015, + "grad_norm": 0.04691687971353531, + "learning_rate": 9.560708010556902e-05, + "loss": 0.1703, + "step": 4410 + }, + { + "epoch": 0.8889784404593996, + "grad_norm": 0.038260504603385925, + "learning_rate": 9.560161627739813e-05, + "loss": 0.1592, + "step": 4412 + }, + { + "epoch": 0.8893814225266976, + "grad_norm": 0.10631363838911057, + "learning_rate": 9.559614920976131e-05, + "loss": 0.2233, + "step": 4414 + }, + { + "epoch": 0.8897844045939955, + "grad_norm": 0.05809139087796211, + "learning_rate": 9.559067890304698e-05, + "loss": 0.2693, + "step": 4416 + }, + { + "epoch": 0.8901873866612936, + "grad_norm": 0.03824347257614136, + "learning_rate": 9.558520535764375e-05, + "loss": 0.1971, + "step": 4418 + }, + { + "epoch": 0.8905903687285915, + "grad_norm": 0.061272189021110535, + "learning_rate": 9.557972857394042e-05, + "loss": 0.2159, + "step": 4420 + }, + { + "epoch": 0.8909933507958896, + "grad_norm": 0.04731274023652077, + "learning_rate": 9.557424855232608e-05, + "loss": 0.2196, + "step": 4422 + }, + { + "epoch": 0.8913963328631875, + "grad_norm": 0.058554090559482574, + "learning_rate": 9.556876529318999e-05, + "loss": 0.1422, + "step": 4424 + }, + { + "epoch": 0.8917993149304856, + "grad_norm": 0.05500726401805878, + "learning_rate": 9.55632787969217e-05, + "loss": 0.2067, + "step": 4426 + }, + { + "epoch": 0.8922022969977836, + "grad_norm": 0.04614582657814026, + "learning_rate": 9.555778906391095e-05, + "loss": 0.2027, + "step": 4428 + }, + { + "epoch": 0.8926052790650816, + "grad_norm": 0.042596716433763504, + "learning_rate": 9.555229609454772e-05, + "loss": 0.1479, + "step": 4430 + }, + { + "epoch": 0.8930082611323796, + "grad_norm": 0.08022289723157883, + "learning_rate": 9.554679988922222e-05, + "loss": 0.2432, + "step": 4432 + }, + { + "epoch": 0.8934112431996776, + "grad_norm": 0.05268188938498497, + "learning_rate": 9.554130044832492e-05, + "loss": 0.1994, + "step": 4434 + }, + { + "epoch": 0.8938142252669756, + "grad_norm": 0.049188628792762756, + "learning_rate": 9.553579777224644e-05, + "loss": 0.1807, + "step": 4436 + }, + { + "epoch": 0.8942172073342737, + "grad_norm": 0.043887216597795486, + "learning_rate": 9.553029186137775e-05, + "loss": 0.1264, + "step": 4438 + }, + { + "epoch": 0.8946201894015716, + "grad_norm": 0.11060819029808044, + "learning_rate": 9.552478271610989e-05, + "loss": 0.217, + "step": 4440 + }, + { + "epoch": 0.8950231714688697, + "grad_norm": 0.05788939818739891, + "learning_rate": 9.55192703368343e-05, + "loss": 0.1859, + "step": 4442 + }, + { + "epoch": 0.8954261535361676, + "grad_norm": 0.059715982526540756, + "learning_rate": 9.551375472394255e-05, + "loss": 0.175, + "step": 4444 + }, + { + "epoch": 0.8958291356034657, + "grad_norm": 0.04088887572288513, + "learning_rate": 9.550823587782645e-05, + "loss": 0.1929, + "step": 4446 + }, + { + "epoch": 0.8962321176707636, + "grad_norm": 0.05613197386264801, + "learning_rate": 9.550271379887805e-05, + "loss": 0.1827, + "step": 4448 + }, + { + "epoch": 0.8966350997380617, + "grad_norm": 0.04531438648700714, + "learning_rate": 9.549718848748962e-05, + "loss": 0.183, + "step": 4450 + }, + { + "epoch": 0.8970380818053597, + "grad_norm": 0.06092282757163048, + "learning_rate": 9.54916599440537e-05, + "loss": 0.1447, + "step": 4452 + }, + { + "epoch": 0.8974410638726577, + "grad_norm": 0.03984326496720314, + "learning_rate": 9.548612816896298e-05, + "loss": 0.1448, + "step": 4454 + }, + { + "epoch": 0.8978440459399557, + "grad_norm": 0.05323481187224388, + "learning_rate": 9.548059316261049e-05, + "loss": 0.1924, + "step": 4456 + }, + { + "epoch": 0.8982470280072536, + "grad_norm": 0.06252434849739075, + "learning_rate": 9.54750549253894e-05, + "loss": 0.2178, + "step": 4458 + }, + { + "epoch": 0.8986500100745517, + "grad_norm": 0.06720856577157974, + "learning_rate": 9.546951345769311e-05, + "loss": 0.1585, + "step": 4460 + }, + { + "epoch": 0.8990529921418496, + "grad_norm": 0.060730621218681335, + "learning_rate": 9.546396875991532e-05, + "loss": 0.2401, + "step": 4462 + }, + { + "epoch": 0.8994559742091477, + "grad_norm": 0.07229790836572647, + "learning_rate": 9.54584208324499e-05, + "loss": 0.1737, + "step": 4464 + }, + { + "epoch": 0.8998589562764457, + "grad_norm": 0.05511532351374626, + "learning_rate": 9.545286967569095e-05, + "loss": 0.1808, + "step": 4466 + }, + { + "epoch": 0.9002619383437437, + "grad_norm": 0.056663528084754944, + "learning_rate": 9.544731529003283e-05, + "loss": 0.1716, + "step": 4468 + }, + { + "epoch": 0.9006649204110417, + "grad_norm": 0.04725079610943794, + "learning_rate": 9.544175767587012e-05, + "loss": 0.2016, + "step": 4470 + }, + { + "epoch": 0.9010679024783397, + "grad_norm": 0.05176713317632675, + "learning_rate": 9.543619683359762e-05, + "loss": 0.186, + "step": 4472 + }, + { + "epoch": 0.9014708845456377, + "grad_norm": 0.05347689613699913, + "learning_rate": 9.543063276361037e-05, + "loss": 0.1651, + "step": 4474 + }, + { + "epoch": 0.9018738666129357, + "grad_norm": 0.053653594106435776, + "learning_rate": 9.54250654663036e-05, + "loss": 0.1924, + "step": 4476 + }, + { + "epoch": 0.9022768486802337, + "grad_norm": 0.06456420570611954, + "learning_rate": 9.541949494207286e-05, + "loss": 0.1796, + "step": 4478 + }, + { + "epoch": 0.9026798307475318, + "grad_norm": 0.05221540853381157, + "learning_rate": 9.54139211913138e-05, + "loss": 0.2247, + "step": 4480 + }, + { + "epoch": 0.9030828128148297, + "grad_norm": 0.049423567950725555, + "learning_rate": 9.540834421442243e-05, + "loss": 0.2272, + "step": 4482 + }, + { + "epoch": 0.9034857948821278, + "grad_norm": 0.051466915756464005, + "learning_rate": 9.54027640117949e-05, + "loss": 0.1817, + "step": 4484 + }, + { + "epoch": 0.9038887769494257, + "grad_norm": 0.06747356057167053, + "learning_rate": 9.539718058382763e-05, + "loss": 0.2122, + "step": 4486 + }, + { + "epoch": 0.9042917590167238, + "grad_norm": 0.04708021134138107, + "learning_rate": 9.539159393091726e-05, + "loss": 0.2082, + "step": 4488 + }, + { + "epoch": 0.9046947410840218, + "grad_norm": 0.051016103476285934, + "learning_rate": 9.538600405346064e-05, + "loss": 0.2112, + "step": 4490 + }, + { + "epoch": 0.9050977231513198, + "grad_norm": 0.05513007566332817, + "learning_rate": 9.538041095185486e-05, + "loss": 0.1696, + "step": 4492 + }, + { + "epoch": 0.9055007052186178, + "grad_norm": 0.04744619131088257, + "learning_rate": 9.537481462649729e-05, + "loss": 0.1648, + "step": 4494 + }, + { + "epoch": 0.9059036872859157, + "grad_norm": 0.04048705846071243, + "learning_rate": 9.536921507778543e-05, + "loss": 0.1534, + "step": 4496 + }, + { + "epoch": 0.9063066693532138, + "grad_norm": 0.05609700828790665, + "learning_rate": 9.53636123061171e-05, + "loss": 0.1872, + "step": 4498 + }, + { + "epoch": 0.9067096514205117, + "grad_norm": 0.061345573514699936, + "learning_rate": 9.535800631189032e-05, + "loss": 0.2813, + "step": 4500 + }, + { + "epoch": 0.9071126334878098, + "grad_norm": 0.059606775641441345, + "learning_rate": 9.535239709550328e-05, + "loss": 0.2559, + "step": 4502 + }, + { + "epoch": 0.9075156155551078, + "grad_norm": 0.06905293464660645, + "learning_rate": 9.534678465735449e-05, + "loss": 0.1876, + "step": 4504 + }, + { + "epoch": 0.9079185976224058, + "grad_norm": 0.07575200498104095, + "learning_rate": 9.534116899784265e-05, + "loss": 0.2222, + "step": 4506 + }, + { + "epoch": 0.9083215796897038, + "grad_norm": 0.053717587143182755, + "learning_rate": 9.533555011736667e-05, + "loss": 0.1471, + "step": 4508 + }, + { + "epoch": 0.9087245617570018, + "grad_norm": 0.04818922281265259, + "learning_rate": 9.532992801632571e-05, + "loss": 0.2059, + "step": 4510 + }, + { + "epoch": 0.9091275438242998, + "grad_norm": 0.07846032083034515, + "learning_rate": 9.532430269511916e-05, + "loss": 0.2092, + "step": 4512 + }, + { + "epoch": 0.9095305258915978, + "grad_norm": 0.055181194096803665, + "learning_rate": 9.531867415414664e-05, + "loss": 0.2307, + "step": 4514 + }, + { + "epoch": 0.9099335079588958, + "grad_norm": 0.0974973514676094, + "learning_rate": 9.531304239380797e-05, + "loss": 0.2617, + "step": 4516 + }, + { + "epoch": 0.9103364900261939, + "grad_norm": 0.053954772651195526, + "learning_rate": 9.530740741450323e-05, + "loss": 0.2005, + "step": 4518 + }, + { + "epoch": 0.9107394720934918, + "grad_norm": 0.04149053618311882, + "learning_rate": 9.530176921663275e-05, + "loss": 0.2076, + "step": 4520 + }, + { + "epoch": 0.9111424541607899, + "grad_norm": 0.04992946609854698, + "learning_rate": 9.529612780059703e-05, + "loss": 0.1643, + "step": 4522 + }, + { + "epoch": 0.9115454362280878, + "grad_norm": 0.0528724230825901, + "learning_rate": 9.529048316679682e-05, + "loss": 0.2377, + "step": 4524 + }, + { + "epoch": 0.9119484182953859, + "grad_norm": 0.043224893510341644, + "learning_rate": 9.528483531563313e-05, + "loss": 0.2093, + "step": 4526 + }, + { + "epoch": 0.9123514003626838, + "grad_norm": 0.04171156883239746, + "learning_rate": 9.527918424750715e-05, + "loss": 0.1763, + "step": 4528 + }, + { + "epoch": 0.9127543824299819, + "grad_norm": 0.03696437180042267, + "learning_rate": 9.527352996282033e-05, + "loss": 0.2211, + "step": 4530 + }, + { + "epoch": 0.9131573644972799, + "grad_norm": 0.051768265664577484, + "learning_rate": 9.526787246197436e-05, + "loss": 0.1906, + "step": 4532 + }, + { + "epoch": 0.9135603465645779, + "grad_norm": 0.044482771307229996, + "learning_rate": 9.526221174537111e-05, + "loss": 0.1931, + "step": 4534 + }, + { + "epoch": 0.9139633286318759, + "grad_norm": 0.05503168702125549, + "learning_rate": 9.525654781341274e-05, + "loss": 0.2304, + "step": 4536 + }, + { + "epoch": 0.9143663106991738, + "grad_norm": 0.046392567455768585, + "learning_rate": 9.525088066650158e-05, + "loss": 0.2202, + "step": 4538 + }, + { + "epoch": 0.9147692927664719, + "grad_norm": 0.07175581157207489, + "learning_rate": 9.524521030504023e-05, + "loss": 0.218, + "step": 4540 + }, + { + "epoch": 0.91517227483377, + "grad_norm": 0.050982605665922165, + "learning_rate": 9.523953672943152e-05, + "loss": 0.226, + "step": 4542 + }, + { + "epoch": 0.9155752569010679, + "grad_norm": 0.05275282263755798, + "learning_rate": 9.523385994007843e-05, + "loss": 0.1566, + "step": 4544 + }, + { + "epoch": 0.9159782389683659, + "grad_norm": 0.04982639476656914, + "learning_rate": 9.522817993738429e-05, + "loss": 0.1909, + "step": 4546 + }, + { + "epoch": 0.9163812210356639, + "grad_norm": 0.0570659339427948, + "learning_rate": 9.522249672175259e-05, + "loss": 0.1553, + "step": 4548 + }, + { + "epoch": 0.9167842031029619, + "grad_norm": 0.04874482378363609, + "learning_rate": 9.521681029358702e-05, + "loss": 0.1659, + "step": 4550 + }, + { + "epoch": 0.9171871851702599, + "grad_norm": 0.06070302054286003, + "learning_rate": 9.521112065329159e-05, + "loss": 0.1794, + "step": 4552 + }, + { + "epoch": 0.9175901672375579, + "grad_norm": 0.057424820959568024, + "learning_rate": 9.520542780127044e-05, + "loss": 0.182, + "step": 4554 + }, + { + "epoch": 0.917993149304856, + "grad_norm": 0.043012555688619614, + "learning_rate": 9.519973173792798e-05, + "loss": 0.1809, + "step": 4556 + }, + { + "epoch": 0.9183961313721539, + "grad_norm": 0.07553418725728989, + "learning_rate": 9.519403246366888e-05, + "loss": 0.1845, + "step": 4558 + }, + { + "epoch": 0.918799113439452, + "grad_norm": 0.051994435489177704, + "learning_rate": 9.518832997889798e-05, + "loss": 0.213, + "step": 4560 + }, + { + "epoch": 0.9192020955067499, + "grad_norm": 0.05973916873335838, + "learning_rate": 9.51826242840204e-05, + "loss": 0.2248, + "step": 4562 + }, + { + "epoch": 0.919605077574048, + "grad_norm": 0.0520995669066906, + "learning_rate": 9.517691537944145e-05, + "loss": 0.2318, + "step": 4564 + }, + { + "epoch": 0.9200080596413459, + "grad_norm": 0.050771716982126236, + "learning_rate": 9.517120326556666e-05, + "loss": 0.2165, + "step": 4566 + }, + { + "epoch": 0.920411041708644, + "grad_norm": 0.055689260363578796, + "learning_rate": 9.516548794280185e-05, + "loss": 0.1839, + "step": 4568 + }, + { + "epoch": 0.920814023775942, + "grad_norm": 0.05374159663915634, + "learning_rate": 9.5159769411553e-05, + "loss": 0.175, + "step": 4570 + }, + { + "epoch": 0.92121700584324, + "grad_norm": 0.07564659416675568, + "learning_rate": 9.515404767222636e-05, + "loss": 0.2277, + "step": 4572 + }, + { + "epoch": 0.921619987910538, + "grad_norm": 0.05168713629245758, + "learning_rate": 9.514832272522838e-05, + "loss": 0.2263, + "step": 4574 + }, + { + "epoch": 0.922022969977836, + "grad_norm": 0.05899692326784134, + "learning_rate": 9.514259457096578e-05, + "loss": 0.2301, + "step": 4576 + }, + { + "epoch": 0.922425952045134, + "grad_norm": 0.040742505341768265, + "learning_rate": 9.513686320984543e-05, + "loss": 0.1854, + "step": 4578 + }, + { + "epoch": 0.9228289341124319, + "grad_norm": 0.04289477691054344, + "learning_rate": 9.513112864227451e-05, + "loss": 0.1922, + "step": 4580 + }, + { + "epoch": 0.92323191617973, + "grad_norm": 0.057433657348155975, + "learning_rate": 9.512539086866038e-05, + "loss": 0.1583, + "step": 4582 + }, + { + "epoch": 0.923634898247028, + "grad_norm": 0.09156624227762222, + "learning_rate": 9.511964988941067e-05, + "loss": 0.1966, + "step": 4584 + }, + { + "epoch": 0.924037880314326, + "grad_norm": 0.04218217730522156, + "learning_rate": 9.511390570493317e-05, + "loss": 0.1794, + "step": 4586 + }, + { + "epoch": 0.924440862381624, + "grad_norm": 0.04996601492166519, + "learning_rate": 9.510815831563596e-05, + "loss": 0.2086, + "step": 4588 + }, + { + "epoch": 0.924843844448922, + "grad_norm": 0.21532565355300903, + "learning_rate": 9.510240772192733e-05, + "loss": 0.2615, + "step": 4590 + }, + { + "epoch": 0.92524682651622, + "grad_norm": 0.06121666356921196, + "learning_rate": 9.509665392421579e-05, + "loss": 0.233, + "step": 4592 + }, + { + "epoch": 0.9256498085835181, + "grad_norm": 0.04677826538681984, + "learning_rate": 9.509089692291006e-05, + "loss": 0.1455, + "step": 4594 + }, + { + "epoch": 0.926052790650816, + "grad_norm": 0.043626219034194946, + "learning_rate": 9.508513671841914e-05, + "loss": 0.1787, + "step": 4596 + }, + { + "epoch": 0.9264557727181141, + "grad_norm": 0.04962493106722832, + "learning_rate": 9.507937331115222e-05, + "loss": 0.193, + "step": 4598 + }, + { + "epoch": 0.926858754785412, + "grad_norm": 0.05599873512983322, + "learning_rate": 9.50736067015187e-05, + "loss": 0.1929, + "step": 4600 + }, + { + "epoch": 0.9272617368527101, + "grad_norm": 0.0480925627052784, + "learning_rate": 9.506783688992824e-05, + "loss": 0.1696, + "step": 4602 + }, + { + "epoch": 0.927664718920008, + "grad_norm": 0.05178828909993172, + "learning_rate": 9.506206387679073e-05, + "loss": 0.1668, + "step": 4604 + }, + { + "epoch": 0.9280677009873061, + "grad_norm": 0.04958764463663101, + "learning_rate": 9.505628766251628e-05, + "loss": 0.1871, + "step": 4606 + }, + { + "epoch": 0.9284706830546041, + "grad_norm": 0.05447972193360329, + "learning_rate": 9.50505082475152e-05, + "loss": 0.2132, + "step": 4608 + }, + { + "epoch": 0.9288736651219021, + "grad_norm": 0.05043847858905792, + "learning_rate": 9.504472563219805e-05, + "loss": 0.2384, + "step": 4610 + }, + { + "epoch": 0.9292766471892001, + "grad_norm": 0.053794488310813904, + "learning_rate": 9.503893981697565e-05, + "loss": 0.2195, + "step": 4612 + }, + { + "epoch": 0.929679629256498, + "grad_norm": 0.044235896319150925, + "learning_rate": 9.503315080225897e-05, + "loss": 0.1764, + "step": 4614 + }, + { + "epoch": 0.9300826113237961, + "grad_norm": 0.052948713302612305, + "learning_rate": 9.50273585884593e-05, + "loss": 0.1739, + "step": 4616 + }, + { + "epoch": 0.930485593391094, + "grad_norm": 0.054978448897600174, + "learning_rate": 9.502156317598807e-05, + "loss": 0.2081, + "step": 4618 + }, + { + "epoch": 0.9308885754583921, + "grad_norm": 0.0541086308658123, + "learning_rate": 9.501576456525701e-05, + "loss": 0.2139, + "step": 4620 + }, + { + "epoch": 0.9312915575256902, + "grad_norm": 0.06622260063886642, + "learning_rate": 9.500996275667802e-05, + "loss": 0.1911, + "step": 4622 + }, + { + "epoch": 0.9316945395929881, + "grad_norm": 0.05048837512731552, + "learning_rate": 9.500415775066324e-05, + "loss": 0.2073, + "step": 4624 + }, + { + "epoch": 0.9320975216602861, + "grad_norm": 0.04079214856028557, + "learning_rate": 9.49983495476251e-05, + "loss": 0.1657, + "step": 4626 + }, + { + "epoch": 0.9325005037275841, + "grad_norm": 0.054248470813035965, + "learning_rate": 9.499253814797615e-05, + "loss": 0.2275, + "step": 4628 + }, + { + "epoch": 0.9329034857948821, + "grad_norm": 0.05754832178354263, + "learning_rate": 9.498672355212925e-05, + "loss": 0.1923, + "step": 4630 + }, + { + "epoch": 0.9333064678621801, + "grad_norm": 0.05790744349360466, + "learning_rate": 9.498090576049745e-05, + "loss": 0.2118, + "step": 4632 + }, + { + "epoch": 0.9337094499294781, + "grad_norm": 0.08360709995031357, + "learning_rate": 9.497508477349406e-05, + "loss": 0.2253, + "step": 4634 + }, + { + "epoch": 0.9341124319967762, + "grad_norm": 0.11953411996364594, + "learning_rate": 9.496926059153254e-05, + "loss": 0.1517, + "step": 4636 + }, + { + "epoch": 0.9345154140640741, + "grad_norm": 0.048100925981998444, + "learning_rate": 9.49634332150267e-05, + "loss": 0.2065, + "step": 4638 + }, + { + "epoch": 0.9349183961313722, + "grad_norm": 0.04823688790202141, + "learning_rate": 9.495760264439046e-05, + "loss": 0.1974, + "step": 4640 + }, + { + "epoch": 0.9353213781986701, + "grad_norm": 0.05790715292096138, + "learning_rate": 9.495176888003803e-05, + "loss": 0.2179, + "step": 4642 + }, + { + "epoch": 0.9357243602659682, + "grad_norm": 0.05937306210398674, + "learning_rate": 9.494593192238382e-05, + "loss": 0.1792, + "step": 4644 + }, + { + "epoch": 0.9361273423332662, + "grad_norm": 0.06410137563943863, + "learning_rate": 9.494009177184248e-05, + "loss": 0.1481, + "step": 4646 + }, + { + "epoch": 0.9365303244005642, + "grad_norm": 0.03937869146466255, + "learning_rate": 9.493424842882892e-05, + "loss": 0.1937, + "step": 4648 + }, + { + "epoch": 0.9369333064678622, + "grad_norm": 0.05343402177095413, + "learning_rate": 9.492840189375819e-05, + "loss": 0.2059, + "step": 4650 + }, + { + "epoch": 0.9373362885351602, + "grad_norm": 0.05615449324250221, + "learning_rate": 9.492255216704564e-05, + "loss": 0.2398, + "step": 4652 + }, + { + "epoch": 0.9377392706024582, + "grad_norm": 0.07987558841705322, + "learning_rate": 9.491669924910684e-05, + "loss": 0.2611, + "step": 4654 + }, + { + "epoch": 0.9381422526697561, + "grad_norm": 0.042240921407938004, + "learning_rate": 9.491084314035756e-05, + "loss": 0.1426, + "step": 4656 + }, + { + "epoch": 0.9385452347370542, + "grad_norm": 0.05875542387366295, + "learning_rate": 9.49049838412138e-05, + "loss": 0.2371, + "step": 4658 + }, + { + "epoch": 0.9389482168043523, + "grad_norm": 0.05499307066202164, + "learning_rate": 9.48991213520918e-05, + "loss": 0.1818, + "step": 4660 + }, + { + "epoch": 0.9393511988716502, + "grad_norm": 0.060871563851833344, + "learning_rate": 9.489325567340804e-05, + "loss": 0.1856, + "step": 4662 + }, + { + "epoch": 0.9397541809389482, + "grad_norm": 0.10326692461967468, + "learning_rate": 9.488738680557919e-05, + "loss": 0.2386, + "step": 4664 + }, + { + "epoch": 0.9401571630062462, + "grad_norm": 0.04238074645400047, + "learning_rate": 9.488151474902215e-05, + "loss": 0.1813, + "step": 4666 + }, + { + "epoch": 0.9405601450735442, + "grad_norm": 0.07186947017908096, + "learning_rate": 9.487563950415409e-05, + "loss": 0.1766, + "step": 4668 + }, + { + "epoch": 0.9409631271408422, + "grad_norm": 0.04443906992673874, + "learning_rate": 9.486976107139237e-05, + "loss": 0.171, + "step": 4670 + }, + { + "epoch": 0.9413661092081402, + "grad_norm": 0.06593028455972672, + "learning_rate": 9.486387945115458e-05, + "loss": 0.1799, + "step": 4672 + }, + { + "epoch": 0.9417690912754383, + "grad_norm": 0.09184325486421585, + "learning_rate": 9.485799464385854e-05, + "loss": 0.2211, + "step": 4674 + }, + { + "epoch": 0.9421720733427362, + "grad_norm": 0.05752957612276077, + "learning_rate": 9.48521066499223e-05, + "loss": 0.2312, + "step": 4676 + }, + { + "epoch": 0.9425750554100343, + "grad_norm": 0.06228434666991234, + "learning_rate": 9.484621546976415e-05, + "loss": 0.1777, + "step": 4678 + }, + { + "epoch": 0.9429780374773322, + "grad_norm": 0.06209741532802582, + "learning_rate": 9.484032110380256e-05, + "loss": 0.2325, + "step": 4680 + }, + { + "epoch": 0.9433810195446303, + "grad_norm": 0.04441828653216362, + "learning_rate": 9.483442355245626e-05, + "loss": 0.1211, + "step": 4682 + }, + { + "epoch": 0.9437840016119282, + "grad_norm": 0.039253611117601395, + "learning_rate": 9.482852281614423e-05, + "loss": 0.1761, + "step": 4684 + }, + { + "epoch": 0.9441869836792263, + "grad_norm": 0.04427387937903404, + "learning_rate": 9.482261889528563e-05, + "loss": 0.1874, + "step": 4686 + }, + { + "epoch": 0.9445899657465243, + "grad_norm": 0.056583307683467865, + "learning_rate": 9.481671179029985e-05, + "loss": 0.2145, + "step": 4688 + }, + { + "epoch": 0.9449929478138223, + "grad_norm": 0.04767727851867676, + "learning_rate": 9.481080150160656e-05, + "loss": 0.1776, + "step": 4690 + }, + { + "epoch": 0.9453959298811203, + "grad_norm": 0.04897291958332062, + "learning_rate": 9.480488802962559e-05, + "loss": 0.1925, + "step": 4692 + }, + { + "epoch": 0.9457989119484183, + "grad_norm": 0.06027200073003769, + "learning_rate": 9.479897137477702e-05, + "loss": 0.2359, + "step": 4694 + }, + { + "epoch": 0.9462018940157163, + "grad_norm": 0.04494505375623703, + "learning_rate": 9.479305153748116e-05, + "loss": 0.1743, + "step": 4696 + }, + { + "epoch": 0.9466048760830144, + "grad_norm": 0.060913268476724625, + "learning_rate": 9.478712851815858e-05, + "loss": 0.1648, + "step": 4698 + }, + { + "epoch": 0.9470078581503123, + "grad_norm": 0.04992394894361496, + "learning_rate": 9.478120231723001e-05, + "loss": 0.1343, + "step": 4700 + }, + { + "epoch": 0.9474108402176104, + "grad_norm": 0.04095543920993805, + "learning_rate": 9.477527293511644e-05, + "loss": 0.1663, + "step": 4702 + }, + { + "epoch": 0.9478138222849083, + "grad_norm": 0.066753089427948, + "learning_rate": 9.476934037223909e-05, + "loss": 0.2191, + "step": 4704 + }, + { + "epoch": 0.9482168043522063, + "grad_norm": 0.06055450811982155, + "learning_rate": 9.47634046290194e-05, + "loss": 0.209, + "step": 4706 + }, + { + "epoch": 0.9486197864195043, + "grad_norm": 0.045478709042072296, + "learning_rate": 9.475746570587903e-05, + "loss": 0.2269, + "step": 4708 + }, + { + "epoch": 0.9490227684868023, + "grad_norm": 0.06353277713060379, + "learning_rate": 9.475152360323987e-05, + "loss": 0.2201, + "step": 4710 + }, + { + "epoch": 0.9494257505541004, + "grad_norm": 0.05817262828350067, + "learning_rate": 9.474557832152405e-05, + "loss": 0.2127, + "step": 4712 + }, + { + "epoch": 0.9498287326213983, + "grad_norm": 0.05550335347652435, + "learning_rate": 9.47396298611539e-05, + "loss": 0.2479, + "step": 4714 + }, + { + "epoch": 0.9502317146886964, + "grad_norm": 0.06964296102523804, + "learning_rate": 9.473367822255202e-05, + "loss": 0.2014, + "step": 4716 + }, + { + "epoch": 0.9506346967559943, + "grad_norm": 0.14811141788959503, + "learning_rate": 9.472772340614115e-05, + "loss": 0.2747, + "step": 4718 + }, + { + "epoch": 0.9510376788232924, + "grad_norm": 0.05050405487418175, + "learning_rate": 9.472176541234435e-05, + "loss": 0.2006, + "step": 4720 + }, + { + "epoch": 0.9514406608905903, + "grad_norm": 0.038916632533073425, + "learning_rate": 9.471580424158486e-05, + "loss": 0.1612, + "step": 4722 + }, + { + "epoch": 0.9518436429578884, + "grad_norm": 0.05323829501867294, + "learning_rate": 9.470983989428615e-05, + "loss": 0.1914, + "step": 4724 + }, + { + "epoch": 0.9522466250251864, + "grad_norm": 0.06356607377529144, + "learning_rate": 9.47038723708719e-05, + "loss": 0.2505, + "step": 4726 + }, + { + "epoch": 0.9526496070924844, + "grad_norm": 0.04368700832128525, + "learning_rate": 9.469790167176606e-05, + "loss": 0.1307, + "step": 4728 + }, + { + "epoch": 0.9530525891597824, + "grad_norm": 0.050323549658060074, + "learning_rate": 9.469192779739278e-05, + "loss": 0.1773, + "step": 4730 + }, + { + "epoch": 0.9534555712270804, + "grad_norm": 0.08620080351829529, + "learning_rate": 9.468595074817641e-05, + "loss": 0.2042, + "step": 4732 + }, + { + "epoch": 0.9538585532943784, + "grad_norm": 0.04472964629530907, + "learning_rate": 9.467997052454157e-05, + "loss": 0.1965, + "step": 4734 + }, + { + "epoch": 0.9542615353616765, + "grad_norm": 0.04990183562040329, + "learning_rate": 9.467398712691308e-05, + "loss": 0.1418, + "step": 4736 + }, + { + "epoch": 0.9546645174289744, + "grad_norm": 0.05777883529663086, + "learning_rate": 9.466800055571599e-05, + "loss": 0.1987, + "step": 4738 + }, + { + "epoch": 0.9550674994962725, + "grad_norm": 0.04438330978155136, + "learning_rate": 9.466201081137557e-05, + "loss": 0.2193, + "step": 4740 + }, + { + "epoch": 0.9554704815635704, + "grad_norm": 0.07162509113550186, + "learning_rate": 9.465601789431733e-05, + "loss": 0.1679, + "step": 4742 + }, + { + "epoch": 0.9558734636308684, + "grad_norm": 0.07432812452316284, + "learning_rate": 9.465002180496701e-05, + "loss": 0.2137, + "step": 4744 + }, + { + "epoch": 0.9562764456981664, + "grad_norm": 0.045975614339113235, + "learning_rate": 9.464402254375053e-05, + "loss": 0.174, + "step": 4746 + }, + { + "epoch": 0.9566794277654644, + "grad_norm": 0.04282781109213829, + "learning_rate": 9.463802011109409e-05, + "loss": 0.2672, + "step": 4748 + }, + { + "epoch": 0.9570824098327625, + "grad_norm": 0.06693075597286224, + "learning_rate": 9.46320145074241e-05, + "loss": 0.1993, + "step": 4750 + }, + { + "epoch": 0.9574853919000604, + "grad_norm": 0.07140269875526428, + "learning_rate": 9.462600573316715e-05, + "loss": 0.182, + "step": 4752 + }, + { + "epoch": 0.9578883739673585, + "grad_norm": 0.0505669005215168, + "learning_rate": 9.461999378875015e-05, + "loss": 0.1665, + "step": 4754 + }, + { + "epoch": 0.9582913560346564, + "grad_norm": 0.057108256965875626, + "learning_rate": 9.461397867460014e-05, + "loss": 0.1331, + "step": 4756 + }, + { + "epoch": 0.9586943381019545, + "grad_norm": 0.05740061402320862, + "learning_rate": 9.460796039114443e-05, + "loss": 0.1687, + "step": 4758 + }, + { + "epoch": 0.9590973201692524, + "grad_norm": 0.0820176899433136, + "learning_rate": 9.460193893881057e-05, + "loss": 0.2129, + "step": 4760 + }, + { + "epoch": 0.9595003022365505, + "grad_norm": 0.06103077530860901, + "learning_rate": 9.459591431802628e-05, + "loss": 0.1989, + "step": 4762 + }, + { + "epoch": 0.9599032843038485, + "grad_norm": 0.05346281826496124, + "learning_rate": 9.458988652921957e-05, + "loss": 0.2003, + "step": 4764 + }, + { + "epoch": 0.9603062663711465, + "grad_norm": 0.0755985677242279, + "learning_rate": 9.458385557281862e-05, + "loss": 0.1845, + "step": 4766 + }, + { + "epoch": 0.9607092484384445, + "grad_norm": 0.053794801235198975, + "learning_rate": 9.457782144925188e-05, + "loss": 0.2163, + "step": 4768 + }, + { + "epoch": 0.9611122305057425, + "grad_norm": 0.04686738923192024, + "learning_rate": 9.4571784158948e-05, + "loss": 0.2205, + "step": 4770 + }, + { + "epoch": 0.9615152125730405, + "grad_norm": 0.050479158759117126, + "learning_rate": 9.456574370233584e-05, + "loss": 0.1789, + "step": 4772 + }, + { + "epoch": 0.9619181946403385, + "grad_norm": 0.0576460175216198, + "learning_rate": 9.455970007984453e-05, + "loss": 0.1383, + "step": 4774 + }, + { + "epoch": 0.9623211767076365, + "grad_norm": 0.06098479777574539, + "learning_rate": 9.45536532919034e-05, + "loss": 0.1946, + "step": 4776 + }, + { + "epoch": 0.9627241587749346, + "grad_norm": 0.057289186865091324, + "learning_rate": 9.454760333894197e-05, + "loss": 0.2384, + "step": 4778 + }, + { + "epoch": 0.9631271408422325, + "grad_norm": 0.05714469403028488, + "learning_rate": 9.454155022139006e-05, + "loss": 0.2139, + "step": 4780 + }, + { + "epoch": 0.9635301229095306, + "grad_norm": 0.06614606082439423, + "learning_rate": 9.453549393967764e-05, + "loss": 0.2204, + "step": 4782 + }, + { + "epoch": 0.9639331049768285, + "grad_norm": 0.050378940999507904, + "learning_rate": 9.452943449423497e-05, + "loss": 0.2274, + "step": 4784 + }, + { + "epoch": 0.9643360870441265, + "grad_norm": 0.06546289473772049, + "learning_rate": 9.452337188549248e-05, + "loss": 0.245, + "step": 4786 + }, + { + "epoch": 0.9647390691114246, + "grad_norm": 0.04873489961028099, + "learning_rate": 9.451730611388086e-05, + "loss": 0.2005, + "step": 4788 + }, + { + "epoch": 0.9651420511787225, + "grad_norm": 0.04178796708583832, + "learning_rate": 9.451123717983101e-05, + "loss": 0.1826, + "step": 4790 + }, + { + "epoch": 0.9655450332460206, + "grad_norm": 0.05364158749580383, + "learning_rate": 9.450516508377405e-05, + "loss": 0.1845, + "step": 4792 + }, + { + "epoch": 0.9659480153133185, + "grad_norm": 0.05494103580713272, + "learning_rate": 9.449908982614133e-05, + "loss": 0.2375, + "step": 4794 + }, + { + "epoch": 0.9663509973806166, + "grad_norm": 0.04489286616444588, + "learning_rate": 9.449301140736446e-05, + "loss": 0.2198, + "step": 4796 + }, + { + "epoch": 0.9667539794479145, + "grad_norm": 0.07947932928800583, + "learning_rate": 9.44869298278752e-05, + "loss": 0.2555, + "step": 4798 + }, + { + "epoch": 0.9671569615152126, + "grad_norm": 0.05839437618851662, + "learning_rate": 9.448084508810559e-05, + "loss": 0.2151, + "step": 4800 + }, + { + "epoch": 0.9675599435825106, + "grad_norm": 0.04776003211736679, + "learning_rate": 9.447475718848788e-05, + "loss": 0.183, + "step": 4802 + }, + { + "epoch": 0.9679629256498086, + "grad_norm": 0.04061594605445862, + "learning_rate": 9.446866612945455e-05, + "loss": 0.1549, + "step": 4804 + }, + { + "epoch": 0.9683659077171066, + "grad_norm": 0.042147375643253326, + "learning_rate": 9.44625719114383e-05, + "loss": 0.1591, + "step": 4806 + }, + { + "epoch": 0.9687688897844046, + "grad_norm": 0.05506439134478569, + "learning_rate": 9.445647453487204e-05, + "loss": 0.2761, + "step": 4808 + }, + { + "epoch": 0.9691718718517026, + "grad_norm": 0.04046489670872688, + "learning_rate": 9.445037400018892e-05, + "loss": 0.1516, + "step": 4810 + }, + { + "epoch": 0.9695748539190006, + "grad_norm": 0.04889726638793945, + "learning_rate": 9.444427030782234e-05, + "loss": 0.147, + "step": 4812 + }, + { + "epoch": 0.9699778359862986, + "grad_norm": 0.049571387469768524, + "learning_rate": 9.443816345820587e-05, + "loss": 0.2026, + "step": 4814 + }, + { + "epoch": 0.9703808180535967, + "grad_norm": 0.06491173058748245, + "learning_rate": 9.443205345177333e-05, + "loss": 0.1986, + "step": 4816 + }, + { + "epoch": 0.9707838001208946, + "grad_norm": 0.04497947171330452, + "learning_rate": 9.442594028895877e-05, + "loss": 0.2304, + "step": 4818 + }, + { + "epoch": 0.9711867821881927, + "grad_norm": 0.045850589871406555, + "learning_rate": 9.441982397019647e-05, + "loss": 0.194, + "step": 4820 + }, + { + "epoch": 0.9715897642554906, + "grad_norm": 0.0547361746430397, + "learning_rate": 9.44137044959209e-05, + "loss": 0.1766, + "step": 4822 + }, + { + "epoch": 0.9719927463227886, + "grad_norm": 0.06350360810756683, + "learning_rate": 9.44075818665668e-05, + "loss": 0.2033, + "step": 4824 + }, + { + "epoch": 0.9723957283900866, + "grad_norm": 0.05815676599740982, + "learning_rate": 9.44014560825691e-05, + "loss": 0.2162, + "step": 4826 + }, + { + "epoch": 0.9727987104573846, + "grad_norm": 0.047649532556533813, + "learning_rate": 9.439532714436297e-05, + "loss": 0.2091, + "step": 4828 + }, + { + "epoch": 0.9732016925246827, + "grad_norm": 0.05103228986263275, + "learning_rate": 9.43891950523838e-05, + "loss": 0.1958, + "step": 4830 + }, + { + "epoch": 0.9736046745919806, + "grad_norm": 0.05040536820888519, + "learning_rate": 9.438305980706721e-05, + "loss": 0.2203, + "step": 4832 + }, + { + "epoch": 0.9740076566592787, + "grad_norm": 0.03978874534368515, + "learning_rate": 9.437692140884902e-05, + "loss": 0.1667, + "step": 4834 + }, + { + "epoch": 0.9744106387265766, + "grad_norm": 0.03923666477203369, + "learning_rate": 9.437077985816532e-05, + "loss": 0.1951, + "step": 4836 + }, + { + "epoch": 0.9748136207938747, + "grad_norm": 0.047468025237321854, + "learning_rate": 9.436463515545237e-05, + "loss": 0.1986, + "step": 4838 + }, + { + "epoch": 0.9752166028611727, + "grad_norm": 0.04159865155816078, + "learning_rate": 9.435848730114668e-05, + "loss": 0.2168, + "step": 4840 + }, + { + "epoch": 0.9756195849284707, + "grad_norm": 0.04819134995341301, + "learning_rate": 9.4352336295685e-05, + "loss": 0.1822, + "step": 4842 + }, + { + "epoch": 0.9760225669957687, + "grad_norm": 0.042101290076971054, + "learning_rate": 9.434618213950428e-05, + "loss": 0.152, + "step": 4844 + }, + { + "epoch": 0.9764255490630667, + "grad_norm": 0.060625553131103516, + "learning_rate": 9.434002483304172e-05, + "loss": 0.2029, + "step": 4846 + }, + { + "epoch": 0.9768285311303647, + "grad_norm": 0.04648581147193909, + "learning_rate": 9.433386437673468e-05, + "loss": 0.1906, + "step": 4848 + }, + { + "epoch": 0.9772315131976627, + "grad_norm": 0.045150671154260635, + "learning_rate": 9.432770077102084e-05, + "loss": 0.1537, + "step": 4850 + }, + { + "epoch": 0.9776344952649607, + "grad_norm": 0.0670241042971611, + "learning_rate": 9.4321534016338e-05, + "loss": 0.2136, + "step": 4852 + }, + { + "epoch": 0.9780374773322588, + "grad_norm": 0.04682036116719246, + "learning_rate": 9.431536411312429e-05, + "loss": 0.1993, + "step": 4854 + }, + { + "epoch": 0.9784404593995567, + "grad_norm": 0.05474329739809036, + "learning_rate": 9.430919106181799e-05, + "loss": 0.195, + "step": 4856 + }, + { + "epoch": 0.9788434414668548, + "grad_norm": 0.09272141754627228, + "learning_rate": 9.43030148628576e-05, + "loss": 0.2943, + "step": 4858 + }, + { + "epoch": 0.9792464235341527, + "grad_norm": 0.044193752110004425, + "learning_rate": 9.429683551668189e-05, + "loss": 0.1576, + "step": 4860 + }, + { + "epoch": 0.9796494056014508, + "grad_norm": 0.04668412357568741, + "learning_rate": 9.429065302372984e-05, + "loss": 0.203, + "step": 4862 + }, + { + "epoch": 0.9800523876687487, + "grad_norm": 0.049144770950078964, + "learning_rate": 9.42844673844406e-05, + "loss": 0.2408, + "step": 4864 + }, + { + "epoch": 0.9804553697360467, + "grad_norm": 0.05974644795060158, + "learning_rate": 9.427827859925366e-05, + "loss": 0.2047, + "step": 4866 + }, + { + "epoch": 0.9808583518033448, + "grad_norm": 0.059690799564123154, + "learning_rate": 9.427208666860859e-05, + "loss": 0.232, + "step": 4868 + }, + { + "epoch": 0.9812613338706427, + "grad_norm": 0.05260715261101723, + "learning_rate": 9.42658915929453e-05, + "loss": 0.1786, + "step": 4870 + }, + { + "epoch": 0.9816643159379408, + "grad_norm": 0.0499386303126812, + "learning_rate": 9.425969337270386e-05, + "loss": 0.2155, + "step": 4872 + }, + { + "epoch": 0.9820672980052387, + "grad_norm": 0.03995073586702347, + "learning_rate": 9.425349200832459e-05, + "loss": 0.1629, + "step": 4874 + }, + { + "epoch": 0.9824702800725368, + "grad_norm": 0.04229956492781639, + "learning_rate": 9.424728750024802e-05, + "loss": 0.1997, + "step": 4876 + }, + { + "epoch": 0.9828732621398347, + "grad_norm": 0.04232575744390488, + "learning_rate": 9.424107984891491e-05, + "loss": 0.2051, + "step": 4878 + }, + { + "epoch": 0.9832762442071328, + "grad_norm": 0.05002473667263985, + "learning_rate": 9.423486905476624e-05, + "loss": 0.1571, + "step": 4880 + }, + { + "epoch": 0.9836792262744308, + "grad_norm": 0.039196085184812546, + "learning_rate": 9.422865511824322e-05, + "loss": 0.2116, + "step": 4882 + }, + { + "epoch": 0.9840822083417288, + "grad_norm": 0.0501602478325367, + "learning_rate": 9.422243803978726e-05, + "loss": 0.1995, + "step": 4884 + }, + { + "epoch": 0.9844851904090268, + "grad_norm": 0.0668732151389122, + "learning_rate": 9.421621781984004e-05, + "loss": 0.2035, + "step": 4886 + }, + { + "epoch": 0.9848881724763248, + "grad_norm": 0.0779421254992485, + "learning_rate": 9.42099944588434e-05, + "loss": 0.2211, + "step": 4888 + }, + { + "epoch": 0.9852911545436228, + "grad_norm": 0.052955832332372665, + "learning_rate": 9.420376795723947e-05, + "loss": 0.2195, + "step": 4890 + }, + { + "epoch": 0.9856941366109209, + "grad_norm": 0.05393822491168976, + "learning_rate": 9.419753831547056e-05, + "loss": 0.2528, + "step": 4892 + }, + { + "epoch": 0.9860971186782188, + "grad_norm": 0.05363127589225769, + "learning_rate": 9.419130553397921e-05, + "loss": 0.2517, + "step": 4894 + }, + { + "epoch": 0.9865001007455169, + "grad_norm": 0.05999777838587761, + "learning_rate": 9.418506961320819e-05, + "loss": 0.191, + "step": 4896 + }, + { + "epoch": 0.9869030828128148, + "grad_norm": 0.05673222243785858, + "learning_rate": 9.417883055360048e-05, + "loss": 0.2017, + "step": 4898 + }, + { + "epoch": 0.9873060648801129, + "grad_norm": 0.04215675964951515, + "learning_rate": 9.417258835559931e-05, + "loss": 0.2031, + "step": 4900 + }, + { + "epoch": 0.9877090469474108, + "grad_norm": 0.03936833515763283, + "learning_rate": 9.41663430196481e-05, + "loss": 0.2285, + "step": 4902 + }, + { + "epoch": 0.9881120290147088, + "grad_norm": 0.05112620070576668, + "learning_rate": 9.416009454619053e-05, + "loss": 0.2145, + "step": 4904 + }, + { + "epoch": 0.9885150110820069, + "grad_norm": 0.052898604422807693, + "learning_rate": 9.415384293567045e-05, + "loss": 0.1976, + "step": 4906 + }, + { + "epoch": 0.9889179931493048, + "grad_norm": 0.04791862890124321, + "learning_rate": 9.414758818853198e-05, + "loss": 0.2208, + "step": 4908 + }, + { + "epoch": 0.9893209752166029, + "grad_norm": 0.057992760092020035, + "learning_rate": 9.414133030521946e-05, + "loss": 0.1953, + "step": 4910 + }, + { + "epoch": 0.9897239572839008, + "grad_norm": 0.045697104185819626, + "learning_rate": 9.413506928617744e-05, + "loss": 0.1606, + "step": 4912 + }, + { + "epoch": 0.9901269393511989, + "grad_norm": 0.05711643025279045, + "learning_rate": 9.412880513185065e-05, + "loss": 0.1584, + "step": 4914 + }, + { + "epoch": 0.9905299214184968, + "grad_norm": 0.06118881329894066, + "learning_rate": 9.412253784268414e-05, + "loss": 0.1672, + "step": 4916 + }, + { + "epoch": 0.9909329034857949, + "grad_norm": 0.048959724605083466, + "learning_rate": 9.411626741912309e-05, + "loss": 0.2152, + "step": 4918 + }, + { + "epoch": 0.9913358855530929, + "grad_norm": 0.06793203949928284, + "learning_rate": 9.410999386161297e-05, + "loss": 0.1952, + "step": 4920 + }, + { + "epoch": 0.9917388676203909, + "grad_norm": 0.04434814676642418, + "learning_rate": 9.410371717059943e-05, + "loss": 0.2083, + "step": 4922 + }, + { + "epoch": 0.9921418496876889, + "grad_norm": 0.05386332795023918, + "learning_rate": 9.409743734652834e-05, + "loss": 0.1835, + "step": 4924 + }, + { + "epoch": 0.9925448317549869, + "grad_norm": 0.05638045817613602, + "learning_rate": 9.409115438984584e-05, + "loss": 0.1961, + "step": 4926 + }, + { + "epoch": 0.9929478138222849, + "grad_norm": 0.08104156702756882, + "learning_rate": 9.408486830099824e-05, + "loss": 0.266, + "step": 4928 + }, + { + "epoch": 0.9933507958895829, + "grad_norm": 0.054531458765268326, + "learning_rate": 9.40785790804321e-05, + "loss": 0.2087, + "step": 4930 + }, + { + "epoch": 0.9937537779568809, + "grad_norm": 0.051535993814468384, + "learning_rate": 9.40722867285942e-05, + "loss": 0.2245, + "step": 4932 + }, + { + "epoch": 0.994156760024179, + "grad_norm": 0.06591679900884628, + "learning_rate": 9.406599124593152e-05, + "loss": 0.1696, + "step": 4934 + }, + { + "epoch": 0.9945597420914769, + "grad_norm": 0.04971354827284813, + "learning_rate": 9.405969263289131e-05, + "loss": 0.1651, + "step": 4936 + }, + { + "epoch": 0.994962724158775, + "grad_norm": 0.06742224842309952, + "learning_rate": 9.405339088992099e-05, + "loss": 0.2121, + "step": 4938 + }, + { + "epoch": 0.9953657062260729, + "grad_norm": 0.061622168868780136, + "learning_rate": 9.404708601746823e-05, + "loss": 0.2156, + "step": 4940 + }, + { + "epoch": 0.995768688293371, + "grad_norm": 0.04887613281607628, + "learning_rate": 9.404077801598093e-05, + "loss": 0.1869, + "step": 4942 + }, + { + "epoch": 0.996171670360669, + "grad_norm": 0.051833122968673706, + "learning_rate": 9.403446688590719e-05, + "loss": 0.1906, + "step": 4944 + }, + { + "epoch": 0.996574652427967, + "grad_norm": 0.04879339411854744, + "learning_rate": 9.402815262769536e-05, + "loss": 0.1643, + "step": 4946 + }, + { + "epoch": 0.996977634495265, + "grad_norm": 0.045783065259456635, + "learning_rate": 9.402183524179395e-05, + "loss": 0.2048, + "step": 4948 + }, + { + "epoch": 0.9973806165625629, + "grad_norm": 0.048180919140577316, + "learning_rate": 9.401551472865179e-05, + "loss": 0.1594, + "step": 4950 + }, + { + "epoch": 0.997783598629861, + "grad_norm": 0.04826292395591736, + "learning_rate": 9.400919108871783e-05, + "loss": 0.2254, + "step": 4952 + }, + { + "epoch": 0.9981865806971589, + "grad_norm": 0.0434853732585907, + "learning_rate": 9.400286432244135e-05, + "loss": 0.2108, + "step": 4954 + }, + { + "epoch": 0.998589562764457, + "grad_norm": 0.04653545469045639, + "learning_rate": 9.399653443027175e-05, + "loss": 0.2503, + "step": 4956 + }, + { + "epoch": 0.998992544831755, + "grad_norm": 0.05442295968532562, + "learning_rate": 9.399020141265871e-05, + "loss": 0.2082, + "step": 4958 + }, + { + "epoch": 0.999395526899053, + "grad_norm": 0.05237448215484619, + "learning_rate": 9.39838652700521e-05, + "loss": 0.1517, + "step": 4960 + }, + { + "epoch": 0.999798508966351, + "grad_norm": 0.04914592206478119, + "learning_rate": 9.397752600290205e-05, + "loss": 0.1774, + "step": 4962 + }, + { + "epoch": 1.000201491033649, + "grad_norm": 0.03976700082421303, + "learning_rate": 9.397118361165889e-05, + "loss": 0.1439, + "step": 4964 + }, + { + "epoch": 1.000604473100947, + "grad_norm": 0.048117585480213165, + "learning_rate": 9.396483809677316e-05, + "loss": 0.2467, + "step": 4966 + }, + { + "epoch": 1.001007455168245, + "grad_norm": 0.04981888830661774, + "learning_rate": 9.395848945869564e-05, + "loss": 0.2025, + "step": 4968 + }, + { + "epoch": 1.001410437235543, + "grad_norm": 0.04174968972802162, + "learning_rate": 9.395213769787734e-05, + "loss": 0.2232, + "step": 4970 + }, + { + "epoch": 1.001813419302841, + "grad_norm": 0.05808541178703308, + "learning_rate": 9.394578281476946e-05, + "loss": 0.1699, + "step": 4972 + }, + { + "epoch": 1.0022164013701391, + "grad_norm": 0.059681493788957596, + "learning_rate": 9.393942480982345e-05, + "loss": 0.2036, + "step": 4974 + }, + { + "epoch": 1.002619383437437, + "grad_norm": 0.04494740068912506, + "learning_rate": 9.393306368349099e-05, + "loss": 0.1592, + "step": 4976 + }, + { + "epoch": 1.003022365504735, + "grad_norm": 0.048827823251485825, + "learning_rate": 9.392669943622391e-05, + "loss": 0.2018, + "step": 4978 + }, + { + "epoch": 1.003425347572033, + "grad_norm": 0.05513651296496391, + "learning_rate": 9.39203320684744e-05, + "loss": 0.1701, + "step": 4980 + }, + { + "epoch": 1.003828329639331, + "grad_norm": 0.0650024265050888, + "learning_rate": 9.39139615806947e-05, + "loss": 0.2187, + "step": 4982 + }, + { + "epoch": 1.004231311706629, + "grad_norm": 0.05801888927817345, + "learning_rate": 9.390758797333742e-05, + "loss": 0.2212, + "step": 4984 + }, + { + "epoch": 1.004634293773927, + "grad_norm": 0.05577477440237999, + "learning_rate": 9.39012112468553e-05, + "loss": 0.2248, + "step": 4986 + }, + { + "epoch": 1.0050372758412252, + "grad_norm": 0.053429532796144485, + "learning_rate": 9.389483140170134e-05, + "loss": 0.2302, + "step": 4988 + }, + { + "epoch": 1.005440257908523, + "grad_norm": 0.04849427193403244, + "learning_rate": 9.388844843832878e-05, + "loss": 0.1972, + "step": 4990 + }, + { + "epoch": 1.005843239975821, + "grad_norm": 0.04170486330986023, + "learning_rate": 9.388206235719102e-05, + "loss": 0.2255, + "step": 4992 + }, + { + "epoch": 1.006246222043119, + "grad_norm": 0.0438525527715683, + "learning_rate": 9.387567315874171e-05, + "loss": 0.1649, + "step": 4994 + }, + { + "epoch": 1.0066492041104171, + "grad_norm": 0.041522156447172165, + "learning_rate": 9.386928084343478e-05, + "loss": 0.1447, + "step": 4996 + }, + { + "epoch": 1.007052186177715, + "grad_norm": 0.05541053041815758, + "learning_rate": 9.386288541172428e-05, + "loss": 0.231, + "step": 4998 + }, + { + "epoch": 1.007455168245013, + "grad_norm": 0.047914352267980576, + "learning_rate": 9.385648686406454e-05, + "loss": 0.1826, + "step": 5000 + }, + { + "epoch": 1.0078581503123112, + "grad_norm": 0.05127349868416786, + "learning_rate": 9.385008520091012e-05, + "loss": 0.1679, + "step": 5002 + }, + { + "epoch": 1.0082611323796091, + "grad_norm": 0.06173473596572876, + "learning_rate": 9.384368042271577e-05, + "loss": 0.1756, + "step": 5004 + }, + { + "epoch": 1.008664114446907, + "grad_norm": 0.053565483540296555, + "learning_rate": 9.383727252993649e-05, + "loss": 0.1678, + "step": 5006 + }, + { + "epoch": 1.009067096514205, + "grad_norm": 0.06874987483024597, + "learning_rate": 9.383086152302747e-05, + "loss": 0.1762, + "step": 5008 + }, + { + "epoch": 1.0094700785815032, + "grad_norm": 0.06190743297338486, + "learning_rate": 9.382444740244415e-05, + "loss": 0.2323, + "step": 5010 + }, + { + "epoch": 1.0098730606488011, + "grad_norm": 0.052252788096666336, + "learning_rate": 9.381803016864216e-05, + "loss": 0.1899, + "step": 5012 + }, + { + "epoch": 1.010276042716099, + "grad_norm": 0.05020655691623688, + "learning_rate": 9.38116098220774e-05, + "loss": 0.1517, + "step": 5014 + }, + { + "epoch": 1.0106790247833972, + "grad_norm": 0.05819331109523773, + "learning_rate": 9.380518636320594e-05, + "loss": 0.1582, + "step": 5016 + }, + { + "epoch": 1.0110820068506952, + "grad_norm": 0.04920068383216858, + "learning_rate": 9.37987597924841e-05, + "loss": 0.1482, + "step": 5018 + }, + { + "epoch": 1.011484988917993, + "grad_norm": 0.05984297767281532, + "learning_rate": 9.37923301103684e-05, + "loss": 0.2166, + "step": 5020 + }, + { + "epoch": 1.0118879709852913, + "grad_norm": 0.03330834209918976, + "learning_rate": 9.378589731731561e-05, + "loss": 0.1439, + "step": 5022 + }, + { + "epoch": 1.0122909530525892, + "grad_norm": 0.05390724912285805, + "learning_rate": 9.37794614137827e-05, + "loss": 0.1796, + "step": 5024 + }, + { + "epoch": 1.0126939351198871, + "grad_norm": 0.04868139326572418, + "learning_rate": 9.377302240022687e-05, + "loss": 0.1869, + "step": 5026 + }, + { + "epoch": 1.013096917187185, + "grad_norm": 0.04992436245083809, + "learning_rate": 9.376658027710552e-05, + "loss": 0.2336, + "step": 5028 + }, + { + "epoch": 1.0134998992544832, + "grad_norm": 0.0564495250582695, + "learning_rate": 9.37601350448763e-05, + "loss": 0.165, + "step": 5030 + }, + { + "epoch": 1.0139028813217812, + "grad_norm": 0.06212518364191055, + "learning_rate": 9.375368670399709e-05, + "loss": 0.1943, + "step": 5032 + }, + { + "epoch": 1.0143058633890791, + "grad_norm": 0.06465470790863037, + "learning_rate": 9.374723525492594e-05, + "loss": 0.1914, + "step": 5034 + }, + { + "epoch": 1.0147088454563773, + "grad_norm": 0.0705813467502594, + "learning_rate": 9.374078069812116e-05, + "loss": 0.1924, + "step": 5036 + }, + { + "epoch": 1.0151118275236752, + "grad_norm": 0.06170513108372688, + "learning_rate": 9.373432303404128e-05, + "loss": 0.2444, + "step": 5038 + }, + { + "epoch": 1.0155148095909732, + "grad_norm": 0.04834654927253723, + "learning_rate": 9.372786226314503e-05, + "loss": 0.1862, + "step": 5040 + }, + { + "epoch": 1.0159177916582711, + "grad_norm": 0.04867038130760193, + "learning_rate": 9.372139838589138e-05, + "loss": 0.252, + "step": 5042 + }, + { + "epoch": 1.0163207737255693, + "grad_norm": 0.03534555062651634, + "learning_rate": 9.37149314027395e-05, + "loss": 0.136, + "step": 5044 + }, + { + "epoch": 1.0167237557928672, + "grad_norm": 0.06922519952058792, + "learning_rate": 9.37084613141488e-05, + "loss": 0.176, + "step": 5046 + }, + { + "epoch": 1.0171267378601652, + "grad_norm": 0.044819701462984085, + "learning_rate": 9.370198812057893e-05, + "loss": 0.1751, + "step": 5048 + }, + { + "epoch": 1.0175297199274633, + "grad_norm": 0.049208372831344604, + "learning_rate": 9.36955118224897e-05, + "loss": 0.2234, + "step": 5050 + }, + { + "epoch": 1.0179327019947613, + "grad_norm": 0.048055894672870636, + "learning_rate": 9.368903242034121e-05, + "loss": 0.1297, + "step": 5052 + }, + { + "epoch": 1.0183356840620592, + "grad_norm": 0.04360407590866089, + "learning_rate": 9.368254991459371e-05, + "loss": 0.1929, + "step": 5054 + }, + { + "epoch": 1.0187386661293572, + "grad_norm": 0.06082189083099365, + "learning_rate": 9.367606430570772e-05, + "loss": 0.1798, + "step": 5056 + }, + { + "epoch": 1.0191416481966553, + "grad_norm": 0.062363721430301666, + "learning_rate": 9.366957559414399e-05, + "loss": 0.2117, + "step": 5058 + }, + { + "epoch": 1.0195446302639533, + "grad_norm": 0.07241443544626236, + "learning_rate": 9.366308378036344e-05, + "loss": 0.1657, + "step": 5060 + }, + { + "epoch": 1.0199476123312512, + "grad_norm": 0.05443087965250015, + "learning_rate": 9.365658886482725e-05, + "loss": 0.1419, + "step": 5062 + }, + { + "epoch": 1.0203505943985494, + "grad_norm": 0.05649973824620247, + "learning_rate": 9.365009084799678e-05, + "loss": 0.1858, + "step": 5064 + }, + { + "epoch": 1.0207535764658473, + "grad_norm": 0.05326389893889427, + "learning_rate": 9.36435897303337e-05, + "loss": 0.1853, + "step": 5066 + }, + { + "epoch": 1.0211565585331452, + "grad_norm": 0.05806842818856239, + "learning_rate": 9.363708551229978e-05, + "loss": 0.2173, + "step": 5068 + }, + { + "epoch": 1.0215595406004432, + "grad_norm": 0.0407080352306366, + "learning_rate": 9.36305781943571e-05, + "loss": 0.1853, + "step": 5070 + }, + { + "epoch": 1.0219625226677413, + "grad_norm": 0.07433430105447769, + "learning_rate": 9.362406777696793e-05, + "loss": 0.2404, + "step": 5072 + }, + { + "epoch": 1.0223655047350393, + "grad_norm": 0.04541980102658272, + "learning_rate": 9.361755426059473e-05, + "loss": 0.1299, + "step": 5074 + }, + { + "epoch": 1.0227684868023372, + "grad_norm": 0.057482391595840454, + "learning_rate": 9.361103764570025e-05, + "loss": 0.1706, + "step": 5076 + }, + { + "epoch": 1.0231714688696354, + "grad_norm": 0.06770235300064087, + "learning_rate": 9.36045179327474e-05, + "loss": 0.2241, + "step": 5078 + }, + { + "epoch": 1.0235744509369333, + "grad_norm": 0.058020222932100296, + "learning_rate": 9.359799512219932e-05, + "loss": 0.1901, + "step": 5080 + }, + { + "epoch": 1.0239774330042313, + "grad_norm": 0.0436263270676136, + "learning_rate": 9.35914692145194e-05, + "loss": 0.1821, + "step": 5082 + }, + { + "epoch": 1.0243804150715292, + "grad_norm": 0.0542580746114254, + "learning_rate": 9.358494021017121e-05, + "loss": 0.2041, + "step": 5084 + }, + { + "epoch": 1.0247833971388274, + "grad_norm": 0.05703788995742798, + "learning_rate": 9.35784081096186e-05, + "loss": 0.2051, + "step": 5086 + }, + { + "epoch": 1.0251863792061253, + "grad_norm": 0.035361260175704956, + "learning_rate": 9.357187291332554e-05, + "loss": 0.1488, + "step": 5088 + }, + { + "epoch": 1.0255893612734233, + "grad_norm": 0.05394968390464783, + "learning_rate": 9.356533462175632e-05, + "loss": 0.1925, + "step": 5090 + }, + { + "epoch": 1.0259923433407214, + "grad_norm": 0.056257717311382294, + "learning_rate": 9.35587932353754e-05, + "loss": 0.2336, + "step": 5092 + }, + { + "epoch": 1.0263953254080194, + "grad_norm": 0.05867587774991989, + "learning_rate": 9.355224875464748e-05, + "loss": 0.2295, + "step": 5094 + }, + { + "epoch": 1.0267983074753173, + "grad_norm": 0.05022086203098297, + "learning_rate": 9.354570118003745e-05, + "loss": 0.2216, + "step": 5096 + }, + { + "epoch": 1.0272012895426152, + "grad_norm": 0.045713771134614944, + "learning_rate": 9.353915051201046e-05, + "loss": 0.1787, + "step": 5098 + }, + { + "epoch": 1.0276042716099134, + "grad_norm": 0.043041035532951355, + "learning_rate": 9.353259675103185e-05, + "loss": 0.1466, + "step": 5100 + }, + { + "epoch": 1.0280072536772114, + "grad_norm": 0.053802333772182465, + "learning_rate": 9.352603989756717e-05, + "loss": 0.1919, + "step": 5102 + }, + { + "epoch": 1.0284102357445093, + "grad_norm": 0.049435701221227646, + "learning_rate": 9.351947995208224e-05, + "loss": 0.19, + "step": 5104 + }, + { + "epoch": 1.0288132178118075, + "grad_norm": 0.049988001585006714, + "learning_rate": 9.351291691504305e-05, + "loss": 0.2154, + "step": 5106 + }, + { + "epoch": 1.0292161998791054, + "grad_norm": 0.054042112082242966, + "learning_rate": 9.350635078691583e-05, + "loss": 0.153, + "step": 5108 + }, + { + "epoch": 1.0296191819464033, + "grad_norm": 0.04441095143556595, + "learning_rate": 9.349978156816702e-05, + "loss": 0.1965, + "step": 5110 + }, + { + "epoch": 1.0300221640137015, + "grad_norm": 0.04497204348444939, + "learning_rate": 9.34932092592633e-05, + "loss": 0.2281, + "step": 5112 + }, + { + "epoch": 1.0304251460809994, + "grad_norm": 0.0403619110584259, + "learning_rate": 9.348663386067156e-05, + "loss": 0.1855, + "step": 5114 + }, + { + "epoch": 1.0308281281482974, + "grad_norm": 0.05032897740602493, + "learning_rate": 9.348005537285889e-05, + "loss": 0.2216, + "step": 5116 + }, + { + "epoch": 1.0312311102155953, + "grad_norm": 0.058560241013765335, + "learning_rate": 9.347347379629262e-05, + "loss": 0.2346, + "step": 5118 + }, + { + "epoch": 1.0316340922828935, + "grad_norm": 0.04787188768386841, + "learning_rate": 9.346688913144031e-05, + "loss": 0.1653, + "step": 5120 + }, + { + "epoch": 1.0320370743501914, + "grad_norm": 0.07317479699850082, + "learning_rate": 9.34603013787697e-05, + "loss": 0.1305, + "step": 5122 + }, + { + "epoch": 1.0324400564174894, + "grad_norm": 0.07495363801717758, + "learning_rate": 9.345371053874878e-05, + "loss": 0.1945, + "step": 5124 + }, + { + "epoch": 1.0328430384847875, + "grad_norm": 0.05415000021457672, + "learning_rate": 9.344711661184575e-05, + "loss": 0.1667, + "step": 5126 + }, + { + "epoch": 1.0332460205520855, + "grad_norm": 0.05984153226017952, + "learning_rate": 9.344051959852907e-05, + "loss": 0.2495, + "step": 5128 + }, + { + "epoch": 1.0336490026193834, + "grad_norm": 0.055079031735658646, + "learning_rate": 9.343391949926732e-05, + "loss": 0.2101, + "step": 5130 + }, + { + "epoch": 1.0340519846866814, + "grad_norm": 0.05712849646806717, + "learning_rate": 9.342731631452942e-05, + "loss": 0.2127, + "step": 5132 + }, + { + "epoch": 1.0344549667539795, + "grad_norm": 0.04696709290146828, + "learning_rate": 9.342071004478439e-05, + "loss": 0.2077, + "step": 5134 + }, + { + "epoch": 1.0348579488212775, + "grad_norm": 0.05382629111409187, + "learning_rate": 9.341410069050159e-05, + "loss": 0.2282, + "step": 5136 + }, + { + "epoch": 1.0352609308885754, + "grad_norm": 0.05978304147720337, + "learning_rate": 9.340748825215047e-05, + "loss": 0.1724, + "step": 5138 + }, + { + "epoch": 1.0356639129558736, + "grad_norm": 0.0483553521335125, + "learning_rate": 9.340087273020084e-05, + "loss": 0.1775, + "step": 5140 + }, + { + "epoch": 1.0360668950231715, + "grad_norm": 0.06312581151723862, + "learning_rate": 9.339425412512259e-05, + "loss": 0.195, + "step": 5142 + }, + { + "epoch": 1.0364698770904694, + "grad_norm": 0.049008119851350784, + "learning_rate": 9.338763243738595e-05, + "loss": 0.1825, + "step": 5144 + }, + { + "epoch": 1.0368728591577674, + "grad_norm": 0.12464253604412079, + "learning_rate": 9.338100766746129e-05, + "loss": 0.1478, + "step": 5146 + }, + { + "epoch": 1.0372758412250656, + "grad_norm": 0.04872802644968033, + "learning_rate": 9.337437981581921e-05, + "loss": 0.2437, + "step": 5148 + }, + { + "epoch": 1.0376788232923635, + "grad_norm": 0.06108350679278374, + "learning_rate": 9.336774888293056e-05, + "loss": 0.1857, + "step": 5150 + }, + { + "epoch": 1.0380818053596614, + "grad_norm": 0.06063728407025337, + "learning_rate": 9.336111486926639e-05, + "loss": 0.1936, + "step": 5152 + }, + { + "epoch": 1.0384847874269596, + "grad_norm": 0.06168040260672569, + "learning_rate": 9.335447777529795e-05, + "loss": 0.205, + "step": 5154 + }, + { + "epoch": 1.0388877694942575, + "grad_norm": 0.058670446276664734, + "learning_rate": 9.334783760149677e-05, + "loss": 0.1868, + "step": 5156 + }, + { + "epoch": 1.0392907515615555, + "grad_norm": 0.0711679607629776, + "learning_rate": 9.334119434833452e-05, + "loss": 0.2227, + "step": 5158 + }, + { + "epoch": 1.0396937336288534, + "grad_norm": 0.05397256091237068, + "learning_rate": 9.333454801628313e-05, + "loss": 0.1852, + "step": 5160 + }, + { + "epoch": 1.0400967156961516, + "grad_norm": 0.06917382031679153, + "learning_rate": 9.332789860581475e-05, + "loss": 0.2189, + "step": 5162 + }, + { + "epoch": 1.0404996977634495, + "grad_norm": 0.05291181057691574, + "learning_rate": 9.332124611740176e-05, + "loss": 0.2203, + "step": 5164 + }, + { + "epoch": 1.0409026798307475, + "grad_norm": 0.046576354652643204, + "learning_rate": 9.331459055151673e-05, + "loss": 0.16, + "step": 5166 + }, + { + "epoch": 1.0413056618980456, + "grad_norm": 0.04400373622775078, + "learning_rate": 9.330793190863244e-05, + "loss": 0.1689, + "step": 5168 + }, + { + "epoch": 1.0417086439653436, + "grad_norm": 0.055641304701566696, + "learning_rate": 9.330127018922194e-05, + "loss": 0.1461, + "step": 5170 + }, + { + "epoch": 1.0421116260326415, + "grad_norm": 0.05043255165219307, + "learning_rate": 9.329460539375844e-05, + "loss": 0.2048, + "step": 5172 + }, + { + "epoch": 1.0425146080999395, + "grad_norm": 0.2573598325252533, + "learning_rate": 9.328793752271543e-05, + "loss": 0.1971, + "step": 5174 + }, + { + "epoch": 1.0429175901672376, + "grad_norm": 0.06544458121061325, + "learning_rate": 9.328126657656657e-05, + "loss": 0.2092, + "step": 5176 + }, + { + "epoch": 1.0433205722345356, + "grad_norm": 0.07152343541383743, + "learning_rate": 9.327459255578574e-05, + "loss": 0.2437, + "step": 5178 + }, + { + "epoch": 1.0437235543018335, + "grad_norm": 0.07631577551364899, + "learning_rate": 9.326791546084706e-05, + "loss": 0.199, + "step": 5180 + }, + { + "epoch": 1.0441265363691317, + "grad_norm": 0.060583699494600296, + "learning_rate": 9.326123529222489e-05, + "loss": 0.1864, + "step": 5182 + }, + { + "epoch": 1.0445295184364296, + "grad_norm": 0.04643869027495384, + "learning_rate": 9.325455205039372e-05, + "loss": 0.1652, + "step": 5184 + }, + { + "epoch": 1.0449325005037275, + "grad_norm": 0.2666691839694977, + "learning_rate": 9.324786573582836e-05, + "loss": 0.1951, + "step": 5186 + }, + { + "epoch": 1.0453354825710255, + "grad_norm": 0.060796648263931274, + "learning_rate": 9.324117634900378e-05, + "loss": 0.224, + "step": 5188 + }, + { + "epoch": 1.0457384646383237, + "grad_norm": 0.05381575971841812, + "learning_rate": 9.323448389039517e-05, + "loss": 0.2816, + "step": 5190 + }, + { + "epoch": 1.0461414467056216, + "grad_norm": 0.06347894668579102, + "learning_rate": 9.322778836047798e-05, + "loss": 0.2011, + "step": 5192 + }, + { + "epoch": 1.0465444287729195, + "grad_norm": 0.047166090458631516, + "learning_rate": 9.322108975972786e-05, + "loss": 0.2097, + "step": 5194 + }, + { + "epoch": 1.0469474108402177, + "grad_norm": 0.047421008348464966, + "learning_rate": 9.321438808862061e-05, + "loss": 0.2342, + "step": 5196 + }, + { + "epoch": 1.0473503929075156, + "grad_norm": 0.0566796138882637, + "learning_rate": 9.320768334763236e-05, + "loss": 0.2047, + "step": 5198 + }, + { + "epoch": 1.0477533749748136, + "grad_norm": 0.05123184621334076, + "learning_rate": 9.320097553723938e-05, + "loss": 0.2139, + "step": 5200 + }, + { + "epoch": 1.0481563570421115, + "grad_norm": 0.04648276045918465, + "learning_rate": 9.319426465791821e-05, + "loss": 0.2232, + "step": 5202 + }, + { + "epoch": 1.0485593391094097, + "grad_norm": 0.061085715889930725, + "learning_rate": 9.318755071014554e-05, + "loss": 0.1368, + "step": 5204 + }, + { + "epoch": 1.0489623211767076, + "grad_norm": 0.06661339849233627, + "learning_rate": 9.318083369439833e-05, + "loss": 0.1924, + "step": 5206 + }, + { + "epoch": 1.0493653032440056, + "grad_norm": 0.043311674147844315, + "learning_rate": 9.317411361115376e-05, + "loss": 0.196, + "step": 5208 + }, + { + "epoch": 1.0497682853113037, + "grad_norm": 0.07116144895553589, + "learning_rate": 9.31673904608892e-05, + "loss": 0.1658, + "step": 5210 + }, + { + "epoch": 1.0501712673786017, + "grad_norm": 0.054282158613204956, + "learning_rate": 9.316066424408225e-05, + "loss": 0.1705, + "step": 5212 + }, + { + "epoch": 1.0505742494458996, + "grad_norm": 0.057639311999082565, + "learning_rate": 9.315393496121075e-05, + "loss": 0.2219, + "step": 5214 + }, + { + "epoch": 1.0509772315131976, + "grad_norm": 0.06315790861845016, + "learning_rate": 9.314720261275273e-05, + "loss": 0.2211, + "step": 5216 + }, + { + "epoch": 1.0513802135804957, + "grad_norm": 0.07036170363426208, + "learning_rate": 9.314046719918644e-05, + "loss": 0.1876, + "step": 5218 + }, + { + "epoch": 1.0517831956477937, + "grad_norm": 0.24687032401561737, + "learning_rate": 9.313372872099033e-05, + "loss": 0.1613, + "step": 5220 + }, + { + "epoch": 1.0521861777150916, + "grad_norm": 0.05008988082408905, + "learning_rate": 9.312698717864314e-05, + "loss": 0.2066, + "step": 5222 + }, + { + "epoch": 1.0525891597823898, + "grad_norm": 0.06380771845579147, + "learning_rate": 9.312024257262373e-05, + "loss": 0.1926, + "step": 5224 + }, + { + "epoch": 1.0529921418496877, + "grad_norm": 0.06707505881786346, + "learning_rate": 9.311349490341126e-05, + "loss": 0.229, + "step": 5226 + }, + { + "epoch": 1.0533951239169856, + "grad_norm": 0.1017342135310173, + "learning_rate": 9.310674417148507e-05, + "loss": 0.2125, + "step": 5228 + }, + { + "epoch": 1.0537981059842838, + "grad_norm": 0.09565824270248413, + "learning_rate": 9.30999903773247e-05, + "loss": 0.2331, + "step": 5230 + }, + { + "epoch": 1.0542010880515817, + "grad_norm": 0.05370059609413147, + "learning_rate": 9.309323352140996e-05, + "loss": 0.1902, + "step": 5232 + }, + { + "epoch": 1.0546040701188797, + "grad_norm": 0.07205013185739517, + "learning_rate": 9.30864736042208e-05, + "loss": 0.2382, + "step": 5234 + }, + { + "epoch": 1.0550070521861776, + "grad_norm": 0.046149007976055145, + "learning_rate": 9.307971062623748e-05, + "loss": 0.1579, + "step": 5236 + }, + { + "epoch": 1.0554100342534758, + "grad_norm": 0.07567006349563599, + "learning_rate": 9.307294458794041e-05, + "loss": 0.1881, + "step": 5238 + }, + { + "epoch": 1.0558130163207737, + "grad_norm": 0.06721212714910507, + "learning_rate": 9.306617548981024e-05, + "loss": 0.1988, + "step": 5240 + }, + { + "epoch": 1.0562159983880717, + "grad_norm": 0.0702115148305893, + "learning_rate": 9.305940333232784e-05, + "loss": 0.1687, + "step": 5242 + }, + { + "epoch": 1.0566189804553698, + "grad_norm": 0.05544662848114967, + "learning_rate": 9.305262811597429e-05, + "loss": 0.1663, + "step": 5244 + }, + { + "epoch": 1.0570219625226678, + "grad_norm": 0.0851738229393959, + "learning_rate": 9.304584984123089e-05, + "loss": 0.2534, + "step": 5246 + }, + { + "epoch": 1.0574249445899657, + "grad_norm": 0.042979102581739426, + "learning_rate": 9.303906850857917e-05, + "loss": 0.145, + "step": 5248 + }, + { + "epoch": 1.0578279266572637, + "grad_norm": 0.07365533709526062, + "learning_rate": 9.303228411850085e-05, + "loss": 0.1765, + "step": 5250 + }, + { + "epoch": 1.0582309087245618, + "grad_norm": 0.07229702174663544, + "learning_rate": 9.302549667147787e-05, + "loss": 0.1969, + "step": 5252 + }, + { + "epoch": 1.0586338907918598, + "grad_norm": 0.11042525619268417, + "learning_rate": 9.301870616799242e-05, + "loss": 0.2076, + "step": 5254 + }, + { + "epoch": 1.0590368728591577, + "grad_norm": 0.06097995117306709, + "learning_rate": 9.301191260852688e-05, + "loss": 0.1772, + "step": 5256 + }, + { + "epoch": 1.0594398549264559, + "grad_norm": 0.08126639574766159, + "learning_rate": 9.300511599356387e-05, + "loss": 0.2103, + "step": 5258 + }, + { + "epoch": 1.0598428369937538, + "grad_norm": 0.06578926742076874, + "learning_rate": 9.29983163235862e-05, + "loss": 0.2008, + "step": 5260 + }, + { + "epoch": 1.0602458190610518, + "grad_norm": 0.05863117054104805, + "learning_rate": 9.299151359907689e-05, + "loss": 0.2324, + "step": 5262 + }, + { + "epoch": 1.0606488011283497, + "grad_norm": 0.06247061491012573, + "learning_rate": 9.29847078205192e-05, + "loss": 0.217, + "step": 5264 + }, + { + "epoch": 1.0610517831956479, + "grad_norm": 0.07186681032180786, + "learning_rate": 9.297789898839662e-05, + "loss": 0.2127, + "step": 5266 + }, + { + "epoch": 1.0614547652629458, + "grad_norm": 0.0571020282804966, + "learning_rate": 9.297108710319285e-05, + "loss": 0.1943, + "step": 5268 + }, + { + "epoch": 1.0618577473302437, + "grad_norm": 0.054367225617170334, + "learning_rate": 9.296427216539175e-05, + "loss": 0.2208, + "step": 5270 + }, + { + "epoch": 1.062260729397542, + "grad_norm": 0.05774039775133133, + "learning_rate": 9.295745417547747e-05, + "loss": 0.2079, + "step": 5272 + }, + { + "epoch": 1.0626637114648398, + "grad_norm": 0.05585348606109619, + "learning_rate": 9.295063313393435e-05, + "loss": 0.2041, + "step": 5274 + }, + { + "epoch": 1.0630666935321378, + "grad_norm": 0.056385934352874756, + "learning_rate": 9.294380904124693e-05, + "loss": 0.1697, + "step": 5276 + }, + { + "epoch": 1.0634696755994357, + "grad_norm": 0.059455644339323044, + "learning_rate": 9.293698189790002e-05, + "loss": 0.2062, + "step": 5278 + }, + { + "epoch": 1.063872657666734, + "grad_norm": 0.05323608219623566, + "learning_rate": 9.293015170437856e-05, + "loss": 0.1897, + "step": 5280 + }, + { + "epoch": 1.0642756397340318, + "grad_norm": 0.04466480761766434, + "learning_rate": 9.292331846116779e-05, + "loss": 0.2029, + "step": 5282 + }, + { + "epoch": 1.0646786218013298, + "grad_norm": 0.05269274860620499, + "learning_rate": 9.29164821687531e-05, + "loss": 0.2499, + "step": 5284 + }, + { + "epoch": 1.065081603868628, + "grad_norm": 0.07230303436517715, + "learning_rate": 9.290964282762018e-05, + "loss": 0.1982, + "step": 5286 + }, + { + "epoch": 1.0654845859359259, + "grad_norm": 0.03910969942808151, + "learning_rate": 9.290280043825486e-05, + "loss": 0.1533, + "step": 5288 + }, + { + "epoch": 1.0658875680032238, + "grad_norm": 0.06969325244426727, + "learning_rate": 9.289595500114319e-05, + "loss": 0.1881, + "step": 5290 + }, + { + "epoch": 1.0662905500705218, + "grad_norm": 0.04106011241674423, + "learning_rate": 9.288910651677149e-05, + "loss": 0.1504, + "step": 5292 + }, + { + "epoch": 1.06669353213782, + "grad_norm": 0.06589590013027191, + "learning_rate": 9.288225498562624e-05, + "loss": 0.2022, + "step": 5294 + }, + { + "epoch": 1.0670965142051179, + "grad_norm": 0.07294216752052307, + "learning_rate": 9.287540040819418e-05, + "loss": 0.2199, + "step": 5296 + }, + { + "epoch": 1.0674994962724158, + "grad_norm": 0.04226839542388916, + "learning_rate": 9.286854278496226e-05, + "loss": 0.1784, + "step": 5298 + }, + { + "epoch": 1.067902478339714, + "grad_norm": 0.06024034321308136, + "learning_rate": 9.286168211641762e-05, + "loss": 0.253, + "step": 5300 + }, + { + "epoch": 1.068305460407012, + "grad_norm": 0.052907638251781464, + "learning_rate": 9.28548184030476e-05, + "loss": 0.1703, + "step": 5302 + }, + { + "epoch": 1.0687084424743098, + "grad_norm": 0.06218164786696434, + "learning_rate": 9.284795164533984e-05, + "loss": 0.2072, + "step": 5304 + }, + { + "epoch": 1.069111424541608, + "grad_norm": 0.05378426983952522, + "learning_rate": 9.284108184378212e-05, + "loss": 0.1843, + "step": 5306 + }, + { + "epoch": 1.069514406608906, + "grad_norm": 0.04734671860933304, + "learning_rate": 9.283420899886245e-05, + "loss": 0.1888, + "step": 5308 + }, + { + "epoch": 1.069917388676204, + "grad_norm": 0.04951245337724686, + "learning_rate": 9.282733311106908e-05, + "loss": 0.228, + "step": 5310 + }, + { + "epoch": 1.0703203707435018, + "grad_norm": 0.06134570389986038, + "learning_rate": 9.282045418089047e-05, + "loss": 0.1987, + "step": 5312 + }, + { + "epoch": 1.0707233528108, + "grad_norm": 0.06411506235599518, + "learning_rate": 9.281357220881526e-05, + "loss": 0.214, + "step": 5314 + }, + { + "epoch": 1.071126334878098, + "grad_norm": 0.058260899037122726, + "learning_rate": 9.280668719533236e-05, + "loss": 0.2182, + "step": 5316 + }, + { + "epoch": 1.0715293169453959, + "grad_norm": 0.04684171453118324, + "learning_rate": 9.279979914093084e-05, + "loss": 0.1945, + "step": 5318 + }, + { + "epoch": 1.071932299012694, + "grad_norm": 0.0659032016992569, + "learning_rate": 9.279290804610005e-05, + "loss": 0.2218, + "step": 5320 + }, + { + "epoch": 1.072335281079992, + "grad_norm": 0.061241116374731064, + "learning_rate": 9.278601391132953e-05, + "loss": 0.2016, + "step": 5322 + }, + { + "epoch": 1.07273826314729, + "grad_norm": 0.056910622864961624, + "learning_rate": 9.2779116737109e-05, + "loss": 0.2354, + "step": 5324 + }, + { + "epoch": 1.0731412452145879, + "grad_norm": 0.04516435042023659, + "learning_rate": 9.277221652392841e-05, + "loss": 0.2081, + "step": 5326 + }, + { + "epoch": 1.073544227281886, + "grad_norm": 0.047263868153095245, + "learning_rate": 9.276531327227798e-05, + "loss": 0.2268, + "step": 5328 + }, + { + "epoch": 1.073947209349184, + "grad_norm": 0.07561453431844711, + "learning_rate": 9.275840698264808e-05, + "loss": 0.2418, + "step": 5330 + }, + { + "epoch": 1.074350191416482, + "grad_norm": 0.04668330028653145, + "learning_rate": 9.275149765552933e-05, + "loss": 0.2266, + "step": 5332 + }, + { + "epoch": 1.07475317348378, + "grad_norm": 0.042202576994895935, + "learning_rate": 9.274458529141256e-05, + "loss": 0.1451, + "step": 5334 + }, + { + "epoch": 1.075156155551078, + "grad_norm": 0.05034751817584038, + "learning_rate": 9.273766989078883e-05, + "loss": 0.2515, + "step": 5336 + }, + { + "epoch": 1.075559137618376, + "grad_norm": 0.054490312933921814, + "learning_rate": 9.273075145414935e-05, + "loss": 0.1836, + "step": 5338 + }, + { + "epoch": 1.075962119685674, + "grad_norm": 0.0491647832095623, + "learning_rate": 9.272382998198563e-05, + "loss": 0.1985, + "step": 5340 + }, + { + "epoch": 1.076365101752972, + "grad_norm": 0.06226971000432968, + "learning_rate": 9.271690547478937e-05, + "loss": 0.2063, + "step": 5342 + }, + { + "epoch": 1.07676808382027, + "grad_norm": 0.043259039521217346, + "learning_rate": 9.270997793305245e-05, + "loss": 0.169, + "step": 5344 + }, + { + "epoch": 1.077171065887568, + "grad_norm": 0.05381862819194794, + "learning_rate": 9.2703047357267e-05, + "loss": 0.1795, + "step": 5346 + }, + { + "epoch": 1.077574047954866, + "grad_norm": 0.05488457530736923, + "learning_rate": 9.269611374792537e-05, + "loss": 0.1761, + "step": 5348 + }, + { + "epoch": 1.077977030022164, + "grad_norm": 0.0528683140873909, + "learning_rate": 9.26891771055201e-05, + "loss": 0.1615, + "step": 5350 + }, + { + "epoch": 1.078380012089462, + "grad_norm": 0.05571812763810158, + "learning_rate": 9.268223743054394e-05, + "loss": 0.1655, + "step": 5352 + }, + { + "epoch": 1.07878299415676, + "grad_norm": 0.0493588000535965, + "learning_rate": 9.267529472348992e-05, + "loss": 0.155, + "step": 5354 + }, + { + "epoch": 1.079185976224058, + "grad_norm": 0.05553653463721275, + "learning_rate": 9.266834898485119e-05, + "loss": 0.2077, + "step": 5356 + }, + { + "epoch": 1.079588958291356, + "grad_norm": 0.07048819959163666, + "learning_rate": 9.26614002151212e-05, + "loss": 0.1946, + "step": 5358 + }, + { + "epoch": 1.079991940358654, + "grad_norm": 0.04235006496310234, + "learning_rate": 9.265444841479356e-05, + "loss": 0.1742, + "step": 5360 + }, + { + "epoch": 1.0803949224259521, + "grad_norm": 0.044974058866500854, + "learning_rate": 9.264749358436213e-05, + "loss": 0.1744, + "step": 5362 + }, + { + "epoch": 1.08079790449325, + "grad_norm": 0.05903393775224686, + "learning_rate": 9.264053572432094e-05, + "loss": 0.2158, + "step": 5364 + }, + { + "epoch": 1.081200886560548, + "grad_norm": 0.06140587851405144, + "learning_rate": 9.263357483516431e-05, + "loss": 0.1826, + "step": 5366 + }, + { + "epoch": 1.081603868627846, + "grad_norm": 0.044442079961299896, + "learning_rate": 9.262661091738668e-05, + "loss": 0.1985, + "step": 5368 + }, + { + "epoch": 1.0820068506951441, + "grad_norm": 0.04410601034760475, + "learning_rate": 9.261964397148279e-05, + "loss": 0.1989, + "step": 5370 + }, + { + "epoch": 1.082409832762442, + "grad_norm": 0.05235345661640167, + "learning_rate": 9.261267399794757e-05, + "loss": 0.1553, + "step": 5372 + }, + { + "epoch": 1.08281281482974, + "grad_norm": 0.042015720158815384, + "learning_rate": 9.260570099727612e-05, + "loss": 0.1862, + "step": 5374 + }, + { + "epoch": 1.0832157968970382, + "grad_norm": 0.05750071257352829, + "learning_rate": 9.259872496996382e-05, + "loss": 0.1873, + "step": 5376 + }, + { + "epoch": 1.0836187789643361, + "grad_norm": 0.07005587220191956, + "learning_rate": 9.259174591650621e-05, + "loss": 0.2258, + "step": 5378 + }, + { + "epoch": 1.084021761031634, + "grad_norm": 0.06074369698762894, + "learning_rate": 9.258476383739909e-05, + "loss": 0.1777, + "step": 5380 + }, + { + "epoch": 1.084424743098932, + "grad_norm": 0.048150282353162766, + "learning_rate": 9.257777873313847e-05, + "loss": 0.1437, + "step": 5382 + }, + { + "epoch": 1.0848277251662302, + "grad_norm": 0.04304511845111847, + "learning_rate": 9.257079060422051e-05, + "loss": 0.1864, + "step": 5384 + }, + { + "epoch": 1.085230707233528, + "grad_norm": 0.042921341955661774, + "learning_rate": 9.256379945114168e-05, + "loss": 0.1959, + "step": 5386 + }, + { + "epoch": 1.085633689300826, + "grad_norm": 0.06277068704366684, + "learning_rate": 9.255680527439862e-05, + "loss": 0.2267, + "step": 5388 + }, + { + "epoch": 1.0860366713681242, + "grad_norm": 0.04990841820836067, + "learning_rate": 9.254980807448818e-05, + "loss": 0.209, + "step": 5390 + }, + { + "epoch": 1.0864396534354221, + "grad_norm": 0.061241425573825836, + "learning_rate": 9.25428078519074e-05, + "loss": 0.1987, + "step": 5392 + }, + { + "epoch": 1.08684263550272, + "grad_norm": 0.08670193701982498, + "learning_rate": 9.25358046071536e-05, + "loss": 0.2124, + "step": 5394 + }, + { + "epoch": 1.087245617570018, + "grad_norm": 0.06139161437749863, + "learning_rate": 9.252879834072425e-05, + "loss": 0.1906, + "step": 5396 + }, + { + "epoch": 1.0876485996373162, + "grad_norm": 0.05512767285108566, + "learning_rate": 9.25217890531171e-05, + "loss": 0.1975, + "step": 5398 + }, + { + "epoch": 1.0880515817046141, + "grad_norm": 0.05780330300331116, + "learning_rate": 9.251477674483005e-05, + "loss": 0.1792, + "step": 5400 + }, + { + "epoch": 1.088454563771912, + "grad_norm": 0.044492777436971664, + "learning_rate": 9.250776141636126e-05, + "loss": 0.1552, + "step": 5402 + }, + { + "epoch": 1.0888575458392102, + "grad_norm": 0.044563908129930496, + "learning_rate": 9.250074306820907e-05, + "loss": 0.1773, + "step": 5404 + }, + { + "epoch": 1.0892605279065082, + "grad_norm": 0.042871829122304916, + "learning_rate": 9.249372170087208e-05, + "loss": 0.2186, + "step": 5406 + }, + { + "epoch": 1.0896635099738061, + "grad_norm": 0.05457884073257446, + "learning_rate": 9.248669731484903e-05, + "loss": 0.2273, + "step": 5408 + }, + { + "epoch": 1.090066492041104, + "grad_norm": 0.04954477399587631, + "learning_rate": 9.247966991063897e-05, + "loss": 0.2203, + "step": 5410 + }, + { + "epoch": 1.0904694741084022, + "grad_norm": 0.060218147933483124, + "learning_rate": 9.24726394887411e-05, + "loss": 0.2143, + "step": 5412 + }, + { + "epoch": 1.0908724561757002, + "grad_norm": 0.05448036640882492, + "learning_rate": 9.246560604965483e-05, + "loss": 0.1576, + "step": 5414 + }, + { + "epoch": 1.091275438242998, + "grad_norm": 0.06038915738463402, + "learning_rate": 9.245856959387984e-05, + "loss": 0.1761, + "step": 5416 + }, + { + "epoch": 1.0916784203102963, + "grad_norm": 0.06423316150903702, + "learning_rate": 9.245153012191594e-05, + "loss": 0.2227, + "step": 5418 + }, + { + "epoch": 1.0920814023775942, + "grad_norm": 0.04629231616854668, + "learning_rate": 9.244448763426325e-05, + "loss": 0.1706, + "step": 5420 + }, + { + "epoch": 1.0924843844448922, + "grad_norm": 0.046612486243247986, + "learning_rate": 9.243744213142203e-05, + "loss": 0.1885, + "step": 5422 + }, + { + "epoch": 1.09288736651219, + "grad_norm": 0.04658891260623932, + "learning_rate": 9.24303936138928e-05, + "loss": 0.2422, + "step": 5424 + }, + { + "epoch": 1.0932903485794883, + "grad_norm": 0.049364447593688965, + "learning_rate": 9.242334208217627e-05, + "loss": 0.1901, + "step": 5426 + }, + { + "epoch": 1.0936933306467862, + "grad_norm": 0.08126164227724075, + "learning_rate": 9.241628753677335e-05, + "loss": 0.2406, + "step": 5428 + }, + { + "epoch": 1.0940963127140841, + "grad_norm": 0.044822290539741516, + "learning_rate": 9.240922997818519e-05, + "loss": 0.2249, + "step": 5430 + }, + { + "epoch": 1.0944992947813823, + "grad_norm": 0.0389249213039875, + "learning_rate": 9.240216940691318e-05, + "loss": 0.178, + "step": 5432 + }, + { + "epoch": 1.0949022768486802, + "grad_norm": 0.043713074177503586, + "learning_rate": 9.239510582345885e-05, + "loss": 0.1621, + "step": 5434 + }, + { + "epoch": 1.0953052589159782, + "grad_norm": 0.0456581637263298, + "learning_rate": 9.238803922832402e-05, + "loss": 0.2192, + "step": 5436 + }, + { + "epoch": 1.0957082409832761, + "grad_norm": 0.057899102568626404, + "learning_rate": 9.238096962201066e-05, + "loss": 0.2148, + "step": 5438 + }, + { + "epoch": 1.0961112230505743, + "grad_norm": 0.06534866988658905, + "learning_rate": 9.237389700502099e-05, + "loss": 0.2245, + "step": 5440 + }, + { + "epoch": 1.0965142051178722, + "grad_norm": 0.05663755163550377, + "learning_rate": 9.236682137785746e-05, + "loss": 0.2097, + "step": 5442 + }, + { + "epoch": 1.0969171871851702, + "grad_norm": 0.07649657875299454, + "learning_rate": 9.23597427410227e-05, + "loss": 0.2231, + "step": 5444 + }, + { + "epoch": 1.0973201692524683, + "grad_norm": 0.056587040424346924, + "learning_rate": 9.235266109501955e-05, + "loss": 0.2006, + "step": 5446 + }, + { + "epoch": 1.0977231513197663, + "grad_norm": 0.04581359773874283, + "learning_rate": 9.234557644035108e-05, + "loss": 0.1974, + "step": 5448 + }, + { + "epoch": 1.0981261333870642, + "grad_norm": 0.04404517635703087, + "learning_rate": 9.233848877752058e-05, + "loss": 0.1691, + "step": 5450 + }, + { + "epoch": 1.0985291154543624, + "grad_norm": 0.03833284601569176, + "learning_rate": 9.233139810703156e-05, + "loss": 0.1831, + "step": 5452 + }, + { + "epoch": 1.0989320975216603, + "grad_norm": 0.04854941368103027, + "learning_rate": 9.232430442938771e-05, + "loss": 0.2112, + "step": 5454 + }, + { + "epoch": 1.0993350795889583, + "grad_norm": 0.051708050072193146, + "learning_rate": 9.231720774509297e-05, + "loss": 0.1781, + "step": 5456 + }, + { + "epoch": 1.0997380616562562, + "grad_norm": 0.038904573768377304, + "learning_rate": 9.231010805465145e-05, + "loss": 0.1538, + "step": 5458 + }, + { + "epoch": 1.1001410437235544, + "grad_norm": 0.03842492401599884, + "learning_rate": 9.230300535856755e-05, + "loss": 0.2175, + "step": 5460 + }, + { + "epoch": 1.1005440257908523, + "grad_norm": 0.058032989501953125, + "learning_rate": 9.229589965734577e-05, + "loss": 0.2119, + "step": 5462 + }, + { + "epoch": 1.1009470078581503, + "grad_norm": 0.04689335078001022, + "learning_rate": 9.228879095149094e-05, + "loss": 0.1421, + "step": 5464 + }, + { + "epoch": 1.1013499899254484, + "grad_norm": 0.056845102459192276, + "learning_rate": 9.228167924150803e-05, + "loss": 0.2294, + "step": 5466 + }, + { + "epoch": 1.1017529719927464, + "grad_norm": 0.054818570613861084, + "learning_rate": 9.227456452790224e-05, + "loss": 0.2249, + "step": 5468 + }, + { + "epoch": 1.1021559540600443, + "grad_norm": 0.09339629113674164, + "learning_rate": 9.2267446811179e-05, + "loss": 0.1741, + "step": 5470 + }, + { + "epoch": 1.1025589361273422, + "grad_norm": 0.06134527549147606, + "learning_rate": 9.226032609184394e-05, + "loss": 0.1906, + "step": 5472 + }, + { + "epoch": 1.1029619181946404, + "grad_norm": 0.056619711220264435, + "learning_rate": 9.225320237040289e-05, + "loss": 0.1955, + "step": 5474 + }, + { + "epoch": 1.1033649002619383, + "grad_norm": 0.041108060628175735, + "learning_rate": 9.224607564736192e-05, + "loss": 0.1855, + "step": 5476 + }, + { + "epoch": 1.1037678823292363, + "grad_norm": 0.04002273827791214, + "learning_rate": 9.22389459232273e-05, + "loss": 0.1777, + "step": 5478 + }, + { + "epoch": 1.1041708643965344, + "grad_norm": 0.0521574467420578, + "learning_rate": 9.223181319850551e-05, + "loss": 0.1726, + "step": 5480 + }, + { + "epoch": 1.1045738464638324, + "grad_norm": 0.07421046495437622, + "learning_rate": 9.222467747370325e-05, + "loss": 0.2221, + "step": 5482 + }, + { + "epoch": 1.1049768285311303, + "grad_norm": 0.05241503566503525, + "learning_rate": 9.221753874932743e-05, + "loss": 0.1346, + "step": 5484 + }, + { + "epoch": 1.1053798105984283, + "grad_norm": 0.05384815111756325, + "learning_rate": 9.221039702588519e-05, + "loss": 0.1861, + "step": 5486 + }, + { + "epoch": 1.1057827926657264, + "grad_norm": 0.047820959240198135, + "learning_rate": 9.220325230388382e-05, + "loss": 0.2723, + "step": 5488 + }, + { + "epoch": 1.1061857747330244, + "grad_norm": 0.04170846566557884, + "learning_rate": 9.219610458383092e-05, + "loss": 0.157, + "step": 5490 + }, + { + "epoch": 1.1065887568003223, + "grad_norm": 0.044703803956508636, + "learning_rate": 9.218895386623424e-05, + "loss": 0.2218, + "step": 5492 + }, + { + "epoch": 1.1069917388676205, + "grad_norm": 0.0474412739276886, + "learning_rate": 9.218180015160173e-05, + "loss": 0.1925, + "step": 5494 + }, + { + "epoch": 1.1073947209349184, + "grad_norm": 0.06012911722064018, + "learning_rate": 9.217464344044162e-05, + "loss": 0.1965, + "step": 5496 + }, + { + "epoch": 1.1077977030022164, + "grad_norm": 0.05040358379483223, + "learning_rate": 9.216748373326227e-05, + "loss": 0.2436, + "step": 5498 + }, + { + "epoch": 1.1082006850695145, + "grad_norm": 0.0459962859749794, + "learning_rate": 9.216032103057232e-05, + "loss": 0.1476, + "step": 5500 + }, + { + "epoch": 1.1086036671368125, + "grad_norm": 0.050243549048900604, + "learning_rate": 9.215315533288057e-05, + "loss": 0.2147, + "step": 5502 + }, + { + "epoch": 1.1090066492041104, + "grad_norm": 0.03991679847240448, + "learning_rate": 9.214598664069611e-05, + "loss": 0.1953, + "step": 5504 + }, + { + "epoch": 1.1094096312714083, + "grad_norm": 0.05045531690120697, + "learning_rate": 9.213881495452815e-05, + "loss": 0.2051, + "step": 5506 + }, + { + "epoch": 1.1098126133387065, + "grad_norm": 0.057895347476005554, + "learning_rate": 9.213164027488617e-05, + "loss": 0.179, + "step": 5508 + }, + { + "epoch": 1.1102155954060045, + "grad_norm": 0.047821614891290665, + "learning_rate": 9.212446260227982e-05, + "loss": 0.2077, + "step": 5510 + }, + { + "epoch": 1.1106185774733024, + "grad_norm": 0.051236581057310104, + "learning_rate": 9.211728193721904e-05, + "loss": 0.1889, + "step": 5512 + }, + { + "epoch": 1.1110215595406006, + "grad_norm": 0.062080029398202896, + "learning_rate": 9.211009828021391e-05, + "loss": 0.2335, + "step": 5514 + }, + { + "epoch": 1.1114245416078985, + "grad_norm": 0.04640135169029236, + "learning_rate": 9.210291163177474e-05, + "loss": 0.2103, + "step": 5516 + }, + { + "epoch": 1.1118275236751964, + "grad_norm": 0.043231479823589325, + "learning_rate": 9.209572199241206e-05, + "loss": 0.1743, + "step": 5518 + }, + { + "epoch": 1.1122305057424944, + "grad_norm": 0.07690000534057617, + "learning_rate": 9.20885293626366e-05, + "loss": 0.2403, + "step": 5520 + }, + { + "epoch": 1.1126334878097925, + "grad_norm": 0.033921536058187485, + "learning_rate": 9.208133374295934e-05, + "loss": 0.1443, + "step": 5522 + }, + { + "epoch": 1.1130364698770905, + "grad_norm": 0.054961346089839935, + "learning_rate": 9.207413513389141e-05, + "loss": 0.1989, + "step": 5524 + }, + { + "epoch": 1.1134394519443884, + "grad_norm": 0.04538585990667343, + "learning_rate": 9.206693353594422e-05, + "loss": 0.1754, + "step": 5526 + }, + { + "epoch": 1.1138424340116866, + "grad_norm": 0.0442589595913887, + "learning_rate": 9.205972894962936e-05, + "loss": 0.2155, + "step": 5528 + }, + { + "epoch": 1.1142454160789845, + "grad_norm": 0.052050501108169556, + "learning_rate": 9.205252137545861e-05, + "loss": 0.2069, + "step": 5530 + }, + { + "epoch": 1.1146483981462825, + "grad_norm": 0.062034666538238525, + "learning_rate": 9.204531081394399e-05, + "loss": 0.2195, + "step": 5532 + }, + { + "epoch": 1.1150513802135804, + "grad_norm": 0.04442548006772995, + "learning_rate": 9.203809726559773e-05, + "loss": 0.1445, + "step": 5534 + }, + { + "epoch": 1.1154543622808786, + "grad_norm": 0.04459670931100845, + "learning_rate": 9.203088073093227e-05, + "loss": 0.1902, + "step": 5536 + }, + { + "epoch": 1.1158573443481765, + "grad_norm": 0.041819144040346146, + "learning_rate": 9.202366121046027e-05, + "loss": 0.2043, + "step": 5538 + }, + { + "epoch": 1.1162603264154745, + "grad_norm": 0.03972846269607544, + "learning_rate": 9.201643870469458e-05, + "loss": 0.18, + "step": 5540 + }, + { + "epoch": 1.1166633084827726, + "grad_norm": 0.05215775966644287, + "learning_rate": 9.200921321414829e-05, + "loss": 0.1896, + "step": 5542 + }, + { + "epoch": 1.1170662905500706, + "grad_norm": 0.05705489218235016, + "learning_rate": 9.200198473933466e-05, + "loss": 0.2474, + "step": 5544 + }, + { + "epoch": 1.1174692726173685, + "grad_norm": 0.0615994967520237, + "learning_rate": 9.19947532807672e-05, + "loss": 0.2095, + "step": 5546 + }, + { + "epoch": 1.1178722546846664, + "grad_norm": 0.048454973846673965, + "learning_rate": 9.198751883895967e-05, + "loss": 0.1614, + "step": 5548 + }, + { + "epoch": 1.1182752367519646, + "grad_norm": 0.07016980648040771, + "learning_rate": 9.198028141442591e-05, + "loss": 0.1869, + "step": 5550 + }, + { + "epoch": 1.1186782188192625, + "grad_norm": 0.04643561318516731, + "learning_rate": 9.19730410076801e-05, + "loss": 0.2311, + "step": 5552 + }, + { + "epoch": 1.1190812008865605, + "grad_norm": 0.0610547810792923, + "learning_rate": 9.19657976192366e-05, + "loss": 0.2155, + "step": 5554 + }, + { + "epoch": 1.1194841829538587, + "grad_norm": 0.04994925484061241, + "learning_rate": 9.195855124960995e-05, + "loss": 0.249, + "step": 5556 + }, + { + "epoch": 1.1198871650211566, + "grad_norm": 0.055203359574079514, + "learning_rate": 9.19513018993149e-05, + "loss": 0.2251, + "step": 5558 + }, + { + "epoch": 1.1202901470884545, + "grad_norm": 0.05273060500621796, + "learning_rate": 9.194404956886648e-05, + "loss": 0.1437, + "step": 5560 + }, + { + "epoch": 1.1206931291557525, + "grad_norm": 0.04343268647789955, + "learning_rate": 9.193679425877983e-05, + "loss": 0.1886, + "step": 5562 + }, + { + "epoch": 1.1210961112230506, + "grad_norm": 0.04952103644609451, + "learning_rate": 9.192953596957041e-05, + "loss": 0.1578, + "step": 5564 + }, + { + "epoch": 1.1214990932903486, + "grad_norm": 0.06240951642394066, + "learning_rate": 9.192227470175381e-05, + "loss": 0.1606, + "step": 5566 + }, + { + "epoch": 1.1219020753576465, + "grad_norm": 0.04541534557938576, + "learning_rate": 9.191501045584586e-05, + "loss": 0.2096, + "step": 5568 + }, + { + "epoch": 1.1223050574249447, + "grad_norm": 0.04394442215561867, + "learning_rate": 9.190774323236258e-05, + "loss": 0.2232, + "step": 5570 + }, + { + "epoch": 1.1227080394922426, + "grad_norm": 0.06251212954521179, + "learning_rate": 9.190047303182025e-05, + "loss": 0.1937, + "step": 5572 + }, + { + "epoch": 1.1231110215595406, + "grad_norm": 0.04978261888027191, + "learning_rate": 9.189319985473532e-05, + "loss": 0.2086, + "step": 5574 + }, + { + "epoch": 1.1235140036268385, + "grad_norm": 0.042393967509269714, + "learning_rate": 9.18859237016245e-05, + "loss": 0.225, + "step": 5576 + }, + { + "epoch": 1.1239169856941367, + "grad_norm": 0.04710579290986061, + "learning_rate": 9.187864457300461e-05, + "loss": 0.2149, + "step": 5578 + }, + { + "epoch": 1.1243199677614346, + "grad_norm": 0.05299391224980354, + "learning_rate": 9.187136246939281e-05, + "loss": 0.221, + "step": 5580 + }, + { + "epoch": 1.1247229498287326, + "grad_norm": 0.08004529774188995, + "learning_rate": 9.186407739130638e-05, + "loss": 0.1848, + "step": 5582 + }, + { + "epoch": 1.1251259318960307, + "grad_norm": 0.05207962915301323, + "learning_rate": 9.185678933926284e-05, + "loss": 0.2194, + "step": 5584 + }, + { + "epoch": 1.1255289139633287, + "grad_norm": 0.05663863569498062, + "learning_rate": 9.184949831377992e-05, + "loss": 0.1864, + "step": 5586 + }, + { + "epoch": 1.1259318960306266, + "grad_norm": 0.06725175678730011, + "learning_rate": 9.184220431537558e-05, + "loss": 0.1973, + "step": 5588 + }, + { + "epoch": 1.1263348780979245, + "grad_norm": 0.050569623708724976, + "learning_rate": 9.183490734456794e-05, + "loss": 0.1651, + "step": 5590 + }, + { + "epoch": 1.1267378601652227, + "grad_norm": 0.05040304362773895, + "learning_rate": 9.182760740187542e-05, + "loss": 0.2196, + "step": 5592 + }, + { + "epoch": 1.1271408422325206, + "grad_norm": 0.0837257131934166, + "learning_rate": 9.182030448781654e-05, + "loss": 0.2125, + "step": 5594 + }, + { + "epoch": 1.1275438242998186, + "grad_norm": 0.056179650127887726, + "learning_rate": 9.181299860291011e-05, + "loss": 0.1813, + "step": 5596 + }, + { + "epoch": 1.1279468063671167, + "grad_norm": 0.059334564954042435, + "learning_rate": 9.180568974767513e-05, + "loss": 0.167, + "step": 5598 + }, + { + "epoch": 1.1283497884344147, + "grad_norm": 0.050333570688962936, + "learning_rate": 9.179837792263082e-05, + "loss": 0.2414, + "step": 5600 + }, + { + "epoch": 1.1287527705017126, + "grad_norm": 0.05855239927768707, + "learning_rate": 9.179106312829659e-05, + "loss": 0.2002, + "step": 5602 + }, + { + "epoch": 1.1291557525690106, + "grad_norm": 0.04838457331061363, + "learning_rate": 9.178374536519206e-05, + "loss": 0.2037, + "step": 5604 + }, + { + "epoch": 1.1295587346363087, + "grad_norm": 0.04486094042658806, + "learning_rate": 9.177642463383708e-05, + "loss": 0.1879, + "step": 5606 + }, + { + "epoch": 1.1299617167036067, + "grad_norm": 0.044857099652290344, + "learning_rate": 9.176910093475172e-05, + "loss": 0.1604, + "step": 5608 + }, + { + "epoch": 1.1303646987709046, + "grad_norm": 0.04508034139871597, + "learning_rate": 9.176177426845623e-05, + "loss": 0.1911, + "step": 5610 + }, + { + "epoch": 1.1307676808382028, + "grad_norm": 0.0480637326836586, + "learning_rate": 9.175444463547108e-05, + "loss": 0.2079, + "step": 5612 + }, + { + "epoch": 1.1311706629055007, + "grad_norm": 0.0715208500623703, + "learning_rate": 9.174711203631694e-05, + "loss": 0.1998, + "step": 5614 + }, + { + "epoch": 1.1315736449727987, + "grad_norm": 0.04499208182096481, + "learning_rate": 9.173977647151475e-05, + "loss": 0.1715, + "step": 5616 + }, + { + "epoch": 1.1319766270400966, + "grad_norm": 0.0567949116230011, + "learning_rate": 9.173243794158557e-05, + "loss": 0.1727, + "step": 5618 + }, + { + "epoch": 1.1323796091073948, + "grad_norm": 0.04465009644627571, + "learning_rate": 9.172509644705077e-05, + "loss": 0.2048, + "step": 5620 + }, + { + "epoch": 1.1327825911746927, + "grad_norm": 0.05172666534781456, + "learning_rate": 9.171775198843183e-05, + "loss": 0.1939, + "step": 5622 + }, + { + "epoch": 1.1331855732419907, + "grad_norm": 0.049424417316913605, + "learning_rate": 9.17104045662505e-05, + "loss": 0.1683, + "step": 5624 + }, + { + "epoch": 1.1335885553092888, + "grad_norm": 0.04293535649776459, + "learning_rate": 9.170305418102874e-05, + "loss": 0.2179, + "step": 5626 + }, + { + "epoch": 1.1339915373765868, + "grad_norm": 0.054131362587213516, + "learning_rate": 9.169570083328871e-05, + "loss": 0.175, + "step": 5628 + }, + { + "epoch": 1.1343945194438847, + "grad_norm": 0.054785408079624176, + "learning_rate": 9.168834452355277e-05, + "loss": 0.218, + "step": 5630 + }, + { + "epoch": 1.1347975015111826, + "grad_norm": 0.050389211624860764, + "learning_rate": 9.168098525234351e-05, + "loss": 0.2058, + "step": 5632 + }, + { + "epoch": 1.1352004835784808, + "grad_norm": 0.043904174119234085, + "learning_rate": 9.167362302018372e-05, + "loss": 0.1675, + "step": 5634 + }, + { + "epoch": 1.1356034656457787, + "grad_norm": 0.044593095779418945, + "learning_rate": 9.166625782759639e-05, + "loss": 0.1556, + "step": 5636 + }, + { + "epoch": 1.1360064477130767, + "grad_norm": 0.07273683696985245, + "learning_rate": 9.165888967510474e-05, + "loss": 0.2114, + "step": 5638 + }, + { + "epoch": 1.1364094297803748, + "grad_norm": 0.05269932374358177, + "learning_rate": 9.16515185632322e-05, + "loss": 0.2006, + "step": 5640 + }, + { + "epoch": 1.1368124118476728, + "grad_norm": 0.0487261600792408, + "learning_rate": 9.164414449250239e-05, + "loss": 0.2053, + "step": 5642 + }, + { + "epoch": 1.1372153939149707, + "grad_norm": 0.057701513171195984, + "learning_rate": 9.163676746343914e-05, + "loss": 0.2576, + "step": 5644 + }, + { + "epoch": 1.1376183759822687, + "grad_norm": 0.056653860956430435, + "learning_rate": 9.162938747656652e-05, + "loss": 0.2186, + "step": 5646 + }, + { + "epoch": 1.1380213580495668, + "grad_norm": 0.04430731385946274, + "learning_rate": 9.162200453240882e-05, + "loss": 0.1844, + "step": 5648 + }, + { + "epoch": 1.1384243401168648, + "grad_norm": 0.03643381595611572, + "learning_rate": 9.161461863149046e-05, + "loss": 0.1805, + "step": 5650 + }, + { + "epoch": 1.1388273221841627, + "grad_norm": 0.05398820340633392, + "learning_rate": 9.160722977433613e-05, + "loss": 0.2303, + "step": 5652 + }, + { + "epoch": 1.1392303042514609, + "grad_norm": 0.07164819538593292, + "learning_rate": 9.159983796147078e-05, + "loss": 0.2121, + "step": 5654 + }, + { + "epoch": 1.1396332863187588, + "grad_norm": 0.046827998012304306, + "learning_rate": 9.159244319341944e-05, + "loss": 0.2502, + "step": 5656 + }, + { + "epoch": 1.1400362683860568, + "grad_norm": 0.04478400945663452, + "learning_rate": 9.158504547070745e-05, + "loss": 0.2045, + "step": 5658 + }, + { + "epoch": 1.1404392504533547, + "grad_norm": 0.041052401065826416, + "learning_rate": 9.157764479386035e-05, + "loss": 0.1818, + "step": 5660 + }, + { + "epoch": 1.1408422325206529, + "grad_norm": 0.06413646787405014, + "learning_rate": 9.157024116340384e-05, + "loss": 0.2311, + "step": 5662 + }, + { + "epoch": 1.1412452145879508, + "grad_norm": 0.052323296666145325, + "learning_rate": 9.15628345798639e-05, + "loss": 0.1733, + "step": 5664 + }, + { + "epoch": 1.1416481966552487, + "grad_norm": 0.08062373101711273, + "learning_rate": 9.155542504376664e-05, + "loss": 0.1951, + "step": 5666 + }, + { + "epoch": 1.142051178722547, + "grad_norm": 0.04671994969248772, + "learning_rate": 9.154801255563845e-05, + "loss": 0.1823, + "step": 5668 + }, + { + "epoch": 1.1424541607898449, + "grad_norm": 0.04927929863333702, + "learning_rate": 9.154059711600591e-05, + "loss": 0.184, + "step": 5670 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.04393012821674347, + "learning_rate": 9.153317872539578e-05, + "loss": 0.1605, + "step": 5672 + }, + { + "epoch": 1.143260124924441, + "grad_norm": 0.05194476246833801, + "learning_rate": 9.152575738433505e-05, + "loss": 0.2001, + "step": 5674 + }, + { + "epoch": 1.143663106991739, + "grad_norm": 0.05223282799124718, + "learning_rate": 9.151833309335092e-05, + "loss": 0.1982, + "step": 5676 + }, + { + "epoch": 1.1440660890590368, + "grad_norm": 0.08013258129358292, + "learning_rate": 9.151090585297082e-05, + "loss": 0.2021, + "step": 5678 + }, + { + "epoch": 1.144469071126335, + "grad_norm": 0.03999164327979088, + "learning_rate": 9.150347566372234e-05, + "loss": 0.1648, + "step": 5680 + }, + { + "epoch": 1.144872053193633, + "grad_norm": 0.056218814104795456, + "learning_rate": 9.149604252613332e-05, + "loss": 0.1562, + "step": 5682 + }, + { + "epoch": 1.1452750352609309, + "grad_norm": 0.04334927350282669, + "learning_rate": 9.148860644073182e-05, + "loss": 0.2053, + "step": 5684 + }, + { + "epoch": 1.1456780173282288, + "grad_norm": 0.06221528723835945, + "learning_rate": 9.148116740804606e-05, + "loss": 0.2721, + "step": 5686 + }, + { + "epoch": 1.146080999395527, + "grad_norm": 0.04984398931264877, + "learning_rate": 9.147372542860451e-05, + "loss": 0.2652, + "step": 5688 + }, + { + "epoch": 1.146483981462825, + "grad_norm": 0.04859272390604019, + "learning_rate": 9.146628050293584e-05, + "loss": 0.1944, + "step": 5690 + }, + { + "epoch": 1.1468869635301229, + "grad_norm": 0.0628964751958847, + "learning_rate": 9.145883263156891e-05, + "loss": 0.1631, + "step": 5692 + }, + { + "epoch": 1.147289945597421, + "grad_norm": 0.05849664658308029, + "learning_rate": 9.145138181503281e-05, + "loss": 0.1846, + "step": 5694 + }, + { + "epoch": 1.147692927664719, + "grad_norm": 0.04668128490447998, + "learning_rate": 9.144392805385684e-05, + "loss": 0.2517, + "step": 5696 + }, + { + "epoch": 1.148095909732017, + "grad_norm": 0.047532372176647186, + "learning_rate": 9.14364713485705e-05, + "loss": 0.2122, + "step": 5698 + }, + { + "epoch": 1.1484988917993149, + "grad_norm": 0.04797205701470375, + "learning_rate": 9.14290116997035e-05, + "loss": 0.2055, + "step": 5700 + }, + { + "epoch": 1.148901873866613, + "grad_norm": 0.04008246585726738, + "learning_rate": 9.142154910778578e-05, + "loss": 0.2234, + "step": 5702 + }, + { + "epoch": 1.149304855933911, + "grad_norm": 0.04848853126168251, + "learning_rate": 9.141408357334744e-05, + "loss": 0.2334, + "step": 5704 + }, + { + "epoch": 1.149707838001209, + "grad_norm": 0.09612215310335159, + "learning_rate": 9.140661509691885e-05, + "loss": 0.1858, + "step": 5706 + }, + { + "epoch": 1.150110820068507, + "grad_norm": 0.048193544149398804, + "learning_rate": 9.139914367903053e-05, + "loss": 0.183, + "step": 5708 + }, + { + "epoch": 1.150513802135805, + "grad_norm": 0.04777916520833969, + "learning_rate": 9.139166932021326e-05, + "loss": 0.1848, + "step": 5710 + }, + { + "epoch": 1.150916784203103, + "grad_norm": 0.03997446224093437, + "learning_rate": 9.1384192020998e-05, + "loss": 0.1549, + "step": 5712 + }, + { + "epoch": 1.151319766270401, + "grad_norm": 0.053762342780828476, + "learning_rate": 9.137671178191592e-05, + "loss": 0.1862, + "step": 5714 + }, + { + "epoch": 1.151722748337699, + "grad_norm": 0.03450224921107292, + "learning_rate": 9.13692286034984e-05, + "loss": 0.154, + "step": 5716 + }, + { + "epoch": 1.152125730404997, + "grad_norm": 0.05368155613541603, + "learning_rate": 9.136174248627703e-05, + "loss": 0.2018, + "step": 5718 + }, + { + "epoch": 1.152528712472295, + "grad_norm": 0.043950121849775314, + "learning_rate": 9.135425343078364e-05, + "loss": 0.1567, + "step": 5720 + }, + { + "epoch": 1.152931694539593, + "grad_norm": 0.06447681039571762, + "learning_rate": 9.134676143755022e-05, + "loss": 0.1946, + "step": 5722 + }, + { + "epoch": 1.153334676606891, + "grad_norm": 0.051290884613990784, + "learning_rate": 9.133926650710898e-05, + "loss": 0.2268, + "step": 5724 + }, + { + "epoch": 1.153737658674189, + "grad_norm": 0.060127224773168564, + "learning_rate": 9.133176863999238e-05, + "loss": 0.2142, + "step": 5726 + }, + { + "epoch": 1.154140640741487, + "grad_norm": 0.040788453072309494, + "learning_rate": 9.132426783673303e-05, + "loss": 0.1696, + "step": 5728 + }, + { + "epoch": 1.154543622808785, + "grad_norm": 0.05721386522054672, + "learning_rate": 9.131676409786379e-05, + "loss": 0.2706, + "step": 5730 + }, + { + "epoch": 1.154946604876083, + "grad_norm": 0.04448957368731499, + "learning_rate": 9.130925742391767e-05, + "loss": 0.1767, + "step": 5732 + }, + { + "epoch": 1.155349586943381, + "grad_norm": 0.03977445885539055, + "learning_rate": 9.1301747815428e-05, + "loss": 0.1781, + "step": 5734 + }, + { + "epoch": 1.1557525690106791, + "grad_norm": 0.044697824865579605, + "learning_rate": 9.12942352729282e-05, + "loss": 0.1693, + "step": 5736 + }, + { + "epoch": 1.156155551077977, + "grad_norm": 0.06158357486128807, + "learning_rate": 9.128671979695198e-05, + "loss": 0.1666, + "step": 5738 + }, + { + "epoch": 1.156558533145275, + "grad_norm": 0.05893006548285484, + "learning_rate": 9.127920138803321e-05, + "loss": 0.1851, + "step": 5740 + }, + { + "epoch": 1.156961515212573, + "grad_norm": 0.04412064328789711, + "learning_rate": 9.127168004670599e-05, + "loss": 0.1425, + "step": 5742 + }, + { + "epoch": 1.1573644972798711, + "grad_norm": 0.04011744633316994, + "learning_rate": 9.126415577350461e-05, + "loss": 0.2082, + "step": 5744 + }, + { + "epoch": 1.157767479347169, + "grad_norm": 0.05388535559177399, + "learning_rate": 9.125662856896362e-05, + "loss": 0.1851, + "step": 5746 + }, + { + "epoch": 1.158170461414467, + "grad_norm": 0.049892496317625046, + "learning_rate": 9.12490984336177e-05, + "loss": 0.2151, + "step": 5748 + }, + { + "epoch": 1.1585734434817652, + "grad_norm": 0.046070702373981476, + "learning_rate": 9.12415653680018e-05, + "loss": 0.1925, + "step": 5750 + }, + { + "epoch": 1.158976425549063, + "grad_norm": 0.04378350451588631, + "learning_rate": 9.123402937265104e-05, + "loss": 0.2271, + "step": 5752 + }, + { + "epoch": 1.159379407616361, + "grad_norm": 0.04043138027191162, + "learning_rate": 9.12264904481008e-05, + "loss": 0.1627, + "step": 5754 + }, + { + "epoch": 1.159782389683659, + "grad_norm": 0.06803450733423233, + "learning_rate": 9.12189485948866e-05, + "loss": 0.2098, + "step": 5756 + }, + { + "epoch": 1.1601853717509572, + "grad_norm": 0.0573253408074379, + "learning_rate": 9.12114038135442e-05, + "loss": 0.1875, + "step": 5758 + }, + { + "epoch": 1.160588353818255, + "grad_norm": 0.049632567912340164, + "learning_rate": 9.12038561046096e-05, + "loss": 0.2009, + "step": 5760 + }, + { + "epoch": 1.160991335885553, + "grad_norm": 0.04781503975391388, + "learning_rate": 9.119630546861895e-05, + "loss": 0.2163, + "step": 5762 + }, + { + "epoch": 1.1613943179528512, + "grad_norm": 0.04843660071492195, + "learning_rate": 9.118875190610865e-05, + "loss": 0.1457, + "step": 5764 + }, + { + "epoch": 1.1617973000201491, + "grad_norm": 0.06017361208796501, + "learning_rate": 9.118119541761527e-05, + "loss": 0.1663, + "step": 5766 + }, + { + "epoch": 1.162200282087447, + "grad_norm": 0.057248305529356, + "learning_rate": 9.117363600367566e-05, + "loss": 0.2276, + "step": 5768 + }, + { + "epoch": 1.162603264154745, + "grad_norm": 0.07495728135108948, + "learning_rate": 9.116607366482676e-05, + "loss": 0.1746, + "step": 5770 + }, + { + "epoch": 1.1630062462220432, + "grad_norm": 0.061165809631347656, + "learning_rate": 9.115850840160583e-05, + "loss": 0.2502, + "step": 5772 + }, + { + "epoch": 1.1634092282893411, + "grad_norm": 0.042846087366342545, + "learning_rate": 9.11509402145503e-05, + "loss": 0.2142, + "step": 5774 + }, + { + "epoch": 1.163812210356639, + "grad_norm": 0.06059703230857849, + "learning_rate": 9.114336910419779e-05, + "loss": 0.2013, + "step": 5776 + }, + { + "epoch": 1.1642151924239372, + "grad_norm": 0.045994676649570465, + "learning_rate": 9.113579507108612e-05, + "loss": 0.2065, + "step": 5778 + }, + { + "epoch": 1.1646181744912352, + "grad_norm": 0.059071313589811325, + "learning_rate": 9.112821811575336e-05, + "loss": 0.1969, + "step": 5780 + }, + { + "epoch": 1.165021156558533, + "grad_norm": 0.050841737538576126, + "learning_rate": 9.112063823873776e-05, + "loss": 0.1471, + "step": 5782 + }, + { + "epoch": 1.165424138625831, + "grad_norm": 0.056296203285455704, + "learning_rate": 9.11130554405778e-05, + "loss": 0.2354, + "step": 5784 + }, + { + "epoch": 1.1658271206931292, + "grad_norm": 0.040048208087682724, + "learning_rate": 9.110546972181211e-05, + "loss": 0.2454, + "step": 5786 + }, + { + "epoch": 1.1662301027604272, + "grad_norm": 0.046090155839920044, + "learning_rate": 9.109788108297959e-05, + "loss": 0.1594, + "step": 5788 + }, + { + "epoch": 1.166633084827725, + "grad_norm": 0.05453578010201454, + "learning_rate": 9.109028952461934e-05, + "loss": 0.2359, + "step": 5790 + }, + { + "epoch": 1.1670360668950233, + "grad_norm": 0.039822839200496674, + "learning_rate": 9.108269504727063e-05, + "loss": 0.1897, + "step": 5792 + }, + { + "epoch": 1.1674390489623212, + "grad_norm": 0.054770614951848984, + "learning_rate": 9.107509765147294e-05, + "loss": 0.1906, + "step": 5794 + }, + { + "epoch": 1.1678420310296191, + "grad_norm": 0.0513848178088665, + "learning_rate": 9.106749733776605e-05, + "loss": 0.1986, + "step": 5796 + }, + { + "epoch": 1.168245013096917, + "grad_norm": 0.04624282568693161, + "learning_rate": 9.10598941066898e-05, + "loss": 0.2392, + "step": 5798 + }, + { + "epoch": 1.1686479951642152, + "grad_norm": 0.05323868989944458, + "learning_rate": 9.105228795878434e-05, + "loss": 0.2303, + "step": 5800 + }, + { + "epoch": 1.1690509772315132, + "grad_norm": 0.05079076439142227, + "learning_rate": 9.104467889458999e-05, + "loss": 0.192, + "step": 5802 + }, + { + "epoch": 1.1694539592988111, + "grad_norm": 0.04578608646988869, + "learning_rate": 9.10370669146473e-05, + "loss": 0.209, + "step": 5804 + }, + { + "epoch": 1.1698569413661093, + "grad_norm": 0.04013809934258461, + "learning_rate": 9.102945201949701e-05, + "loss": 0.1483, + "step": 5806 + }, + { + "epoch": 1.1702599234334072, + "grad_norm": 0.05745330825448036, + "learning_rate": 9.102183420968006e-05, + "loss": 0.1694, + "step": 5808 + }, + { + "epoch": 1.1706629055007052, + "grad_norm": 0.045204054564237595, + "learning_rate": 9.101421348573763e-05, + "loss": 0.2133, + "step": 5810 + }, + { + "epoch": 1.1710658875680031, + "grad_norm": 0.0566680021584034, + "learning_rate": 9.100658984821105e-05, + "loss": 0.199, + "step": 5812 + }, + { + "epoch": 1.1714688696353013, + "grad_norm": 0.052215367555618286, + "learning_rate": 9.09989632976419e-05, + "loss": 0.1555, + "step": 5814 + }, + { + "epoch": 1.1718718517025992, + "grad_norm": 0.051735419780015945, + "learning_rate": 9.099133383457196e-05, + "loss": 0.2188, + "step": 5816 + }, + { + "epoch": 1.1722748337698972, + "grad_norm": 0.05079289525747299, + "learning_rate": 9.098370145954325e-05, + "loss": 0.2182, + "step": 5818 + }, + { + "epoch": 1.1726778158371953, + "grad_norm": 0.04231395944952965, + "learning_rate": 9.097606617309792e-05, + "loss": 0.1908, + "step": 5820 + }, + { + "epoch": 1.1730807979044933, + "grad_norm": 0.05421389639377594, + "learning_rate": 9.096842797577838e-05, + "loss": 0.2239, + "step": 5822 + }, + { + "epoch": 1.1734837799717912, + "grad_norm": 0.06685738265514374, + "learning_rate": 9.096078686812724e-05, + "loss": 0.1902, + "step": 5824 + }, + { + "epoch": 1.1738867620390891, + "grad_norm": 0.07613111287355423, + "learning_rate": 9.095314285068729e-05, + "loss": 0.1891, + "step": 5826 + }, + { + "epoch": 1.1742897441063873, + "grad_norm": 0.04930075258016586, + "learning_rate": 9.094549592400156e-05, + "loss": 0.2344, + "step": 5828 + }, + { + "epoch": 1.1746927261736853, + "grad_norm": 0.054598815739154816, + "learning_rate": 9.093784608861332e-05, + "loss": 0.1918, + "step": 5830 + }, + { + "epoch": 1.1750957082409832, + "grad_norm": 0.05315352603793144, + "learning_rate": 9.093019334506594e-05, + "loss": 0.189, + "step": 5832 + }, + { + "epoch": 1.1754986903082814, + "grad_norm": 0.0629630908370018, + "learning_rate": 9.092253769390308e-05, + "loss": 0.2278, + "step": 5834 + }, + { + "epoch": 1.1759016723755793, + "grad_norm": 0.04600967466831207, + "learning_rate": 9.09148791356686e-05, + "loss": 0.1794, + "step": 5836 + }, + { + "epoch": 1.1763046544428772, + "grad_norm": 0.049229636788368225, + "learning_rate": 9.090721767090654e-05, + "loss": 0.2453, + "step": 5838 + }, + { + "epoch": 1.1767076365101752, + "grad_norm": 0.04404792562127113, + "learning_rate": 9.089955330016115e-05, + "loss": 0.2238, + "step": 5840 + }, + { + "epoch": 1.1771106185774733, + "grad_norm": 0.061983443796634674, + "learning_rate": 9.089188602397692e-05, + "loss": 0.2076, + "step": 5842 + }, + { + "epoch": 1.1775136006447713, + "grad_norm": 0.06978388875722885, + "learning_rate": 9.088421584289848e-05, + "loss": 0.1874, + "step": 5844 + }, + { + "epoch": 1.1779165827120692, + "grad_norm": 0.04500797390937805, + "learning_rate": 9.087654275747074e-05, + "loss": 0.2087, + "step": 5846 + }, + { + "epoch": 1.1783195647793674, + "grad_norm": 0.05657447502017021, + "learning_rate": 9.086886676823878e-05, + "loss": 0.1595, + "step": 5848 + }, + { + "epoch": 1.1787225468466653, + "grad_norm": 0.0397280678153038, + "learning_rate": 9.086118787574787e-05, + "loss": 0.1718, + "step": 5850 + }, + { + "epoch": 1.1791255289139633, + "grad_norm": 0.0470387302339077, + "learning_rate": 9.085350608054354e-05, + "loss": 0.2014, + "step": 5852 + }, + { + "epoch": 1.1795285109812612, + "grad_norm": 0.0703502893447876, + "learning_rate": 9.084582138317146e-05, + "loss": 0.1792, + "step": 5854 + }, + { + "epoch": 1.1799314930485594, + "grad_norm": 0.03431640937924385, + "learning_rate": 9.083813378417756e-05, + "loss": 0.1386, + "step": 5856 + }, + { + "epoch": 1.1803344751158573, + "grad_norm": 0.05815456807613373, + "learning_rate": 9.083044328410794e-05, + "loss": 0.1662, + "step": 5858 + }, + { + "epoch": 1.1807374571831553, + "grad_norm": 0.05745692178606987, + "learning_rate": 9.082274988350894e-05, + "loss": 0.2128, + "step": 5860 + }, + { + "epoch": 1.1811404392504534, + "grad_norm": 0.049616165459156036, + "learning_rate": 9.081505358292707e-05, + "loss": 0.1829, + "step": 5862 + }, + { + "epoch": 1.1815434213177514, + "grad_norm": 0.06410976499319077, + "learning_rate": 9.080735438290906e-05, + "loss": 0.2012, + "step": 5864 + }, + { + "epoch": 1.1819464033850493, + "grad_norm": 0.06344123184680939, + "learning_rate": 9.079965228400187e-05, + "loss": 0.1708, + "step": 5866 + }, + { + "epoch": 1.1823493854523472, + "grad_norm": 0.04813029244542122, + "learning_rate": 9.079194728675261e-05, + "loss": 0.191, + "step": 5868 + }, + { + "epoch": 1.1827523675196454, + "grad_norm": 0.06375063210725784, + "learning_rate": 9.078423939170868e-05, + "loss": 0.2086, + "step": 5870 + }, + { + "epoch": 1.1831553495869434, + "grad_norm": 0.06699992716312408, + "learning_rate": 9.077652859941759e-05, + "loss": 0.2428, + "step": 5872 + }, + { + "epoch": 1.1835583316542413, + "grad_norm": 0.08780429512262344, + "learning_rate": 9.076881491042711e-05, + "loss": 0.2224, + "step": 5874 + }, + { + "epoch": 1.1839613137215395, + "grad_norm": 0.05629020184278488, + "learning_rate": 9.076109832528523e-05, + "loss": 0.1999, + "step": 5876 + }, + { + "epoch": 1.1843642957888374, + "grad_norm": 0.0460069477558136, + "learning_rate": 9.075337884454012e-05, + "loss": 0.1994, + "step": 5878 + }, + { + "epoch": 1.1847672778561353, + "grad_norm": 0.06257259100675583, + "learning_rate": 9.074565646874014e-05, + "loss": 0.2424, + "step": 5880 + }, + { + "epoch": 1.1851702599234335, + "grad_norm": 0.04899228736758232, + "learning_rate": 9.07379311984339e-05, + "loss": 0.2154, + "step": 5882 + }, + { + "epoch": 1.1855732419907314, + "grad_norm": 0.048573628067970276, + "learning_rate": 9.073020303417017e-05, + "loss": 0.164, + "step": 5884 + }, + { + "epoch": 1.1859762240580294, + "grad_norm": 0.05294910818338394, + "learning_rate": 9.072247197649795e-05, + "loss": 0.2525, + "step": 5886 + }, + { + "epoch": 1.1863792061253275, + "grad_norm": 0.05634959414601326, + "learning_rate": 9.071473802596646e-05, + "loss": 0.1868, + "step": 5888 + }, + { + "epoch": 1.1867821881926255, + "grad_norm": 0.05087854340672493, + "learning_rate": 9.07070011831251e-05, + "loss": 0.2566, + "step": 5890 + }, + { + "epoch": 1.1871851702599234, + "grad_norm": 0.04623530060052872, + "learning_rate": 9.069926144852346e-05, + "loss": 0.2095, + "step": 5892 + }, + { + "epoch": 1.1875881523272214, + "grad_norm": 0.0693693682551384, + "learning_rate": 9.069151882271139e-05, + "loss": 0.2475, + "step": 5894 + }, + { + "epoch": 1.1879911343945195, + "grad_norm": 0.04673139005899429, + "learning_rate": 9.068377330623887e-05, + "loss": 0.2004, + "step": 5896 + }, + { + "epoch": 1.1883941164618175, + "grad_norm": 0.056024160236120224, + "learning_rate": 9.067602489965619e-05, + "loss": 0.1873, + "step": 5898 + }, + { + "epoch": 1.1887970985291154, + "grad_norm": 0.04089471325278282, + "learning_rate": 9.066827360351373e-05, + "loss": 0.1676, + "step": 5900 + }, + { + "epoch": 1.1892000805964136, + "grad_norm": 0.0679912269115448, + "learning_rate": 9.066051941836218e-05, + "loss": 0.189, + "step": 5902 + }, + { + "epoch": 1.1896030626637115, + "grad_norm": 0.04771299287676811, + "learning_rate": 9.065276234475233e-05, + "loss": 0.1804, + "step": 5904 + }, + { + "epoch": 1.1900060447310095, + "grad_norm": 0.04200606793165207, + "learning_rate": 9.064500238323528e-05, + "loss": 0.1495, + "step": 5906 + }, + { + "epoch": 1.1904090267983074, + "grad_norm": 0.05432324483990669, + "learning_rate": 9.063723953436225e-05, + "loss": 0.1868, + "step": 5908 + }, + { + "epoch": 1.1908120088656056, + "grad_norm": 0.05804635211825371, + "learning_rate": 9.062947379868472e-05, + "loss": 0.216, + "step": 5910 + }, + { + "epoch": 1.1912149909329035, + "grad_norm": 0.04404330998659134, + "learning_rate": 9.062170517675434e-05, + "loss": 0.2023, + "step": 5912 + }, + { + "epoch": 1.1916179730002014, + "grad_norm": 0.06136954203248024, + "learning_rate": 9.061393366912298e-05, + "loss": 0.2247, + "step": 5914 + }, + { + "epoch": 1.1920209550674996, + "grad_norm": 0.054422929883003235, + "learning_rate": 9.060615927634275e-05, + "loss": 0.1607, + "step": 5916 + }, + { + "epoch": 1.1924239371347976, + "grad_norm": 0.06473580747842789, + "learning_rate": 9.059838199896588e-05, + "loss": 0.2242, + "step": 5918 + }, + { + "epoch": 1.1928269192020955, + "grad_norm": 0.05070899799466133, + "learning_rate": 9.05906018375449e-05, + "loss": 0.1601, + "step": 5920 + }, + { + "epoch": 1.1932299012693934, + "grad_norm": 0.04901570454239845, + "learning_rate": 9.058281879263247e-05, + "loss": 0.1697, + "step": 5922 + }, + { + "epoch": 1.1936328833366916, + "grad_norm": 0.04864586517214775, + "learning_rate": 9.05750328647815e-05, + "loss": 0.1868, + "step": 5924 + }, + { + "epoch": 1.1940358654039895, + "grad_norm": 0.044245872646570206, + "learning_rate": 9.056724405454509e-05, + "loss": 0.201, + "step": 5926 + }, + { + "epoch": 1.1944388474712875, + "grad_norm": 0.050298575311899185, + "learning_rate": 9.055945236247654e-05, + "loss": 0.1613, + "step": 5928 + }, + { + "epoch": 1.1948418295385856, + "grad_norm": 0.054503366351127625, + "learning_rate": 9.055165778912934e-05, + "loss": 0.2154, + "step": 5930 + }, + { + "epoch": 1.1952448116058836, + "grad_norm": 0.04412766918540001, + "learning_rate": 9.054386033505724e-05, + "loss": 0.1519, + "step": 5932 + }, + { + "epoch": 1.1956477936731815, + "grad_norm": 0.03437204286456108, + "learning_rate": 9.053606000081413e-05, + "loss": 0.1507, + "step": 5934 + }, + { + "epoch": 1.1960507757404795, + "grad_norm": 0.05740992724895477, + "learning_rate": 9.052825678695417e-05, + "loss": 0.1893, + "step": 5936 + }, + { + "epoch": 1.1964537578077776, + "grad_norm": 0.06025834009051323, + "learning_rate": 9.052045069403165e-05, + "loss": 0.2163, + "step": 5938 + }, + { + "epoch": 1.1968567398750756, + "grad_norm": 0.06567951291799545, + "learning_rate": 9.05126417226011e-05, + "loss": 0.182, + "step": 5940 + }, + { + "epoch": 1.1972597219423735, + "grad_norm": 0.04909198358654976, + "learning_rate": 9.050482987321729e-05, + "loss": 0.1967, + "step": 5942 + }, + { + "epoch": 1.1976627040096717, + "grad_norm": 0.0788033977150917, + "learning_rate": 9.049701514643514e-05, + "loss": 0.2236, + "step": 5944 + }, + { + "epoch": 1.1980656860769696, + "grad_norm": 0.05334840714931488, + "learning_rate": 9.04891975428098e-05, + "loss": 0.1899, + "step": 5946 + }, + { + "epoch": 1.1984686681442676, + "grad_norm": 0.05208491533994675, + "learning_rate": 9.048137706289662e-05, + "loss": 0.1837, + "step": 5948 + }, + { + "epoch": 1.1988716502115655, + "grad_norm": 0.05720973014831543, + "learning_rate": 9.047355370725115e-05, + "loss": 0.2488, + "step": 5950 + }, + { + "epoch": 1.1992746322788637, + "grad_norm": 0.04644571989774704, + "learning_rate": 9.046572747642916e-05, + "loss": 0.2018, + "step": 5952 + }, + { + "epoch": 1.1996776143461616, + "grad_norm": 0.04885132610797882, + "learning_rate": 9.045789837098659e-05, + "loss": 0.217, + "step": 5954 + }, + { + "epoch": 1.2000805964134595, + "grad_norm": 0.053192708641290665, + "learning_rate": 9.045006639147964e-05, + "loss": 0.1962, + "step": 5956 + }, + { + "epoch": 1.2004835784807577, + "grad_norm": 0.04022448509931564, + "learning_rate": 9.044223153846466e-05, + "loss": 0.1872, + "step": 5958 + }, + { + "epoch": 1.2008865605480556, + "grad_norm": 0.046723004430532455, + "learning_rate": 9.043439381249823e-05, + "loss": 0.2085, + "step": 5960 + }, + { + "epoch": 1.2012895426153536, + "grad_norm": 0.058833975344896317, + "learning_rate": 9.042655321413712e-05, + "loss": 0.214, + "step": 5962 + }, + { + "epoch": 1.2016925246826515, + "grad_norm": 0.05478130653500557, + "learning_rate": 9.041870974393832e-05, + "loss": 0.2313, + "step": 5964 + }, + { + "epoch": 1.2020955067499497, + "grad_norm": 0.0757334753870964, + "learning_rate": 9.041086340245904e-05, + "loss": 0.1936, + "step": 5966 + }, + { + "epoch": 1.2024984888172476, + "grad_norm": 0.04867379739880562, + "learning_rate": 9.040301419025663e-05, + "loss": 0.1978, + "step": 5968 + }, + { + "epoch": 1.2029014708845456, + "grad_norm": 0.04826593026518822, + "learning_rate": 9.039516210788872e-05, + "loss": 0.1717, + "step": 5970 + }, + { + "epoch": 1.2033044529518437, + "grad_norm": 0.05275225266814232, + "learning_rate": 9.038730715591308e-05, + "loss": 0.1761, + "step": 5972 + }, + { + "epoch": 1.2037074350191417, + "grad_norm": 0.05469043180346489, + "learning_rate": 9.037944933488776e-05, + "loss": 0.2534, + "step": 5974 + }, + { + "epoch": 1.2041104170864396, + "grad_norm": 0.040808357298374176, + "learning_rate": 9.03715886453709e-05, + "loss": 0.1671, + "step": 5976 + }, + { + "epoch": 1.2045133991537376, + "grad_norm": 0.050247691571712494, + "learning_rate": 9.036372508792097e-05, + "loss": 0.1579, + "step": 5978 + }, + { + "epoch": 1.2049163812210357, + "grad_norm": 0.05738181248307228, + "learning_rate": 9.035585866309656e-05, + "loss": 0.1743, + "step": 5980 + }, + { + "epoch": 1.2053193632883337, + "grad_norm": 0.04823550209403038, + "learning_rate": 9.034798937145649e-05, + "loss": 0.1769, + "step": 5982 + }, + { + "epoch": 1.2057223453556316, + "grad_norm": 0.045579053461551666, + "learning_rate": 9.034011721355977e-05, + "loss": 0.1475, + "step": 5984 + }, + { + "epoch": 1.2061253274229298, + "grad_norm": 0.055633220821619034, + "learning_rate": 9.033224218996565e-05, + "loss": 0.2113, + "step": 5986 + }, + { + "epoch": 1.2065283094902277, + "grad_norm": 0.05092507600784302, + "learning_rate": 9.032436430123355e-05, + "loss": 0.2388, + "step": 5988 + }, + { + "epoch": 1.2069312915575257, + "grad_norm": 0.07090190052986145, + "learning_rate": 9.031648354792309e-05, + "loss": 0.1895, + "step": 5990 + }, + { + "epoch": 1.2073342736248236, + "grad_norm": 0.03843579441308975, + "learning_rate": 9.030859993059413e-05, + "loss": 0.156, + "step": 5992 + }, + { + "epoch": 1.2077372556921218, + "grad_norm": 0.04407760500907898, + "learning_rate": 9.030071344980668e-05, + "loss": 0.1831, + "step": 5994 + }, + { + "epoch": 1.2081402377594197, + "grad_norm": 0.049290142953395844, + "learning_rate": 9.0292824106121e-05, + "loss": 0.2357, + "step": 5996 + }, + { + "epoch": 1.2085432198267176, + "grad_norm": 0.05240347981452942, + "learning_rate": 9.028493190009754e-05, + "loss": 0.2431, + "step": 5998 + }, + { + "epoch": 1.2089462018940158, + "grad_norm": 0.044982656836509705, + "learning_rate": 9.027703683229694e-05, + "loss": 0.1933, + "step": 6000 + }, + { + "epoch": 1.2093491839613137, + "grad_norm": 0.08840004354715347, + "learning_rate": 9.026913890328004e-05, + "loss": 0.1292, + "step": 6002 + }, + { + "epoch": 1.2097521660286117, + "grad_norm": 0.04240783676505089, + "learning_rate": 9.026123811360794e-05, + "loss": 0.2281, + "step": 6004 + }, + { + "epoch": 1.2101551480959096, + "grad_norm": 0.05913504585623741, + "learning_rate": 9.025333446384187e-05, + "loss": 0.2185, + "step": 6006 + }, + { + "epoch": 1.2105581301632078, + "grad_norm": 0.04633234441280365, + "learning_rate": 9.024542795454328e-05, + "loss": 0.1623, + "step": 6008 + }, + { + "epoch": 1.2109611122305057, + "grad_norm": 0.03760692849755287, + "learning_rate": 9.023751858627387e-05, + "loss": 0.1773, + "step": 6010 + }, + { + "epoch": 1.2113640942978037, + "grad_norm": 0.06671061366796494, + "learning_rate": 9.022960635959548e-05, + "loss": 0.1771, + "step": 6012 + }, + { + "epoch": 1.2117670763651018, + "grad_norm": 0.06102992966771126, + "learning_rate": 9.022169127507019e-05, + "loss": 0.2618, + "step": 6014 + }, + { + "epoch": 1.2121700584323998, + "grad_norm": 0.05439795181155205, + "learning_rate": 9.021377333326027e-05, + "loss": 0.1914, + "step": 6016 + }, + { + "epoch": 1.2125730404996977, + "grad_norm": 0.039903391152620316, + "learning_rate": 9.020585253472822e-05, + "loss": 0.1595, + "step": 6018 + }, + { + "epoch": 1.2129760225669957, + "grad_norm": 0.056906476616859436, + "learning_rate": 9.019792888003671e-05, + "loss": 0.2301, + "step": 6020 + }, + { + "epoch": 1.2133790046342938, + "grad_norm": 0.05966992676258087, + "learning_rate": 9.019000236974859e-05, + "loss": 0.1885, + "step": 6022 + }, + { + "epoch": 1.2137819867015918, + "grad_norm": 0.04310747981071472, + "learning_rate": 9.0182073004427e-05, + "loss": 0.1629, + "step": 6024 + }, + { + "epoch": 1.2141849687688897, + "grad_norm": 0.05637119710445404, + "learning_rate": 9.01741407846352e-05, + "loss": 0.1677, + "step": 6026 + }, + { + "epoch": 1.2145879508361879, + "grad_norm": 0.06048206612467766, + "learning_rate": 9.01662057109367e-05, + "loss": 0.1987, + "step": 6028 + }, + { + "epoch": 1.2149909329034858, + "grad_norm": 0.06276671588420868, + "learning_rate": 9.015826778389517e-05, + "loss": 0.1893, + "step": 6030 + }, + { + "epoch": 1.2153939149707838, + "grad_norm": 0.05327733978629112, + "learning_rate": 9.015032700407452e-05, + "loss": 0.2143, + "step": 6032 + }, + { + "epoch": 1.2157968970380817, + "grad_norm": 0.06122863292694092, + "learning_rate": 9.014238337203885e-05, + "loss": 0.2751, + "step": 6034 + }, + { + "epoch": 1.2161998791053799, + "grad_norm": 0.043489329516887665, + "learning_rate": 9.013443688835246e-05, + "loss": 0.1712, + "step": 6036 + }, + { + "epoch": 1.2166028611726778, + "grad_norm": 0.04354552924633026, + "learning_rate": 9.012648755357986e-05, + "loss": 0.2167, + "step": 6038 + }, + { + "epoch": 1.2170058432399757, + "grad_norm": 0.0550360232591629, + "learning_rate": 9.011853536828576e-05, + "loss": 0.1874, + "step": 6040 + }, + { + "epoch": 1.217408825307274, + "grad_norm": 0.06048135086894035, + "learning_rate": 9.011058033303508e-05, + "loss": 0.1952, + "step": 6042 + }, + { + "epoch": 1.2178118073745718, + "grad_norm": 0.05317499861121178, + "learning_rate": 9.010262244839292e-05, + "loss": 0.1439, + "step": 6044 + }, + { + "epoch": 1.2182147894418698, + "grad_norm": 0.04616203531622887, + "learning_rate": 9.009466171492458e-05, + "loss": 0.1396, + "step": 6046 + }, + { + "epoch": 1.2186177715091677, + "grad_norm": 0.059253908693790436, + "learning_rate": 9.008669813319559e-05, + "loss": 0.2175, + "step": 6048 + }, + { + "epoch": 1.2190207535764659, + "grad_norm": 0.047273196280002594, + "learning_rate": 9.00787317037717e-05, + "loss": 0.1592, + "step": 6050 + }, + { + "epoch": 1.2194237356437638, + "grad_norm": 0.050518494099378586, + "learning_rate": 9.007076242721878e-05, + "loss": 0.2352, + "step": 6052 + }, + { + "epoch": 1.2198267177110618, + "grad_norm": 0.0745004266500473, + "learning_rate": 9.006279030410298e-05, + "loss": 0.2386, + "step": 6054 + }, + { + "epoch": 1.22022969977836, + "grad_norm": 0.06451984494924545, + "learning_rate": 9.005481533499065e-05, + "loss": 0.2335, + "step": 6056 + }, + { + "epoch": 1.2206326818456579, + "grad_norm": 0.06747965514659882, + "learning_rate": 9.004683752044828e-05, + "loss": 0.268, + "step": 6058 + }, + { + "epoch": 1.2210356639129558, + "grad_norm": 0.05588728189468384, + "learning_rate": 9.003885686104262e-05, + "loss": 0.2193, + "step": 6060 + }, + { + "epoch": 1.2214386459802538, + "grad_norm": 0.05268767848610878, + "learning_rate": 9.00308733573406e-05, + "loss": 0.2084, + "step": 6062 + }, + { + "epoch": 1.221841628047552, + "grad_norm": 0.04871091991662979, + "learning_rate": 9.002288700990937e-05, + "loss": 0.1493, + "step": 6064 + }, + { + "epoch": 1.2222446101148499, + "grad_norm": 0.06996570527553558, + "learning_rate": 9.001489781931624e-05, + "loss": 0.2083, + "step": 6066 + }, + { + "epoch": 1.2226475921821478, + "grad_norm": 0.04460399970412254, + "learning_rate": 9.000690578612877e-05, + "loss": 0.1949, + "step": 6068 + }, + { + "epoch": 1.223050574249446, + "grad_norm": 0.05490953475236893, + "learning_rate": 8.999891091091469e-05, + "loss": 0.2582, + "step": 6070 + }, + { + "epoch": 1.223453556316744, + "grad_norm": 0.0709412470459938, + "learning_rate": 8.999091319424196e-05, + "loss": 0.159, + "step": 6072 + }, + { + "epoch": 1.2238565383840418, + "grad_norm": 0.0431116484105587, + "learning_rate": 8.998291263667869e-05, + "loss": 0.1924, + "step": 6074 + }, + { + "epoch": 1.22425952045134, + "grad_norm": 0.03946799412369728, + "learning_rate": 8.997490923879327e-05, + "loss": 0.1695, + "step": 6076 + }, + { + "epoch": 1.224662502518638, + "grad_norm": 0.05423908308148384, + "learning_rate": 8.996690300115422e-05, + "loss": 0.2, + "step": 6078 + }, + { + "epoch": 1.225065484585936, + "grad_norm": 0.044834546744823456, + "learning_rate": 8.99588939243303e-05, + "loss": 0.2225, + "step": 6080 + }, + { + "epoch": 1.2254684666532338, + "grad_norm": 0.05637017637491226, + "learning_rate": 8.995088200889046e-05, + "loss": 0.2222, + "step": 6082 + }, + { + "epoch": 1.225871448720532, + "grad_norm": 0.0475136823952198, + "learning_rate": 8.994286725540384e-05, + "loss": 0.1824, + "step": 6084 + }, + { + "epoch": 1.22627443078783, + "grad_norm": 0.045652661472558975, + "learning_rate": 8.993484966443984e-05, + "loss": 0.2257, + "step": 6086 + }, + { + "epoch": 1.2266774128551279, + "grad_norm": 0.052571386098861694, + "learning_rate": 8.992682923656797e-05, + "loss": 0.204, + "step": 6088 + }, + { + "epoch": 1.227080394922426, + "grad_norm": 0.056858647614717484, + "learning_rate": 8.9918805972358e-05, + "loss": 0.2115, + "step": 6090 + }, + { + "epoch": 1.227483376989724, + "grad_norm": 0.05734502896666527, + "learning_rate": 8.991077987237989e-05, + "loss": 0.2268, + "step": 6092 + }, + { + "epoch": 1.227886359057022, + "grad_norm": 0.04611526057124138, + "learning_rate": 8.990275093720381e-05, + "loss": 0.1692, + "step": 6094 + }, + { + "epoch": 1.22828934112432, + "grad_norm": 0.03183615952730179, + "learning_rate": 8.989471916740013e-05, + "loss": 0.1486, + "step": 6096 + }, + { + "epoch": 1.228692323191618, + "grad_norm": 0.048749230802059174, + "learning_rate": 8.988668456353939e-05, + "loss": 0.2033, + "step": 6098 + }, + { + "epoch": 1.229095305258916, + "grad_norm": 0.04780949279665947, + "learning_rate": 8.987864712619238e-05, + "loss": 0.2068, + "step": 6100 + }, + { + "epoch": 1.229498287326214, + "grad_norm": 0.044503167271614075, + "learning_rate": 8.987060685593006e-05, + "loss": 0.2014, + "step": 6102 + }, + { + "epoch": 1.229901269393512, + "grad_norm": 0.044010862708091736, + "learning_rate": 8.986256375332355e-05, + "loss": 0.1874, + "step": 6104 + }, + { + "epoch": 1.23030425146081, + "grad_norm": 0.049143869429826736, + "learning_rate": 8.98545178189443e-05, + "loss": 0.1811, + "step": 6106 + }, + { + "epoch": 1.230707233528108, + "grad_norm": 0.05140808969736099, + "learning_rate": 8.984646905336384e-05, + "loss": 0.1646, + "step": 6108 + }, + { + "epoch": 1.2311102155954061, + "grad_norm": 0.0606461800634861, + "learning_rate": 8.983841745715393e-05, + "loss": 0.2466, + "step": 6110 + }, + { + "epoch": 1.231513197662704, + "grad_norm": 0.046162523329257965, + "learning_rate": 8.983036303088656e-05, + "loss": 0.2057, + "step": 6112 + }, + { + "epoch": 1.231916179730002, + "grad_norm": 0.04632925987243652, + "learning_rate": 8.982230577513391e-05, + "loss": 0.2076, + "step": 6114 + }, + { + "epoch": 1.2323191617973, + "grad_norm": 0.04459146410226822, + "learning_rate": 8.981424569046834e-05, + "loss": 0.2181, + "step": 6116 + }, + { + "epoch": 1.232722143864598, + "grad_norm": 0.05720078572630882, + "learning_rate": 8.980618277746242e-05, + "loss": 0.2284, + "step": 6118 + }, + { + "epoch": 1.233125125931896, + "grad_norm": 0.0529584176838398, + "learning_rate": 8.979811703668894e-05, + "loss": 0.2177, + "step": 6120 + }, + { + "epoch": 1.233528107999194, + "grad_norm": 0.0538485161960125, + "learning_rate": 8.979004846872088e-05, + "loss": 0.2025, + "step": 6122 + }, + { + "epoch": 1.2339310900664922, + "grad_norm": 0.06908518821001053, + "learning_rate": 8.97819770741314e-05, + "loss": 0.1802, + "step": 6124 + }, + { + "epoch": 1.23433407213379, + "grad_norm": 0.0569969080388546, + "learning_rate": 8.977390285349391e-05, + "loss": 0.1983, + "step": 6126 + }, + { + "epoch": 1.234737054201088, + "grad_norm": 0.07259988784790039, + "learning_rate": 8.976582580738195e-05, + "loss": 0.2202, + "step": 6128 + }, + { + "epoch": 1.235140036268386, + "grad_norm": 0.04598740115761757, + "learning_rate": 8.975774593636933e-05, + "loss": 0.1815, + "step": 6130 + }, + { + "epoch": 1.2355430183356841, + "grad_norm": 0.051774248480796814, + "learning_rate": 8.974966324103002e-05, + "loss": 0.2285, + "step": 6132 + }, + { + "epoch": 1.235946000402982, + "grad_norm": 0.05057157576084137, + "learning_rate": 8.974157772193821e-05, + "loss": 0.2287, + "step": 6134 + }, + { + "epoch": 1.23634898247028, + "grad_norm": 0.06012137979269028, + "learning_rate": 8.973348937966826e-05, + "loss": 0.2513, + "step": 6136 + }, + { + "epoch": 1.2367519645375782, + "grad_norm": 0.04695823788642883, + "learning_rate": 8.972539821479478e-05, + "loss": 0.1839, + "step": 6138 + }, + { + "epoch": 1.2371549466048761, + "grad_norm": 0.0549938790500164, + "learning_rate": 8.971730422789255e-05, + "loss": 0.2404, + "step": 6140 + }, + { + "epoch": 1.237557928672174, + "grad_norm": 0.04319198429584503, + "learning_rate": 8.970920741953652e-05, + "loss": 0.2047, + "step": 6142 + }, + { + "epoch": 1.237960910739472, + "grad_norm": 0.04579996317625046, + "learning_rate": 8.970110779030193e-05, + "loss": 0.1764, + "step": 6144 + }, + { + "epoch": 1.2383638928067702, + "grad_norm": 0.04213905707001686, + "learning_rate": 8.969300534076412e-05, + "loss": 0.2204, + "step": 6146 + }, + { + "epoch": 1.2387668748740681, + "grad_norm": 0.04355452582240105, + "learning_rate": 8.96849000714987e-05, + "loss": 0.1718, + "step": 6148 + }, + { + "epoch": 1.239169856941366, + "grad_norm": 0.041120558977127075, + "learning_rate": 8.967679198308144e-05, + "loss": 0.1902, + "step": 6150 + }, + { + "epoch": 1.2395728390086642, + "grad_norm": 0.04531228542327881, + "learning_rate": 8.966868107608832e-05, + "loss": 0.1738, + "step": 6152 + }, + { + "epoch": 1.2399758210759622, + "grad_norm": 0.04133576154708862, + "learning_rate": 8.966056735109555e-05, + "loss": 0.2365, + "step": 6154 + }, + { + "epoch": 1.24037880314326, + "grad_norm": 0.05090603604912758, + "learning_rate": 8.96524508086795e-05, + "loss": 0.1798, + "step": 6156 + }, + { + "epoch": 1.240781785210558, + "grad_norm": 0.044529419392347336, + "learning_rate": 8.964433144941675e-05, + "loss": 0.2278, + "step": 6158 + }, + { + "epoch": 1.2411847672778562, + "grad_norm": 0.04503866657614708, + "learning_rate": 8.963620927388412e-05, + "loss": 0.1504, + "step": 6160 + }, + { + "epoch": 1.2415877493451541, + "grad_norm": 0.04498367756605148, + "learning_rate": 8.962808428265855e-05, + "loss": 0.1835, + "step": 6162 + }, + { + "epoch": 1.241990731412452, + "grad_norm": 0.06172780320048332, + "learning_rate": 8.961995647631724e-05, + "loss": 0.1891, + "step": 6164 + }, + { + "epoch": 1.2423937134797503, + "grad_norm": 0.0528985895216465, + "learning_rate": 8.961182585543762e-05, + "loss": 0.2107, + "step": 6166 + }, + { + "epoch": 1.2427966955470482, + "grad_norm": 0.03874331712722778, + "learning_rate": 8.960369242059721e-05, + "loss": 0.2512, + "step": 6168 + }, + { + "epoch": 1.2431996776143461, + "grad_norm": 0.06559088826179504, + "learning_rate": 8.959555617237383e-05, + "loss": 0.2369, + "step": 6170 + }, + { + "epoch": 1.243602659681644, + "grad_norm": 0.0783800482749939, + "learning_rate": 8.958741711134548e-05, + "loss": 0.2538, + "step": 6172 + }, + { + "epoch": 1.2440056417489422, + "grad_norm": 0.04903315007686615, + "learning_rate": 8.957927523809033e-05, + "loss": 0.1955, + "step": 6174 + }, + { + "epoch": 1.2444086238162402, + "grad_norm": 0.0630233958363533, + "learning_rate": 8.957113055318674e-05, + "loss": 0.2164, + "step": 6176 + }, + { + "epoch": 1.2448116058835381, + "grad_norm": 0.09295551478862762, + "learning_rate": 8.956298305721333e-05, + "loss": 0.2149, + "step": 6178 + }, + { + "epoch": 1.2452145879508363, + "grad_norm": 0.05847977474331856, + "learning_rate": 8.95548327507489e-05, + "loss": 0.2074, + "step": 6180 + }, + { + "epoch": 1.2456175700181342, + "grad_norm": 0.04421735554933548, + "learning_rate": 8.954667963437238e-05, + "loss": 0.1598, + "step": 6182 + }, + { + "epoch": 1.2460205520854322, + "grad_norm": 0.046964842826128006, + "learning_rate": 8.953852370866299e-05, + "loss": 0.158, + "step": 6184 + }, + { + "epoch": 1.24642353415273, + "grad_norm": 0.03633161261677742, + "learning_rate": 8.95303649742001e-05, + "loss": 0.1394, + "step": 6186 + }, + { + "epoch": 1.2468265162200283, + "grad_norm": 0.061852145940065384, + "learning_rate": 8.952220343156332e-05, + "loss": 0.1814, + "step": 6188 + }, + { + "epoch": 1.2472294982873262, + "grad_norm": 0.045274149626493454, + "learning_rate": 8.951403908133242e-05, + "loss": 0.2286, + "step": 6190 + }, + { + "epoch": 1.2476324803546242, + "grad_norm": 0.05303020775318146, + "learning_rate": 8.950587192408737e-05, + "loss": 0.2134, + "step": 6192 + }, + { + "epoch": 1.2480354624219223, + "grad_norm": 0.05261223763227463, + "learning_rate": 8.949770196040834e-05, + "loss": 0.1796, + "step": 6194 + }, + { + "epoch": 1.2484384444892203, + "grad_norm": 0.051110610365867615, + "learning_rate": 8.948952919087575e-05, + "loss": 0.1984, + "step": 6196 + }, + { + "epoch": 1.2488414265565182, + "grad_norm": 0.05076858028769493, + "learning_rate": 8.948135361607016e-05, + "loss": 0.2459, + "step": 6198 + }, + { + "epoch": 1.2492444086238161, + "grad_norm": 0.06023627519607544, + "learning_rate": 8.947317523657235e-05, + "loss": 0.1624, + "step": 6200 + }, + { + "epoch": 1.2496473906911143, + "grad_norm": 0.0513908714056015, + "learning_rate": 8.946499405296328e-05, + "loss": 0.209, + "step": 6202 + }, + { + "epoch": 1.2500503727584122, + "grad_norm": 0.04183659330010414, + "learning_rate": 8.945681006582419e-05, + "loss": 0.2022, + "step": 6204 + }, + { + "epoch": 1.2504533548257102, + "grad_norm": 0.068643718957901, + "learning_rate": 8.944862327573638e-05, + "loss": 0.1795, + "step": 6206 + }, + { + "epoch": 1.2508563368930083, + "grad_norm": 0.05124150961637497, + "learning_rate": 8.944043368328145e-05, + "loss": 0.1887, + "step": 6208 + }, + { + "epoch": 1.2512593189603063, + "grad_norm": 0.048066116869449615, + "learning_rate": 8.943224128904122e-05, + "loss": 0.1641, + "step": 6210 + }, + { + "epoch": 1.2516623010276042, + "grad_norm": 0.06339994817972183, + "learning_rate": 8.942404609359761e-05, + "loss": 0.1863, + "step": 6212 + }, + { + "epoch": 1.2520652830949022, + "grad_norm": 0.04368380457162857, + "learning_rate": 8.941584809753283e-05, + "loss": 0.1979, + "step": 6214 + }, + { + "epoch": 1.2524682651622003, + "grad_norm": 0.05009367689490318, + "learning_rate": 8.940764730142922e-05, + "loss": 0.2354, + "step": 6216 + }, + { + "epoch": 1.2528712472294983, + "grad_norm": 0.058022335171699524, + "learning_rate": 8.939944370586938e-05, + "loss": 0.2433, + "step": 6218 + }, + { + "epoch": 1.2532742292967962, + "grad_norm": 0.06911885738372803, + "learning_rate": 8.939123731143606e-05, + "loss": 0.1599, + "step": 6220 + }, + { + "epoch": 1.2536772113640944, + "grad_norm": 0.04500617831945419, + "learning_rate": 8.938302811871225e-05, + "loss": 0.1972, + "step": 6222 + }, + { + "epoch": 1.2540801934313923, + "grad_norm": 0.04577852040529251, + "learning_rate": 8.93748161282811e-05, + "loss": 0.1829, + "step": 6224 + }, + { + "epoch": 1.2544831754986903, + "grad_norm": 0.05215632542967796, + "learning_rate": 8.936660134072599e-05, + "loss": 0.1789, + "step": 6226 + }, + { + "epoch": 1.2548861575659882, + "grad_norm": 0.046846963465213776, + "learning_rate": 8.935838375663047e-05, + "loss": 0.1722, + "step": 6228 + }, + { + "epoch": 1.2552891396332864, + "grad_norm": 0.06160841882228851, + "learning_rate": 8.935016337657831e-05, + "loss": 0.1877, + "step": 6230 + }, + { + "epoch": 1.2556921217005843, + "grad_norm": 0.03491106256842613, + "learning_rate": 8.934194020115349e-05, + "loss": 0.1862, + "step": 6232 + }, + { + "epoch": 1.2560951037678822, + "grad_norm": 0.046751853078603745, + "learning_rate": 8.933371423094014e-05, + "loss": 0.2105, + "step": 6234 + }, + { + "epoch": 1.2564980858351804, + "grad_norm": 0.056983157992362976, + "learning_rate": 8.932548546652264e-05, + "loss": 0.1568, + "step": 6236 + }, + { + "epoch": 1.2569010679024784, + "grad_norm": 0.04728730395436287, + "learning_rate": 8.931725390848556e-05, + "loss": 0.2299, + "step": 6238 + }, + { + "epoch": 1.2573040499697763, + "grad_norm": 0.06326276808977127, + "learning_rate": 8.930901955741363e-05, + "loss": 0.1922, + "step": 6240 + }, + { + "epoch": 1.2577070320370742, + "grad_norm": 0.03530817851424217, + "learning_rate": 8.93007824138918e-05, + "loss": 0.1629, + "step": 6242 + }, + { + "epoch": 1.2581100141043724, + "grad_norm": 0.055923838168382645, + "learning_rate": 8.929254247850526e-05, + "loss": 0.1664, + "step": 6244 + }, + { + "epoch": 1.2585129961716703, + "grad_norm": 0.06713879108428955, + "learning_rate": 8.928429975183934e-05, + "loss": 0.1592, + "step": 6246 + }, + { + "epoch": 1.2589159782389685, + "grad_norm": 0.04550889879465103, + "learning_rate": 8.927605423447958e-05, + "loss": 0.1682, + "step": 6248 + }, + { + "epoch": 1.2593189603062664, + "grad_norm": 0.05290335789322853, + "learning_rate": 8.926780592701176e-05, + "loss": 0.2486, + "step": 6250 + }, + { + "epoch": 1.2597219423735644, + "grad_norm": 0.04227093979716301, + "learning_rate": 8.925955483002178e-05, + "loss": 0.1991, + "step": 6252 + }, + { + "epoch": 1.2601249244408623, + "grad_norm": 0.07582451403141022, + "learning_rate": 8.925130094409582e-05, + "loss": 0.1773, + "step": 6254 + }, + { + "epoch": 1.2605279065081603, + "grad_norm": 0.061997395008802414, + "learning_rate": 8.924304426982022e-05, + "loss": 0.2172, + "step": 6256 + }, + { + "epoch": 1.2609308885754584, + "grad_norm": 0.07101260870695114, + "learning_rate": 8.923478480778151e-05, + "loss": 0.1832, + "step": 6258 + }, + { + "epoch": 1.2613338706427564, + "grad_norm": 0.038211528211832047, + "learning_rate": 8.922652255856645e-05, + "loss": 0.1695, + "step": 6260 + }, + { + "epoch": 1.2617368527100545, + "grad_norm": 0.05801432207226753, + "learning_rate": 8.921825752276194e-05, + "loss": 0.2165, + "step": 6262 + }, + { + "epoch": 1.2621398347773525, + "grad_norm": 0.04629329964518547, + "learning_rate": 8.920998970095515e-05, + "loss": 0.1839, + "step": 6264 + }, + { + "epoch": 1.2625428168446504, + "grad_norm": 0.05066737160086632, + "learning_rate": 8.920171909373339e-05, + "loss": 0.2055, + "step": 6266 + }, + { + "epoch": 1.2629457989119484, + "grad_norm": 0.04259856417775154, + "learning_rate": 8.91934457016842e-05, + "loss": 0.1638, + "step": 6268 + }, + { + "epoch": 1.2633487809792463, + "grad_norm": 0.061476513743400574, + "learning_rate": 8.918516952539532e-05, + "loss": 0.1923, + "step": 6270 + }, + { + "epoch": 1.2637517630465445, + "grad_norm": 0.049006011337041855, + "learning_rate": 8.917689056545463e-05, + "loss": 0.1802, + "step": 6272 + }, + { + "epoch": 1.2641547451138424, + "grad_norm": 0.05190061405301094, + "learning_rate": 8.916860882245032e-05, + "loss": 0.1723, + "step": 6274 + }, + { + "epoch": 1.2645577271811406, + "grad_norm": 0.06619302183389664, + "learning_rate": 8.916032429697069e-05, + "loss": 0.152, + "step": 6276 + }, + { + "epoch": 1.2649607092484385, + "grad_norm": 0.046162430197000504, + "learning_rate": 8.915203698960423e-05, + "loss": 0.2159, + "step": 6278 + }, + { + "epoch": 1.2653636913157364, + "grad_norm": 0.05554254725575447, + "learning_rate": 8.914374690093967e-05, + "loss": 0.1643, + "step": 6280 + }, + { + "epoch": 1.2657666733830344, + "grad_norm": 0.05046551302075386, + "learning_rate": 8.913545403156596e-05, + "loss": 0.1734, + "step": 6282 + }, + { + "epoch": 1.2661696554503323, + "grad_norm": 0.050692591816186905, + "learning_rate": 8.912715838207215e-05, + "loss": 0.1975, + "step": 6284 + }, + { + "epoch": 1.2665726375176305, + "grad_norm": 0.048872966319322586, + "learning_rate": 8.911885995304761e-05, + "loss": 0.2695, + "step": 6286 + }, + { + "epoch": 1.2669756195849284, + "grad_norm": 0.06595724821090698, + "learning_rate": 8.911055874508181e-05, + "loss": 0.2077, + "step": 6288 + }, + { + "epoch": 1.2673786016522266, + "grad_norm": 0.042650818824768066, + "learning_rate": 8.910225475876446e-05, + "loss": 0.2249, + "step": 6290 + }, + { + "epoch": 1.2677815837195245, + "grad_norm": 0.05222772806882858, + "learning_rate": 8.909394799468547e-05, + "loss": 0.2154, + "step": 6292 + }, + { + "epoch": 1.2681845657868225, + "grad_norm": 0.04049530252814293, + "learning_rate": 8.908563845343494e-05, + "loss": 0.1806, + "step": 6294 + }, + { + "epoch": 1.2685875478541204, + "grad_norm": 0.052481092512607574, + "learning_rate": 8.907732613560316e-05, + "loss": 0.2266, + "step": 6296 + }, + { + "epoch": 1.2689905299214184, + "grad_norm": 0.032652221620082855, + "learning_rate": 8.906901104178062e-05, + "loss": 0.1711, + "step": 6298 + }, + { + "epoch": 1.2693935119887165, + "grad_norm": 0.04992491006851196, + "learning_rate": 8.906069317255801e-05, + "loss": 0.192, + "step": 6300 + }, + { + "epoch": 1.2697964940560145, + "grad_norm": 0.04392782226204872, + "learning_rate": 8.905237252852624e-05, + "loss": 0.1647, + "step": 6302 + }, + { + "epoch": 1.2701994761233126, + "grad_norm": 0.04683556407690048, + "learning_rate": 8.904404911027638e-05, + "loss": 0.1929, + "step": 6304 + }, + { + "epoch": 1.2706024581906106, + "grad_norm": 0.048300039023160934, + "learning_rate": 8.903572291839971e-05, + "loss": 0.1956, + "step": 6306 + }, + { + "epoch": 1.2710054402579085, + "grad_norm": 0.058879271149635315, + "learning_rate": 8.902739395348771e-05, + "loss": 0.1853, + "step": 6308 + }, + { + "epoch": 1.2714084223252065, + "grad_norm": 0.05005578324198723, + "learning_rate": 8.901906221613206e-05, + "loss": 0.217, + "step": 6310 + }, + { + "epoch": 1.2718114043925044, + "grad_norm": 0.06488041579723358, + "learning_rate": 8.901072770692464e-05, + "loss": 0.2339, + "step": 6312 + }, + { + "epoch": 1.2722143864598026, + "grad_norm": 0.05080641061067581, + "learning_rate": 8.900239042645751e-05, + "loss": 0.1918, + "step": 6314 + }, + { + "epoch": 1.2726173685271005, + "grad_norm": 0.059111639857292175, + "learning_rate": 8.899405037532294e-05, + "loss": 0.2268, + "step": 6316 + }, + { + "epoch": 1.2730203505943987, + "grad_norm": 0.05126844719052315, + "learning_rate": 8.898570755411338e-05, + "loss": 0.1732, + "step": 6318 + }, + { + "epoch": 1.2734233326616966, + "grad_norm": 0.05296769365668297, + "learning_rate": 8.897736196342151e-05, + "loss": 0.2006, + "step": 6320 + }, + { + "epoch": 1.2738263147289945, + "grad_norm": 0.054364196956157684, + "learning_rate": 8.896901360384018e-05, + "loss": 0.2214, + "step": 6322 + }, + { + "epoch": 1.2742292967962925, + "grad_norm": 0.04188178852200508, + "learning_rate": 8.896066247596245e-05, + "loss": 0.2247, + "step": 6324 + }, + { + "epoch": 1.2746322788635907, + "grad_norm": 0.048656053841114044, + "learning_rate": 8.895230858038157e-05, + "loss": 0.1718, + "step": 6326 + }, + { + "epoch": 1.2750352609308886, + "grad_norm": 0.07030941545963287, + "learning_rate": 8.894395191769099e-05, + "loss": 0.2284, + "step": 6328 + }, + { + "epoch": 1.2754382429981865, + "grad_norm": 0.04568159952759743, + "learning_rate": 8.893559248848431e-05, + "loss": 0.1851, + "step": 6330 + }, + { + "epoch": 1.2758412250654847, + "grad_norm": 0.04896444454789162, + "learning_rate": 8.892723029335544e-05, + "loss": 0.1895, + "step": 6332 + }, + { + "epoch": 1.2762442071327826, + "grad_norm": 0.06214359775185585, + "learning_rate": 8.891886533289839e-05, + "loss": 0.14, + "step": 6334 + }, + { + "epoch": 1.2766471892000806, + "grad_norm": 0.062105149030685425, + "learning_rate": 8.891049760770737e-05, + "loss": 0.2003, + "step": 6336 + }, + { + "epoch": 1.2770501712673785, + "grad_norm": 0.06353707611560822, + "learning_rate": 8.890212711837684e-05, + "loss": 0.1847, + "step": 6338 + }, + { + "epoch": 1.2774531533346767, + "grad_norm": 0.05937555432319641, + "learning_rate": 8.88937538655014e-05, + "loss": 0.2148, + "step": 6340 + }, + { + "epoch": 1.2778561354019746, + "grad_norm": 0.055293429642915726, + "learning_rate": 8.88853778496759e-05, + "loss": 0.2006, + "step": 6342 + }, + { + "epoch": 1.2782591174692726, + "grad_norm": 0.06335525959730148, + "learning_rate": 8.887699907149534e-05, + "loss": 0.2237, + "step": 6344 + }, + { + "epoch": 1.2786620995365707, + "grad_norm": 0.05069897323846817, + "learning_rate": 8.886861753155495e-05, + "loss": 0.1777, + "step": 6346 + }, + { + "epoch": 1.2790650816038687, + "grad_norm": 0.04641510918736458, + "learning_rate": 8.886023323045012e-05, + "loss": 0.1996, + "step": 6348 + }, + { + "epoch": 1.2794680636711666, + "grad_norm": 0.0559643991291523, + "learning_rate": 8.885184616877647e-05, + "loss": 0.2127, + "step": 6350 + }, + { + "epoch": 1.2798710457384646, + "grad_norm": 0.055252932012081146, + "learning_rate": 8.88434563471298e-05, + "loss": 0.2204, + "step": 6352 + }, + { + "epoch": 1.2802740278057627, + "grad_norm": 0.0576675646007061, + "learning_rate": 8.883506376610612e-05, + "loss": 0.2209, + "step": 6354 + }, + { + "epoch": 1.2806770098730607, + "grad_norm": 0.06650307774543762, + "learning_rate": 8.882666842630162e-05, + "loss": 0.2201, + "step": 6356 + }, + { + "epoch": 1.2810799919403586, + "grad_norm": 0.060477275401353836, + "learning_rate": 8.881827032831268e-05, + "loss": 0.1611, + "step": 6358 + }, + { + "epoch": 1.2814829740076568, + "grad_norm": 0.0532105453312397, + "learning_rate": 8.880986947273591e-05, + "loss": 0.1582, + "step": 6360 + }, + { + "epoch": 1.2818859560749547, + "grad_norm": 0.04423583298921585, + "learning_rate": 8.880146586016806e-05, + "loss": 0.1612, + "step": 6362 + }, + { + "epoch": 1.2822889381422526, + "grad_norm": 0.06925471872091293, + "learning_rate": 8.879305949120613e-05, + "loss": 0.2171, + "step": 6364 + }, + { + "epoch": 1.2826919202095506, + "grad_norm": 0.03289978951215744, + "learning_rate": 8.878465036644732e-05, + "loss": 0.1585, + "step": 6366 + }, + { + "epoch": 1.2830949022768487, + "grad_norm": 0.06422306597232819, + "learning_rate": 8.877623848648894e-05, + "loss": 0.2094, + "step": 6368 + }, + { + "epoch": 1.2834978843441467, + "grad_norm": 0.06235311180353165, + "learning_rate": 8.876782385192861e-05, + "loss": 0.226, + "step": 6370 + }, + { + "epoch": 1.2839008664114446, + "grad_norm": 0.062130313366651535, + "learning_rate": 8.875940646336409e-05, + "loss": 0.1998, + "step": 6372 + }, + { + "epoch": 1.2843038484787428, + "grad_norm": 0.05406568944454193, + "learning_rate": 8.87509863213933e-05, + "loss": 0.2353, + "step": 6374 + }, + { + "epoch": 1.2847068305460407, + "grad_norm": 0.0408284068107605, + "learning_rate": 8.874256342661442e-05, + "loss": 0.2246, + "step": 6376 + }, + { + "epoch": 1.2851098126133387, + "grad_norm": 0.03897031396627426, + "learning_rate": 8.873413777962578e-05, + "loss": 0.1423, + "step": 6378 + }, + { + "epoch": 1.2855127946806366, + "grad_norm": 0.05661426857113838, + "learning_rate": 8.872570938102595e-05, + "loss": 0.2576, + "step": 6380 + }, + { + "epoch": 1.2859157767479348, + "grad_norm": 0.04744990915060043, + "learning_rate": 8.871727823141367e-05, + "loss": 0.2002, + "step": 6382 + }, + { + "epoch": 1.2863187588152327, + "grad_norm": 0.045208241790533066, + "learning_rate": 8.870884433138785e-05, + "loss": 0.2259, + "step": 6384 + }, + { + "epoch": 1.2867217408825307, + "grad_norm": 0.04680623859167099, + "learning_rate": 8.870040768154763e-05, + "loss": 0.1799, + "step": 6386 + }, + { + "epoch": 1.2871247229498288, + "grad_norm": 0.04088175669312477, + "learning_rate": 8.869196828249235e-05, + "loss": 0.1982, + "step": 6388 + }, + { + "epoch": 1.2875277050171268, + "grad_norm": 0.05093217268586159, + "learning_rate": 8.868352613482153e-05, + "loss": 0.2254, + "step": 6390 + }, + { + "epoch": 1.2879306870844247, + "grad_norm": 0.0655287578701973, + "learning_rate": 8.867508123913486e-05, + "loss": 0.1938, + "step": 6392 + }, + { + "epoch": 1.2883336691517226, + "grad_norm": 0.054968371987342834, + "learning_rate": 8.866663359603228e-05, + "loss": 0.2205, + "step": 6394 + }, + { + "epoch": 1.2887366512190208, + "grad_norm": 0.04670187085866928, + "learning_rate": 8.865818320611388e-05, + "loss": 0.1916, + "step": 6396 + }, + { + "epoch": 1.2891396332863188, + "grad_norm": 0.03799133747816086, + "learning_rate": 8.864973006997999e-05, + "loss": 0.1747, + "step": 6398 + }, + { + "epoch": 1.2895426153536167, + "grad_norm": 0.0437266044318676, + "learning_rate": 8.864127418823107e-05, + "loss": 0.1568, + "step": 6400 + }, + { + "epoch": 1.2899455974209149, + "grad_norm": 0.05466373264789581, + "learning_rate": 8.863281556146783e-05, + "loss": 0.2209, + "step": 6402 + }, + { + "epoch": 1.2903485794882128, + "grad_norm": 0.04808984696865082, + "learning_rate": 8.862435419029116e-05, + "loss": 0.2492, + "step": 6404 + }, + { + "epoch": 1.2907515615555107, + "grad_norm": 0.06633257120847702, + "learning_rate": 8.861589007530214e-05, + "loss": 0.1691, + "step": 6406 + }, + { + "epoch": 1.2911545436228087, + "grad_norm": 0.038611918687820435, + "learning_rate": 8.860742321710204e-05, + "loss": 0.1695, + "step": 6408 + }, + { + "epoch": 1.2915575256901068, + "grad_norm": 0.054731372743844986, + "learning_rate": 8.859895361629233e-05, + "loss": 0.172, + "step": 6410 + }, + { + "epoch": 1.2919605077574048, + "grad_norm": 0.04989850893616676, + "learning_rate": 8.859048127347472e-05, + "loss": 0.1281, + "step": 6412 + }, + { + "epoch": 1.2923634898247027, + "grad_norm": 0.052453186362981796, + "learning_rate": 8.8582006189251e-05, + "loss": 0.1744, + "step": 6414 + }, + { + "epoch": 1.292766471892001, + "grad_norm": 0.03481290861964226, + "learning_rate": 8.857352836422328e-05, + "loss": 0.1577, + "step": 6416 + }, + { + "epoch": 1.2931694539592988, + "grad_norm": 0.03524111956357956, + "learning_rate": 8.856504779899378e-05, + "loss": 0.1481, + "step": 6418 + }, + { + "epoch": 1.2935724360265968, + "grad_norm": 0.04402356222271919, + "learning_rate": 8.855656449416498e-05, + "loss": 0.1864, + "step": 6420 + }, + { + "epoch": 1.2939754180938947, + "grad_norm": 0.06525306403636932, + "learning_rate": 8.854807845033949e-05, + "loss": 0.201, + "step": 6422 + }, + { + "epoch": 1.2943784001611929, + "grad_norm": 0.04306092485785484, + "learning_rate": 8.853958966812015e-05, + "loss": 0.1546, + "step": 6424 + }, + { + "epoch": 1.2947813822284908, + "grad_norm": 0.045840419828891754, + "learning_rate": 8.853109814811e-05, + "loss": 0.2124, + "step": 6426 + }, + { + "epoch": 1.2951843642957888, + "grad_norm": 0.04774376004934311, + "learning_rate": 8.852260389091227e-05, + "loss": 0.1978, + "step": 6428 + }, + { + "epoch": 1.295587346363087, + "grad_norm": 0.06106032803654671, + "learning_rate": 8.851410689713036e-05, + "loss": 0.2433, + "step": 6430 + }, + { + "epoch": 1.2959903284303849, + "grad_norm": 0.05506075546145439, + "learning_rate": 8.850560716736789e-05, + "loss": 0.2102, + "step": 6432 + }, + { + "epoch": 1.2963933104976828, + "grad_norm": 0.047731101512908936, + "learning_rate": 8.849710470222865e-05, + "loss": 0.1948, + "step": 6434 + }, + { + "epoch": 1.2967962925649807, + "grad_norm": 0.058562930673360825, + "learning_rate": 8.848859950231668e-05, + "loss": 0.2239, + "step": 6436 + }, + { + "epoch": 1.297199274632279, + "grad_norm": 0.03972695767879486, + "learning_rate": 8.848009156823615e-05, + "loss": 0.1916, + "step": 6438 + }, + { + "epoch": 1.2976022566995769, + "grad_norm": 0.04205062612891197, + "learning_rate": 8.847158090059145e-05, + "loss": 0.165, + "step": 6440 + }, + { + "epoch": 1.2980052387668748, + "grad_norm": 0.054686374962329865, + "learning_rate": 8.846306749998719e-05, + "loss": 0.1919, + "step": 6442 + }, + { + "epoch": 1.298408220834173, + "grad_norm": 0.0542772077023983, + "learning_rate": 8.845455136702809e-05, + "loss": 0.225, + "step": 6444 + }, + { + "epoch": 1.298811202901471, + "grad_norm": 0.04445016384124756, + "learning_rate": 8.844603250231918e-05, + "loss": 0.2461, + "step": 6446 + }, + { + "epoch": 1.2992141849687688, + "grad_norm": 0.04408948868513107, + "learning_rate": 8.843751090646562e-05, + "loss": 0.1767, + "step": 6448 + }, + { + "epoch": 1.2996171670360668, + "grad_norm": 0.041693881154060364, + "learning_rate": 8.842898658007274e-05, + "loss": 0.1658, + "step": 6450 + }, + { + "epoch": 1.300020149103365, + "grad_norm": 0.04052754119038582, + "learning_rate": 8.842045952374612e-05, + "loss": 0.1824, + "step": 6452 + }, + { + "epoch": 1.3004231311706629, + "grad_norm": 0.05492265522480011, + "learning_rate": 8.841192973809149e-05, + "loss": 0.2019, + "step": 6454 + }, + { + "epoch": 1.300826113237961, + "grad_norm": 0.06872859597206116, + "learning_rate": 8.84033972237148e-05, + "loss": 0.2278, + "step": 6456 + }, + { + "epoch": 1.301229095305259, + "grad_norm": 0.048062894493341446, + "learning_rate": 8.83948619812222e-05, + "loss": 0.2154, + "step": 6458 + }, + { + "epoch": 1.301632077372557, + "grad_norm": 0.045196451246738434, + "learning_rate": 8.838632401122e-05, + "loss": 0.1845, + "step": 6460 + }, + { + "epoch": 1.3020350594398549, + "grad_norm": 0.04608435556292534, + "learning_rate": 8.837778331431475e-05, + "loss": 0.2171, + "step": 6462 + }, + { + "epoch": 1.3024380415071528, + "grad_norm": 0.04621405154466629, + "learning_rate": 8.836923989111313e-05, + "loss": 0.1573, + "step": 6464 + }, + { + "epoch": 1.302841023574451, + "grad_norm": 0.04945302754640579, + "learning_rate": 8.836069374222206e-05, + "loss": 0.1985, + "step": 6466 + }, + { + "epoch": 1.303244005641749, + "grad_norm": 0.04604468122124672, + "learning_rate": 8.835214486824869e-05, + "loss": 0.2051, + "step": 6468 + }, + { + "epoch": 1.303646987709047, + "grad_norm": 0.05772722512483597, + "learning_rate": 8.834359326980026e-05, + "loss": 0.1724, + "step": 6470 + }, + { + "epoch": 1.304049969776345, + "grad_norm": 0.045948565006256104, + "learning_rate": 8.833503894748429e-05, + "loss": 0.2082, + "step": 6472 + }, + { + "epoch": 1.304452951843643, + "grad_norm": 0.06069548428058624, + "learning_rate": 8.832648190190847e-05, + "loss": 0.2108, + "step": 6474 + }, + { + "epoch": 1.304855933910941, + "grad_norm": 0.05096980184316635, + "learning_rate": 8.831792213368065e-05, + "loss": 0.2311, + "step": 6476 + }, + { + "epoch": 1.3052589159782388, + "grad_norm": 0.054760102182626724, + "learning_rate": 8.830935964340894e-05, + "loss": 0.2228, + "step": 6478 + }, + { + "epoch": 1.305661898045537, + "grad_norm": 0.046445854008197784, + "learning_rate": 8.830079443170158e-05, + "loss": 0.2161, + "step": 6480 + }, + { + "epoch": 1.306064880112835, + "grad_norm": 0.06579455733299255, + "learning_rate": 8.829222649916704e-05, + "loss": 0.2356, + "step": 6482 + }, + { + "epoch": 1.306467862180133, + "grad_norm": 0.05317157134413719, + "learning_rate": 8.828365584641396e-05, + "loss": 0.1861, + "step": 6484 + }, + { + "epoch": 1.306870844247431, + "grad_norm": 0.045328740030527115, + "learning_rate": 8.82750824740512e-05, + "loss": 0.1509, + "step": 6486 + }, + { + "epoch": 1.307273826314729, + "grad_norm": 0.03549671545624733, + "learning_rate": 8.826650638268781e-05, + "loss": 0.1512, + "step": 6488 + }, + { + "epoch": 1.307676808382027, + "grad_norm": 0.045670825988054276, + "learning_rate": 8.825792757293299e-05, + "loss": 0.2287, + "step": 6490 + }, + { + "epoch": 1.3080797904493249, + "grad_norm": 0.11401376128196716, + "learning_rate": 8.824934604539617e-05, + "loss": 0.1898, + "step": 6492 + }, + { + "epoch": 1.308482772516623, + "grad_norm": 0.07366832345724106, + "learning_rate": 8.8240761800687e-05, + "loss": 0.1657, + "step": 6494 + }, + { + "epoch": 1.308885754583921, + "grad_norm": 0.06504294276237488, + "learning_rate": 8.823217483941524e-05, + "loss": 0.2702, + "step": 6496 + }, + { + "epoch": 1.3092887366512191, + "grad_norm": 0.052060194313526154, + "learning_rate": 8.822358516219093e-05, + "loss": 0.2161, + "step": 6498 + }, + { + "epoch": 1.309691718718517, + "grad_norm": 0.046481553465127945, + "learning_rate": 8.821499276962429e-05, + "loss": 0.2609, + "step": 6500 + }, + { + "epoch": 1.310094700785815, + "grad_norm": 0.06982485949993134, + "learning_rate": 8.820639766232565e-05, + "loss": 0.2059, + "step": 6502 + }, + { + "epoch": 1.310497682853113, + "grad_norm": 0.05630074441432953, + "learning_rate": 8.819779984090562e-05, + "loss": 0.207, + "step": 6504 + }, + { + "epoch": 1.310900664920411, + "grad_norm": 0.05257268622517586, + "learning_rate": 8.8189199305975e-05, + "loss": 0.179, + "step": 6506 + }, + { + "epoch": 1.311303646987709, + "grad_norm": 0.0464656688272953, + "learning_rate": 8.818059605814472e-05, + "loss": 0.1982, + "step": 6508 + }, + { + "epoch": 1.311706629055007, + "grad_norm": 0.05108407139778137, + "learning_rate": 8.817199009802595e-05, + "loss": 0.2116, + "step": 6510 + }, + { + "epoch": 1.3121096111223052, + "grad_norm": 0.0434158593416214, + "learning_rate": 8.816338142623007e-05, + "loss": 0.1984, + "step": 6512 + }, + { + "epoch": 1.3125125931896031, + "grad_norm": 0.04521339014172554, + "learning_rate": 8.815477004336858e-05, + "loss": 0.2046, + "step": 6514 + }, + { + "epoch": 1.312915575256901, + "grad_norm": 0.05215785652399063, + "learning_rate": 8.814615595005328e-05, + "loss": 0.1816, + "step": 6516 + }, + { + "epoch": 1.313318557324199, + "grad_norm": 0.050764963030815125, + "learning_rate": 8.813753914689605e-05, + "loss": 0.1906, + "step": 6518 + }, + { + "epoch": 1.313721539391497, + "grad_norm": 0.05948259308934212, + "learning_rate": 8.812891963450903e-05, + "loss": 0.194, + "step": 6520 + }, + { + "epoch": 1.314124521458795, + "grad_norm": 0.062436092644929886, + "learning_rate": 8.812029741350454e-05, + "loss": 0.2094, + "step": 6522 + }, + { + "epoch": 1.314527503526093, + "grad_norm": 0.045266736298799515, + "learning_rate": 8.811167248449508e-05, + "loss": 0.198, + "step": 6524 + }, + { + "epoch": 1.3149304855933912, + "grad_norm": 0.042582929134368896, + "learning_rate": 8.810304484809336e-05, + "loss": 0.1983, + "step": 6526 + }, + { + "epoch": 1.3153334676606891, + "grad_norm": 0.05839109048247337, + "learning_rate": 8.809441450491227e-05, + "loss": 0.2705, + "step": 6528 + }, + { + "epoch": 1.315736449727987, + "grad_norm": 0.05570756644010544, + "learning_rate": 8.80857814555649e-05, + "loss": 0.1926, + "step": 6530 + }, + { + "epoch": 1.316139431795285, + "grad_norm": 0.033902619034051895, + "learning_rate": 8.807714570066454e-05, + "loss": 0.1553, + "step": 6532 + }, + { + "epoch": 1.3165424138625832, + "grad_norm": 0.05374903604388237, + "learning_rate": 8.806850724082462e-05, + "loss": 0.1889, + "step": 6534 + }, + { + "epoch": 1.3169453959298811, + "grad_norm": 0.049184996634721756, + "learning_rate": 8.805986607665884e-05, + "loss": 0.1544, + "step": 6536 + }, + { + "epoch": 1.317348377997179, + "grad_norm": 0.03358803689479828, + "learning_rate": 8.805122220878104e-05, + "loss": 0.1384, + "step": 6538 + }, + { + "epoch": 1.3177513600644772, + "grad_norm": 0.04204018786549568, + "learning_rate": 8.804257563780525e-05, + "loss": 0.1581, + "step": 6540 + }, + { + "epoch": 1.3181543421317752, + "grad_norm": 0.04981936141848564, + "learning_rate": 8.803392636434575e-05, + "loss": 0.2046, + "step": 6542 + }, + { + "epoch": 1.3185573241990731, + "grad_norm": 0.04282686114311218, + "learning_rate": 8.802527438901693e-05, + "loss": 0.1827, + "step": 6544 + }, + { + "epoch": 1.318960306266371, + "grad_norm": 0.052251186221838, + "learning_rate": 8.801661971243345e-05, + "loss": 0.154, + "step": 6546 + }, + { + "epoch": 1.3193632883336692, + "grad_norm": 0.0645936131477356, + "learning_rate": 8.80079623352101e-05, + "loss": 0.2281, + "step": 6548 + }, + { + "epoch": 1.3197662704009672, + "grad_norm": 0.04686029627919197, + "learning_rate": 8.799930225796187e-05, + "loss": 0.2311, + "step": 6550 + }, + { + "epoch": 1.320169252468265, + "grad_norm": 0.05446161329746246, + "learning_rate": 8.7990639481304e-05, + "loss": 0.2203, + "step": 6552 + }, + { + "epoch": 1.3205722345355633, + "grad_norm": 0.043119318783283234, + "learning_rate": 8.798197400585185e-05, + "loss": 0.1367, + "step": 6554 + }, + { + "epoch": 1.3209752166028612, + "grad_norm": 0.03402850776910782, + "learning_rate": 8.7973305832221e-05, + "loss": 0.1273, + "step": 6556 + }, + { + "epoch": 1.3213781986701592, + "grad_norm": 0.05417346581816673, + "learning_rate": 8.796463496102725e-05, + "loss": 0.2275, + "step": 6558 + }, + { + "epoch": 1.321781180737457, + "grad_norm": 0.04006451368331909, + "learning_rate": 8.795596139288655e-05, + "loss": 0.2158, + "step": 6560 + }, + { + "epoch": 1.3221841628047553, + "grad_norm": 0.04239325597882271, + "learning_rate": 8.794728512841504e-05, + "loss": 0.1763, + "step": 6562 + }, + { + "epoch": 1.3225871448720532, + "grad_norm": 0.05095091834664345, + "learning_rate": 8.79386061682291e-05, + "loss": 0.2323, + "step": 6564 + }, + { + "epoch": 1.3229901269393511, + "grad_norm": 0.05127684772014618, + "learning_rate": 8.792992451294522e-05, + "loss": 0.2092, + "step": 6566 + }, + { + "epoch": 1.3233931090066493, + "grad_norm": 0.053673263639211655, + "learning_rate": 8.79212401631802e-05, + "loss": 0.1998, + "step": 6568 + }, + { + "epoch": 1.3237960910739472, + "grad_norm": 0.05202171579003334, + "learning_rate": 8.79125531195509e-05, + "loss": 0.2041, + "step": 6570 + }, + { + "epoch": 1.3241990731412452, + "grad_norm": 0.05307517573237419, + "learning_rate": 8.790386338267447e-05, + "loss": 0.182, + "step": 6572 + }, + { + "epoch": 1.3246020552085431, + "grad_norm": 0.04658079519867897, + "learning_rate": 8.789517095316819e-05, + "loss": 0.2143, + "step": 6574 + }, + { + "epoch": 1.3250050372758413, + "grad_norm": 0.04600522294640541, + "learning_rate": 8.788647583164959e-05, + "loss": 0.1783, + "step": 6576 + }, + { + "epoch": 1.3254080193431392, + "grad_norm": 0.045908063650131226, + "learning_rate": 8.787777801873632e-05, + "loss": 0.1593, + "step": 6578 + }, + { + "epoch": 1.3258110014104372, + "grad_norm": 0.04135711118578911, + "learning_rate": 8.786907751504628e-05, + "loss": 0.1651, + "step": 6580 + }, + { + "epoch": 1.3262139834777353, + "grad_norm": 0.07556430250406265, + "learning_rate": 8.786037432119754e-05, + "loss": 0.1893, + "step": 6582 + }, + { + "epoch": 1.3266169655450333, + "grad_norm": 0.04877334460616112, + "learning_rate": 8.785166843780837e-05, + "loss": 0.1789, + "step": 6584 + }, + { + "epoch": 1.3270199476123312, + "grad_norm": 0.055644772946834564, + "learning_rate": 8.784295986549717e-05, + "loss": 0.2269, + "step": 6586 + }, + { + "epoch": 1.3274229296796292, + "grad_norm": 0.04751408472657204, + "learning_rate": 8.783424860488266e-05, + "loss": 0.216, + "step": 6588 + }, + { + "epoch": 1.3278259117469273, + "grad_norm": 0.05544084310531616, + "learning_rate": 8.782553465658363e-05, + "loss": 0.1849, + "step": 6590 + }, + { + "epoch": 1.3282288938142253, + "grad_norm": 0.07215588539838791, + "learning_rate": 8.781681802121911e-05, + "loss": 0.2223, + "step": 6592 + }, + { + "epoch": 1.3286318758815232, + "grad_norm": 0.04705362394452095, + "learning_rate": 8.780809869940829e-05, + "loss": 0.1941, + "step": 6594 + }, + { + "epoch": 1.3290348579488214, + "grad_norm": 0.04929559677839279, + "learning_rate": 8.779937669177064e-05, + "loss": 0.2029, + "step": 6596 + }, + { + "epoch": 1.3294378400161193, + "grad_norm": 0.05044730007648468, + "learning_rate": 8.77906519989257e-05, + "loss": 0.1913, + "step": 6598 + }, + { + "epoch": 1.3298408220834173, + "grad_norm": 0.09016617387533188, + "learning_rate": 8.778192462149328e-05, + "loss": 0.1968, + "step": 6600 + }, + { + "epoch": 1.3302438041507152, + "grad_norm": 0.07022813707590103, + "learning_rate": 8.777319456009337e-05, + "loss": 0.2, + "step": 6602 + }, + { + "epoch": 1.3306467862180134, + "grad_norm": 0.0559733621776104, + "learning_rate": 8.776446181534612e-05, + "loss": 0.1923, + "step": 6604 + }, + { + "epoch": 1.3310497682853113, + "grad_norm": 0.06447982043027878, + "learning_rate": 8.775572638787189e-05, + "loss": 0.1711, + "step": 6606 + }, + { + "epoch": 1.3314527503526092, + "grad_norm": 0.06927803158760071, + "learning_rate": 8.774698827829126e-05, + "loss": 0.2384, + "step": 6608 + }, + { + "epoch": 1.3318557324199074, + "grad_norm": 0.062186673283576965, + "learning_rate": 8.773824748722492e-05, + "loss": 0.2065, + "step": 6610 + }, + { + "epoch": 1.3322587144872053, + "grad_norm": 0.06363217532634735, + "learning_rate": 8.772950401529386e-05, + "loss": 0.1999, + "step": 6612 + }, + { + "epoch": 1.3326616965545033, + "grad_norm": 0.05283069983124733, + "learning_rate": 8.772075786311916e-05, + "loss": 0.2548, + "step": 6614 + }, + { + "epoch": 1.3330646786218012, + "grad_norm": 0.045274145901203156, + "learning_rate": 8.771200903132215e-05, + "loss": 0.1845, + "step": 6616 + }, + { + "epoch": 1.3334676606890994, + "grad_norm": 0.05644421651959419, + "learning_rate": 8.770325752052432e-05, + "loss": 0.1792, + "step": 6618 + }, + { + "epoch": 1.3338706427563973, + "grad_norm": 0.04432598128914833, + "learning_rate": 8.769450333134739e-05, + "loss": 0.1934, + "step": 6620 + }, + { + "epoch": 1.3342736248236953, + "grad_norm": 0.05254184454679489, + "learning_rate": 8.768574646441323e-05, + "loss": 0.1675, + "step": 6622 + }, + { + "epoch": 1.3346766068909934, + "grad_norm": 0.05594053491950035, + "learning_rate": 8.767698692034389e-05, + "loss": 0.2387, + "step": 6624 + }, + { + "epoch": 1.3350795889582914, + "grad_norm": 0.07795904576778412, + "learning_rate": 8.766822469976167e-05, + "loss": 0.2225, + "step": 6626 + }, + { + "epoch": 1.3354825710255893, + "grad_norm": 0.05772830918431282, + "learning_rate": 8.7659459803289e-05, + "loss": 0.1746, + "step": 6628 + }, + { + "epoch": 1.3358855530928873, + "grad_norm": 0.04830396547913551, + "learning_rate": 8.765069223154853e-05, + "loss": 0.1767, + "step": 6630 + }, + { + "epoch": 1.3362885351601854, + "grad_norm": 0.04581188037991524, + "learning_rate": 8.764192198516313e-05, + "loss": 0.1916, + "step": 6632 + }, + { + "epoch": 1.3366915172274834, + "grad_norm": 0.06372931599617004, + "learning_rate": 8.763314906475574e-05, + "loss": 0.1885, + "step": 6634 + }, + { + "epoch": 1.3370944992947813, + "grad_norm": 0.047626253217458725, + "learning_rate": 8.762437347094965e-05, + "loss": 0.1751, + "step": 6636 + }, + { + "epoch": 1.3374974813620795, + "grad_norm": 0.03912975639104843, + "learning_rate": 8.761559520436826e-05, + "loss": 0.1827, + "step": 6638 + }, + { + "epoch": 1.3379004634293774, + "grad_norm": 0.07140173017978668, + "learning_rate": 8.760681426563512e-05, + "loss": 0.2485, + "step": 6640 + }, + { + "epoch": 1.3383034454966753, + "grad_norm": 0.050327908247709274, + "learning_rate": 8.759803065537404e-05, + "loss": 0.1755, + "step": 6642 + }, + { + "epoch": 1.3387064275639733, + "grad_norm": 0.047158148139715195, + "learning_rate": 8.758924437420898e-05, + "loss": 0.1858, + "step": 6644 + }, + { + "epoch": 1.3391094096312715, + "grad_norm": 0.037591077387332916, + "learning_rate": 8.758045542276414e-05, + "loss": 0.1511, + "step": 6646 + }, + { + "epoch": 1.3395123916985694, + "grad_norm": 0.045494113117456436, + "learning_rate": 8.757166380166384e-05, + "loss": 0.2192, + "step": 6648 + }, + { + "epoch": 1.3399153737658676, + "grad_norm": 0.058958739042282104, + "learning_rate": 8.756286951153263e-05, + "loss": 0.2174, + "step": 6650 + }, + { + "epoch": 1.3403183558331655, + "grad_norm": 0.07330820709466934, + "learning_rate": 8.755407255299524e-05, + "loss": 0.2206, + "step": 6652 + }, + { + "epoch": 1.3407213379004634, + "grad_norm": 0.12760120630264282, + "learning_rate": 8.75452729266766e-05, + "loss": 0.2324, + "step": 6654 + }, + { + "epoch": 1.3411243199677614, + "grad_norm": 0.08199845999479294, + "learning_rate": 8.75364706332018e-05, + "loss": 0.1753, + "step": 6656 + }, + { + "epoch": 1.3415273020350593, + "grad_norm": 0.0407002717256546, + "learning_rate": 8.752766567319616e-05, + "loss": 0.1774, + "step": 6658 + }, + { + "epoch": 1.3419302841023575, + "grad_norm": 0.043844059109687805, + "learning_rate": 8.751885804728519e-05, + "loss": 0.1779, + "step": 6660 + }, + { + "epoch": 1.3423332661696554, + "grad_norm": 0.06086337938904762, + "learning_rate": 8.751004775609452e-05, + "loss": 0.1774, + "step": 6662 + }, + { + "epoch": 1.3427362482369536, + "grad_norm": 0.048990312963724136, + "learning_rate": 8.750123480025007e-05, + "loss": 0.1705, + "step": 6664 + }, + { + "epoch": 1.3431392303042515, + "grad_norm": 0.03980618715286255, + "learning_rate": 8.749241918037788e-05, + "loss": 0.1758, + "step": 6666 + }, + { + "epoch": 1.3435422123715495, + "grad_norm": 0.051659561693668365, + "learning_rate": 8.748360089710416e-05, + "loss": 0.196, + "step": 6668 + }, + { + "epoch": 1.3439451944388474, + "grad_norm": 0.05354244261980057, + "learning_rate": 8.74747799510554e-05, + "loss": 0.121, + "step": 6670 + }, + { + "epoch": 1.3443481765061454, + "grad_norm": 0.055109504610300064, + "learning_rate": 8.74659563428582e-05, + "loss": 0.2251, + "step": 6672 + }, + { + "epoch": 1.3447511585734435, + "grad_norm": 0.043078888207674026, + "learning_rate": 8.745713007313937e-05, + "loss": 0.169, + "step": 6674 + }, + { + "epoch": 1.3451541406407415, + "grad_norm": 0.05727505311369896, + "learning_rate": 8.744830114252592e-05, + "loss": 0.2085, + "step": 6676 + }, + { + "epoch": 1.3455571227080396, + "grad_norm": 0.043431010097265244, + "learning_rate": 8.743946955164506e-05, + "loss": 0.1945, + "step": 6678 + }, + { + "epoch": 1.3459601047753376, + "grad_norm": 0.04095043987035751, + "learning_rate": 8.743063530112416e-05, + "loss": 0.2139, + "step": 6680 + }, + { + "epoch": 1.3463630868426355, + "grad_norm": 0.05373978987336159, + "learning_rate": 8.742179839159077e-05, + "loss": 0.1522, + "step": 6682 + }, + { + "epoch": 1.3467660689099334, + "grad_norm": 0.05662880837917328, + "learning_rate": 8.741295882367269e-05, + "loss": 0.1871, + "step": 6684 + }, + { + "epoch": 1.3471690509772314, + "grad_norm": 0.04732615128159523, + "learning_rate": 8.740411659799785e-05, + "loss": 0.2025, + "step": 6686 + }, + { + "epoch": 1.3475720330445295, + "grad_norm": 0.043686844408512115, + "learning_rate": 8.739527171519437e-05, + "loss": 0.2166, + "step": 6688 + }, + { + "epoch": 1.3479750151118275, + "grad_norm": 0.039230071008205414, + "learning_rate": 8.73864241758906e-05, + "loss": 0.1561, + "step": 6690 + }, + { + "epoch": 1.3483779971791257, + "grad_norm": 0.04516879841685295, + "learning_rate": 8.737757398071505e-05, + "loss": 0.2342, + "step": 6692 + }, + { + "epoch": 1.3487809792464236, + "grad_norm": 0.03462446853518486, + "learning_rate": 8.736872113029642e-05, + "loss": 0.1728, + "step": 6694 + }, + { + "epoch": 1.3491839613137215, + "grad_norm": 0.045789361000061035, + "learning_rate": 8.735986562526361e-05, + "loss": 0.1634, + "step": 6696 + }, + { + "epoch": 1.3495869433810195, + "grad_norm": 0.045005571097135544, + "learning_rate": 8.735100746624568e-05, + "loss": 0.1449, + "step": 6698 + }, + { + "epoch": 1.3499899254483174, + "grad_norm": 0.0720299556851387, + "learning_rate": 8.734214665387193e-05, + "loss": 0.1818, + "step": 6700 + }, + { + "epoch": 1.3503929075156156, + "grad_norm": 0.04918527230620384, + "learning_rate": 8.733328318877179e-05, + "loss": 0.1818, + "step": 6702 + }, + { + "epoch": 1.3507958895829135, + "grad_norm": 0.04074351117014885, + "learning_rate": 8.73244170715749e-05, + "loss": 0.1968, + "step": 6704 + }, + { + "epoch": 1.3511988716502117, + "grad_norm": 0.055268727242946625, + "learning_rate": 8.731554830291114e-05, + "loss": 0.2063, + "step": 6706 + }, + { + "epoch": 1.3516018537175096, + "grad_norm": 0.037626102566719055, + "learning_rate": 8.73066768834105e-05, + "loss": 0.1903, + "step": 6708 + }, + { + "epoch": 1.3520048357848076, + "grad_norm": 0.05037368834018707, + "learning_rate": 8.72978028137032e-05, + "loss": 0.1652, + "step": 6710 + }, + { + "epoch": 1.3524078178521055, + "grad_norm": 0.07378649711608887, + "learning_rate": 8.728892609441964e-05, + "loss": 0.1998, + "step": 6712 + }, + { + "epoch": 1.3528107999194035, + "grad_norm": 0.040671806782484055, + "learning_rate": 8.728004672619039e-05, + "loss": 0.1678, + "step": 6714 + }, + { + "epoch": 1.3532137819867016, + "grad_norm": 0.04682549834251404, + "learning_rate": 8.727116470964624e-05, + "loss": 0.2063, + "step": 6716 + }, + { + "epoch": 1.3536167640539996, + "grad_norm": 0.06274406611919403, + "learning_rate": 8.726228004541818e-05, + "loss": 0.1976, + "step": 6718 + }, + { + "epoch": 1.3540197461212977, + "grad_norm": 0.049354761838912964, + "learning_rate": 8.725339273413731e-05, + "loss": 0.2151, + "step": 6720 + }, + { + "epoch": 1.3544227281885957, + "grad_norm": 0.06009558215737343, + "learning_rate": 8.724450277643501e-05, + "loss": 0.2396, + "step": 6722 + }, + { + "epoch": 1.3548257102558936, + "grad_norm": 0.042581889778375626, + "learning_rate": 8.72356101729428e-05, + "loss": 0.2058, + "step": 6724 + }, + { + "epoch": 1.3552286923231915, + "grad_norm": 0.05875520780682564, + "learning_rate": 8.72267149242924e-05, + "loss": 0.2039, + "step": 6726 + }, + { + "epoch": 1.3556316743904897, + "grad_norm": 0.04886787384748459, + "learning_rate": 8.721781703111568e-05, + "loss": 0.1559, + "step": 6728 + }, + { + "epoch": 1.3560346564577876, + "grad_norm": 0.04805205762386322, + "learning_rate": 8.72089164940448e-05, + "loss": 0.1894, + "step": 6730 + }, + { + "epoch": 1.3564376385250856, + "grad_norm": 0.0584242157638073, + "learning_rate": 8.720001331371197e-05, + "loss": 0.1765, + "step": 6732 + }, + { + "epoch": 1.3568406205923838, + "grad_norm": 0.043926917016506195, + "learning_rate": 8.719110749074969e-05, + "loss": 0.1696, + "step": 6734 + }, + { + "epoch": 1.3572436026596817, + "grad_norm": 0.07441367954015732, + "learning_rate": 8.71821990257906e-05, + "loss": 0.2257, + "step": 6736 + }, + { + "epoch": 1.3576465847269796, + "grad_norm": 0.052764154970645905, + "learning_rate": 8.717328791946758e-05, + "loss": 0.161, + "step": 6738 + }, + { + "epoch": 1.3580495667942776, + "grad_norm": 0.05789724364876747, + "learning_rate": 8.716437417241363e-05, + "loss": 0.1846, + "step": 6740 + }, + { + "epoch": 1.3584525488615757, + "grad_norm": 0.05074714124202728, + "learning_rate": 8.715545778526197e-05, + "loss": 0.1703, + "step": 6742 + }, + { + "epoch": 1.3588555309288737, + "grad_norm": 0.057830099016427994, + "learning_rate": 8.714653875864601e-05, + "loss": 0.1658, + "step": 6744 + }, + { + "epoch": 1.3592585129961716, + "grad_norm": 0.06288470327854156, + "learning_rate": 8.713761709319934e-05, + "loss": 0.1938, + "step": 6746 + }, + { + "epoch": 1.3596614950634698, + "grad_norm": 0.0460391566157341, + "learning_rate": 8.712869278955575e-05, + "loss": 0.2288, + "step": 6748 + }, + { + "epoch": 1.3600644771307677, + "grad_norm": 0.04402383044362068, + "learning_rate": 8.71197658483492e-05, + "loss": 0.182, + "step": 6750 + }, + { + "epoch": 1.3604674591980657, + "grad_norm": 0.04419035091996193, + "learning_rate": 8.711083627021386e-05, + "loss": 0.169, + "step": 6752 + }, + { + "epoch": 1.3608704412653636, + "grad_norm": 0.040202751755714417, + "learning_rate": 8.710190405578404e-05, + "loss": 0.1829, + "step": 6754 + }, + { + "epoch": 1.3612734233326618, + "grad_norm": 0.04582332819700241, + "learning_rate": 8.709296920569432e-05, + "loss": 0.2035, + "step": 6756 + }, + { + "epoch": 1.3616764053999597, + "grad_norm": 0.06750451028347015, + "learning_rate": 8.708403172057936e-05, + "loss": 0.2098, + "step": 6758 + }, + { + "epoch": 1.3620793874672577, + "grad_norm": 0.05921706184744835, + "learning_rate": 8.707509160107411e-05, + "loss": 0.1745, + "step": 6760 + }, + { + "epoch": 1.3624823695345558, + "grad_norm": 0.05832752212882042, + "learning_rate": 8.706614884781363e-05, + "loss": 0.1917, + "step": 6762 + }, + { + "epoch": 1.3628853516018538, + "grad_norm": 0.03713718429207802, + "learning_rate": 8.705720346143325e-05, + "loss": 0.1709, + "step": 6764 + }, + { + "epoch": 1.3632883336691517, + "grad_norm": 0.057797905057668686, + "learning_rate": 8.704825544256837e-05, + "loss": 0.2044, + "step": 6766 + }, + { + "epoch": 1.3636913157364496, + "grad_norm": 0.04862739145755768, + "learning_rate": 8.703930479185467e-05, + "loss": 0.217, + "step": 6768 + }, + { + "epoch": 1.3640942978037478, + "grad_norm": 0.05247446522116661, + "learning_rate": 8.703035150992802e-05, + "loss": 0.2, + "step": 6770 + }, + { + "epoch": 1.3644972798710457, + "grad_norm": 0.05208409205079079, + "learning_rate": 8.70213955974244e-05, + "loss": 0.1663, + "step": 6772 + }, + { + "epoch": 1.3649002619383437, + "grad_norm": 0.0481448620557785, + "learning_rate": 8.701243705498003e-05, + "loss": 0.1961, + "step": 6774 + }, + { + "epoch": 1.3653032440056418, + "grad_norm": 0.044661637395620346, + "learning_rate": 8.700347588323135e-05, + "loss": 0.1508, + "step": 6776 + }, + { + "epoch": 1.3657062260729398, + "grad_norm": 0.06829683482646942, + "learning_rate": 8.69945120828149e-05, + "loss": 0.1964, + "step": 6778 + }, + { + "epoch": 1.3661092081402377, + "grad_norm": 0.06598050892353058, + "learning_rate": 8.69855456543675e-05, + "loss": 0.1932, + "step": 6780 + }, + { + "epoch": 1.3665121902075357, + "grad_norm": 0.04563366621732712, + "learning_rate": 8.697657659852608e-05, + "loss": 0.2021, + "step": 6782 + }, + { + "epoch": 1.3669151722748338, + "grad_norm": 0.049588385969400406, + "learning_rate": 8.696760491592778e-05, + "loss": 0.2342, + "step": 6784 + }, + { + "epoch": 1.3673181543421318, + "grad_norm": 0.06191490218043327, + "learning_rate": 8.695863060720995e-05, + "loss": 0.2053, + "step": 6786 + }, + { + "epoch": 1.3677211364094297, + "grad_norm": 0.04240760579705238, + "learning_rate": 8.694965367301013e-05, + "loss": 0.2053, + "step": 6788 + }, + { + "epoch": 1.3681241184767279, + "grad_norm": 0.0616329126060009, + "learning_rate": 8.694067411396599e-05, + "loss": 0.1861, + "step": 6790 + }, + { + "epoch": 1.3685271005440258, + "grad_norm": 0.04297064244747162, + "learning_rate": 8.693169193071543e-05, + "loss": 0.1687, + "step": 6792 + }, + { + "epoch": 1.3689300826113238, + "grad_norm": 0.06308623403310776, + "learning_rate": 8.692270712389654e-05, + "loss": 0.2375, + "step": 6794 + }, + { + "epoch": 1.3693330646786217, + "grad_norm": 0.04142848029732704, + "learning_rate": 8.691371969414759e-05, + "loss": 0.1678, + "step": 6796 + }, + { + "epoch": 1.3697360467459199, + "grad_norm": 0.055353887379169464, + "learning_rate": 8.690472964210703e-05, + "loss": 0.1774, + "step": 6798 + }, + { + "epoch": 1.3701390288132178, + "grad_norm": 0.05628889426589012, + "learning_rate": 8.689573696841351e-05, + "loss": 0.1804, + "step": 6800 + }, + { + "epoch": 1.3705420108805157, + "grad_norm": 0.0353449322283268, + "learning_rate": 8.688674167370583e-05, + "loss": 0.1727, + "step": 6802 + }, + { + "epoch": 1.370944992947814, + "grad_norm": 0.060777902603149414, + "learning_rate": 8.687774375862301e-05, + "loss": 0.1996, + "step": 6804 + }, + { + "epoch": 1.3713479750151119, + "grad_norm": 0.05757424980401993, + "learning_rate": 8.686874322380425e-05, + "loss": 0.2303, + "step": 6806 + }, + { + "epoch": 1.3717509570824098, + "grad_norm": 0.05264467000961304, + "learning_rate": 8.685974006988893e-05, + "loss": 0.2153, + "step": 6808 + }, + { + "epoch": 1.3721539391497077, + "grad_norm": 0.05513720214366913, + "learning_rate": 8.685073429751663e-05, + "loss": 0.2005, + "step": 6810 + }, + { + "epoch": 1.372556921217006, + "grad_norm": 0.06795477867126465, + "learning_rate": 8.68417259073271e-05, + "loss": 0.1603, + "step": 6812 + }, + { + "epoch": 1.3729599032843038, + "grad_norm": 0.04166838526725769, + "learning_rate": 8.683271489996029e-05, + "loss": 0.2005, + "step": 6814 + }, + { + "epoch": 1.3733628853516018, + "grad_norm": 0.05827517807483673, + "learning_rate": 8.68237012760563e-05, + "loss": 0.2218, + "step": 6816 + }, + { + "epoch": 1.3737658674189, + "grad_norm": 0.07584039121866226, + "learning_rate": 8.681468503625548e-05, + "loss": 0.2158, + "step": 6818 + }, + { + "epoch": 1.3741688494861979, + "grad_norm": 0.05099467188119888, + "learning_rate": 8.680566618119829e-05, + "loss": 0.221, + "step": 6820 + }, + { + "epoch": 1.3745718315534958, + "grad_norm": 0.06616433709859848, + "learning_rate": 8.679664471152546e-05, + "loss": 0.1717, + "step": 6822 + }, + { + "epoch": 1.3749748136207938, + "grad_norm": 0.048993200063705444, + "learning_rate": 8.678762062787782e-05, + "loss": 0.1877, + "step": 6824 + }, + { + "epoch": 1.375377795688092, + "grad_norm": 0.059043314307928085, + "learning_rate": 8.677859393089646e-05, + "loss": 0.2321, + "step": 6826 + }, + { + "epoch": 1.3757807777553899, + "grad_norm": 0.06745105981826782, + "learning_rate": 8.676956462122259e-05, + "loss": 0.1757, + "step": 6828 + }, + { + "epoch": 1.3761837598226878, + "grad_norm": 0.03870021179318428, + "learning_rate": 8.676053269949766e-05, + "loss": 0.1822, + "step": 6830 + }, + { + "epoch": 1.376586741889986, + "grad_norm": 0.03666188195347786, + "learning_rate": 8.675149816636327e-05, + "loss": 0.1847, + "step": 6832 + }, + { + "epoch": 1.376989723957284, + "grad_norm": 0.04749156907200813, + "learning_rate": 8.674246102246125e-05, + "loss": 0.2007, + "step": 6834 + }, + { + "epoch": 1.3773927060245819, + "grad_norm": 0.044996339827775955, + "learning_rate": 8.673342126843353e-05, + "loss": 0.2072, + "step": 6836 + }, + { + "epoch": 1.3777956880918798, + "grad_norm": 0.047479066997766495, + "learning_rate": 8.672437890492234e-05, + "loss": 0.1748, + "step": 6838 + }, + { + "epoch": 1.378198670159178, + "grad_norm": 0.049724042415618896, + "learning_rate": 8.671533393256998e-05, + "loss": 0.1375, + "step": 6840 + }, + { + "epoch": 1.378601652226476, + "grad_norm": 0.05517043545842171, + "learning_rate": 8.670628635201901e-05, + "loss": 0.1694, + "step": 6842 + }, + { + "epoch": 1.3790046342937738, + "grad_norm": 0.04162873700261116, + "learning_rate": 8.669723616391217e-05, + "loss": 0.1945, + "step": 6844 + }, + { + "epoch": 1.379407616361072, + "grad_norm": 0.05656104534864426, + "learning_rate": 8.668818336889237e-05, + "loss": 0.1662, + "step": 6846 + }, + { + "epoch": 1.37981059842837, + "grad_norm": 0.04506509006023407, + "learning_rate": 8.667912796760269e-05, + "loss": 0.2062, + "step": 6848 + }, + { + "epoch": 1.380213580495668, + "grad_norm": 0.06399524211883545, + "learning_rate": 8.667006996068642e-05, + "loss": 0.2252, + "step": 6850 + }, + { + "epoch": 1.3806165625629658, + "grad_norm": 0.0532291941344738, + "learning_rate": 8.666100934878702e-05, + "loss": 0.1534, + "step": 6852 + }, + { + "epoch": 1.381019544630264, + "grad_norm": 0.06636221706867218, + "learning_rate": 8.665194613254814e-05, + "loss": 0.2373, + "step": 6854 + }, + { + "epoch": 1.381422526697562, + "grad_norm": 0.051087286323308945, + "learning_rate": 8.664288031261365e-05, + "loss": 0.2099, + "step": 6856 + }, + { + "epoch": 1.38182550876486, + "grad_norm": 0.07028748840093613, + "learning_rate": 8.663381188962753e-05, + "loss": 0.1864, + "step": 6858 + }, + { + "epoch": 1.382228490832158, + "grad_norm": 0.0513707660138607, + "learning_rate": 8.6624740864234e-05, + "loss": 0.2527, + "step": 6860 + }, + { + "epoch": 1.382631472899456, + "grad_norm": 0.041053254157304764, + "learning_rate": 8.661566723707745e-05, + "loss": 0.1516, + "step": 6862 + }, + { + "epoch": 1.383034454966754, + "grad_norm": 0.029478134587407112, + "learning_rate": 8.660659100880246e-05, + "loss": 0.1596, + "step": 6864 + }, + { + "epoch": 1.3834374370340519, + "grad_norm": 0.05876408517360687, + "learning_rate": 8.659751218005379e-05, + "loss": 0.1817, + "step": 6866 + }, + { + "epoch": 1.38384041910135, + "grad_norm": 0.06829272210597992, + "learning_rate": 8.658843075147636e-05, + "loss": 0.1947, + "step": 6868 + }, + { + "epoch": 1.384243401168648, + "grad_norm": 0.049879446625709534, + "learning_rate": 8.657934672371534e-05, + "loss": 0.2194, + "step": 6870 + }, + { + "epoch": 1.3846463832359461, + "grad_norm": 0.03202884644269943, + "learning_rate": 8.657026009741605e-05, + "loss": 0.1768, + "step": 6872 + }, + { + "epoch": 1.385049365303244, + "grad_norm": 0.06864320486783981, + "learning_rate": 8.656117087322395e-05, + "loss": 0.2288, + "step": 6874 + }, + { + "epoch": 1.385452347370542, + "grad_norm": 0.05002441629767418, + "learning_rate": 8.655207905178474e-05, + "loss": 0.2283, + "step": 6876 + }, + { + "epoch": 1.38585532943784, + "grad_norm": 0.06078348681330681, + "learning_rate": 8.654298463374429e-05, + "loss": 0.1693, + "step": 6878 + }, + { + "epoch": 1.386258311505138, + "grad_norm": 0.050742197781801224, + "learning_rate": 8.653388761974865e-05, + "loss": 0.1947, + "step": 6880 + }, + { + "epoch": 1.386661293572436, + "grad_norm": 0.03963219001889229, + "learning_rate": 8.652478801044407e-05, + "loss": 0.161, + "step": 6882 + }, + { + "epoch": 1.387064275639734, + "grad_norm": 0.04068451747298241, + "learning_rate": 8.651568580647698e-05, + "loss": 0.1831, + "step": 6884 + }, + { + "epoch": 1.3874672577070322, + "grad_norm": 0.05580208823084831, + "learning_rate": 8.650658100849394e-05, + "loss": 0.1938, + "step": 6886 + }, + { + "epoch": 1.38787023977433, + "grad_norm": 0.06083298847079277, + "learning_rate": 8.649747361714178e-05, + "loss": 0.2187, + "step": 6888 + }, + { + "epoch": 1.388273221841628, + "grad_norm": 0.06060599535703659, + "learning_rate": 8.648836363306745e-05, + "loss": 0.186, + "step": 6890 + }, + { + "epoch": 1.388676203908926, + "grad_norm": 0.05033830180764198, + "learning_rate": 8.647925105691814e-05, + "loss": 0.1962, + "step": 6892 + }, + { + "epoch": 1.389079185976224, + "grad_norm": 0.052239105105400085, + "learning_rate": 8.647013588934117e-05, + "loss": 0.2469, + "step": 6894 + }, + { + "epoch": 1.389482168043522, + "grad_norm": 0.06745372712612152, + "learning_rate": 8.646101813098407e-05, + "loss": 0.2453, + "step": 6896 + }, + { + "epoch": 1.38988515011082, + "grad_norm": 0.05417153611779213, + "learning_rate": 8.645189778249456e-05, + "loss": 0.1875, + "step": 6898 + }, + { + "epoch": 1.3902881321781182, + "grad_norm": 0.05408632755279541, + "learning_rate": 8.644277484452052e-05, + "loss": 0.1902, + "step": 6900 + }, + { + "epoch": 1.3906911142454161, + "grad_norm": 0.05546940863132477, + "learning_rate": 8.643364931771004e-05, + "loss": 0.2093, + "step": 6902 + }, + { + "epoch": 1.391094096312714, + "grad_norm": 0.041996292769908905, + "learning_rate": 8.642452120271137e-05, + "loss": 0.2097, + "step": 6904 + }, + { + "epoch": 1.391497078380012, + "grad_norm": 0.03974250331521034, + "learning_rate": 8.641539050017297e-05, + "loss": 0.1665, + "step": 6906 + }, + { + "epoch": 1.39190006044731, + "grad_norm": 0.04208201915025711, + "learning_rate": 8.640625721074347e-05, + "loss": 0.1463, + "step": 6908 + }, + { + "epoch": 1.3923030425146081, + "grad_norm": 0.07863224297761917, + "learning_rate": 8.639712133507169e-05, + "loss": 0.2162, + "step": 6910 + }, + { + "epoch": 1.392706024581906, + "grad_norm": 0.042710717767477036, + "learning_rate": 8.63879828738066e-05, + "loss": 0.1739, + "step": 6912 + }, + { + "epoch": 1.3931090066492042, + "grad_norm": 0.06020447984337807, + "learning_rate": 8.637884182759741e-05, + "loss": 0.2377, + "step": 6914 + }, + { + "epoch": 1.3935119887165022, + "grad_norm": 0.05448845028877258, + "learning_rate": 8.636969819709348e-05, + "loss": 0.1784, + "step": 6916 + }, + { + "epoch": 1.3939149707838001, + "grad_norm": 0.04206893965601921, + "learning_rate": 8.636055198294434e-05, + "loss": 0.2199, + "step": 6918 + }, + { + "epoch": 1.394317952851098, + "grad_norm": 0.06374957412481308, + "learning_rate": 8.635140318579976e-05, + "loss": 0.1851, + "step": 6920 + }, + { + "epoch": 1.394720934918396, + "grad_norm": 0.04802941903471947, + "learning_rate": 8.634225180630962e-05, + "loss": 0.1895, + "step": 6922 + }, + { + "epoch": 1.3951239169856942, + "grad_norm": 0.062166936695575714, + "learning_rate": 8.633309784512403e-05, + "loss": 0.245, + "step": 6924 + }, + { + "epoch": 1.395526899052992, + "grad_norm": 0.0570203997194767, + "learning_rate": 8.632394130289328e-05, + "loss": 0.2336, + "step": 6926 + }, + { + "epoch": 1.3959298811202903, + "grad_norm": 0.046736180782318115, + "learning_rate": 8.631478218026782e-05, + "loss": 0.2514, + "step": 6928 + }, + { + "epoch": 1.3963328631875882, + "grad_norm": 0.0648045614361763, + "learning_rate": 8.630562047789833e-05, + "loss": 0.1536, + "step": 6930 + }, + { + "epoch": 1.3967358452548861, + "grad_norm": 0.06284670531749725, + "learning_rate": 8.629645619643561e-05, + "loss": 0.2086, + "step": 6932 + }, + { + "epoch": 1.397138827322184, + "grad_norm": 0.05843097344040871, + "learning_rate": 8.62872893365307e-05, + "loss": 0.2045, + "step": 6934 + }, + { + "epoch": 1.3975418093894822, + "grad_norm": 0.03313927724957466, + "learning_rate": 8.627811989883479e-05, + "loss": 0.1621, + "step": 6936 + }, + { + "epoch": 1.3979447914567802, + "grad_norm": 0.05084552243351936, + "learning_rate": 8.626894788399925e-05, + "loss": 0.245, + "step": 6938 + }, + { + "epoch": 1.3983477735240781, + "grad_norm": 0.0713992714881897, + "learning_rate": 8.625977329267565e-05, + "loss": 0.2607, + "step": 6940 + }, + { + "epoch": 1.3987507555913763, + "grad_norm": 0.0683627724647522, + "learning_rate": 8.625059612551575e-05, + "loss": 0.1904, + "step": 6942 + }, + { + "epoch": 1.3991537376586742, + "grad_norm": 0.03592758998274803, + "learning_rate": 8.624141638317149e-05, + "loss": 0.203, + "step": 6944 + }, + { + "epoch": 1.3995567197259722, + "grad_norm": 0.04698161408305168, + "learning_rate": 8.623223406629495e-05, + "loss": 0.1839, + "step": 6946 + }, + { + "epoch": 1.3999597017932701, + "grad_norm": 0.05979125574231148, + "learning_rate": 8.622304917553846e-05, + "loss": 0.1984, + "step": 6948 + }, + { + "epoch": 1.4003626838605683, + "grad_norm": 0.042240679264068604, + "learning_rate": 8.621386171155448e-05, + "loss": 0.186, + "step": 6950 + }, + { + "epoch": 1.4007656659278662, + "grad_norm": 0.06413422524929047, + "learning_rate": 8.620467167499568e-05, + "loss": 0.1845, + "step": 6952 + }, + { + "epoch": 1.4011686479951642, + "grad_norm": 0.0588700994849205, + "learning_rate": 8.61954790665149e-05, + "loss": 0.1922, + "step": 6954 + }, + { + "epoch": 1.4015716300624623, + "grad_norm": 0.06049403175711632, + "learning_rate": 8.61862838867652e-05, + "loss": 0.2095, + "step": 6956 + }, + { + "epoch": 1.4019746121297603, + "grad_norm": 0.049626272171735764, + "learning_rate": 8.617708613639973e-05, + "loss": 0.179, + "step": 6958 + }, + { + "epoch": 1.4023775941970582, + "grad_norm": 0.04943282529711723, + "learning_rate": 8.616788581607193e-05, + "loss": 0.1887, + "step": 6960 + }, + { + "epoch": 1.4027805762643561, + "grad_norm": 0.043147701770067215, + "learning_rate": 8.615868292643536e-05, + "loss": 0.1697, + "step": 6962 + }, + { + "epoch": 1.4031835583316543, + "grad_norm": 0.04488087072968483, + "learning_rate": 8.614947746814379e-05, + "loss": 0.1726, + "step": 6964 + }, + { + "epoch": 1.4035865403989523, + "grad_norm": 0.05846525728702545, + "learning_rate": 8.614026944185117e-05, + "loss": 0.1389, + "step": 6966 + }, + { + "epoch": 1.4039895224662502, + "grad_norm": 0.03812554106116295, + "learning_rate": 8.613105884821157e-05, + "loss": 0.1742, + "step": 6968 + }, + { + "epoch": 1.4043925045335484, + "grad_norm": 0.06095854192972183, + "learning_rate": 8.612184568787936e-05, + "loss": 0.1818, + "step": 6970 + }, + { + "epoch": 1.4047954866008463, + "grad_norm": 0.05667596682906151, + "learning_rate": 8.611262996150899e-05, + "loss": 0.2194, + "step": 6972 + }, + { + "epoch": 1.4051984686681442, + "grad_norm": 0.06258885562419891, + "learning_rate": 8.610341166975513e-05, + "loss": 0.2332, + "step": 6974 + }, + { + "epoch": 1.4056014507354422, + "grad_norm": 0.05343182757496834, + "learning_rate": 8.609419081327266e-05, + "loss": 0.1721, + "step": 6976 + }, + { + "epoch": 1.4060044328027403, + "grad_norm": 0.0530022569000721, + "learning_rate": 8.608496739271659e-05, + "loss": 0.21, + "step": 6978 + }, + { + "epoch": 1.4064074148700383, + "grad_norm": 0.05478819087147713, + "learning_rate": 8.607574140874214e-05, + "loss": 0.1335, + "step": 6980 + }, + { + "epoch": 1.4068103969373362, + "grad_norm": 0.047832515090703964, + "learning_rate": 8.606651286200474e-05, + "loss": 0.1664, + "step": 6982 + }, + { + "epoch": 1.4072133790046344, + "grad_norm": 0.030942801386117935, + "learning_rate": 8.605728175315993e-05, + "loss": 0.1981, + "step": 6984 + }, + { + "epoch": 1.4076163610719323, + "grad_norm": 0.047290656715631485, + "learning_rate": 8.604804808286348e-05, + "loss": 0.195, + "step": 6986 + }, + { + "epoch": 1.4080193431392303, + "grad_norm": 0.061841338872909546, + "learning_rate": 8.603881185177136e-05, + "loss": 0.202, + "step": 6988 + }, + { + "epoch": 1.4084223252065282, + "grad_norm": 0.054422467947006226, + "learning_rate": 8.602957306053968e-05, + "loss": 0.2143, + "step": 6990 + }, + { + "epoch": 1.4088253072738264, + "grad_norm": 0.04802738130092621, + "learning_rate": 8.602033170982475e-05, + "loss": 0.1735, + "step": 6992 + }, + { + "epoch": 1.4092282893411243, + "grad_norm": 0.07405896484851837, + "learning_rate": 8.601108780028306e-05, + "loss": 0.2378, + "step": 6994 + }, + { + "epoch": 1.4096312714084223, + "grad_norm": 0.07153638452291489, + "learning_rate": 8.600184133257127e-05, + "loss": 0.2177, + "step": 6996 + }, + { + "epoch": 1.4100342534757204, + "grad_norm": 0.11758548021316528, + "learning_rate": 8.599259230734626e-05, + "loss": 0.2042, + "step": 6998 + }, + { + "epoch": 1.4104372355430184, + "grad_norm": 0.050041794776916504, + "learning_rate": 8.598334072526507e-05, + "loss": 0.1478, + "step": 7000 + }, + { + "epoch": 1.4108402176103163, + "grad_norm": 0.044110897928476334, + "learning_rate": 8.597408658698488e-05, + "loss": 0.1432, + "step": 7002 + }, + { + "epoch": 1.4112431996776142, + "grad_norm": 0.06034010276198387, + "learning_rate": 8.596482989316312e-05, + "loss": 0.1976, + "step": 7004 + }, + { + "epoch": 1.4116461817449124, + "grad_norm": 0.07794831693172455, + "learning_rate": 8.595557064445736e-05, + "loss": 0.2376, + "step": 7006 + }, + { + "epoch": 1.4120491638122104, + "grad_norm": 0.07392967492341995, + "learning_rate": 8.594630884152537e-05, + "loss": 0.2335, + "step": 7008 + }, + { + "epoch": 1.4124521458795083, + "grad_norm": 0.06214665621519089, + "learning_rate": 8.593704448502507e-05, + "loss": 0.1827, + "step": 7010 + }, + { + "epoch": 1.4128551279468065, + "grad_norm": 0.05099937692284584, + "learning_rate": 8.59277775756146e-05, + "loss": 0.1647, + "step": 7012 + }, + { + "epoch": 1.4132581100141044, + "grad_norm": 0.08212552219629288, + "learning_rate": 8.591850811395231e-05, + "loss": 0.2181, + "step": 7014 + }, + { + "epoch": 1.4136610920814023, + "grad_norm": 0.11238207668066025, + "learning_rate": 8.59092361006966e-05, + "loss": 0.2202, + "step": 7016 + }, + { + "epoch": 1.4140640741487003, + "grad_norm": 0.0554063580930233, + "learning_rate": 8.589996153650622e-05, + "loss": 0.1892, + "step": 7018 + }, + { + "epoch": 1.4144670562159984, + "grad_norm": 0.045140497386455536, + "learning_rate": 8.589068442203996e-05, + "loss": 0.1798, + "step": 7020 + }, + { + "epoch": 1.4148700382832964, + "grad_norm": 0.04684115946292877, + "learning_rate": 8.58814047579569e-05, + "loss": 0.181, + "step": 7022 + }, + { + "epoch": 1.4152730203505943, + "grad_norm": 0.06891334801912308, + "learning_rate": 8.587212254491621e-05, + "loss": 0.2023, + "step": 7024 + }, + { + "epoch": 1.4156760024178925, + "grad_norm": 0.0646648108959198, + "learning_rate": 8.58628377835773e-05, + "loss": 0.1956, + "step": 7026 + }, + { + "epoch": 1.4160789844851904, + "grad_norm": 0.06795363128185272, + "learning_rate": 8.585355047459976e-05, + "loss": 0.1768, + "step": 7028 + }, + { + "epoch": 1.4164819665524884, + "grad_norm": 0.062433384358882904, + "learning_rate": 8.584426061864335e-05, + "loss": 0.2723, + "step": 7030 + }, + { + "epoch": 1.4168849486197863, + "grad_norm": 0.030506597831845284, + "learning_rate": 8.583496821636797e-05, + "loss": 0.1273, + "step": 7032 + }, + { + "epoch": 1.4172879306870845, + "grad_norm": 0.04489409551024437, + "learning_rate": 8.582567326843376e-05, + "loss": 0.151, + "step": 7034 + }, + { + "epoch": 1.4176909127543824, + "grad_norm": 0.043156519532203674, + "learning_rate": 8.581637577550101e-05, + "loss": 0.222, + "step": 7036 + }, + { + "epoch": 1.4180938948216804, + "grad_norm": 0.05898886173963547, + "learning_rate": 8.580707573823021e-05, + "loss": 0.2044, + "step": 7038 + }, + { + "epoch": 1.4184968768889785, + "grad_norm": 0.05552120506763458, + "learning_rate": 8.579777315728202e-05, + "loss": 0.192, + "step": 7040 + }, + { + "epoch": 1.4188998589562765, + "grad_norm": 0.07700909674167633, + "learning_rate": 8.578846803331726e-05, + "loss": 0.1817, + "step": 7042 + }, + { + "epoch": 1.4193028410235744, + "grad_norm": 0.05442452058196068, + "learning_rate": 8.577916036699698e-05, + "loss": 0.1612, + "step": 7044 + }, + { + "epoch": 1.4197058230908723, + "grad_norm": 0.03962863236665726, + "learning_rate": 8.576985015898237e-05, + "loss": 0.1251, + "step": 7046 + }, + { + "epoch": 1.4201088051581705, + "grad_norm": 0.07024955004453659, + "learning_rate": 8.57605374099348e-05, + "loss": 0.2465, + "step": 7048 + }, + { + "epoch": 1.4205117872254684, + "grad_norm": 0.06219443678855896, + "learning_rate": 8.575122212051585e-05, + "loss": 0.173, + "step": 7050 + }, + { + "epoch": 1.4209147692927664, + "grad_norm": 0.05334271490573883, + "learning_rate": 8.574190429138726e-05, + "loss": 0.1995, + "step": 7052 + }, + { + "epoch": 1.4213177513600646, + "grad_norm": 0.05093487352132797, + "learning_rate": 8.573258392321093e-05, + "loss": 0.1886, + "step": 7054 + }, + { + "epoch": 1.4217207334273625, + "grad_norm": 0.07703381776809692, + "learning_rate": 8.5723261016649e-05, + "loss": 0.196, + "step": 7056 + }, + { + "epoch": 1.4221237154946604, + "grad_norm": 0.05135168135166168, + "learning_rate": 8.571393557236373e-05, + "loss": 0.1924, + "step": 7058 + }, + { + "epoch": 1.4225266975619584, + "grad_norm": 0.05865864083170891, + "learning_rate": 8.570460759101761e-05, + "loss": 0.2021, + "step": 7060 + }, + { + "epoch": 1.4229296796292565, + "grad_norm": 0.05785017088055611, + "learning_rate": 8.569527707327325e-05, + "loss": 0.1633, + "step": 7062 + }, + { + "epoch": 1.4233326616965545, + "grad_norm": 0.05694718286395073, + "learning_rate": 8.56859440197935e-05, + "loss": 0.1682, + "step": 7064 + }, + { + "epoch": 1.4237356437638526, + "grad_norm": 0.0642189010977745, + "learning_rate": 8.567660843124135e-05, + "loss": 0.1794, + "step": 7066 + }, + { + "epoch": 1.4241386258311506, + "grad_norm": 0.05178860202431679, + "learning_rate": 8.566727030828001e-05, + "loss": 0.1915, + "step": 7068 + }, + { + "epoch": 1.4245416078984485, + "grad_norm": 0.06810785830020905, + "learning_rate": 8.565792965157281e-05, + "loss": 0.2163, + "step": 7070 + }, + { + "epoch": 1.4249445899657465, + "grad_norm": 0.05047709494829178, + "learning_rate": 8.564858646178333e-05, + "loss": 0.2125, + "step": 7072 + }, + { + "epoch": 1.4253475720330444, + "grad_norm": 0.057600557804107666, + "learning_rate": 8.563924073957527e-05, + "loss": 0.1998, + "step": 7074 + }, + { + "epoch": 1.4257505541003426, + "grad_norm": 0.04869558662176132, + "learning_rate": 8.562989248561256e-05, + "loss": 0.1805, + "step": 7076 + }, + { + "epoch": 1.4261535361676405, + "grad_norm": 0.06351305544376373, + "learning_rate": 8.562054170055924e-05, + "loss": 0.1748, + "step": 7078 + }, + { + "epoch": 1.4265565182349387, + "grad_norm": 0.046501412987709045, + "learning_rate": 8.561118838507962e-05, + "loss": 0.2176, + "step": 7080 + }, + { + "epoch": 1.4269595003022366, + "grad_norm": 0.057603869587183, + "learning_rate": 8.560183253983813e-05, + "loss": 0.2286, + "step": 7082 + }, + { + "epoch": 1.4273624823695346, + "grad_norm": 0.07590744644403458, + "learning_rate": 8.55924741654994e-05, + "loss": 0.201, + "step": 7084 + }, + { + "epoch": 1.4277654644368325, + "grad_norm": 0.049121540039777756, + "learning_rate": 8.558311326272821e-05, + "loss": 0.1838, + "step": 7086 + }, + { + "epoch": 1.4281684465041304, + "grad_norm": 0.04728762432932854, + "learning_rate": 8.557374983218957e-05, + "loss": 0.1613, + "step": 7088 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.05571918934583664, + "learning_rate": 8.556438387454864e-05, + "loss": 0.231, + "step": 7090 + }, + { + "epoch": 1.4289744106387265, + "grad_norm": 0.04902351647615433, + "learning_rate": 8.555501539047075e-05, + "loss": 0.2, + "step": 7092 + }, + { + "epoch": 1.4293773927060247, + "grad_norm": 0.05943556874990463, + "learning_rate": 8.554564438062142e-05, + "loss": 0.1962, + "step": 7094 + }, + { + "epoch": 1.4297803747733226, + "grad_norm": 0.04188118875026703, + "learning_rate": 8.553627084566637e-05, + "loss": 0.1585, + "step": 7096 + }, + { + "epoch": 1.4301833568406206, + "grad_norm": 0.058290161192417145, + "learning_rate": 8.552689478627147e-05, + "loss": 0.1897, + "step": 7098 + }, + { + "epoch": 1.4305863389079185, + "grad_norm": 0.05757759138941765, + "learning_rate": 8.551751620310279e-05, + "loss": 0.2302, + "step": 7100 + }, + { + "epoch": 1.4309893209752165, + "grad_norm": 0.06541424989700317, + "learning_rate": 8.550813509682654e-05, + "loss": 0.1784, + "step": 7102 + }, + { + "epoch": 1.4313923030425146, + "grad_norm": 0.05995183438062668, + "learning_rate": 8.549875146810918e-05, + "loss": 0.2322, + "step": 7104 + }, + { + "epoch": 1.4317952851098126, + "grad_norm": 0.050467684864997864, + "learning_rate": 8.548936531761727e-05, + "loss": 0.1946, + "step": 7106 + }, + { + "epoch": 1.4321982671771107, + "grad_norm": 0.05367492884397507, + "learning_rate": 8.547997664601763e-05, + "loss": 0.1872, + "step": 7108 + }, + { + "epoch": 1.4326012492444087, + "grad_norm": 0.07544504106044769, + "learning_rate": 8.547058545397717e-05, + "loss": 0.2195, + "step": 7110 + }, + { + "epoch": 1.4330042313117066, + "grad_norm": 0.07416324317455292, + "learning_rate": 8.546119174216305e-05, + "loss": 0.1982, + "step": 7112 + }, + { + "epoch": 1.4334072133790046, + "grad_norm": 0.10676012933254242, + "learning_rate": 8.545179551124258e-05, + "loss": 0.1391, + "step": 7114 + }, + { + "epoch": 1.4338101954463025, + "grad_norm": 0.04345235228538513, + "learning_rate": 8.544239676188326e-05, + "loss": 0.1398, + "step": 7116 + }, + { + "epoch": 1.4342131775136007, + "grad_norm": 0.039523664861917496, + "learning_rate": 8.543299549475274e-05, + "loss": 0.1978, + "step": 7118 + }, + { + "epoch": 1.4346161595808986, + "grad_norm": 0.0516582652926445, + "learning_rate": 8.54235917105189e-05, + "loss": 0.2099, + "step": 7120 + }, + { + "epoch": 1.4350191416481968, + "grad_norm": 0.0456339530646801, + "learning_rate": 8.541418540984975e-05, + "loss": 0.2016, + "step": 7122 + }, + { + "epoch": 1.4354221237154947, + "grad_norm": 0.05590183287858963, + "learning_rate": 8.54047765934135e-05, + "loss": 0.1898, + "step": 7124 + }, + { + "epoch": 1.4358251057827927, + "grad_norm": 0.05954501032829285, + "learning_rate": 8.539536526187857e-05, + "loss": 0.198, + "step": 7126 + }, + { + "epoch": 1.4362280878500906, + "grad_norm": 0.09433047473430634, + "learning_rate": 8.538595141591348e-05, + "loss": 0.2157, + "step": 7128 + }, + { + "epoch": 1.4366310699173888, + "grad_norm": 0.06842294335365295, + "learning_rate": 8.5376535056187e-05, + "loss": 0.2662, + "step": 7130 + }, + { + "epoch": 1.4370340519846867, + "grad_norm": 0.0586603581905365, + "learning_rate": 8.536711618336802e-05, + "loss": 0.1726, + "step": 7132 + }, + { + "epoch": 1.4374370340519846, + "grad_norm": 0.04986822232604027, + "learning_rate": 8.535769479812569e-05, + "loss": 0.1851, + "step": 7134 + }, + { + "epoch": 1.4378400161192828, + "grad_norm": 0.05392155423760414, + "learning_rate": 8.534827090112927e-05, + "loss": 0.1937, + "step": 7136 + }, + { + "epoch": 1.4382429981865807, + "grad_norm": 0.033999454230070114, + "learning_rate": 8.53388444930482e-05, + "loss": 0.1801, + "step": 7138 + }, + { + "epoch": 1.4386459802538787, + "grad_norm": 0.04176968336105347, + "learning_rate": 8.532941557455214e-05, + "loss": 0.1373, + "step": 7140 + }, + { + "epoch": 1.4390489623211766, + "grad_norm": 0.05936324968934059, + "learning_rate": 8.53199841463109e-05, + "loss": 0.1359, + "step": 7142 + }, + { + "epoch": 1.4394519443884748, + "grad_norm": 0.04897318407893181, + "learning_rate": 8.531055020899448e-05, + "loss": 0.1773, + "step": 7144 + }, + { + "epoch": 1.4398549264557727, + "grad_norm": 0.05380289629101753, + "learning_rate": 8.530111376327304e-05, + "loss": 0.1658, + "step": 7146 + }, + { + "epoch": 1.4402579085230707, + "grad_norm": 0.056560710072517395, + "learning_rate": 8.529167480981693e-05, + "loss": 0.1902, + "step": 7148 + }, + { + "epoch": 1.4406608905903688, + "grad_norm": 0.05610078573226929, + "learning_rate": 8.528223334929669e-05, + "loss": 0.1944, + "step": 7150 + }, + { + "epoch": 1.4410638726576668, + "grad_norm": 0.11309154331684113, + "learning_rate": 8.5272789382383e-05, + "loss": 0.2735, + "step": 7152 + }, + { + "epoch": 1.4414668547249647, + "grad_norm": 0.05021926388144493, + "learning_rate": 8.52633429097468e-05, + "loss": 0.195, + "step": 7154 + }, + { + "epoch": 1.4418698367922627, + "grad_norm": 0.07295999675989151, + "learning_rate": 8.525389393205906e-05, + "loss": 0.2276, + "step": 7156 + }, + { + "epoch": 1.4422728188595608, + "grad_norm": 0.07204556465148926, + "learning_rate": 8.524444244999113e-05, + "loss": 0.1838, + "step": 7158 + }, + { + "epoch": 1.4426758009268588, + "grad_norm": 0.04845462366938591, + "learning_rate": 8.523498846421435e-05, + "loss": 0.1711, + "step": 7160 + }, + { + "epoch": 1.4430787829941567, + "grad_norm": 0.049779243767261505, + "learning_rate": 8.522553197540033e-05, + "loss": 0.195, + "step": 7162 + }, + { + "epoch": 1.4434817650614549, + "grad_norm": 0.06310156732797623, + "learning_rate": 8.521607298422087e-05, + "loss": 0.1704, + "step": 7164 + }, + { + "epoch": 1.4438847471287528, + "grad_norm": 0.04404526203870773, + "learning_rate": 8.52066114913479e-05, + "loss": 0.1512, + "step": 7166 + }, + { + "epoch": 1.4442877291960508, + "grad_norm": 0.05154913291335106, + "learning_rate": 8.519714749745356e-05, + "loss": 0.2179, + "step": 7168 + }, + { + "epoch": 1.4446907112633487, + "grad_norm": 0.04693300276994705, + "learning_rate": 8.518768100321013e-05, + "loss": 0.1985, + "step": 7170 + }, + { + "epoch": 1.4450936933306469, + "grad_norm": 0.05438321828842163, + "learning_rate": 8.517821200929013e-05, + "loss": 0.2356, + "step": 7172 + }, + { + "epoch": 1.4454966753979448, + "grad_norm": 0.05669800192117691, + "learning_rate": 8.516874051636621e-05, + "loss": 0.2373, + "step": 7174 + }, + { + "epoch": 1.4458996574652427, + "grad_norm": 0.051683925092220306, + "learning_rate": 8.51592665251112e-05, + "loss": 0.2001, + "step": 7176 + }, + { + "epoch": 1.446302639532541, + "grad_norm": 0.03920593857765198, + "learning_rate": 8.514979003619814e-05, + "loss": 0.1738, + "step": 7178 + }, + { + "epoch": 1.4467056215998388, + "grad_norm": 0.0427895151078701, + "learning_rate": 8.51403110503002e-05, + "loss": 0.162, + "step": 7180 + }, + { + "epoch": 1.4471086036671368, + "grad_norm": 0.06468407064676285, + "learning_rate": 8.513082956809075e-05, + "loss": 0.2271, + "step": 7182 + }, + { + "epoch": 1.4475115857344347, + "grad_norm": 0.039048973470926285, + "learning_rate": 8.512134559024337e-05, + "loss": 0.1726, + "step": 7184 + }, + { + "epoch": 1.4479145678017329, + "grad_norm": 0.04582072049379349, + "learning_rate": 8.511185911743176e-05, + "loss": 0.2126, + "step": 7186 + }, + { + "epoch": 1.4483175498690308, + "grad_norm": 0.04817645624279976, + "learning_rate": 8.510237015032982e-05, + "loss": 0.218, + "step": 7188 + }, + { + "epoch": 1.4487205319363288, + "grad_norm": 0.055014822632074356, + "learning_rate": 8.509287868961166e-05, + "loss": 0.1736, + "step": 7190 + }, + { + "epoch": 1.449123514003627, + "grad_norm": 0.04446453973650932, + "learning_rate": 8.508338473595152e-05, + "loss": 0.2389, + "step": 7192 + }, + { + "epoch": 1.4495264960709249, + "grad_norm": 0.04078453779220581, + "learning_rate": 8.507388829002383e-05, + "loss": 0.2143, + "step": 7194 + }, + { + "epoch": 1.4499294781382228, + "grad_norm": 0.04532262682914734, + "learning_rate": 8.50643893525032e-05, + "loss": 0.1713, + "step": 7196 + }, + { + "epoch": 1.4503324602055208, + "grad_norm": 0.05296036973595619, + "learning_rate": 8.505488792406444e-05, + "loss": 0.1575, + "step": 7198 + }, + { + "epoch": 1.450735442272819, + "grad_norm": 0.06389618664979935, + "learning_rate": 8.504538400538252e-05, + "loss": 0.2189, + "step": 7200 + }, + { + "epoch": 1.4511384243401169, + "grad_norm": 0.04667343944311142, + "learning_rate": 8.503587759713253e-05, + "loss": 0.2219, + "step": 7202 + }, + { + "epoch": 1.4515414064074148, + "grad_norm": 0.06371932476758957, + "learning_rate": 8.502636869998986e-05, + "loss": 0.1854, + "step": 7204 + }, + { + "epoch": 1.451944388474713, + "grad_norm": 0.04117533937096596, + "learning_rate": 8.501685731462995e-05, + "loss": 0.21, + "step": 7206 + }, + { + "epoch": 1.452347370542011, + "grad_norm": 0.043234214186668396, + "learning_rate": 8.500734344172849e-05, + "loss": 0.2015, + "step": 7208 + }, + { + "epoch": 1.4527503526093088, + "grad_norm": 0.05427609011530876, + "learning_rate": 8.499782708196136e-05, + "loss": 0.1649, + "step": 7210 + }, + { + "epoch": 1.4531533346766068, + "grad_norm": 0.05856523662805557, + "learning_rate": 8.498830823600457e-05, + "loss": 0.1776, + "step": 7212 + }, + { + "epoch": 1.453556316743905, + "grad_norm": 0.061823341995477676, + "learning_rate": 8.49787869045343e-05, + "loss": 0.178, + "step": 7214 + }, + { + "epoch": 1.453959298811203, + "grad_norm": 0.06578311324119568, + "learning_rate": 8.496926308822696e-05, + "loss": 0.2282, + "step": 7216 + }, + { + "epoch": 1.4543622808785008, + "grad_norm": 0.040528710931539536, + "learning_rate": 8.49597367877591e-05, + "loss": 0.2129, + "step": 7218 + }, + { + "epoch": 1.454765262945799, + "grad_norm": 0.05041544884443283, + "learning_rate": 8.495020800380742e-05, + "loss": 0.1878, + "step": 7220 + }, + { + "epoch": 1.455168245013097, + "grad_norm": 0.05043159797787666, + "learning_rate": 8.494067673704888e-05, + "loss": 0.2189, + "step": 7222 + }, + { + "epoch": 1.4555712270803949, + "grad_norm": 0.04681938886642456, + "learning_rate": 8.493114298816055e-05, + "loss": 0.199, + "step": 7224 + }, + { + "epoch": 1.4559742091476928, + "grad_norm": 0.055137768387794495, + "learning_rate": 8.492160675781967e-05, + "loss": 0.1672, + "step": 7226 + }, + { + "epoch": 1.456377191214991, + "grad_norm": 0.04934654012322426, + "learning_rate": 8.49120680467037e-05, + "loss": 0.2264, + "step": 7228 + }, + { + "epoch": 1.456780173282289, + "grad_norm": 0.0343373566865921, + "learning_rate": 8.490252685549026e-05, + "loss": 0.1736, + "step": 7230 + }, + { + "epoch": 1.4571831553495869, + "grad_norm": 0.051517315208911896, + "learning_rate": 8.489298318485712e-05, + "loss": 0.1566, + "step": 7232 + }, + { + "epoch": 1.457586137416885, + "grad_norm": 0.05607752874493599, + "learning_rate": 8.488343703548226e-05, + "loss": 0.1562, + "step": 7234 + }, + { + "epoch": 1.457989119484183, + "grad_norm": 0.049427617341279984, + "learning_rate": 8.487388840804383e-05, + "loss": 0.2388, + "step": 7236 + }, + { + "epoch": 1.458392101551481, + "grad_norm": 0.05366994068026543, + "learning_rate": 8.486433730322012e-05, + "loss": 0.1983, + "step": 7238 + }, + { + "epoch": 1.4587950836187789, + "grad_norm": 0.05468441918492317, + "learning_rate": 8.485478372168966e-05, + "loss": 0.2348, + "step": 7240 + }, + { + "epoch": 1.459198065686077, + "grad_norm": 0.05302935093641281, + "learning_rate": 8.48452276641311e-05, + "loss": 0.2082, + "step": 7242 + }, + { + "epoch": 1.459601047753375, + "grad_norm": 0.04598622024059296, + "learning_rate": 8.48356691312233e-05, + "loss": 0.1921, + "step": 7244 + }, + { + "epoch": 1.460004029820673, + "grad_norm": 0.06511973589658737, + "learning_rate": 8.482610812364527e-05, + "loss": 0.1838, + "step": 7246 + }, + { + "epoch": 1.460407011887971, + "grad_norm": 0.060067228972911835, + "learning_rate": 8.481654464207623e-05, + "loss": 0.16, + "step": 7248 + }, + { + "epoch": 1.460809993955269, + "grad_norm": 0.04335297271609306, + "learning_rate": 8.480697868719551e-05, + "loss": 0.1295, + "step": 7250 + }, + { + "epoch": 1.461212976022567, + "grad_norm": 0.056406330317258835, + "learning_rate": 8.47974102596827e-05, + "loss": 0.1607, + "step": 7252 + }, + { + "epoch": 1.4616159580898649, + "grad_norm": 0.06859233975410461, + "learning_rate": 8.478783936021753e-05, + "loss": 0.208, + "step": 7254 + }, + { + "epoch": 1.462018940157163, + "grad_norm": 0.0544903390109539, + "learning_rate": 8.477826598947989e-05, + "loss": 0.1753, + "step": 7256 + }, + { + "epoch": 1.462421922224461, + "grad_norm": 0.06715361773967743, + "learning_rate": 8.476869014814984e-05, + "loss": 0.1487, + "step": 7258 + }, + { + "epoch": 1.4628249042917592, + "grad_norm": 0.06288928538560867, + "learning_rate": 8.475911183690765e-05, + "loss": 0.1728, + "step": 7260 + }, + { + "epoch": 1.463227886359057, + "grad_norm": 0.0655706375837326, + "learning_rate": 8.474953105643374e-05, + "loss": 0.2087, + "step": 7262 + }, + { + "epoch": 1.463630868426355, + "grad_norm": 0.045362550765275955, + "learning_rate": 8.473994780740873e-05, + "loss": 0.162, + "step": 7264 + }, + { + "epoch": 1.464033850493653, + "grad_norm": 0.05193858593702316, + "learning_rate": 8.473036209051337e-05, + "loss": 0.2173, + "step": 7266 + }, + { + "epoch": 1.464436832560951, + "grad_norm": 0.0538729652762413, + "learning_rate": 8.472077390642864e-05, + "loss": 0.1997, + "step": 7268 + }, + { + "epoch": 1.464839814628249, + "grad_norm": 0.04861455410718918, + "learning_rate": 8.471118325583565e-05, + "loss": 0.2214, + "step": 7270 + }, + { + "epoch": 1.465242796695547, + "grad_norm": 0.04979060962796211, + "learning_rate": 8.470159013941572e-05, + "loss": 0.1745, + "step": 7272 + }, + { + "epoch": 1.4656457787628452, + "grad_norm": 0.04076359421014786, + "learning_rate": 8.469199455785032e-05, + "loss": 0.182, + "step": 7274 + }, + { + "epoch": 1.4660487608301431, + "grad_norm": 0.03544747456908226, + "learning_rate": 8.46823965118211e-05, + "loss": 0.1535, + "step": 7276 + }, + { + "epoch": 1.466451742897441, + "grad_norm": 0.0503484271466732, + "learning_rate": 8.467279600200993e-05, + "loss": 0.2295, + "step": 7278 + }, + { + "epoch": 1.466854724964739, + "grad_norm": 0.061403173953294754, + "learning_rate": 8.466319302909875e-05, + "loss": 0.1619, + "step": 7280 + }, + { + "epoch": 1.467257707032037, + "grad_norm": 0.048420753329992294, + "learning_rate": 8.465358759376979e-05, + "loss": 0.1717, + "step": 7282 + }, + { + "epoch": 1.4676606890993351, + "grad_norm": 0.04460752010345459, + "learning_rate": 8.464397969670538e-05, + "loss": 0.1453, + "step": 7284 + }, + { + "epoch": 1.468063671166633, + "grad_norm": 0.05327368900179863, + "learning_rate": 8.463436933858806e-05, + "loss": 0.2189, + "step": 7286 + }, + { + "epoch": 1.4684666532339312, + "grad_norm": 0.0458202101290226, + "learning_rate": 8.462475652010053e-05, + "loss": 0.1892, + "step": 7288 + }, + { + "epoch": 1.4688696353012292, + "grad_norm": 0.047448720782995224, + "learning_rate": 8.461514124192567e-05, + "loss": 0.2031, + "step": 7290 + }, + { + "epoch": 1.469272617368527, + "grad_norm": 0.05255912244319916, + "learning_rate": 8.460552350474654e-05, + "loss": 0.1929, + "step": 7292 + }, + { + "epoch": 1.469675599435825, + "grad_norm": 0.05827522650361061, + "learning_rate": 8.459590330924636e-05, + "loss": 0.2482, + "step": 7294 + }, + { + "epoch": 1.470078581503123, + "grad_norm": 0.08329940587282181, + "learning_rate": 8.458628065610853e-05, + "loss": 0.2222, + "step": 7296 + }, + { + "epoch": 1.4704815635704211, + "grad_norm": 0.0485442690551281, + "learning_rate": 8.457665554601667e-05, + "loss": 0.1998, + "step": 7298 + }, + { + "epoch": 1.470884545637719, + "grad_norm": 0.05297599732875824, + "learning_rate": 8.456702797965446e-05, + "loss": 0.2179, + "step": 7300 + }, + { + "epoch": 1.4712875277050173, + "grad_norm": 0.05084371566772461, + "learning_rate": 8.455739795770588e-05, + "loss": 0.2063, + "step": 7302 + }, + { + "epoch": 1.4716905097723152, + "grad_norm": 0.044608235359191895, + "learning_rate": 8.454776548085499e-05, + "loss": 0.1736, + "step": 7304 + }, + { + "epoch": 1.4720934918396131, + "grad_norm": 0.04484618082642555, + "learning_rate": 8.453813054978612e-05, + "loss": 0.2138, + "step": 7306 + }, + { + "epoch": 1.472496473906911, + "grad_norm": 0.04494086280465126, + "learning_rate": 8.452849316518367e-05, + "loss": 0.1769, + "step": 7308 + }, + { + "epoch": 1.472899455974209, + "grad_norm": 0.04433842748403549, + "learning_rate": 8.451885332773231e-05, + "loss": 0.196, + "step": 7310 + }, + { + "epoch": 1.4733024380415072, + "grad_norm": 0.06002422794699669, + "learning_rate": 8.450921103811679e-05, + "loss": 0.2003, + "step": 7312 + }, + { + "epoch": 1.4737054201088051, + "grad_norm": 0.0462716743350029, + "learning_rate": 8.449956629702214e-05, + "loss": 0.207, + "step": 7314 + }, + { + "epoch": 1.4741084021761033, + "grad_norm": 0.06718819588422775, + "learning_rate": 8.448991910513344e-05, + "loss": 0.1779, + "step": 7316 + }, + { + "epoch": 1.4745113842434012, + "grad_norm": 0.04861169680953026, + "learning_rate": 8.448026946313607e-05, + "loss": 0.1674, + "step": 7318 + }, + { + "epoch": 1.4749143663106992, + "grad_norm": 0.04479601979255676, + "learning_rate": 8.447061737171549e-05, + "loss": 0.1838, + "step": 7320 + }, + { + "epoch": 1.475317348377997, + "grad_norm": 0.05224357545375824, + "learning_rate": 8.446096283155736e-05, + "loss": 0.2221, + "step": 7322 + }, + { + "epoch": 1.475720330445295, + "grad_norm": 0.03974277153611183, + "learning_rate": 8.445130584334758e-05, + "loss": 0.1639, + "step": 7324 + }, + { + "epoch": 1.4761233125125932, + "grad_norm": 0.05455762520432472, + "learning_rate": 8.44416464077721e-05, + "loss": 0.2433, + "step": 7326 + }, + { + "epoch": 1.4765262945798912, + "grad_norm": 0.04934287071228027, + "learning_rate": 8.443198452551715e-05, + "loss": 0.1937, + "step": 7328 + }, + { + "epoch": 1.4769292766471893, + "grad_norm": 0.041016895323991776, + "learning_rate": 8.442232019726909e-05, + "loss": 0.1981, + "step": 7330 + }, + { + "epoch": 1.4773322587144873, + "grad_norm": 0.037635620683431625, + "learning_rate": 8.441265342371445e-05, + "loss": 0.1531, + "step": 7332 + }, + { + "epoch": 1.4777352407817852, + "grad_norm": 0.06261298060417175, + "learning_rate": 8.440298420553995e-05, + "loss": 0.1733, + "step": 7334 + }, + { + "epoch": 1.4781382228490831, + "grad_norm": 0.05150596424937248, + "learning_rate": 8.439331254343246e-05, + "loss": 0.1854, + "step": 7336 + }, + { + "epoch": 1.4785412049163813, + "grad_norm": 0.04876361042261124, + "learning_rate": 8.438363843807906e-05, + "loss": 0.1419, + "step": 7338 + }, + { + "epoch": 1.4789441869836792, + "grad_norm": 0.060993921011686325, + "learning_rate": 8.437396189016698e-05, + "loss": 0.1656, + "step": 7340 + }, + { + "epoch": 1.4793471690509772, + "grad_norm": 0.04993096739053726, + "learning_rate": 8.43642829003836e-05, + "loss": 0.2243, + "step": 7342 + }, + { + "epoch": 1.4797501511182753, + "grad_norm": 0.0681760385632515, + "learning_rate": 8.435460146941653e-05, + "loss": 0.2071, + "step": 7344 + }, + { + "epoch": 1.4801531331855733, + "grad_norm": 0.04877663403749466, + "learning_rate": 8.434491759795353e-05, + "loss": 0.2088, + "step": 7346 + }, + { + "epoch": 1.4805561152528712, + "grad_norm": 0.04369872435927391, + "learning_rate": 8.43352312866825e-05, + "loss": 0.2126, + "step": 7348 + }, + { + "epoch": 1.4809590973201692, + "grad_norm": 0.0733921229839325, + "learning_rate": 8.432554253629154e-05, + "loss": 0.1975, + "step": 7350 + }, + { + "epoch": 1.4813620793874673, + "grad_norm": 0.04548421502113342, + "learning_rate": 8.431585134746894e-05, + "loss": 0.1712, + "step": 7352 + }, + { + "epoch": 1.4817650614547653, + "grad_norm": 0.052208177745342255, + "learning_rate": 8.430615772090314e-05, + "loss": 0.1925, + "step": 7354 + }, + { + "epoch": 1.4821680435220632, + "grad_norm": 0.05872802808880806, + "learning_rate": 8.429646165728275e-05, + "loss": 0.1987, + "step": 7356 + }, + { + "epoch": 1.4825710255893614, + "grad_norm": 0.053144607692956924, + "learning_rate": 8.42867631572966e-05, + "loss": 0.2145, + "step": 7358 + }, + { + "epoch": 1.4829740076566593, + "grad_norm": 0.036344993859529495, + "learning_rate": 8.427706222163361e-05, + "loss": 0.1606, + "step": 7360 + }, + { + "epoch": 1.4833769897239573, + "grad_norm": 0.045181769877672195, + "learning_rate": 8.426735885098293e-05, + "loss": 0.1919, + "step": 7362 + }, + { + "epoch": 1.4837799717912552, + "grad_norm": 0.049181703478097916, + "learning_rate": 8.42576530460339e-05, + "loss": 0.2088, + "step": 7364 + }, + { + "epoch": 1.4841829538585534, + "grad_norm": 0.052120842039585114, + "learning_rate": 8.424794480747597e-05, + "loss": 0.1863, + "step": 7366 + }, + { + "epoch": 1.4845859359258513, + "grad_norm": 0.04675266519188881, + "learning_rate": 8.423823413599883e-05, + "loss": 0.1956, + "step": 7368 + }, + { + "epoch": 1.4849889179931492, + "grad_norm": 0.04355587065219879, + "learning_rate": 8.422852103229228e-05, + "loss": 0.2108, + "step": 7370 + }, + { + "epoch": 1.4853919000604474, + "grad_norm": 0.05784691497683525, + "learning_rate": 8.421880549704634e-05, + "loss": 0.2315, + "step": 7372 + }, + { + "epoch": 1.4857948821277454, + "grad_norm": 0.03782470524311066, + "learning_rate": 8.420908753095118e-05, + "loss": 0.1504, + "step": 7374 + }, + { + "epoch": 1.4861978641950433, + "grad_norm": 0.04663468152284622, + "learning_rate": 8.419936713469714e-05, + "loss": 0.2133, + "step": 7376 + }, + { + "epoch": 1.4866008462623412, + "grad_norm": 0.05277375504374504, + "learning_rate": 8.418964430897477e-05, + "loss": 0.1913, + "step": 7378 + }, + { + "epoch": 1.4870038283296394, + "grad_norm": 0.04334205016493797, + "learning_rate": 8.417991905447473e-05, + "loss": 0.1672, + "step": 7380 + }, + { + "epoch": 1.4874068103969373, + "grad_norm": 0.05344159156084061, + "learning_rate": 8.417019137188792e-05, + "loss": 0.225, + "step": 7382 + }, + { + "epoch": 1.4878097924642353, + "grad_norm": 0.045284245163202286, + "learning_rate": 8.416046126190536e-05, + "loss": 0.1946, + "step": 7384 + }, + { + "epoch": 1.4882127745315334, + "grad_norm": 0.047893647104501724, + "learning_rate": 8.415072872521826e-05, + "loss": 0.1857, + "step": 7386 + }, + { + "epoch": 1.4886157565988314, + "grad_norm": 0.041971355676651, + "learning_rate": 8.4140993762518e-05, + "loss": 0.1719, + "step": 7388 + }, + { + "epoch": 1.4890187386661293, + "grad_norm": 0.06434407830238342, + "learning_rate": 8.413125637449615e-05, + "loss": 0.1602, + "step": 7390 + }, + { + "epoch": 1.4894217207334273, + "grad_norm": 0.07025641947984695, + "learning_rate": 8.412151656184444e-05, + "loss": 0.2188, + "step": 7392 + }, + { + "epoch": 1.4898247028007254, + "grad_norm": 0.0681864395737648, + "learning_rate": 8.411177432525475e-05, + "loss": 0.2231, + "step": 7394 + }, + { + "epoch": 1.4902276848680234, + "grad_norm": 0.055996041744947433, + "learning_rate": 8.410202966541917e-05, + "loss": 0.1936, + "step": 7396 + }, + { + "epoch": 1.4906306669353213, + "grad_norm": 0.03989433869719505, + "learning_rate": 8.409228258302994e-05, + "loss": 0.1496, + "step": 7398 + }, + { + "epoch": 1.4910336490026195, + "grad_norm": 0.062053125351667404, + "learning_rate": 8.408253307877947e-05, + "loss": 0.2498, + "step": 7400 + }, + { + "epoch": 1.4914366310699174, + "grad_norm": 0.04868367314338684, + "learning_rate": 8.407278115336037e-05, + "loss": 0.2198, + "step": 7402 + }, + { + "epoch": 1.4918396131372154, + "grad_norm": 0.05877026170492172, + "learning_rate": 8.406302680746538e-05, + "loss": 0.2521, + "step": 7404 + }, + { + "epoch": 1.4922425952045133, + "grad_norm": 0.04489947110414505, + "learning_rate": 8.405327004178745e-05, + "loss": 0.2219, + "step": 7406 + }, + { + "epoch": 1.4926455772718115, + "grad_norm": 0.05375202000141144, + "learning_rate": 8.404351085701967e-05, + "loss": 0.2286, + "step": 7408 + }, + { + "epoch": 1.4930485593391094, + "grad_norm": 0.04449663311243057, + "learning_rate": 8.403374925385532e-05, + "loss": 0.1823, + "step": 7410 + }, + { + "epoch": 1.4934515414064073, + "grad_norm": 0.04015576094388962, + "learning_rate": 8.402398523298786e-05, + "loss": 0.1567, + "step": 7412 + }, + { + "epoch": 1.4938545234737055, + "grad_norm": 0.047074656933546066, + "learning_rate": 8.40142187951109e-05, + "loss": 0.1762, + "step": 7414 + }, + { + "epoch": 1.4942575055410035, + "grad_norm": 0.06970192492008209, + "learning_rate": 8.400444994091823e-05, + "loss": 0.1826, + "step": 7416 + }, + { + "epoch": 1.4946604876083014, + "grad_norm": 0.04595296084880829, + "learning_rate": 8.399467867110382e-05, + "loss": 0.2068, + "step": 7418 + }, + { + "epoch": 1.4950634696755993, + "grad_norm": 0.05146399140357971, + "learning_rate": 8.398490498636181e-05, + "loss": 0.1942, + "step": 7420 + }, + { + "epoch": 1.4954664517428975, + "grad_norm": 0.05432479828596115, + "learning_rate": 8.39751288873865e-05, + "loss": 0.2029, + "step": 7422 + }, + { + "epoch": 1.4958694338101954, + "grad_norm": 0.04735802486538887, + "learning_rate": 8.396535037487236e-05, + "loss": 0.1889, + "step": 7424 + }, + { + "epoch": 1.4962724158774934, + "grad_norm": 0.053364839404821396, + "learning_rate": 8.395556944951406e-05, + "loss": 0.1904, + "step": 7426 + }, + { + "epoch": 1.4966753979447915, + "grad_norm": 0.04619657248258591, + "learning_rate": 8.394578611200639e-05, + "loss": 0.17, + "step": 7428 + }, + { + "epoch": 1.4970783800120895, + "grad_norm": 0.04902162775397301, + "learning_rate": 8.393600036304438e-05, + "loss": 0.2198, + "step": 7430 + }, + { + "epoch": 1.4974813620793874, + "grad_norm": 0.0800173282623291, + "learning_rate": 8.392621220332317e-05, + "loss": 0.2009, + "step": 7432 + }, + { + "epoch": 1.4978843441466854, + "grad_norm": 0.0478094182908535, + "learning_rate": 8.391642163353812e-05, + "loss": 0.2335, + "step": 7434 + }, + { + "epoch": 1.4982873262139835, + "grad_norm": 0.05949089676141739, + "learning_rate": 8.39066286543847e-05, + "loss": 0.209, + "step": 7436 + }, + { + "epoch": 1.4986903082812815, + "grad_norm": 0.043826907873153687, + "learning_rate": 8.389683326655862e-05, + "loss": 0.1955, + "step": 7438 + }, + { + "epoch": 1.4990932903485794, + "grad_norm": 0.05305158719420433, + "learning_rate": 8.388703547075569e-05, + "loss": 0.224, + "step": 7440 + }, + { + "epoch": 1.4994962724158776, + "grad_norm": 0.04061686992645264, + "learning_rate": 8.387723526767197e-05, + "loss": 0.1754, + "step": 7442 + }, + { + "epoch": 1.4998992544831755, + "grad_norm": 0.05251790210604668, + "learning_rate": 8.386743265800364e-05, + "loss": 0.1544, + "step": 7444 + }, + { + "epoch": 1.5003022365504735, + "grad_norm": 0.0496913306415081, + "learning_rate": 8.385762764244704e-05, + "loss": 0.179, + "step": 7446 + }, + { + "epoch": 1.5007052186177714, + "grad_norm": 0.03873209282755852, + "learning_rate": 8.384782022169875e-05, + "loss": 0.2258, + "step": 7448 + }, + { + "epoch": 1.5011082006850696, + "grad_norm": 0.050427861511707306, + "learning_rate": 8.383801039645542e-05, + "loss": 0.2088, + "step": 7450 + }, + { + "epoch": 1.5015111827523675, + "grad_norm": 0.08442942798137665, + "learning_rate": 8.382819816741394e-05, + "loss": 0.2172, + "step": 7452 + }, + { + "epoch": 1.5019141648196657, + "grad_norm": 0.058537282049655914, + "learning_rate": 8.381838353527139e-05, + "loss": 0.1713, + "step": 7454 + }, + { + "epoch": 1.5023171468869636, + "grad_norm": 0.050120122730731964, + "learning_rate": 8.380856650072493e-05, + "loss": 0.1936, + "step": 7456 + }, + { + "epoch": 1.5027201289542615, + "grad_norm": 0.06258103996515274, + "learning_rate": 8.3798747064472e-05, + "loss": 0.2222, + "step": 7458 + }, + { + "epoch": 1.5031231110215595, + "grad_norm": 0.060956165194511414, + "learning_rate": 8.378892522721012e-05, + "loss": 0.2083, + "step": 7460 + }, + { + "epoch": 1.5035260930888574, + "grad_norm": 0.04213668778538704, + "learning_rate": 8.377910098963702e-05, + "loss": 0.1859, + "step": 7462 + }, + { + "epoch": 1.5039290751561556, + "grad_norm": 0.04656725376844406, + "learning_rate": 8.37692743524506e-05, + "loss": 0.1791, + "step": 7464 + }, + { + "epoch": 1.5043320572234535, + "grad_norm": 0.056995708495378494, + "learning_rate": 8.375944531634896e-05, + "loss": 0.21, + "step": 7466 + }, + { + "epoch": 1.5047350392907517, + "grad_norm": 0.0539989173412323, + "learning_rate": 8.37496138820303e-05, + "loss": 0.1635, + "step": 7468 + }, + { + "epoch": 1.5051380213580496, + "grad_norm": 0.039313316345214844, + "learning_rate": 8.373978005019306e-05, + "loss": 0.211, + "step": 7470 + }, + { + "epoch": 1.5055410034253476, + "grad_norm": 0.05129539594054222, + "learning_rate": 8.372994382153579e-05, + "loss": 0.1601, + "step": 7472 + }, + { + "epoch": 1.5059439854926455, + "grad_norm": 0.058646250516176224, + "learning_rate": 8.372010519675726e-05, + "loss": 0.173, + "step": 7474 + }, + { + "epoch": 1.5063469675599435, + "grad_norm": 0.04545162245631218, + "learning_rate": 8.371026417655639e-05, + "loss": 0.1635, + "step": 7476 + }, + { + "epoch": 1.5067499496272416, + "grad_norm": 0.048731524497270584, + "learning_rate": 8.370042076163224e-05, + "loss": 0.2135, + "step": 7478 + }, + { + "epoch": 1.5071529316945396, + "grad_norm": 0.06594579666852951, + "learning_rate": 8.369057495268413e-05, + "loss": 0.1718, + "step": 7480 + }, + { + "epoch": 1.5075559137618377, + "grad_norm": 0.05998741090297699, + "learning_rate": 8.368072675041144e-05, + "loss": 0.1815, + "step": 7482 + }, + { + "epoch": 1.5079588958291357, + "grad_norm": 0.06421762704849243, + "learning_rate": 8.367087615551377e-05, + "loss": 0.1649, + "step": 7484 + }, + { + "epoch": 1.5083618778964336, + "grad_norm": 0.05000369995832443, + "learning_rate": 8.366102316869094e-05, + "loss": 0.193, + "step": 7486 + }, + { + "epoch": 1.5087648599637316, + "grad_norm": 0.0470392070710659, + "learning_rate": 8.365116779064283e-05, + "loss": 0.2088, + "step": 7488 + }, + { + "epoch": 1.5091678420310295, + "grad_norm": 0.056860171258449554, + "learning_rate": 8.364131002206959e-05, + "loss": 0.2176, + "step": 7490 + }, + { + "epoch": 1.5095708240983277, + "grad_norm": 0.045696549117565155, + "learning_rate": 8.363144986367146e-05, + "loss": 0.1933, + "step": 7492 + }, + { + "epoch": 1.5099738061656256, + "grad_norm": 0.04788528010249138, + "learning_rate": 8.362158731614895e-05, + "loss": 0.1648, + "step": 7494 + }, + { + "epoch": 1.5103767882329238, + "grad_norm": 0.05643216148018837, + "learning_rate": 8.361172238020264e-05, + "loss": 0.1823, + "step": 7496 + }, + { + "epoch": 1.5107797703002217, + "grad_norm": 0.04428640007972717, + "learning_rate": 8.360185505653332e-05, + "loss": 0.2071, + "step": 7498 + }, + { + "epoch": 1.5111827523675196, + "grad_norm": 0.0673820972442627, + "learning_rate": 8.359198534584197e-05, + "loss": 0.1674, + "step": 7500 + }, + { + "epoch": 1.5115857344348176, + "grad_norm": 0.07432591915130615, + "learning_rate": 8.358211324882968e-05, + "loss": 0.2074, + "step": 7502 + }, + { + "epoch": 1.5119887165021155, + "grad_norm": 0.05492899566888809, + "learning_rate": 8.357223876619778e-05, + "loss": 0.2054, + "step": 7504 + }, + { + "epoch": 1.5123916985694137, + "grad_norm": 0.04310237616300583, + "learning_rate": 8.356236189864772e-05, + "loss": 0.1929, + "step": 7506 + }, + { + "epoch": 1.5127946806367116, + "grad_norm": 0.051562558859586716, + "learning_rate": 8.355248264688116e-05, + "loss": 0.1941, + "step": 7508 + }, + { + "epoch": 1.5131976627040098, + "grad_norm": 0.06979775428771973, + "learning_rate": 8.35426010115999e-05, + "loss": 0.2007, + "step": 7510 + }, + { + "epoch": 1.5136006447713077, + "grad_norm": 0.049285538494586945, + "learning_rate": 8.35327169935059e-05, + "loss": 0.1474, + "step": 7512 + }, + { + "epoch": 1.5140036268386057, + "grad_norm": 0.06848005950450897, + "learning_rate": 8.352283059330131e-05, + "loss": 0.2531, + "step": 7514 + }, + { + "epoch": 1.5144066089059036, + "grad_norm": 0.05815625190734863, + "learning_rate": 8.351294181168843e-05, + "loss": 0.2505, + "step": 7516 + }, + { + "epoch": 1.5148095909732016, + "grad_norm": 0.08098626136779785, + "learning_rate": 8.350305064936978e-05, + "loss": 0.2006, + "step": 7518 + }, + { + "epoch": 1.5152125730404997, + "grad_norm": 0.04606293886899948, + "learning_rate": 8.349315710704799e-05, + "loss": 0.218, + "step": 7520 + }, + { + "epoch": 1.5156155551077977, + "grad_norm": 0.04414965957403183, + "learning_rate": 8.348326118542588e-05, + "loss": 0.2248, + "step": 7522 + }, + { + "epoch": 1.5160185371750958, + "grad_norm": 0.049683474004268646, + "learning_rate": 8.347336288520644e-05, + "loss": 0.2149, + "step": 7524 + }, + { + "epoch": 1.5164215192423938, + "grad_norm": 0.05648723617196083, + "learning_rate": 8.346346220709284e-05, + "loss": 0.2169, + "step": 7526 + }, + { + "epoch": 1.5168245013096917, + "grad_norm": 0.05552398040890694, + "learning_rate": 8.34535591517884e-05, + "loss": 0.1927, + "step": 7528 + }, + { + "epoch": 1.5172274833769896, + "grad_norm": 0.03558554872870445, + "learning_rate": 8.344365371999661e-05, + "loss": 0.2295, + "step": 7530 + }, + { + "epoch": 1.5176304654442876, + "grad_norm": 0.04220812767744064, + "learning_rate": 8.343374591242117e-05, + "loss": 0.1748, + "step": 7532 + }, + { + "epoch": 1.5180334475115858, + "grad_norm": 0.046235982328653336, + "learning_rate": 8.342383572976586e-05, + "loss": 0.1707, + "step": 7534 + }, + { + "epoch": 1.5184364295788837, + "grad_norm": 0.04958495497703552, + "learning_rate": 8.341392317273473e-05, + "loss": 0.195, + "step": 7536 + }, + { + "epoch": 1.5188394116461819, + "grad_norm": 0.04524749517440796, + "learning_rate": 8.340400824203194e-05, + "loss": 0.1947, + "step": 7538 + }, + { + "epoch": 1.5192423937134798, + "grad_norm": 0.050707731395959854, + "learning_rate": 8.339409093836182e-05, + "loss": 0.1692, + "step": 7540 + }, + { + "epoch": 1.5196453757807777, + "grad_norm": 0.05605167895555496, + "learning_rate": 8.338417126242888e-05, + "loss": 0.185, + "step": 7542 + }, + { + "epoch": 1.5200483578480757, + "grad_norm": 0.04622466117143631, + "learning_rate": 8.337424921493781e-05, + "loss": 0.1966, + "step": 7544 + }, + { + "epoch": 1.5204513399153736, + "grad_norm": 0.04225487262010574, + "learning_rate": 8.336432479659344e-05, + "loss": 0.1821, + "step": 7546 + }, + { + "epoch": 1.5208543219826718, + "grad_norm": 0.04641425609588623, + "learning_rate": 8.33543980081008e-05, + "loss": 0.2138, + "step": 7548 + }, + { + "epoch": 1.5212573040499697, + "grad_norm": 0.10159898549318314, + "learning_rate": 8.334446885016507e-05, + "loss": 0.2255, + "step": 7550 + }, + { + "epoch": 1.521660286117268, + "grad_norm": 0.05289870873093605, + "learning_rate": 8.333453732349161e-05, + "loss": 0.1797, + "step": 7552 + }, + { + "epoch": 1.5220632681845658, + "grad_norm": 0.04265284165740013, + "learning_rate": 8.33246034287859e-05, + "loss": 0.1829, + "step": 7554 + }, + { + "epoch": 1.5224662502518638, + "grad_norm": 0.05362769961357117, + "learning_rate": 8.33146671667537e-05, + "loss": 0.1789, + "step": 7556 + }, + { + "epoch": 1.5228692323191617, + "grad_norm": 0.06503970921039581, + "learning_rate": 8.330472853810078e-05, + "loss": 0.2332, + "step": 7558 + }, + { + "epoch": 1.5232722143864597, + "grad_norm": 0.04482598602771759, + "learning_rate": 8.329478754353324e-05, + "loss": 0.1456, + "step": 7560 + }, + { + "epoch": 1.5236751964537578, + "grad_norm": 0.046206843107938766, + "learning_rate": 8.328484418375721e-05, + "loss": 0.1937, + "step": 7562 + }, + { + "epoch": 1.5240781785210558, + "grad_norm": 0.05199309438467026, + "learning_rate": 8.327489845947911e-05, + "loss": 0.1884, + "step": 7564 + }, + { + "epoch": 1.524481160588354, + "grad_norm": 0.036253806203603745, + "learning_rate": 8.326495037140543e-05, + "loss": 0.149, + "step": 7566 + }, + { + "epoch": 1.5248841426556519, + "grad_norm": 0.04791543632745743, + "learning_rate": 8.325499992024286e-05, + "loss": 0.1886, + "step": 7568 + }, + { + "epoch": 1.5252871247229498, + "grad_norm": 0.04419293627142906, + "learning_rate": 8.32450471066983e-05, + "loss": 0.1618, + "step": 7570 + }, + { + "epoch": 1.5256901067902477, + "grad_norm": 0.0419553704559803, + "learning_rate": 8.323509193147876e-05, + "loss": 0.1759, + "step": 7572 + }, + { + "epoch": 1.5260930888575457, + "grad_norm": 0.04536983743309975, + "learning_rate": 8.322513439529142e-05, + "loss": 0.1887, + "step": 7574 + }, + { + "epoch": 1.5264960709248439, + "grad_norm": 0.06128111481666565, + "learning_rate": 8.321517449884369e-05, + "loss": 0.2233, + "step": 7576 + }, + { + "epoch": 1.526899052992142, + "grad_norm": 0.04495418816804886, + "learning_rate": 8.320521224284308e-05, + "loss": 0.1392, + "step": 7578 + }, + { + "epoch": 1.52730203505944, + "grad_norm": 0.05643144249916077, + "learning_rate": 8.319524762799728e-05, + "loss": 0.248, + "step": 7580 + }, + { + "epoch": 1.527705017126738, + "grad_norm": 0.044090867042541504, + "learning_rate": 8.318528065501419e-05, + "loss": 0.179, + "step": 7582 + }, + { + "epoch": 1.5281079991940358, + "grad_norm": 0.050828419625759125, + "learning_rate": 8.317531132460183e-05, + "loss": 0.2008, + "step": 7584 + }, + { + "epoch": 1.5285109812613338, + "grad_norm": 0.05592074245214462, + "learning_rate": 8.316533963746841e-05, + "loss": 0.1993, + "step": 7586 + }, + { + "epoch": 1.5289139633286317, + "grad_norm": 0.08189481496810913, + "learning_rate": 8.315536559432231e-05, + "loss": 0.2118, + "step": 7588 + }, + { + "epoch": 1.5293169453959299, + "grad_norm": 0.06498509645462036, + "learning_rate": 8.314538919587205e-05, + "loss": 0.1956, + "step": 7590 + }, + { + "epoch": 1.529719927463228, + "grad_norm": 0.048073191195726395, + "learning_rate": 8.313541044282636e-05, + "loss": 0.1888, + "step": 7592 + }, + { + "epoch": 1.530122909530526, + "grad_norm": 0.04223538562655449, + "learning_rate": 8.31254293358941e-05, + "loss": 0.2121, + "step": 7594 + }, + { + "epoch": 1.530525891597824, + "grad_norm": 0.04270782321691513, + "learning_rate": 8.311544587578431e-05, + "loss": 0.1862, + "step": 7596 + }, + { + "epoch": 1.5309288736651219, + "grad_norm": 0.04466244578361511, + "learning_rate": 8.310546006320623e-05, + "loss": 0.2138, + "step": 7598 + }, + { + "epoch": 1.5313318557324198, + "grad_norm": 0.045204807072877884, + "learning_rate": 8.309547189886917e-05, + "loss": 0.1847, + "step": 7600 + }, + { + "epoch": 1.5317348377997178, + "grad_norm": 0.03572971746325493, + "learning_rate": 8.308548138348274e-05, + "loss": 0.1684, + "step": 7602 + }, + { + "epoch": 1.532137819867016, + "grad_norm": 0.05988677218556404, + "learning_rate": 8.307548851775663e-05, + "loss": 0.1984, + "step": 7604 + }, + { + "epoch": 1.532540801934314, + "grad_norm": 0.04439981281757355, + "learning_rate": 8.30654933024007e-05, + "loss": 0.1932, + "step": 7606 + }, + { + "epoch": 1.532943784001612, + "grad_norm": 0.042620521038770676, + "learning_rate": 8.305549573812501e-05, + "loss": 0.1615, + "step": 7608 + }, + { + "epoch": 1.53334676606891, + "grad_norm": 0.04275134205818176, + "learning_rate": 8.304549582563977e-05, + "loss": 0.1826, + "step": 7610 + }, + { + "epoch": 1.533749748136208, + "grad_norm": 0.04868233948945999, + "learning_rate": 8.303549356565535e-05, + "loss": 0.2086, + "step": 7612 + }, + { + "epoch": 1.5341527302035058, + "grad_norm": 0.04241720587015152, + "learning_rate": 8.302548895888232e-05, + "loss": 0.2049, + "step": 7614 + }, + { + "epoch": 1.5345557122708038, + "grad_norm": 0.040512893348932266, + "learning_rate": 8.301548200603134e-05, + "loss": 0.2209, + "step": 7616 + }, + { + "epoch": 1.534958694338102, + "grad_norm": 0.05638391524553299, + "learning_rate": 8.300547270781333e-05, + "loss": 0.214, + "step": 7618 + }, + { + "epoch": 1.5353616764054, + "grad_norm": 0.034663956612348557, + "learning_rate": 8.299546106493933e-05, + "loss": 0.2244, + "step": 7620 + }, + { + "epoch": 1.535764658472698, + "grad_norm": 0.0562388077378273, + "learning_rate": 8.298544707812054e-05, + "loss": 0.1999, + "step": 7622 + }, + { + "epoch": 1.536167640539996, + "grad_norm": 0.03829832375049591, + "learning_rate": 8.297543074806834e-05, + "loss": 0.2152, + "step": 7624 + }, + { + "epoch": 1.536570622607294, + "grad_norm": 0.05909931659698486, + "learning_rate": 8.296541207549428e-05, + "loss": 0.1943, + "step": 7626 + }, + { + "epoch": 1.5369736046745919, + "grad_norm": 0.049946270883083344, + "learning_rate": 8.295539106111007e-05, + "loss": 0.1866, + "step": 7628 + }, + { + "epoch": 1.53737658674189, + "grad_norm": 0.048141710460186005, + "learning_rate": 8.294536770562757e-05, + "loss": 0.1953, + "step": 7630 + }, + { + "epoch": 1.537779568809188, + "grad_norm": 0.05740709975361824, + "learning_rate": 8.293534200975886e-05, + "loss": 0.1717, + "step": 7632 + }, + { + "epoch": 1.5381825508764861, + "grad_norm": 0.049014464020729065, + "learning_rate": 8.29253139742161e-05, + "loss": 0.1926, + "step": 7634 + }, + { + "epoch": 1.538585532943784, + "grad_norm": 0.05644163116812706, + "learning_rate": 8.29152835997117e-05, + "loss": 0.1373, + "step": 7636 + }, + { + "epoch": 1.538988515011082, + "grad_norm": 0.06402353942394257, + "learning_rate": 8.29052508869582e-05, + "loss": 0.1903, + "step": 7638 + }, + { + "epoch": 1.53939149707838, + "grad_norm": 0.04729839786887169, + "learning_rate": 8.289521583666829e-05, + "loss": 0.196, + "step": 7640 + }, + { + "epoch": 1.539794479145678, + "grad_norm": 0.0404207780957222, + "learning_rate": 8.288517844955487e-05, + "loss": 0.1727, + "step": 7642 + }, + { + "epoch": 1.540197461212976, + "grad_norm": 0.060075223445892334, + "learning_rate": 8.287513872633094e-05, + "loss": 0.2051, + "step": 7644 + }, + { + "epoch": 1.540600443280274, + "grad_norm": 0.06109379976987839, + "learning_rate": 8.286509666770977e-05, + "loss": 0.2241, + "step": 7646 + }, + { + "epoch": 1.5410034253475722, + "grad_norm": 0.04271155223250389, + "learning_rate": 8.285505227440466e-05, + "loss": 0.2364, + "step": 7648 + }, + { + "epoch": 1.5414064074148701, + "grad_norm": 0.05317099019885063, + "learning_rate": 8.28450055471292e-05, + "loss": 0.209, + "step": 7650 + }, + { + "epoch": 1.541809389482168, + "grad_norm": 0.04707867652177811, + "learning_rate": 8.283495648659705e-05, + "loss": 0.1701, + "step": 7652 + }, + { + "epoch": 1.542212371549466, + "grad_norm": 0.04844583570957184, + "learning_rate": 8.282490509352212e-05, + "loss": 0.2099, + "step": 7654 + }, + { + "epoch": 1.542615353616764, + "grad_norm": 0.05662320926785469, + "learning_rate": 8.281485136861842e-05, + "loss": 0.2517, + "step": 7656 + }, + { + "epoch": 1.543018335684062, + "grad_norm": 0.05386023968458176, + "learning_rate": 8.280479531260018e-05, + "loss": 0.1859, + "step": 7658 + }, + { + "epoch": 1.54342131775136, + "grad_norm": 0.03194073960185051, + "learning_rate": 8.279473692618172e-05, + "loss": 0.1292, + "step": 7660 + }, + { + "epoch": 1.5438242998186582, + "grad_norm": 0.05544869229197502, + "learning_rate": 8.27846762100776e-05, + "loss": 0.2069, + "step": 7662 + }, + { + "epoch": 1.5442272818859561, + "grad_norm": 0.05823779106140137, + "learning_rate": 8.277461316500253e-05, + "loss": 0.2354, + "step": 7664 + }, + { + "epoch": 1.544630263953254, + "grad_norm": 0.05206362158060074, + "learning_rate": 8.276454779167133e-05, + "loss": 0.2107, + "step": 7666 + }, + { + "epoch": 1.545033246020552, + "grad_norm": 0.04913756251335144, + "learning_rate": 8.275448009079907e-05, + "loss": 0.2, + "step": 7668 + }, + { + "epoch": 1.54543622808785, + "grad_norm": 0.04635776951909065, + "learning_rate": 8.274441006310091e-05, + "loss": 0.2175, + "step": 7670 + }, + { + "epoch": 1.5458392101551481, + "grad_norm": 0.058479227125644684, + "learning_rate": 8.273433770929225e-05, + "loss": 0.2039, + "step": 7672 + }, + { + "epoch": 1.546242192222446, + "grad_norm": 0.042080748826265335, + "learning_rate": 8.272426303008858e-05, + "loss": 0.2154, + "step": 7674 + }, + { + "epoch": 1.5466451742897442, + "grad_norm": 0.0478924922645092, + "learning_rate": 8.27141860262056e-05, + "loss": 0.1844, + "step": 7676 + }, + { + "epoch": 1.5470481563570422, + "grad_norm": 0.048292964696884155, + "learning_rate": 8.270410669835917e-05, + "loss": 0.2174, + "step": 7678 + }, + { + "epoch": 1.5474511384243401, + "grad_norm": 0.05052988976240158, + "learning_rate": 8.269402504726529e-05, + "loss": 0.1876, + "step": 7680 + }, + { + "epoch": 1.547854120491638, + "grad_norm": 0.04319053515791893, + "learning_rate": 8.268394107364017e-05, + "loss": 0.1592, + "step": 7682 + }, + { + "epoch": 1.548257102558936, + "grad_norm": 0.05351201072335243, + "learning_rate": 8.267385477820014e-05, + "loss": 0.2157, + "step": 7684 + }, + { + "epoch": 1.5486600846262342, + "grad_norm": 0.04633256420493126, + "learning_rate": 8.266376616166172e-05, + "loss": 0.248, + "step": 7686 + }, + { + "epoch": 1.549063066693532, + "grad_norm": 0.052991047501564026, + "learning_rate": 8.26536752247416e-05, + "loss": 0.2038, + "step": 7688 + }, + { + "epoch": 1.5494660487608303, + "grad_norm": 0.07514969259500504, + "learning_rate": 8.26435819681566e-05, + "loss": 0.2306, + "step": 7690 + }, + { + "epoch": 1.5498690308281282, + "grad_norm": 0.041682012379169464, + "learning_rate": 8.263348639262373e-05, + "loss": 0.2014, + "step": 7692 + }, + { + "epoch": 1.5502720128954262, + "grad_norm": 0.04360057786107063, + "learning_rate": 8.26233884988602e-05, + "loss": 0.1914, + "step": 7694 + }, + { + "epoch": 1.550674994962724, + "grad_norm": 0.029803669080138206, + "learning_rate": 8.261328828758333e-05, + "loss": 0.1599, + "step": 7696 + }, + { + "epoch": 1.551077977030022, + "grad_norm": 0.046820033341646194, + "learning_rate": 8.260318575951059e-05, + "loss": 0.1899, + "step": 7698 + }, + { + "epoch": 1.5514809590973202, + "grad_norm": 0.06097986549139023, + "learning_rate": 8.259308091535969e-05, + "loss": 0.2214, + "step": 7700 + }, + { + "epoch": 1.5518839411646181, + "grad_norm": 0.0608573816716671, + "learning_rate": 8.258297375584845e-05, + "loss": 0.1848, + "step": 7702 + }, + { + "epoch": 1.5522869232319163, + "grad_norm": 0.050830453634262085, + "learning_rate": 8.257286428169486e-05, + "loss": 0.1871, + "step": 7704 + }, + { + "epoch": 1.5526899052992142, + "grad_norm": 0.056992385536432266, + "learning_rate": 8.256275249361707e-05, + "loss": 0.1706, + "step": 7706 + }, + { + "epoch": 1.5530928873665122, + "grad_norm": 0.0413503423333168, + "learning_rate": 8.255263839233345e-05, + "loss": 0.1977, + "step": 7708 + }, + { + "epoch": 1.5534958694338101, + "grad_norm": 0.04261191189289093, + "learning_rate": 8.254252197856242e-05, + "loss": 0.2117, + "step": 7710 + }, + { + "epoch": 1.553898851501108, + "grad_norm": 0.03962111100554466, + "learning_rate": 8.253240325302272e-05, + "loss": 0.1648, + "step": 7712 + }, + { + "epoch": 1.5543018335684062, + "grad_norm": 0.04593179374933243, + "learning_rate": 8.252228221643308e-05, + "loss": 0.1734, + "step": 7714 + }, + { + "epoch": 1.5547048156357042, + "grad_norm": 0.056593868881464005, + "learning_rate": 8.251215886951253e-05, + "loss": 0.2084, + "step": 7716 + }, + { + "epoch": 1.5551077977030023, + "grad_norm": 0.06353504210710526, + "learning_rate": 8.250203321298022e-05, + "loss": 0.2008, + "step": 7718 + }, + { + "epoch": 1.5555107797703003, + "grad_norm": 0.044093504548072815, + "learning_rate": 8.249190524755546e-05, + "loss": 0.1992, + "step": 7720 + }, + { + "epoch": 1.5559137618375982, + "grad_norm": 0.058922551572322845, + "learning_rate": 8.24817749739577e-05, + "loss": 0.1842, + "step": 7722 + }, + { + "epoch": 1.5563167439048962, + "grad_norm": 0.03649704158306122, + "learning_rate": 8.247164239290659e-05, + "loss": 0.1879, + "step": 7724 + }, + { + "epoch": 1.556719725972194, + "grad_norm": 0.047487691044807434, + "learning_rate": 8.246150750512193e-05, + "loss": 0.135, + "step": 7726 + }, + { + "epoch": 1.5571227080394923, + "grad_norm": 0.04881270229816437, + "learning_rate": 8.24513703113237e-05, + "loss": 0.2149, + "step": 7728 + }, + { + "epoch": 1.5575256901067902, + "grad_norm": 0.0515984371304512, + "learning_rate": 8.244123081223203e-05, + "loss": 0.2158, + "step": 7730 + }, + { + "epoch": 1.5579286721740884, + "grad_norm": 0.044214099645614624, + "learning_rate": 8.24310890085672e-05, + "loss": 0.195, + "step": 7732 + }, + { + "epoch": 1.5583316542413863, + "grad_norm": 0.0432279109954834, + "learning_rate": 8.242094490104967e-05, + "loss": 0.1775, + "step": 7734 + }, + { + "epoch": 1.5587346363086843, + "grad_norm": 0.05913606658577919, + "learning_rate": 8.241079849040007e-05, + "loss": 0.2104, + "step": 7736 + }, + { + "epoch": 1.5591376183759822, + "grad_norm": 0.05148633196949959, + "learning_rate": 8.240064977733916e-05, + "loss": 0.1648, + "step": 7738 + }, + { + "epoch": 1.5595406004432801, + "grad_norm": 0.054173316806554794, + "learning_rate": 8.239049876258793e-05, + "loss": 0.2285, + "step": 7740 + }, + { + "epoch": 1.5599435825105783, + "grad_norm": 0.0662137120962143, + "learning_rate": 8.238034544686746e-05, + "loss": 0.2163, + "step": 7742 + }, + { + "epoch": 1.5603465645778762, + "grad_norm": 0.061622608453035355, + "learning_rate": 8.237018983089902e-05, + "loss": 0.2353, + "step": 7744 + }, + { + "epoch": 1.5607495466451744, + "grad_norm": 0.07295259088277817, + "learning_rate": 8.236003191540408e-05, + "loss": 0.2081, + "step": 7746 + }, + { + "epoch": 1.5611525287124723, + "grad_norm": 0.04859080910682678, + "learning_rate": 8.234987170110422e-05, + "loss": 0.1894, + "step": 7748 + }, + { + "epoch": 1.5615555107797703, + "grad_norm": 0.053113147616386414, + "learning_rate": 8.233970918872122e-05, + "loss": 0.2037, + "step": 7750 + }, + { + "epoch": 1.5619584928470682, + "grad_norm": 0.060238417237997055, + "learning_rate": 8.232954437897697e-05, + "loss": 0.1833, + "step": 7752 + }, + { + "epoch": 1.5623614749143662, + "grad_norm": 0.04282419756054878, + "learning_rate": 8.231937727259363e-05, + "loss": 0.2228, + "step": 7754 + }, + { + "epoch": 1.5627644569816643, + "grad_norm": 0.04955311492085457, + "learning_rate": 8.23092078702934e-05, + "loss": 0.1778, + "step": 7756 + }, + { + "epoch": 1.5631674390489623, + "grad_norm": 0.05403996258974075, + "learning_rate": 8.229903617279869e-05, + "loss": 0.2412, + "step": 7758 + }, + { + "epoch": 1.5635704211162604, + "grad_norm": 0.05369593948125839, + "learning_rate": 8.228886218083214e-05, + "loss": 0.2124, + "step": 7760 + }, + { + "epoch": 1.5639734031835584, + "grad_norm": 0.04346174746751785, + "learning_rate": 8.227868589511643e-05, + "loss": 0.1902, + "step": 7762 + }, + { + "epoch": 1.5643763852508563, + "grad_norm": 0.05239605903625488, + "learning_rate": 8.226850731637452e-05, + "loss": 0.2224, + "step": 7764 + }, + { + "epoch": 1.5647793673181543, + "grad_norm": 0.0549917034804821, + "learning_rate": 8.225832644532945e-05, + "loss": 0.1699, + "step": 7766 + }, + { + "epoch": 1.5651823493854522, + "grad_norm": 0.05196690559387207, + "learning_rate": 8.224814328270444e-05, + "loss": 0.2061, + "step": 7768 + }, + { + "epoch": 1.5655853314527504, + "grad_norm": 0.03883376717567444, + "learning_rate": 8.223795782922292e-05, + "loss": 0.1644, + "step": 7770 + }, + { + "epoch": 1.5659883135200483, + "grad_norm": 0.04878639429807663, + "learning_rate": 8.222777008560845e-05, + "loss": 0.2061, + "step": 7772 + }, + { + "epoch": 1.5663912955873465, + "grad_norm": 0.04065984860062599, + "learning_rate": 8.22175800525847e-05, + "loss": 0.1986, + "step": 7774 + }, + { + "epoch": 1.5667942776546444, + "grad_norm": 0.05418518930673599, + "learning_rate": 8.220738773087561e-05, + "loss": 0.1841, + "step": 7776 + }, + { + "epoch": 1.5671972597219423, + "grad_norm": 0.10020854324102402, + "learning_rate": 8.21971931212052e-05, + "loss": 0.2231, + "step": 7778 + }, + { + "epoch": 1.5676002417892403, + "grad_norm": 0.057610880583524704, + "learning_rate": 8.218699622429768e-05, + "loss": 0.2071, + "step": 7780 + }, + { + "epoch": 1.5680032238565382, + "grad_norm": 0.04992236942052841, + "learning_rate": 8.217679704087742e-05, + "loss": 0.2034, + "step": 7782 + }, + { + "epoch": 1.5684062059238364, + "grad_norm": 0.04503064975142479, + "learning_rate": 8.216659557166895e-05, + "loss": 0.1688, + "step": 7784 + }, + { + "epoch": 1.5688091879911346, + "grad_norm": 0.06223779916763306, + "learning_rate": 8.2156391817397e-05, + "loss": 0.2852, + "step": 7786 + }, + { + "epoch": 1.5692121700584325, + "grad_norm": 0.042011719197034836, + "learning_rate": 8.21461857787864e-05, + "loss": 0.1581, + "step": 7788 + }, + { + "epoch": 1.5696151521257304, + "grad_norm": 0.044512271881103516, + "learning_rate": 8.213597745656214e-05, + "loss": 0.1814, + "step": 7790 + }, + { + "epoch": 1.5700181341930284, + "grad_norm": 0.036685794591903687, + "learning_rate": 8.212576685144946e-05, + "loss": 0.1937, + "step": 7792 + }, + { + "epoch": 1.5704211162603263, + "grad_norm": 0.04587550461292267, + "learning_rate": 8.211555396417367e-05, + "loss": 0.1991, + "step": 7794 + }, + { + "epoch": 1.5708240983276243, + "grad_norm": 0.04192047566175461, + "learning_rate": 8.21053387954603e-05, + "loss": 0.1697, + "step": 7796 + }, + { + "epoch": 1.5712270803949224, + "grad_norm": 0.04757276922464371, + "learning_rate": 8.209512134603499e-05, + "loss": 0.2251, + "step": 7798 + }, + { + "epoch": 1.5716300624622206, + "grad_norm": 0.03852641209959984, + "learning_rate": 8.20849016166236e-05, + "loss": 0.1986, + "step": 7800 + }, + { + "epoch": 1.5720330445295185, + "grad_norm": 0.04816350340843201, + "learning_rate": 8.20746796079521e-05, + "loss": 0.2065, + "step": 7802 + }, + { + "epoch": 1.5724360265968165, + "grad_norm": 0.0453607551753521, + "learning_rate": 8.206445532074667e-05, + "loss": 0.2135, + "step": 7804 + }, + { + "epoch": 1.5728390086641144, + "grad_norm": 0.05096454173326492, + "learning_rate": 8.20542287557336e-05, + "loss": 0.1533, + "step": 7806 + }, + { + "epoch": 1.5732419907314124, + "grad_norm": 0.04808863624930382, + "learning_rate": 8.20439999136394e-05, + "loss": 0.1495, + "step": 7808 + }, + { + "epoch": 1.5736449727987103, + "grad_norm": 0.04415539652109146, + "learning_rate": 8.20337687951907e-05, + "loss": 0.1631, + "step": 7810 + }, + { + "epoch": 1.5740479548660085, + "grad_norm": 0.040588121861219406, + "learning_rate": 8.202353540111426e-05, + "loss": 0.1696, + "step": 7812 + }, + { + "epoch": 1.5744509369333066, + "grad_norm": 0.05861745402216911, + "learning_rate": 8.201329973213709e-05, + "loss": 0.2119, + "step": 7814 + }, + { + "epoch": 1.5748539190006046, + "grad_norm": 0.09671831130981445, + "learning_rate": 8.200306178898633e-05, + "loss": 0.2133, + "step": 7816 + }, + { + "epoch": 1.5752569010679025, + "grad_norm": 0.06667297333478928, + "learning_rate": 8.19928215723892e-05, + "loss": 0.1915, + "step": 7818 + }, + { + "epoch": 1.5756598831352004, + "grad_norm": 0.05069169029593468, + "learning_rate": 8.198257908307323e-05, + "loss": 0.1999, + "step": 7820 + }, + { + "epoch": 1.5760628652024984, + "grad_norm": 0.04458223283290863, + "learning_rate": 8.197233432176597e-05, + "loss": 0.2094, + "step": 7822 + }, + { + "epoch": 1.5764658472697963, + "grad_norm": 0.04166608303785324, + "learning_rate": 8.196208728919523e-05, + "loss": 0.2149, + "step": 7824 + }, + { + "epoch": 1.5768688293370945, + "grad_norm": 0.04502653703093529, + "learning_rate": 8.195183798608891e-05, + "loss": 0.1799, + "step": 7826 + }, + { + "epoch": 1.5772718114043927, + "grad_norm": 0.04423825815320015, + "learning_rate": 8.194158641317512e-05, + "loss": 0.2456, + "step": 7828 + }, + { + "epoch": 1.5776747934716906, + "grad_norm": 0.05600183457136154, + "learning_rate": 8.193133257118211e-05, + "loss": 0.2021, + "step": 7830 + }, + { + "epoch": 1.5780777755389885, + "grad_norm": 0.049919243901968, + "learning_rate": 8.19210764608383e-05, + "loss": 0.2717, + "step": 7832 + }, + { + "epoch": 1.5784807576062865, + "grad_norm": 0.04759713634848595, + "learning_rate": 8.191081808287229e-05, + "loss": 0.2121, + "step": 7834 + }, + { + "epoch": 1.5788837396735844, + "grad_norm": 0.03916984051465988, + "learning_rate": 8.190055743801278e-05, + "loss": 0.1255, + "step": 7836 + }, + { + "epoch": 1.5792867217408826, + "grad_norm": 0.06487026810646057, + "learning_rate": 8.189029452698868e-05, + "loss": 0.2221, + "step": 7838 + }, + { + "epoch": 1.5796897038081805, + "grad_norm": 0.04453813284635544, + "learning_rate": 8.188002935052907e-05, + "loss": 0.1533, + "step": 7840 + }, + { + "epoch": 1.5800926858754787, + "grad_norm": 0.04084227234125137, + "learning_rate": 8.186976190936317e-05, + "loss": 0.1486, + "step": 7842 + }, + { + "epoch": 1.5804956679427766, + "grad_norm": 0.05148351192474365, + "learning_rate": 8.185949220422034e-05, + "loss": 0.2247, + "step": 7844 + }, + { + "epoch": 1.5808986500100746, + "grad_norm": 0.03696500509977341, + "learning_rate": 8.184922023583012e-05, + "loss": 0.1468, + "step": 7846 + }, + { + "epoch": 1.5813016320773725, + "grad_norm": 0.05501485615968704, + "learning_rate": 8.183894600492225e-05, + "loss": 0.1728, + "step": 7848 + }, + { + "epoch": 1.5817046141446705, + "grad_norm": 0.05600307881832123, + "learning_rate": 8.182866951222656e-05, + "loss": 0.2311, + "step": 7850 + }, + { + "epoch": 1.5821075962119686, + "grad_norm": 0.07867880910634995, + "learning_rate": 8.181839075847311e-05, + "loss": 0.1805, + "step": 7852 + }, + { + "epoch": 1.5825105782792666, + "grad_norm": 0.06224419176578522, + "learning_rate": 8.180810974439205e-05, + "loss": 0.1766, + "step": 7854 + }, + { + "epoch": 1.5829135603465647, + "grad_norm": 0.052312593907117844, + "learning_rate": 8.179782647071374e-05, + "loss": 0.2158, + "step": 7856 + }, + { + "epoch": 1.5833165424138627, + "grad_norm": 0.05663037672638893, + "learning_rate": 8.178754093816871e-05, + "loss": 0.2584, + "step": 7858 + }, + { + "epoch": 1.5837195244811606, + "grad_norm": 0.0479075126349926, + "learning_rate": 8.17772531474876e-05, + "loss": 0.195, + "step": 7860 + }, + { + "epoch": 1.5841225065484585, + "grad_norm": 0.043161310255527496, + "learning_rate": 8.176696309940124e-05, + "loss": 0.1939, + "step": 7862 + }, + { + "epoch": 1.5845254886157565, + "grad_norm": 0.050313014537096024, + "learning_rate": 8.175667079464063e-05, + "loss": 0.1988, + "step": 7864 + }, + { + "epoch": 1.5849284706830546, + "grad_norm": 0.0593692846596241, + "learning_rate": 8.174637623393692e-05, + "loss": 0.2183, + "step": 7866 + }, + { + "epoch": 1.5853314527503526, + "grad_norm": 0.0714186280965805, + "learning_rate": 8.17360794180214e-05, + "loss": 0.1856, + "step": 7868 + }, + { + "epoch": 1.5857344348176508, + "grad_norm": 0.05151008814573288, + "learning_rate": 8.172578034762557e-05, + "loss": 0.2213, + "step": 7870 + }, + { + "epoch": 1.5861374168849487, + "grad_norm": 0.0600179061293602, + "learning_rate": 8.171547902348102e-05, + "loss": 0.2162, + "step": 7872 + }, + { + "epoch": 1.5865403989522466, + "grad_norm": 0.03982606157660484, + "learning_rate": 8.170517544631957e-05, + "loss": 0.1737, + "step": 7874 + }, + { + "epoch": 1.5869433810195446, + "grad_norm": 0.04170709848403931, + "learning_rate": 8.169486961687318e-05, + "loss": 0.1791, + "step": 7876 + }, + { + "epoch": 1.5873463630868425, + "grad_norm": 0.0685291662812233, + "learning_rate": 8.168456153587391e-05, + "loss": 0.1808, + "step": 7878 + }, + { + "epoch": 1.5877493451541407, + "grad_norm": 0.03968818858265877, + "learning_rate": 8.167425120405408e-05, + "loss": 0.1677, + "step": 7880 + }, + { + "epoch": 1.5881523272214386, + "grad_norm": 0.039801955223083496, + "learning_rate": 8.166393862214609e-05, + "loss": 0.2136, + "step": 7882 + }, + { + "epoch": 1.5885553092887368, + "grad_norm": 0.0453149750828743, + "learning_rate": 8.165362379088255e-05, + "loss": 0.2101, + "step": 7884 + }, + { + "epoch": 1.5889582913560347, + "grad_norm": 0.04556663706898689, + "learning_rate": 8.16433067109962e-05, + "loss": 0.1948, + "step": 7886 + }, + { + "epoch": 1.5893612734233327, + "grad_norm": 0.05905143544077873, + "learning_rate": 8.163298738321994e-05, + "loss": 0.1713, + "step": 7888 + }, + { + "epoch": 1.5897642554906306, + "grad_norm": 0.053225282579660416, + "learning_rate": 8.162266580828684e-05, + "loss": 0.1932, + "step": 7890 + }, + { + "epoch": 1.5901672375579285, + "grad_norm": 0.05515950173139572, + "learning_rate": 8.161234198693014e-05, + "loss": 0.1598, + "step": 7892 + }, + { + "epoch": 1.5905702196252267, + "grad_norm": 0.0776933878660202, + "learning_rate": 8.160201591988322e-05, + "loss": 0.1912, + "step": 7894 + }, + { + "epoch": 1.5909732016925247, + "grad_norm": 0.06278533488512039, + "learning_rate": 8.159168760787964e-05, + "loss": 0.2172, + "step": 7896 + }, + { + "epoch": 1.5913761837598228, + "grad_norm": 0.05375386029481888, + "learning_rate": 8.158135705165309e-05, + "loss": 0.1618, + "step": 7898 + }, + { + "epoch": 1.5917791658271208, + "grad_norm": 0.04502701014280319, + "learning_rate": 8.157102425193744e-05, + "loss": 0.1778, + "step": 7900 + }, + { + "epoch": 1.5921821478944187, + "grad_norm": 0.07836294174194336, + "learning_rate": 8.156068920946672e-05, + "loss": 0.2136, + "step": 7902 + }, + { + "epoch": 1.5925851299617166, + "grad_norm": 0.0926329493522644, + "learning_rate": 8.155035192497509e-05, + "loss": 0.2013, + "step": 7904 + }, + { + "epoch": 1.5929881120290146, + "grad_norm": 0.052508678287267685, + "learning_rate": 8.154001239919694e-05, + "loss": 0.2187, + "step": 7906 + }, + { + "epoch": 1.5933910940963127, + "grad_norm": 0.060862090438604355, + "learning_rate": 8.152967063286674e-05, + "loss": 0.1776, + "step": 7908 + }, + { + "epoch": 1.5937940761636107, + "grad_norm": 0.09297354519367218, + "learning_rate": 8.151932662671918e-05, + "loss": 0.171, + "step": 7910 + }, + { + "epoch": 1.5941970582309088, + "grad_norm": 0.05653372034430504, + "learning_rate": 8.150898038148904e-05, + "loss": 0.2042, + "step": 7912 + }, + { + "epoch": 1.5946000402982068, + "grad_norm": 0.07802308350801468, + "learning_rate": 8.149863189791134e-05, + "loss": 0.2464, + "step": 7914 + }, + { + "epoch": 1.5950030223655047, + "grad_norm": 0.046020377427339554, + "learning_rate": 8.14882811767212e-05, + "loss": 0.2178, + "step": 7916 + }, + { + "epoch": 1.5954060044328027, + "grad_norm": 0.05628088861703873, + "learning_rate": 8.147792821865392e-05, + "loss": 0.2068, + "step": 7918 + }, + { + "epoch": 1.5958089865001006, + "grad_norm": 0.05584513023495674, + "learning_rate": 8.146757302444496e-05, + "loss": 0.1852, + "step": 7920 + }, + { + "epoch": 1.5962119685673988, + "grad_norm": 0.05673876032233238, + "learning_rate": 8.145721559482996e-05, + "loss": 0.2101, + "step": 7922 + }, + { + "epoch": 1.5966149506346967, + "grad_norm": 0.05031610652804375, + "learning_rate": 8.144685593054465e-05, + "loss": 0.2067, + "step": 7924 + }, + { + "epoch": 1.5970179327019949, + "grad_norm": 0.05931559205055237, + "learning_rate": 8.143649403232499e-05, + "loss": 0.1883, + "step": 7926 + }, + { + "epoch": 1.5974209147692928, + "grad_norm": 0.056502941995859146, + "learning_rate": 8.142612990090708e-05, + "loss": 0.2533, + "step": 7928 + }, + { + "epoch": 1.5978238968365908, + "grad_norm": 0.06808840483427048, + "learning_rate": 8.141576353702715e-05, + "loss": 0.2198, + "step": 7930 + }, + { + "epoch": 1.5982268789038887, + "grad_norm": 0.056388791650533676, + "learning_rate": 8.14053949414216e-05, + "loss": 0.244, + "step": 7932 + }, + { + "epoch": 1.5986298609711866, + "grad_norm": 0.04594703018665314, + "learning_rate": 8.139502411482705e-05, + "loss": 0.2231, + "step": 7934 + }, + { + "epoch": 1.5990328430384848, + "grad_norm": 0.07403270155191422, + "learning_rate": 8.138465105798018e-05, + "loss": 0.224, + "step": 7936 + }, + { + "epoch": 1.5994358251057827, + "grad_norm": 0.06828963756561279, + "learning_rate": 8.137427577161791e-05, + "loss": 0.2217, + "step": 7938 + }, + { + "epoch": 1.599838807173081, + "grad_norm": 0.04627368599176407, + "learning_rate": 8.136389825647726e-05, + "loss": 0.2574, + "step": 7940 + }, + { + "epoch": 1.6002417892403789, + "grad_norm": 0.043321721255779266, + "learning_rate": 8.135351851329543e-05, + "loss": 0.1637, + "step": 7942 + }, + { + "epoch": 1.6006447713076768, + "grad_norm": 0.06069161742925644, + "learning_rate": 8.134313654280978e-05, + "loss": 0.1878, + "step": 7944 + }, + { + "epoch": 1.6010477533749747, + "grad_norm": 0.05267888680100441, + "learning_rate": 8.133275234575784e-05, + "loss": 0.2145, + "step": 7946 + }, + { + "epoch": 1.6014507354422727, + "grad_norm": 0.03687436878681183, + "learning_rate": 8.132236592287729e-05, + "loss": 0.1785, + "step": 7948 + }, + { + "epoch": 1.6018537175095708, + "grad_norm": 0.04854239895939827, + "learning_rate": 8.131197727490596e-05, + "loss": 0.2081, + "step": 7950 + }, + { + "epoch": 1.6022566995768688, + "grad_norm": 0.05398216098546982, + "learning_rate": 8.130158640258182e-05, + "loss": 0.1895, + "step": 7952 + }, + { + "epoch": 1.602659681644167, + "grad_norm": 0.05339875444769859, + "learning_rate": 8.129119330664305e-05, + "loss": 0.1989, + "step": 7954 + }, + { + "epoch": 1.6030626637114649, + "grad_norm": 0.0771302655339241, + "learning_rate": 8.128079798782798e-05, + "loss": 0.2012, + "step": 7956 + }, + { + "epoch": 1.6034656457787628, + "grad_norm": 0.04624316468834877, + "learning_rate": 8.1270400446875e-05, + "loss": 0.209, + "step": 7958 + }, + { + "epoch": 1.6038686278460608, + "grad_norm": 0.04548148810863495, + "learning_rate": 8.126000068452281e-05, + "loss": 0.1821, + "step": 7960 + }, + { + "epoch": 1.6042716099133587, + "grad_norm": 0.0673830509185791, + "learning_rate": 8.124959870151017e-05, + "loss": 0.252, + "step": 7962 + }, + { + "epoch": 1.6046745919806569, + "grad_norm": 0.04191439598798752, + "learning_rate": 8.1239194498576e-05, + "loss": 0.1672, + "step": 7964 + }, + { + "epoch": 1.6050775740479548, + "grad_norm": 0.044628314673900604, + "learning_rate": 8.122878807645941e-05, + "loss": 0.1743, + "step": 7966 + }, + { + "epoch": 1.605480556115253, + "grad_norm": 0.06855150312185287, + "learning_rate": 8.121837943589967e-05, + "loss": 0.1659, + "step": 7968 + }, + { + "epoch": 1.605883538182551, + "grad_norm": 0.04463125020265579, + "learning_rate": 8.120796857763617e-05, + "loss": 0.1596, + "step": 7970 + }, + { + "epoch": 1.6062865202498489, + "grad_norm": 0.05310614034533501, + "learning_rate": 8.119755550240849e-05, + "loss": 0.1873, + "step": 7972 + }, + { + "epoch": 1.6066895023171468, + "grad_norm": 0.07126377522945404, + "learning_rate": 8.118714021095636e-05, + "loss": 0.1796, + "step": 7974 + }, + { + "epoch": 1.6070924843844447, + "grad_norm": 0.05647788941860199, + "learning_rate": 8.117672270401969e-05, + "loss": 0.1719, + "step": 7976 + }, + { + "epoch": 1.607495466451743, + "grad_norm": 0.05399641394615173, + "learning_rate": 8.116630298233847e-05, + "loss": 0.1828, + "step": 7978 + }, + { + "epoch": 1.607898448519041, + "grad_norm": 0.04089745134115219, + "learning_rate": 8.115588104665294e-05, + "loss": 0.1624, + "step": 7980 + }, + { + "epoch": 1.608301430586339, + "grad_norm": 0.09755789488554001, + "learning_rate": 8.114545689770345e-05, + "loss": 0.2119, + "step": 7982 + }, + { + "epoch": 1.608704412653637, + "grad_norm": 0.045670583844184875, + "learning_rate": 8.113503053623051e-05, + "loss": 0.1697, + "step": 7984 + }, + { + "epoch": 1.609107394720935, + "grad_norm": 0.0721781849861145, + "learning_rate": 8.11246019629748e-05, + "loss": 0.2267, + "step": 7986 + }, + { + "epoch": 1.6095103767882328, + "grad_norm": 0.04462890326976776, + "learning_rate": 8.111417117867715e-05, + "loss": 0.1754, + "step": 7988 + }, + { + "epoch": 1.6099133588555308, + "grad_norm": 0.06390615552663803, + "learning_rate": 8.110373818407852e-05, + "loss": 0.2028, + "step": 7990 + }, + { + "epoch": 1.610316340922829, + "grad_norm": 0.07268303632736206, + "learning_rate": 8.109330297992009e-05, + "loss": 0.2216, + "step": 7992 + }, + { + "epoch": 1.610719322990127, + "grad_norm": 0.051655374467372894, + "learning_rate": 8.10828655669431e-05, + "loss": 0.1643, + "step": 7994 + }, + { + "epoch": 1.611122305057425, + "grad_norm": 0.05964389815926552, + "learning_rate": 8.10724259458891e-05, + "loss": 0.1843, + "step": 7996 + }, + { + "epoch": 1.611525287124723, + "grad_norm": 0.04912543296813965, + "learning_rate": 8.106198411749964e-05, + "loss": 0.2204, + "step": 7998 + }, + { + "epoch": 1.611928269192021, + "grad_norm": 0.056842729449272156, + "learning_rate": 8.10515400825165e-05, + "loss": 0.2453, + "step": 8000 + }, + { + "epoch": 1.6123312512593189, + "grad_norm": 0.05034971982240677, + "learning_rate": 8.104109384168162e-05, + "loss": 0.213, + "step": 8002 + }, + { + "epoch": 1.6127342333266168, + "grad_norm": 0.042848989367485046, + "learning_rate": 8.103064539573706e-05, + "loss": 0.1757, + "step": 8004 + }, + { + "epoch": 1.613137215393915, + "grad_norm": 0.04389515891671181, + "learning_rate": 8.102019474542509e-05, + "loss": 0.1823, + "step": 8006 + }, + { + "epoch": 1.6135401974612131, + "grad_norm": 0.05041665956377983, + "learning_rate": 8.100974189148809e-05, + "loss": 0.2086, + "step": 8008 + }, + { + "epoch": 1.613943179528511, + "grad_norm": 0.06093117967247963, + "learning_rate": 8.099928683466861e-05, + "loss": 0.2188, + "step": 8010 + }, + { + "epoch": 1.614346161595809, + "grad_norm": 0.05891212821006775, + "learning_rate": 8.098882957570937e-05, + "loss": 0.1658, + "step": 8012 + }, + { + "epoch": 1.614749143663107, + "grad_norm": 0.03832251578569412, + "learning_rate": 8.097837011535325e-05, + "loss": 0.1527, + "step": 8014 + }, + { + "epoch": 1.615152125730405, + "grad_norm": 0.03451972082257271, + "learning_rate": 8.096790845434326e-05, + "loss": 0.1676, + "step": 8016 + }, + { + "epoch": 1.6155551077977028, + "grad_norm": 0.04633672162890434, + "learning_rate": 8.095744459342257e-05, + "loss": 0.2141, + "step": 8018 + }, + { + "epoch": 1.615958089865001, + "grad_norm": 0.037216588854789734, + "learning_rate": 8.094697853333453e-05, + "loss": 0.1653, + "step": 8020 + }, + { + "epoch": 1.6163610719322992, + "grad_norm": 0.05595245212316513, + "learning_rate": 8.093651027482263e-05, + "loss": 0.2069, + "step": 8022 + }, + { + "epoch": 1.616764053999597, + "grad_norm": 0.04852914810180664, + "learning_rate": 8.092603981863051e-05, + "loss": 0.1542, + "step": 8024 + }, + { + "epoch": 1.617167036066895, + "grad_norm": 0.048641812056303024, + "learning_rate": 8.091556716550198e-05, + "loss": 0.2141, + "step": 8026 + }, + { + "epoch": 1.617570018134193, + "grad_norm": 0.05191672965884209, + "learning_rate": 8.0905092316181e-05, + "loss": 0.2048, + "step": 8028 + }, + { + "epoch": 1.617973000201491, + "grad_norm": 0.06447257846593857, + "learning_rate": 8.089461527141169e-05, + "loss": 0.1651, + "step": 8030 + }, + { + "epoch": 1.618375982268789, + "grad_norm": 0.05003985017538071, + "learning_rate": 8.088413603193831e-05, + "loss": 0.1688, + "step": 8032 + }, + { + "epoch": 1.618778964336087, + "grad_norm": 0.04257288575172424, + "learning_rate": 8.087365459850531e-05, + "loss": 0.194, + "step": 8034 + }, + { + "epoch": 1.6191819464033852, + "grad_norm": 0.04626917093992233, + "learning_rate": 8.086317097185727e-05, + "loss": 0.229, + "step": 8036 + }, + { + "epoch": 1.6195849284706831, + "grad_norm": 0.04448797181248665, + "learning_rate": 8.085268515273891e-05, + "loss": 0.1372, + "step": 8038 + }, + { + "epoch": 1.619987910537981, + "grad_norm": 0.06380314379930496, + "learning_rate": 8.084219714189514e-05, + "loss": 0.1552, + "step": 8040 + }, + { + "epoch": 1.620390892605279, + "grad_norm": 0.08029188960790634, + "learning_rate": 8.083170694007102e-05, + "loss": 0.2016, + "step": 8042 + }, + { + "epoch": 1.620793874672577, + "grad_norm": 0.05923297628760338, + "learning_rate": 8.082121454801174e-05, + "loss": 0.2585, + "step": 8044 + }, + { + "epoch": 1.6211968567398751, + "grad_norm": 0.06981176137924194, + "learning_rate": 8.081071996646266e-05, + "loss": 0.1868, + "step": 8046 + }, + { + "epoch": 1.621599838807173, + "grad_norm": 0.0497167594730854, + "learning_rate": 8.080022319616931e-05, + "loss": 0.1648, + "step": 8048 + }, + { + "epoch": 1.6220028208744712, + "grad_norm": 0.047734495252370834, + "learning_rate": 8.078972423787738e-05, + "loss": 0.1367, + "step": 8050 + }, + { + "epoch": 1.6224058029417692, + "grad_norm": 0.05451458692550659, + "learning_rate": 8.077922309233267e-05, + "loss": 0.2072, + "step": 8052 + }, + { + "epoch": 1.6228087850090671, + "grad_norm": 0.04716449975967407, + "learning_rate": 8.076871976028117e-05, + "loss": 0.2184, + "step": 8054 + }, + { + "epoch": 1.623211767076365, + "grad_norm": 0.057981863617897034, + "learning_rate": 8.075821424246904e-05, + "loss": 0.1972, + "step": 8056 + }, + { + "epoch": 1.623614749143663, + "grad_norm": 0.056067463010549545, + "learning_rate": 8.074770653964254e-05, + "loss": 0.2298, + "step": 8058 + }, + { + "epoch": 1.6240177312109612, + "grad_norm": 0.05075881630182266, + "learning_rate": 8.073719665254815e-05, + "loss": 0.1309, + "step": 8060 + }, + { + "epoch": 1.624420713278259, + "grad_norm": 0.057302046567201614, + "learning_rate": 8.072668458193247e-05, + "loss": 0.2061, + "step": 8062 + }, + { + "epoch": 1.6248236953455573, + "grad_norm": 0.059380508959293365, + "learning_rate": 8.071617032854226e-05, + "loss": 0.1945, + "step": 8064 + }, + { + "epoch": 1.6252266774128552, + "grad_norm": 0.058844953775405884, + "learning_rate": 8.070565389312443e-05, + "loss": 0.2018, + "step": 8066 + }, + { + "epoch": 1.6256296594801531, + "grad_norm": 0.051923152059316635, + "learning_rate": 8.069513527642605e-05, + "loss": 0.1757, + "step": 8068 + }, + { + "epoch": 1.626032641547451, + "grad_norm": 0.05240127071738243, + "learning_rate": 8.068461447919435e-05, + "loss": 0.2156, + "step": 8070 + }, + { + "epoch": 1.626435623614749, + "grad_norm": 0.04726002365350723, + "learning_rate": 8.06740915021767e-05, + "loss": 0.2025, + "step": 8072 + }, + { + "epoch": 1.6268386056820472, + "grad_norm": 0.048952165991067886, + "learning_rate": 8.066356634612067e-05, + "loss": 0.219, + "step": 8074 + }, + { + "epoch": 1.6272415877493451, + "grad_norm": 0.04405834153294563, + "learning_rate": 8.065303901177392e-05, + "loss": 0.1825, + "step": 8076 + }, + { + "epoch": 1.6276445698166433, + "grad_norm": 0.0471949465572834, + "learning_rate": 8.064250949988429e-05, + "loss": 0.1616, + "step": 8078 + }, + { + "epoch": 1.6280475518839412, + "grad_norm": 0.05710560828447342, + "learning_rate": 8.06319778111998e-05, + "loss": 0.1621, + "step": 8080 + }, + { + "epoch": 1.6284505339512392, + "grad_norm": 0.05007312446832657, + "learning_rate": 8.062144394646858e-05, + "loss": 0.2078, + "step": 8082 + }, + { + "epoch": 1.6288535160185371, + "grad_norm": 0.043250661343336105, + "learning_rate": 8.061090790643897e-05, + "loss": 0.2212, + "step": 8084 + }, + { + "epoch": 1.629256498085835, + "grad_norm": 0.10560256987810135, + "learning_rate": 8.060036969185941e-05, + "loss": 0.156, + "step": 8086 + }, + { + "epoch": 1.6296594801531332, + "grad_norm": 0.048674967139959335, + "learning_rate": 8.058982930347852e-05, + "loss": 0.1581, + "step": 8088 + }, + { + "epoch": 1.6300624622204312, + "grad_norm": 0.05943009629845619, + "learning_rate": 8.05792867420451e-05, + "loss": 0.2013, + "step": 8090 + }, + { + "epoch": 1.6304654442877293, + "grad_norm": 0.043210141360759735, + "learning_rate": 8.056874200830803e-05, + "loss": 0.1863, + "step": 8092 + }, + { + "epoch": 1.6308684263550273, + "grad_norm": 0.05146521329879761, + "learning_rate": 8.055819510301642e-05, + "loss": 0.2236, + "step": 8094 + }, + { + "epoch": 1.6312714084223252, + "grad_norm": 0.0610971599817276, + "learning_rate": 8.054764602691951e-05, + "loss": 0.2232, + "step": 8096 + }, + { + "epoch": 1.6316743904896231, + "grad_norm": 0.04712080955505371, + "learning_rate": 8.053709478076668e-05, + "loss": 0.2287, + "step": 8098 + }, + { + "epoch": 1.632077372556921, + "grad_norm": 0.03435961529612541, + "learning_rate": 8.052654136530746e-05, + "loss": 0.1468, + "step": 8100 + }, + { + "epoch": 1.6324803546242193, + "grad_norm": 0.07136266678571701, + "learning_rate": 8.051598578129157e-05, + "loss": 0.227, + "step": 8102 + }, + { + "epoch": 1.6328833366915172, + "grad_norm": 0.05471991002559662, + "learning_rate": 8.050542802946886e-05, + "loss": 0.1854, + "step": 8104 + }, + { + "epoch": 1.6332863187588154, + "grad_norm": 0.046276554465293884, + "learning_rate": 8.04948681105893e-05, + "loss": 0.1891, + "step": 8106 + }, + { + "epoch": 1.6336893008261133, + "grad_norm": 0.050421275198459625, + "learning_rate": 8.048430602540311e-05, + "loss": 0.1964, + "step": 8108 + }, + { + "epoch": 1.6340922828934112, + "grad_norm": 0.04822089895606041, + "learning_rate": 8.047374177466056e-05, + "loss": 0.2245, + "step": 8110 + }, + { + "epoch": 1.6344952649607092, + "grad_norm": 0.055037956684827805, + "learning_rate": 8.046317535911214e-05, + "loss": 0.1881, + "step": 8112 + }, + { + "epoch": 1.6348982470280071, + "grad_norm": 0.05557211488485336, + "learning_rate": 8.045260677950846e-05, + "loss": 0.1872, + "step": 8114 + }, + { + "epoch": 1.6353012290953053, + "grad_norm": 0.1812106966972351, + "learning_rate": 8.044203603660027e-05, + "loss": 0.2067, + "step": 8116 + }, + { + "epoch": 1.6357042111626032, + "grad_norm": 0.05329279601573944, + "learning_rate": 8.043146313113854e-05, + "loss": 0.2216, + "step": 8118 + }, + { + "epoch": 1.6361071932299014, + "grad_norm": 0.03556535392999649, + "learning_rate": 8.042088806387436e-05, + "loss": 0.1244, + "step": 8120 + }, + { + "epoch": 1.6365101752971993, + "grad_norm": 0.0379241406917572, + "learning_rate": 8.041031083555892e-05, + "loss": 0.2192, + "step": 8122 + }, + { + "epoch": 1.6369131573644973, + "grad_norm": 0.07962547242641449, + "learning_rate": 8.039973144694364e-05, + "loss": 0.2132, + "step": 8124 + }, + { + "epoch": 1.6373161394317952, + "grad_norm": 0.05779522284865379, + "learning_rate": 8.038914989878005e-05, + "loss": 0.1916, + "step": 8126 + }, + { + "epoch": 1.6377191214990932, + "grad_norm": 0.06630147248506546, + "learning_rate": 8.037856619181985e-05, + "loss": 0.2015, + "step": 8128 + }, + { + "epoch": 1.6381221035663913, + "grad_norm": 0.0491768941283226, + "learning_rate": 8.03679803268149e-05, + "loss": 0.2045, + "step": 8130 + }, + { + "epoch": 1.6385250856336893, + "grad_norm": 0.04900708422064781, + "learning_rate": 8.035739230451719e-05, + "loss": 0.177, + "step": 8132 + }, + { + "epoch": 1.6389280677009874, + "grad_norm": 0.033459994941949844, + "learning_rate": 8.034680212567887e-05, + "loss": 0.1713, + "step": 8134 + }, + { + "epoch": 1.6393310497682854, + "grad_norm": 0.04298264533281326, + "learning_rate": 8.033620979105227e-05, + "loss": 0.2095, + "step": 8136 + }, + { + "epoch": 1.6397340318355833, + "grad_norm": 0.04192821681499481, + "learning_rate": 8.032561530138985e-05, + "loss": 0.1744, + "step": 8138 + }, + { + "epoch": 1.6401370139028812, + "grad_norm": 0.05017191916704178, + "learning_rate": 8.03150186574442e-05, + "loss": 0.1874, + "step": 8140 + }, + { + "epoch": 1.6405399959701792, + "grad_norm": 0.04265674576163292, + "learning_rate": 8.030441985996812e-05, + "loss": 0.1553, + "step": 8142 + }, + { + "epoch": 1.6409429780374774, + "grad_norm": 0.03842853009700775, + "learning_rate": 8.02938189097145e-05, + "loss": 0.1578, + "step": 8144 + }, + { + "epoch": 1.6413459601047753, + "grad_norm": 0.04175066202878952, + "learning_rate": 8.028321580743645e-05, + "loss": 0.2033, + "step": 8146 + }, + { + "epoch": 1.6417489421720735, + "grad_norm": 0.04019409045577049, + "learning_rate": 8.027261055388717e-05, + "loss": 0.1894, + "step": 8148 + }, + { + "epoch": 1.6421519242393714, + "grad_norm": 0.04538341239094734, + "learning_rate": 8.026200314982007e-05, + "loss": 0.1652, + "step": 8150 + }, + { + "epoch": 1.6425549063066693, + "grad_norm": 0.05302810296416283, + "learning_rate": 8.025139359598863e-05, + "loss": 0.2195, + "step": 8152 + }, + { + "epoch": 1.6429578883739673, + "grad_norm": 0.06037846952676773, + "learning_rate": 8.024078189314659e-05, + "loss": 0.2235, + "step": 8154 + }, + { + "epoch": 1.6433608704412652, + "grad_norm": 0.04620375484228134, + "learning_rate": 8.023016804204777e-05, + "loss": 0.1934, + "step": 8156 + }, + { + "epoch": 1.6437638525085634, + "grad_norm": 0.04581563174724579, + "learning_rate": 8.021955204344615e-05, + "loss": 0.1899, + "step": 8158 + }, + { + "epoch": 1.6441668345758613, + "grad_norm": 0.03631268069148064, + "learning_rate": 8.020893389809589e-05, + "loss": 0.1662, + "step": 8160 + }, + { + "epoch": 1.6445698166431595, + "grad_norm": 0.048094492405653, + "learning_rate": 8.019831360675127e-05, + "loss": 0.2303, + "step": 8162 + }, + { + "epoch": 1.6449727987104574, + "grad_norm": 0.04222455993294716, + "learning_rate": 8.018769117016675e-05, + "loss": 0.175, + "step": 8164 + }, + { + "epoch": 1.6453757807777554, + "grad_norm": 0.05244714021682739, + "learning_rate": 8.017706658909692e-05, + "loss": 0.1874, + "step": 8166 + }, + { + "epoch": 1.6457787628450533, + "grad_norm": 0.052215490490198135, + "learning_rate": 8.016643986429655e-05, + "loss": 0.1876, + "step": 8168 + }, + { + "epoch": 1.6461817449123513, + "grad_norm": 0.12432952225208282, + "learning_rate": 8.015581099652053e-05, + "loss": 0.1976, + "step": 8170 + }, + { + "epoch": 1.6465847269796494, + "grad_norm": 0.05211934074759483, + "learning_rate": 8.014517998652393e-05, + "loss": 0.2118, + "step": 8172 + }, + { + "epoch": 1.6469877090469474, + "grad_norm": 0.050590209662914276, + "learning_rate": 8.013454683506193e-05, + "loss": 0.1852, + "step": 8174 + }, + { + "epoch": 1.6473906911142455, + "grad_norm": 0.060968901962041855, + "learning_rate": 8.012391154288995e-05, + "loss": 0.208, + "step": 8176 + }, + { + "epoch": 1.6477936731815435, + "grad_norm": 0.051988277584314346, + "learning_rate": 8.011327411076346e-05, + "loss": 0.2507, + "step": 8178 + }, + { + "epoch": 1.6481966552488414, + "grad_norm": 0.03496647998690605, + "learning_rate": 8.010263453943814e-05, + "loss": 0.1696, + "step": 8180 + }, + { + "epoch": 1.6485996373161393, + "grad_norm": 0.05741781368851662, + "learning_rate": 8.00919928296698e-05, + "loss": 0.1829, + "step": 8182 + }, + { + "epoch": 1.6490026193834373, + "grad_norm": 0.04385066404938698, + "learning_rate": 8.00813489822144e-05, + "loss": 0.1922, + "step": 8184 + }, + { + "epoch": 1.6494056014507354, + "grad_norm": 0.06760777533054352, + "learning_rate": 8.007070299782808e-05, + "loss": 0.2192, + "step": 8186 + }, + { + "epoch": 1.6498085835180336, + "grad_norm": 0.03948403149843216, + "learning_rate": 8.006005487726713e-05, + "loss": 0.1596, + "step": 8188 + }, + { + "epoch": 1.6502115655853316, + "grad_norm": 0.05976463481783867, + "learning_rate": 8.004940462128794e-05, + "loss": 0.196, + "step": 8190 + }, + { + "epoch": 1.6506145476526295, + "grad_norm": 0.03693857043981552, + "learning_rate": 8.003875223064711e-05, + "loss": 0.1452, + "step": 8192 + }, + { + "epoch": 1.6510175297199274, + "grad_norm": 0.04535726457834244, + "learning_rate": 8.002809770610136e-05, + "loss": 0.1711, + "step": 8194 + }, + { + "epoch": 1.6514205117872254, + "grad_norm": 0.041834503412246704, + "learning_rate": 8.001744104840756e-05, + "loss": 0.1813, + "step": 8196 + }, + { + "epoch": 1.6518234938545233, + "grad_norm": 0.055996619164943695, + "learning_rate": 8.000678225832275e-05, + "loss": 0.1748, + "step": 8198 + }, + { + "epoch": 1.6522264759218215, + "grad_norm": 0.05592850595712662, + "learning_rate": 7.999612133660413e-05, + "loss": 0.2037, + "step": 8200 + }, + { + "epoch": 1.6526294579891196, + "grad_norm": 0.04505985602736473, + "learning_rate": 7.998545828400904e-05, + "loss": 0.2294, + "step": 8202 + }, + { + "epoch": 1.6530324400564176, + "grad_norm": 0.06555044651031494, + "learning_rate": 7.997479310129491e-05, + "loss": 0.2113, + "step": 8204 + }, + { + "epoch": 1.6534354221237155, + "grad_norm": 0.051124464720487595, + "learning_rate": 7.996412578921945e-05, + "loss": 0.1841, + "step": 8206 + }, + { + "epoch": 1.6538384041910135, + "grad_norm": 0.054283417761325836, + "learning_rate": 7.995345634854039e-05, + "loss": 0.1611, + "step": 8208 + }, + { + "epoch": 1.6542413862583114, + "grad_norm": 0.04832978919148445, + "learning_rate": 7.994278478001571e-05, + "loss": 0.181, + "step": 8210 + }, + { + "epoch": 1.6546443683256093, + "grad_norm": 0.05636703222990036, + "learning_rate": 7.993211108440348e-05, + "loss": 0.2443, + "step": 8212 + }, + { + "epoch": 1.6550473503929075, + "grad_norm": 0.0764659196138382, + "learning_rate": 7.992143526246195e-05, + "loss": 0.2322, + "step": 8214 + }, + { + "epoch": 1.6554503324602057, + "grad_norm": 0.0435616709291935, + "learning_rate": 7.99107573149495e-05, + "loss": 0.2397, + "step": 8216 + }, + { + "epoch": 1.6558533145275036, + "grad_norm": 0.04968470335006714, + "learning_rate": 7.99000772426247e-05, + "loss": 0.2146, + "step": 8218 + }, + { + "epoch": 1.6562562965948016, + "grad_norm": 0.039583925157785416, + "learning_rate": 7.988939504624622e-05, + "loss": 0.1396, + "step": 8220 + }, + { + "epoch": 1.6566592786620995, + "grad_norm": 0.06294752657413483, + "learning_rate": 7.987871072657293e-05, + "loss": 0.1602, + "step": 8222 + }, + { + "epoch": 1.6570622607293974, + "grad_norm": 0.04718125984072685, + "learning_rate": 7.98680242843638e-05, + "loss": 0.1643, + "step": 8224 + }, + { + "epoch": 1.6574652427966954, + "grad_norm": 0.04380892589688301, + "learning_rate": 7.985733572037802e-05, + "loss": 0.1804, + "step": 8226 + }, + { + "epoch": 1.6578682248639935, + "grad_norm": 0.05865201726555824, + "learning_rate": 7.984664503537483e-05, + "loss": 0.1857, + "step": 8228 + }, + { + "epoch": 1.6582712069312917, + "grad_norm": 0.05377458035945892, + "learning_rate": 7.983595223011371e-05, + "loss": 0.2507, + "step": 8230 + }, + { + "epoch": 1.6586741889985896, + "grad_norm": 0.05341102182865143, + "learning_rate": 7.982525730535426e-05, + "loss": 0.219, + "step": 8232 + }, + { + "epoch": 1.6590771710658876, + "grad_norm": 0.04187353700399399, + "learning_rate": 7.981456026185625e-05, + "loss": 0.2516, + "step": 8234 + }, + { + "epoch": 1.6594801531331855, + "grad_norm": 0.04068870469927788, + "learning_rate": 7.980386110037954e-05, + "loss": 0.2234, + "step": 8236 + }, + { + "epoch": 1.6598831352004835, + "grad_norm": 0.04932933673262596, + "learning_rate": 7.979315982168421e-05, + "loss": 0.1944, + "step": 8238 + }, + { + "epoch": 1.6602861172677816, + "grad_norm": 0.04017691686749458, + "learning_rate": 7.978245642653044e-05, + "loss": 0.168, + "step": 8240 + }, + { + "epoch": 1.6606890993350796, + "grad_norm": 0.05114683881402016, + "learning_rate": 7.977175091567862e-05, + "loss": 0.1944, + "step": 8242 + }, + { + "epoch": 1.6610920814023777, + "grad_norm": 0.06419112533330917, + "learning_rate": 7.976104328988921e-05, + "loss": 0.2418, + "step": 8244 + }, + { + "epoch": 1.6614950634696757, + "grad_norm": 0.0576966255903244, + "learning_rate": 7.97503335499229e-05, + "loss": 0.2162, + "step": 8246 + }, + { + "epoch": 1.6618980455369736, + "grad_norm": 0.044565919786691666, + "learning_rate": 7.973962169654044e-05, + "loss": 0.2053, + "step": 8248 + }, + { + "epoch": 1.6623010276042716, + "grad_norm": 0.046498291194438934, + "learning_rate": 7.972890773050284e-05, + "loss": 0.2392, + "step": 8250 + }, + { + "epoch": 1.6627040096715695, + "grad_norm": 0.058883845806121826, + "learning_rate": 7.971819165257117e-05, + "loss": 0.1683, + "step": 8252 + }, + { + "epoch": 1.6631069917388677, + "grad_norm": 0.03590350225567818, + "learning_rate": 7.97074734635067e-05, + "loss": 0.1825, + "step": 8254 + }, + { + "epoch": 1.6635099738061656, + "grad_norm": 0.04365016892552376, + "learning_rate": 7.969675316407083e-05, + "loss": 0.1716, + "step": 8256 + }, + { + "epoch": 1.6639129558734638, + "grad_norm": 0.038824837654829025, + "learning_rate": 7.96860307550251e-05, + "loss": 0.1692, + "step": 8258 + }, + { + "epoch": 1.6643159379407617, + "grad_norm": 0.0652041882276535, + "learning_rate": 7.967530623713122e-05, + "loss": 0.1787, + "step": 8260 + }, + { + "epoch": 1.6647189200080597, + "grad_norm": 0.05665358901023865, + "learning_rate": 7.966457961115104e-05, + "loss": 0.2032, + "step": 8262 + }, + { + "epoch": 1.6651219020753576, + "grad_norm": 0.04727930948138237, + "learning_rate": 7.965385087784657e-05, + "loss": 0.1644, + "step": 8264 + }, + { + "epoch": 1.6655248841426555, + "grad_norm": 0.035444822162389755, + "learning_rate": 7.964312003797996e-05, + "loss": 0.1738, + "step": 8266 + }, + { + "epoch": 1.6659278662099537, + "grad_norm": 0.045218631625175476, + "learning_rate": 7.963238709231351e-05, + "loss": 0.1681, + "step": 8268 + }, + { + "epoch": 1.6663308482772516, + "grad_norm": 0.07512037456035614, + "learning_rate": 7.962165204160966e-05, + "loss": 0.2618, + "step": 8270 + }, + { + "epoch": 1.6667338303445498, + "grad_norm": 0.03968672454357147, + "learning_rate": 7.961091488663105e-05, + "loss": 0.1868, + "step": 8272 + }, + { + "epoch": 1.6671368124118477, + "grad_norm": 0.03970741853117943, + "learning_rate": 7.960017562814038e-05, + "loss": 0.1676, + "step": 8274 + }, + { + "epoch": 1.6675397944791457, + "grad_norm": 0.0486428327858448, + "learning_rate": 7.958943426690056e-05, + "loss": 0.2157, + "step": 8276 + }, + { + "epoch": 1.6679427765464436, + "grad_norm": 0.0547938346862793, + "learning_rate": 7.957869080367466e-05, + "loss": 0.1734, + "step": 8278 + }, + { + "epoch": 1.6683457586137416, + "grad_norm": 0.03951823711395264, + "learning_rate": 7.956794523922589e-05, + "loss": 0.1634, + "step": 8280 + }, + { + "epoch": 1.6687487406810397, + "grad_norm": 0.06093864515423775, + "learning_rate": 7.955719757431755e-05, + "loss": 0.1867, + "step": 8282 + }, + { + "epoch": 1.6691517227483377, + "grad_norm": 0.04404790699481964, + "learning_rate": 7.95464478097132e-05, + "loss": 0.1829, + "step": 8284 + }, + { + "epoch": 1.6695547048156358, + "grad_norm": 0.05264603719115257, + "learning_rate": 7.95356959461764e-05, + "loss": 0.1814, + "step": 8286 + }, + { + "epoch": 1.6699576868829338, + "grad_norm": 0.05429494380950928, + "learning_rate": 7.952494198447102e-05, + "loss": 0.1782, + "step": 8288 + }, + { + "epoch": 1.6703606689502317, + "grad_norm": 0.04684220254421234, + "learning_rate": 7.9514185925361e-05, + "loss": 0.2271, + "step": 8290 + }, + { + "epoch": 1.6707636510175297, + "grad_norm": 0.04970792680978775, + "learning_rate": 7.950342776961038e-05, + "loss": 0.1925, + "step": 8292 + }, + { + "epoch": 1.6711666330848276, + "grad_norm": 0.061497241258621216, + "learning_rate": 7.949266751798345e-05, + "loss": 0.2345, + "step": 8294 + }, + { + "epoch": 1.6715696151521258, + "grad_norm": 0.0509205162525177, + "learning_rate": 7.948190517124459e-05, + "loss": 0.1747, + "step": 8296 + }, + { + "epoch": 1.6719725972194237, + "grad_norm": 0.05777128040790558, + "learning_rate": 7.947114073015833e-05, + "loss": 0.2091, + "step": 8298 + }, + { + "epoch": 1.6723755792867219, + "grad_norm": 0.03959937021136284, + "learning_rate": 7.946037419548936e-05, + "loss": 0.1686, + "step": 8300 + }, + { + "epoch": 1.6727785613540198, + "grad_norm": 0.0511389821767807, + "learning_rate": 7.944960556800254e-05, + "loss": 0.2136, + "step": 8302 + }, + { + "epoch": 1.6731815434213178, + "grad_norm": 0.05945146456360817, + "learning_rate": 7.943883484846282e-05, + "loss": 0.1785, + "step": 8304 + }, + { + "epoch": 1.6735845254886157, + "grad_norm": 0.0385262668132782, + "learning_rate": 7.942806203763535e-05, + "loss": 0.1676, + "step": 8306 + }, + { + "epoch": 1.6739875075559136, + "grad_norm": 0.05760958045721054, + "learning_rate": 7.941728713628544e-05, + "loss": 0.1924, + "step": 8308 + }, + { + "epoch": 1.6743904896232118, + "grad_norm": 0.05645795539021492, + "learning_rate": 7.940651014517848e-05, + "loss": 0.1755, + "step": 8310 + }, + { + "epoch": 1.6747934716905097, + "grad_norm": 0.04871126636862755, + "learning_rate": 7.939573106508008e-05, + "loss": 0.1909, + "step": 8312 + }, + { + "epoch": 1.675196453757808, + "grad_norm": 0.043447740375995636, + "learning_rate": 7.938494989675594e-05, + "loss": 0.2446, + "step": 8314 + }, + { + "epoch": 1.6755994358251058, + "grad_norm": 0.038542792201042175, + "learning_rate": 7.937416664097195e-05, + "loss": 0.1607, + "step": 8316 + }, + { + "epoch": 1.6760024178924038, + "grad_norm": 0.05882372334599495, + "learning_rate": 7.936338129849415e-05, + "loss": 0.1976, + "step": 8318 + }, + { + "epoch": 1.6764053999597017, + "grad_norm": 0.031100820749998093, + "learning_rate": 7.935259387008871e-05, + "loss": 0.1232, + "step": 8320 + }, + { + "epoch": 1.6768083820269997, + "grad_norm": 0.04631880298256874, + "learning_rate": 7.934180435652194e-05, + "loss": 0.1579, + "step": 8322 + }, + { + "epoch": 1.6772113640942978, + "grad_norm": 0.04775720462203026, + "learning_rate": 7.93310127585603e-05, + "loss": 0.1832, + "step": 8324 + }, + { + "epoch": 1.6776143461615958, + "grad_norm": 0.04856050759553909, + "learning_rate": 7.932021907697044e-05, + "loss": 0.1944, + "step": 8326 + }, + { + "epoch": 1.678017328228894, + "grad_norm": 0.04146652668714523, + "learning_rate": 7.93094233125191e-05, + "loss": 0.1789, + "step": 8328 + }, + { + "epoch": 1.6784203102961919, + "grad_norm": 0.039745211601257324, + "learning_rate": 7.92986254659732e-05, + "loss": 0.2204, + "step": 8330 + }, + { + "epoch": 1.6788232923634898, + "grad_norm": 0.047939032316207886, + "learning_rate": 7.92878255380998e-05, + "loss": 0.2244, + "step": 8332 + }, + { + "epoch": 1.6792262744307878, + "grad_norm": 0.06156514957547188, + "learning_rate": 7.927702352966611e-05, + "loss": 0.2438, + "step": 8334 + }, + { + "epoch": 1.6796292564980857, + "grad_norm": 0.049319975078105927, + "learning_rate": 7.92662194414395e-05, + "loss": 0.1942, + "step": 8336 + }, + { + "epoch": 1.6800322385653839, + "grad_norm": 0.06015758216381073, + "learning_rate": 7.925541327418747e-05, + "loss": 0.197, + "step": 8338 + }, + { + "epoch": 1.6804352206326818, + "grad_norm": 0.0507478229701519, + "learning_rate": 7.924460502867766e-05, + "loss": 0.2398, + "step": 8340 + }, + { + "epoch": 1.68083820269998, + "grad_norm": 0.04949882626533508, + "learning_rate": 7.923379470567787e-05, + "loss": 0.2022, + "step": 8342 + }, + { + "epoch": 1.681241184767278, + "grad_norm": 0.0555623322725296, + "learning_rate": 7.922298230595607e-05, + "loss": 0.2119, + "step": 8344 + }, + { + "epoch": 1.6816441668345758, + "grad_norm": 0.04433630779385567, + "learning_rate": 7.921216783028034e-05, + "loss": 0.2112, + "step": 8346 + }, + { + "epoch": 1.6820471489018738, + "grad_norm": 0.051720310002565384, + "learning_rate": 7.920135127941893e-05, + "loss": 0.2542, + "step": 8348 + }, + { + "epoch": 1.6824501309691717, + "grad_norm": 0.05196801573038101, + "learning_rate": 7.91905326541402e-05, + "loss": 0.2263, + "step": 8350 + }, + { + "epoch": 1.68285311303647, + "grad_norm": 0.03836916387081146, + "learning_rate": 7.917971195521274e-05, + "loss": 0.1765, + "step": 8352 + }, + { + "epoch": 1.6832560951037678, + "grad_norm": 0.04000631347298622, + "learning_rate": 7.916888918340521e-05, + "loss": 0.2161, + "step": 8354 + }, + { + "epoch": 1.683659077171066, + "grad_norm": 0.06888539344072342, + "learning_rate": 7.915806433948643e-05, + "loss": 0.2373, + "step": 8356 + }, + { + "epoch": 1.684062059238364, + "grad_norm": 0.05565977841615677, + "learning_rate": 7.914723742422539e-05, + "loss": 0.1777, + "step": 8358 + }, + { + "epoch": 1.6844650413056619, + "grad_norm": 0.047493912279605865, + "learning_rate": 7.913640843839122e-05, + "loss": 0.2043, + "step": 8360 + }, + { + "epoch": 1.6848680233729598, + "grad_norm": 0.04517120495438576, + "learning_rate": 7.912557738275319e-05, + "loss": 0.1706, + "step": 8362 + }, + { + "epoch": 1.6852710054402578, + "grad_norm": 0.07502885162830353, + "learning_rate": 7.911474425808072e-05, + "loss": 0.1983, + "step": 8364 + }, + { + "epoch": 1.685673987507556, + "grad_norm": 0.06703903526067734, + "learning_rate": 7.910390906514338e-05, + "loss": 0.1789, + "step": 8366 + }, + { + "epoch": 1.6860769695748539, + "grad_norm": 0.06102665886282921, + "learning_rate": 7.90930718047109e-05, + "loss": 0.2034, + "step": 8368 + }, + { + "epoch": 1.686479951642152, + "grad_norm": 0.08277782052755356, + "learning_rate": 7.90822324775531e-05, + "loss": 0.2115, + "step": 8370 + }, + { + "epoch": 1.68688293370945, + "grad_norm": 0.04407776519656181, + "learning_rate": 7.907139108444004e-05, + "loss": 0.1716, + "step": 8372 + }, + { + "epoch": 1.687285915776748, + "grad_norm": 0.05880500376224518, + "learning_rate": 7.906054762614184e-05, + "loss": 0.1869, + "step": 8374 + }, + { + "epoch": 1.6876888978440459, + "grad_norm": 0.055312976241111755, + "learning_rate": 7.904970210342882e-05, + "loss": 0.1981, + "step": 8376 + }, + { + "epoch": 1.6880918799113438, + "grad_norm": 0.05081368237733841, + "learning_rate": 7.90388545170714e-05, + "loss": 0.1713, + "step": 8378 + }, + { + "epoch": 1.688494861978642, + "grad_norm": 0.03774468973278999, + "learning_rate": 7.902800486784021e-05, + "loss": 0.1554, + "step": 8380 + }, + { + "epoch": 1.6888978440459401, + "grad_norm": 0.04858670011162758, + "learning_rate": 7.901715315650597e-05, + "loss": 0.1874, + "step": 8382 + }, + { + "epoch": 1.689300826113238, + "grad_norm": 0.03782944008708, + "learning_rate": 7.900629938383959e-05, + "loss": 0.1464, + "step": 8384 + }, + { + "epoch": 1.689703808180536, + "grad_norm": 0.04310869425535202, + "learning_rate": 7.899544355061209e-05, + "loss": 0.1399, + "step": 8386 + }, + { + "epoch": 1.690106790247834, + "grad_norm": 0.06209326907992363, + "learning_rate": 7.898458565759463e-05, + "loss": 0.1511, + "step": 8388 + }, + { + "epoch": 1.6905097723151319, + "grad_norm": 0.05323124676942825, + "learning_rate": 7.897372570555858e-05, + "loss": 0.2171, + "step": 8390 + }, + { + "epoch": 1.6909127543824298, + "grad_norm": 0.0539429634809494, + "learning_rate": 7.89628636952754e-05, + "loss": 0.2134, + "step": 8392 + }, + { + "epoch": 1.691315736449728, + "grad_norm": 0.039286866784095764, + "learning_rate": 7.895199962751668e-05, + "loss": 0.1713, + "step": 8394 + }, + { + "epoch": 1.6917187185170262, + "grad_norm": 0.0580926276743412, + "learning_rate": 7.894113350305421e-05, + "loss": 0.2415, + "step": 8396 + }, + { + "epoch": 1.692121700584324, + "grad_norm": 0.056929927319288254, + "learning_rate": 7.893026532265992e-05, + "loss": 0.2097, + "step": 8398 + }, + { + "epoch": 1.692524682651622, + "grad_norm": 0.039593666791915894, + "learning_rate": 7.891939508710583e-05, + "loss": 0.1637, + "step": 8400 + }, + { + "epoch": 1.69292766471892, + "grad_norm": 0.052947916090488434, + "learning_rate": 7.890852279716416e-05, + "loss": 0.1893, + "step": 8402 + }, + { + "epoch": 1.693330646786218, + "grad_norm": 0.0409802608191967, + "learning_rate": 7.889764845360727e-05, + "loss": 0.1594, + "step": 8404 + }, + { + "epoch": 1.6937336288535159, + "grad_norm": 0.04634273424744606, + "learning_rate": 7.888677205720767e-05, + "loss": 0.2099, + "step": 8406 + }, + { + "epoch": 1.694136610920814, + "grad_norm": 0.060330476611852646, + "learning_rate": 7.887589360873794e-05, + "loss": 0.2091, + "step": 8408 + }, + { + "epoch": 1.6945395929881122, + "grad_norm": 0.05978460609912872, + "learning_rate": 7.886501310897094e-05, + "loss": 0.2053, + "step": 8410 + }, + { + "epoch": 1.6949425750554101, + "grad_norm": 0.0454874113202095, + "learning_rate": 7.885413055867956e-05, + "loss": 0.1795, + "step": 8412 + }, + { + "epoch": 1.695345557122708, + "grad_norm": 0.05972779542207718, + "learning_rate": 7.884324595863688e-05, + "loss": 0.2175, + "step": 8414 + }, + { + "epoch": 1.695748539190006, + "grad_norm": 0.054317884147167206, + "learning_rate": 7.883235930961617e-05, + "loss": 0.1653, + "step": 8416 + }, + { + "epoch": 1.696151521257304, + "grad_norm": 0.05237840861082077, + "learning_rate": 7.882147061239074e-05, + "loss": 0.192, + "step": 8418 + }, + { + "epoch": 1.696554503324602, + "grad_norm": 0.04566609114408493, + "learning_rate": 7.881057986773412e-05, + "loss": 0.2368, + "step": 8420 + }, + { + "epoch": 1.6969574853919, + "grad_norm": 0.05352894216775894, + "learning_rate": 7.879968707642e-05, + "loss": 0.2011, + "step": 8422 + }, + { + "epoch": 1.6973604674591982, + "grad_norm": 0.051115117967128754, + "learning_rate": 7.878879223922215e-05, + "loss": 0.1936, + "step": 8424 + }, + { + "epoch": 1.6977634495264962, + "grad_norm": 0.07551829516887665, + "learning_rate": 7.877789535691455e-05, + "loss": 0.2121, + "step": 8426 + }, + { + "epoch": 1.698166431593794, + "grad_norm": 0.048008158802986145, + "learning_rate": 7.87669964302713e-05, + "loss": 0.1934, + "step": 8428 + }, + { + "epoch": 1.698569413661092, + "grad_norm": 0.04184304550290108, + "learning_rate": 7.875609546006661e-05, + "loss": 0.1513, + "step": 8430 + }, + { + "epoch": 1.69897239572839, + "grad_norm": 0.06528772413730621, + "learning_rate": 7.87451924470749e-05, + "loss": 0.1937, + "step": 8432 + }, + { + "epoch": 1.6993753777956881, + "grad_norm": 0.06352069228887558, + "learning_rate": 7.87342873920707e-05, + "loss": 0.188, + "step": 8434 + }, + { + "epoch": 1.699778359862986, + "grad_norm": 0.06633667647838593, + "learning_rate": 7.872338029582867e-05, + "loss": 0.1779, + "step": 8436 + }, + { + "epoch": 1.7001813419302843, + "grad_norm": 0.04314619302749634, + "learning_rate": 7.871247115912361e-05, + "loss": 0.1713, + "step": 8438 + }, + { + "epoch": 1.7005843239975822, + "grad_norm": 0.07472950220108032, + "learning_rate": 7.870155998273055e-05, + "loss": 0.1738, + "step": 8440 + }, + { + "epoch": 1.7009873060648801, + "grad_norm": 0.09097413718700409, + "learning_rate": 7.869064676742456e-05, + "loss": 0.2039, + "step": 8442 + }, + { + "epoch": 1.701390288132178, + "grad_norm": 0.052758365869522095, + "learning_rate": 7.867973151398091e-05, + "loss": 0.1801, + "step": 8444 + }, + { + "epoch": 1.701793270199476, + "grad_norm": 0.05923820286989212, + "learning_rate": 7.866881422317501e-05, + "loss": 0.2125, + "step": 8446 + }, + { + "epoch": 1.7021962522667742, + "grad_norm": 0.06283913552761078, + "learning_rate": 7.865789489578239e-05, + "loss": 0.1614, + "step": 8448 + }, + { + "epoch": 1.7025992343340721, + "grad_norm": 0.045529961585998535, + "learning_rate": 7.864697353257872e-05, + "loss": 0.1559, + "step": 8450 + }, + { + "epoch": 1.7030022164013703, + "grad_norm": 0.06094701588153839, + "learning_rate": 7.86360501343399e-05, + "loss": 0.1766, + "step": 8452 + }, + { + "epoch": 1.7034051984686682, + "grad_norm": 0.10539116710424423, + "learning_rate": 7.862512470184187e-05, + "loss": 0.213, + "step": 8454 + }, + { + "epoch": 1.7038081805359662, + "grad_norm": 0.06393177807331085, + "learning_rate": 7.861419723586074e-05, + "loss": 0.1821, + "step": 8456 + }, + { + "epoch": 1.704211162603264, + "grad_norm": 0.053990524262189865, + "learning_rate": 7.860326773717281e-05, + "loss": 0.1965, + "step": 8458 + }, + { + "epoch": 1.704614144670562, + "grad_norm": 0.06103832647204399, + "learning_rate": 7.85923362065545e-05, + "loss": 0.1929, + "step": 8460 + }, + { + "epoch": 1.7050171267378602, + "grad_norm": 0.05043712630867958, + "learning_rate": 7.858140264478233e-05, + "loss": 0.2383, + "step": 8462 + }, + { + "epoch": 1.7054201088051582, + "grad_norm": 0.058011483401060104, + "learning_rate": 7.857046705263305e-05, + "loss": 0.1691, + "step": 8464 + }, + { + "epoch": 1.7058230908724563, + "grad_norm": 0.07188203185796738, + "learning_rate": 7.855952943088346e-05, + "loss": 0.2316, + "step": 8466 + }, + { + "epoch": 1.7062260729397543, + "grad_norm": 0.0951547920703888, + "learning_rate": 7.854858978031057e-05, + "loss": 0.2202, + "step": 8468 + }, + { + "epoch": 1.7066290550070522, + "grad_norm": 0.045940931886434555, + "learning_rate": 7.853764810169153e-05, + "loss": 0.1967, + "step": 8470 + }, + { + "epoch": 1.7070320370743501, + "grad_norm": 0.06384126842021942, + "learning_rate": 7.852670439580362e-05, + "loss": 0.2567, + "step": 8472 + }, + { + "epoch": 1.707435019141648, + "grad_norm": 0.05669160187244415, + "learning_rate": 7.851575866342424e-05, + "loss": 0.2404, + "step": 8474 + }, + { + "epoch": 1.7078380012089462, + "grad_norm": 0.04451169818639755, + "learning_rate": 7.850481090533097e-05, + "loss": 0.1893, + "step": 8476 + }, + { + "epoch": 1.7082409832762442, + "grad_norm": 0.06958547234535217, + "learning_rate": 7.84938611223015e-05, + "loss": 0.2434, + "step": 8478 + }, + { + "epoch": 1.7086439653435423, + "grad_norm": 0.05522121861577034, + "learning_rate": 7.848290931511372e-05, + "loss": 0.217, + "step": 8480 + }, + { + "epoch": 1.7090469474108403, + "grad_norm": 0.056487396359443665, + "learning_rate": 7.847195548454564e-05, + "loss": 0.212, + "step": 8482 + }, + { + "epoch": 1.7094499294781382, + "grad_norm": 0.058997754007577896, + "learning_rate": 7.846099963137535e-05, + "loss": 0.2267, + "step": 8484 + }, + { + "epoch": 1.7098529115454362, + "grad_norm": 0.042182806879282, + "learning_rate": 7.845004175638116e-05, + "loss": 0.2063, + "step": 8486 + }, + { + "epoch": 1.7102558936127341, + "grad_norm": 0.05909043177962303, + "learning_rate": 7.843908186034152e-05, + "loss": 0.1918, + "step": 8488 + }, + { + "epoch": 1.7106588756800323, + "grad_norm": 0.04273051396012306, + "learning_rate": 7.842811994403496e-05, + "loss": 0.2098, + "step": 8490 + }, + { + "epoch": 1.7110618577473302, + "grad_norm": 0.04888763651251793, + "learning_rate": 7.841715600824024e-05, + "loss": 0.1562, + "step": 8492 + }, + { + "epoch": 1.7114648398146284, + "grad_norm": 0.048436980694532394, + "learning_rate": 7.840619005373621e-05, + "loss": 0.2039, + "step": 8494 + }, + { + "epoch": 1.7118678218819263, + "grad_norm": 0.03377070277929306, + "learning_rate": 7.839522208130186e-05, + "loss": 0.19, + "step": 8496 + }, + { + "epoch": 1.7122708039492243, + "grad_norm": 0.04444821551442146, + "learning_rate": 7.838425209171633e-05, + "loss": 0.2197, + "step": 8498 + }, + { + "epoch": 1.7126737860165222, + "grad_norm": 0.03357812389731407, + "learning_rate": 7.837328008575895e-05, + "loss": 0.1413, + "step": 8500 + }, + { + "epoch": 1.7130767680838201, + "grad_norm": 0.045264676213264465, + "learning_rate": 7.836230606420911e-05, + "loss": 0.1612, + "step": 8502 + }, + { + "epoch": 1.7134797501511183, + "grad_norm": 0.043273936957120895, + "learning_rate": 7.835133002784642e-05, + "loss": 0.1633, + "step": 8504 + }, + { + "epoch": 1.7138827322184162, + "grad_norm": 0.0491492860019207, + "learning_rate": 7.834035197745059e-05, + "loss": 0.1686, + "step": 8506 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.08964493870735168, + "learning_rate": 7.832937191380147e-05, + "loss": 0.1782, + "step": 8508 + }, + { + "epoch": 1.7146886963530124, + "grad_norm": 0.04667485132813454, + "learning_rate": 7.831838983767907e-05, + "loss": 0.2296, + "step": 8510 + }, + { + "epoch": 1.7150916784203103, + "grad_norm": 0.06557079404592514, + "learning_rate": 7.830740574986355e-05, + "loss": 0.1757, + "step": 8512 + }, + { + "epoch": 1.7154946604876082, + "grad_norm": 0.03733866289258003, + "learning_rate": 7.82964196511352e-05, + "loss": 0.191, + "step": 8514 + }, + { + "epoch": 1.7158976425549062, + "grad_norm": 0.04955060034990311, + "learning_rate": 7.828543154227445e-05, + "loss": 0.1708, + "step": 8516 + }, + { + "epoch": 1.7163006246222043, + "grad_norm": 0.04552415385842323, + "learning_rate": 7.827444142406188e-05, + "loss": 0.1344, + "step": 8518 + }, + { + "epoch": 1.7167036066895023, + "grad_norm": 0.0272463858127594, + "learning_rate": 7.826344929727821e-05, + "loss": 0.1609, + "step": 8520 + }, + { + "epoch": 1.7171065887568004, + "grad_norm": 0.04317507520318031, + "learning_rate": 7.82524551627043e-05, + "loss": 0.1573, + "step": 8522 + }, + { + "epoch": 1.7175095708240984, + "grad_norm": 0.05297327786684036, + "learning_rate": 7.824145902112115e-05, + "loss": 0.1895, + "step": 8524 + }, + { + "epoch": 1.7179125528913963, + "grad_norm": 0.0684426873922348, + "learning_rate": 7.823046087330992e-05, + "loss": 0.2023, + "step": 8526 + }, + { + "epoch": 1.7183155349586943, + "grad_norm": 0.05523200333118439, + "learning_rate": 7.82194607200519e-05, + "loss": 0.1781, + "step": 8528 + }, + { + "epoch": 1.7187185170259922, + "grad_norm": 0.07310648262500763, + "learning_rate": 7.820845856212853e-05, + "loss": 0.2673, + "step": 8530 + }, + { + "epoch": 1.7191214990932904, + "grad_norm": 0.05852164700627327, + "learning_rate": 7.819745440032136e-05, + "loss": 0.1842, + "step": 8532 + }, + { + "epoch": 1.7195244811605883, + "grad_norm": 0.05170690268278122, + "learning_rate": 7.818644823541215e-05, + "loss": 0.1529, + "step": 8534 + }, + { + "epoch": 1.7199274632278865, + "grad_norm": 0.04495406523346901, + "learning_rate": 7.817544006818272e-05, + "loss": 0.1847, + "step": 8536 + }, + { + "epoch": 1.7203304452951844, + "grad_norm": 0.06066835671663284, + "learning_rate": 7.816442989941508e-05, + "loss": 0.2595, + "step": 8538 + }, + { + "epoch": 1.7207334273624824, + "grad_norm": 0.04806002229452133, + "learning_rate": 7.815341772989138e-05, + "loss": 0.2052, + "step": 8540 + }, + { + "epoch": 1.7211364094297803, + "grad_norm": 0.060100167989730835, + "learning_rate": 7.814240356039392e-05, + "loss": 0.2143, + "step": 8542 + }, + { + "epoch": 1.7215393914970782, + "grad_norm": 0.04579418525099754, + "learning_rate": 7.813138739170511e-05, + "loss": 0.2379, + "step": 8544 + }, + { + "epoch": 1.7219423735643764, + "grad_norm": 0.041652414947748184, + "learning_rate": 7.812036922460754e-05, + "loss": 0.1726, + "step": 8546 + }, + { + "epoch": 1.7223453556316743, + "grad_norm": 0.051688678562641144, + "learning_rate": 7.810934905988392e-05, + "loss": 0.1802, + "step": 8548 + }, + { + "epoch": 1.7227483376989725, + "grad_norm": 0.05031515657901764, + "learning_rate": 7.809832689831707e-05, + "loss": 0.2386, + "step": 8550 + }, + { + "epoch": 1.7231513197662705, + "grad_norm": 0.04816931113600731, + "learning_rate": 7.808730274069003e-05, + "loss": 0.1636, + "step": 8552 + }, + { + "epoch": 1.7235543018335684, + "grad_norm": 0.06899578869342804, + "learning_rate": 7.807627658778592e-05, + "loss": 0.1706, + "step": 8554 + }, + { + "epoch": 1.7239572839008663, + "grad_norm": 0.03780083358287811, + "learning_rate": 7.806524844038803e-05, + "loss": 0.2101, + "step": 8556 + }, + { + "epoch": 1.7243602659681643, + "grad_norm": 0.04826981946825981, + "learning_rate": 7.805421829927977e-05, + "loss": 0.1997, + "step": 8558 + }, + { + "epoch": 1.7247632480354624, + "grad_norm": 0.050643905997276306, + "learning_rate": 7.80431861652447e-05, + "loss": 0.2093, + "step": 8560 + }, + { + "epoch": 1.7251662301027604, + "grad_norm": 0.048531509935855865, + "learning_rate": 7.803215203906655e-05, + "loss": 0.1975, + "step": 8562 + }, + { + "epoch": 1.7255692121700585, + "grad_norm": 0.0731339156627655, + "learning_rate": 7.802111592152913e-05, + "loss": 0.195, + "step": 8564 + }, + { + "epoch": 1.7259721942373565, + "grad_norm": 0.05814792215824127, + "learning_rate": 7.801007781341644e-05, + "loss": 0.2539, + "step": 8566 + }, + { + "epoch": 1.7263751763046544, + "grad_norm": 0.05993201211094856, + "learning_rate": 7.799903771551265e-05, + "loss": 0.1668, + "step": 8568 + }, + { + "epoch": 1.7267781583719524, + "grad_norm": 0.04430992528796196, + "learning_rate": 7.798799562860198e-05, + "loss": 0.1859, + "step": 8570 + }, + { + "epoch": 1.7271811404392503, + "grad_norm": 0.04470387473702431, + "learning_rate": 7.797695155346887e-05, + "loss": 0.1571, + "step": 8572 + }, + { + "epoch": 1.7275841225065485, + "grad_norm": 0.03621676564216614, + "learning_rate": 7.796590549089786e-05, + "loss": 0.1829, + "step": 8574 + }, + { + "epoch": 1.7279871045738464, + "grad_norm": 0.05174838379025459, + "learning_rate": 7.795485744167365e-05, + "loss": 0.2106, + "step": 8576 + }, + { + "epoch": 1.7283900866411446, + "grad_norm": 0.042110104113817215, + "learning_rate": 7.794380740658107e-05, + "loss": 0.2247, + "step": 8578 + }, + { + "epoch": 1.7287930687084425, + "grad_norm": 0.05196274816989899, + "learning_rate": 7.79327553864051e-05, + "loss": 0.1762, + "step": 8580 + }, + { + "epoch": 1.7291960507757405, + "grad_norm": 0.051834408193826675, + "learning_rate": 7.792170138193086e-05, + "loss": 0.204, + "step": 8582 + }, + { + "epoch": 1.7295990328430384, + "grad_norm": 0.036789216101169586, + "learning_rate": 7.79106453939436e-05, + "loss": 0.1595, + "step": 8584 + }, + { + "epoch": 1.7300020149103363, + "grad_norm": 0.05568787083029747, + "learning_rate": 7.789958742322873e-05, + "loss": 0.1751, + "step": 8586 + }, + { + "epoch": 1.7304049969776345, + "grad_norm": 0.061141084879636765, + "learning_rate": 7.78885274705718e-05, + "loss": 0.1976, + "step": 8588 + }, + { + "epoch": 1.7308079790449327, + "grad_norm": 0.05548600107431412, + "learning_rate": 7.787746553675848e-05, + "loss": 0.1615, + "step": 8590 + }, + { + "epoch": 1.7312109611122306, + "grad_norm": 0.05701598897576332, + "learning_rate": 7.78664016225746e-05, + "loss": 0.2356, + "step": 8592 + }, + { + "epoch": 1.7316139431795285, + "grad_norm": 0.06706813722848892, + "learning_rate": 7.785533572880609e-05, + "loss": 0.2213, + "step": 8594 + }, + { + "epoch": 1.7320169252468265, + "grad_norm": 0.05262723192572594, + "learning_rate": 7.784426785623908e-05, + "loss": 0.162, + "step": 8596 + }, + { + "epoch": 1.7324199073141244, + "grad_norm": 0.03426643833518028, + "learning_rate": 7.783319800565984e-05, + "loss": 0.1828, + "step": 8598 + }, + { + "epoch": 1.7328228893814224, + "grad_norm": 0.05540724843740463, + "learning_rate": 7.782212617785469e-05, + "loss": 0.2208, + "step": 8600 + }, + { + "epoch": 1.7332258714487205, + "grad_norm": 0.06078488752245903, + "learning_rate": 7.781105237361021e-05, + "loss": 0.2496, + "step": 8602 + }, + { + "epoch": 1.7336288535160187, + "grad_norm": 0.06109081953763962, + "learning_rate": 7.779997659371305e-05, + "loss": 0.1905, + "step": 8604 + }, + { + "epoch": 1.7340318355833166, + "grad_norm": 0.07080352306365967, + "learning_rate": 7.778889883895001e-05, + "loss": 0.2268, + "step": 8606 + }, + { + "epoch": 1.7344348176506146, + "grad_norm": 0.05866062641143799, + "learning_rate": 7.777781911010804e-05, + "loss": 0.2183, + "step": 8608 + }, + { + "epoch": 1.7348377997179125, + "grad_norm": 0.06478538364171982, + "learning_rate": 7.776673740797422e-05, + "loss": 0.2019, + "step": 8610 + }, + { + "epoch": 1.7352407817852105, + "grad_norm": 0.04579712077975273, + "learning_rate": 7.775565373333578e-05, + "loss": 0.2049, + "step": 8612 + }, + { + "epoch": 1.7356437638525084, + "grad_norm": 0.06760866940021515, + "learning_rate": 7.774456808698008e-05, + "loss": 0.2231, + "step": 8614 + }, + { + "epoch": 1.7360467459198066, + "grad_norm": 0.047087687999010086, + "learning_rate": 7.773348046969465e-05, + "loss": 0.1354, + "step": 8616 + }, + { + "epoch": 1.7364497279871047, + "grad_norm": 0.0526084341108799, + "learning_rate": 7.772239088226712e-05, + "loss": 0.2409, + "step": 8618 + }, + { + "epoch": 1.7368527100544027, + "grad_norm": 0.06447000801563263, + "learning_rate": 7.771129932548527e-05, + "loss": 0.2616, + "step": 8620 + }, + { + "epoch": 1.7372556921217006, + "grad_norm": 0.06044398248195648, + "learning_rate": 7.770020580013703e-05, + "loss": 0.2157, + "step": 8622 + }, + { + "epoch": 1.7376586741889986, + "grad_norm": 0.03387543931603432, + "learning_rate": 7.768911030701047e-05, + "loss": 0.164, + "step": 8624 + }, + { + "epoch": 1.7380616562562965, + "grad_norm": 0.043008893728256226, + "learning_rate": 7.76780128468938e-05, + "loss": 0.1349, + "step": 8626 + }, + { + "epoch": 1.7384646383235944, + "grad_norm": 0.06238474324345589, + "learning_rate": 7.766691342057537e-05, + "loss": 0.1753, + "step": 8628 + }, + { + "epoch": 1.7388676203908926, + "grad_norm": 0.04955996945500374, + "learning_rate": 7.765581202884365e-05, + "loss": 0.2607, + "step": 8630 + }, + { + "epoch": 1.7392706024581908, + "grad_norm": 0.03956250473856926, + "learning_rate": 7.764470867248726e-05, + "loss": 0.1929, + "step": 8632 + }, + { + "epoch": 1.7396735845254887, + "grad_norm": 0.04237693175673485, + "learning_rate": 7.7633603352295e-05, + "loss": 0.2227, + "step": 8634 + }, + { + "epoch": 1.7400765665927866, + "grad_norm": 0.04121999070048332, + "learning_rate": 7.762249606905574e-05, + "loss": 0.1772, + "step": 8636 + }, + { + "epoch": 1.7404795486600846, + "grad_norm": 0.04537238925695419, + "learning_rate": 7.761138682355854e-05, + "loss": 0.1972, + "step": 8638 + }, + { + "epoch": 1.7408825307273825, + "grad_norm": 0.04738117754459381, + "learning_rate": 7.760027561659255e-05, + "loss": 0.1604, + "step": 8640 + }, + { + "epoch": 1.7412855127946807, + "grad_norm": 0.05630556121468544, + "learning_rate": 7.758916244894716e-05, + "loss": 0.2037, + "step": 8642 + }, + { + "epoch": 1.7416884948619786, + "grad_norm": 0.05333465337753296, + "learning_rate": 7.757804732141177e-05, + "loss": 0.1861, + "step": 8644 + }, + { + "epoch": 1.7420914769292768, + "grad_norm": 0.04941607639193535, + "learning_rate": 7.7566930234776e-05, + "loss": 0.1992, + "step": 8646 + }, + { + "epoch": 1.7424944589965747, + "grad_norm": 0.04240027070045471, + "learning_rate": 7.755581118982961e-05, + "loss": 0.1641, + "step": 8648 + }, + { + "epoch": 1.7428974410638727, + "grad_norm": 0.05365744233131409, + "learning_rate": 7.754469018736245e-05, + "loss": 0.2032, + "step": 8650 + }, + { + "epoch": 1.7433004231311706, + "grad_norm": 0.052164193242788315, + "learning_rate": 7.753356722816455e-05, + "loss": 0.1564, + "step": 8652 + }, + { + "epoch": 1.7437034051984686, + "grad_norm": 0.03998196870088577, + "learning_rate": 7.752244231302608e-05, + "loss": 0.1512, + "step": 8654 + }, + { + "epoch": 1.7441063872657667, + "grad_norm": 0.03845955803990364, + "learning_rate": 7.75113154427373e-05, + "loss": 0.1582, + "step": 8656 + }, + { + "epoch": 1.7445093693330647, + "grad_norm": 0.038581348955631256, + "learning_rate": 7.750018661808869e-05, + "loss": 0.2324, + "step": 8658 + }, + { + "epoch": 1.7449123514003628, + "grad_norm": 0.05808325111865997, + "learning_rate": 7.748905583987079e-05, + "loss": 0.236, + "step": 8660 + }, + { + "epoch": 1.7453153334676608, + "grad_norm": 0.06073828786611557, + "learning_rate": 7.747792310887434e-05, + "loss": 0.2215, + "step": 8662 + }, + { + "epoch": 1.7457183155349587, + "grad_norm": 0.05565022677183151, + "learning_rate": 7.746678842589017e-05, + "loss": 0.2051, + "step": 8664 + }, + { + "epoch": 1.7461212976022567, + "grad_norm": 0.07590685784816742, + "learning_rate": 7.745565179170927e-05, + "loss": 0.2145, + "step": 8666 + }, + { + "epoch": 1.7465242796695546, + "grad_norm": 0.04306831210851669, + "learning_rate": 7.744451320712278e-05, + "loss": 0.1701, + "step": 8668 + }, + { + "epoch": 1.7469272617368528, + "grad_norm": 0.052143242210149765, + "learning_rate": 7.743337267292197e-05, + "loss": 0.1892, + "step": 8670 + }, + { + "epoch": 1.7473302438041507, + "grad_norm": 0.040976159274578094, + "learning_rate": 7.742223018989822e-05, + "loss": 0.1982, + "step": 8672 + }, + { + "epoch": 1.7477332258714489, + "grad_norm": 0.04762955382466316, + "learning_rate": 7.741108575884311e-05, + "loss": 0.2214, + "step": 8674 + }, + { + "epoch": 1.7481362079387468, + "grad_norm": 0.047516968101263046, + "learning_rate": 7.73999393805483e-05, + "loss": 0.167, + "step": 8676 + }, + { + "epoch": 1.7485391900060447, + "grad_norm": 0.05364019423723221, + "learning_rate": 7.738879105580562e-05, + "loss": 0.1911, + "step": 8678 + }, + { + "epoch": 1.7489421720733427, + "grad_norm": 0.036976758390665054, + "learning_rate": 7.737764078540701e-05, + "loss": 0.134, + "step": 8680 + }, + { + "epoch": 1.7493451541406406, + "grad_norm": 0.062130097299814224, + "learning_rate": 7.73664885701446e-05, + "loss": 0.1687, + "step": 8682 + }, + { + "epoch": 1.7497481362079388, + "grad_norm": 0.04362349212169647, + "learning_rate": 7.73553344108106e-05, + "loss": 0.1953, + "step": 8684 + }, + { + "epoch": 1.7501511182752367, + "grad_norm": 0.045134756714105606, + "learning_rate": 7.73441783081974e-05, + "loss": 0.2189, + "step": 8686 + }, + { + "epoch": 1.750554100342535, + "grad_norm": 0.04244035482406616, + "learning_rate": 7.73330202630975e-05, + "loss": 0.2119, + "step": 8688 + }, + { + "epoch": 1.7509570824098328, + "grad_norm": 0.06525809317827225, + "learning_rate": 7.732186027630355e-05, + "loss": 0.2336, + "step": 8690 + }, + { + "epoch": 1.7513600644771308, + "grad_norm": 0.04660410434007645, + "learning_rate": 7.731069834860833e-05, + "loss": 0.1688, + "step": 8692 + }, + { + "epoch": 1.7517630465444287, + "grad_norm": 0.05042422562837601, + "learning_rate": 7.729953448080481e-05, + "loss": 0.2028, + "step": 8694 + }, + { + "epoch": 1.7521660286117267, + "grad_norm": 0.05255540832877159, + "learning_rate": 7.728836867368599e-05, + "loss": 0.2158, + "step": 8696 + }, + { + "epoch": 1.7525690106790248, + "grad_norm": 0.04372533783316612, + "learning_rate": 7.72772009280451e-05, + "loss": 0.1658, + "step": 8698 + }, + { + "epoch": 1.7529719927463228, + "grad_norm": 0.054151248186826706, + "learning_rate": 7.726603124467548e-05, + "loss": 0.2391, + "step": 8700 + }, + { + "epoch": 1.753374974813621, + "grad_norm": 0.05295997112989426, + "learning_rate": 7.725485962437062e-05, + "loss": 0.2183, + "step": 8702 + }, + { + "epoch": 1.7537779568809189, + "grad_norm": 0.044501155614852905, + "learning_rate": 7.724368606792412e-05, + "loss": 0.2243, + "step": 8704 + }, + { + "epoch": 1.7541809389482168, + "grad_norm": 0.04965506121516228, + "learning_rate": 7.723251057612972e-05, + "loss": 0.1998, + "step": 8706 + }, + { + "epoch": 1.7545839210155147, + "grad_norm": 0.06588796526193619, + "learning_rate": 7.722133314978133e-05, + "loss": 0.196, + "step": 8708 + }, + { + "epoch": 1.7549869030828127, + "grad_norm": 0.057937197387218475, + "learning_rate": 7.721015378967296e-05, + "loss": 0.2054, + "step": 8710 + }, + { + "epoch": 1.7553898851501109, + "grad_norm": 0.04531354829668999, + "learning_rate": 7.719897249659878e-05, + "loss": 0.1873, + "step": 8712 + }, + { + "epoch": 1.7557928672174088, + "grad_norm": 0.05281313881278038, + "learning_rate": 7.71877892713531e-05, + "loss": 0.2363, + "step": 8714 + }, + { + "epoch": 1.756195849284707, + "grad_norm": 0.052877333015203476, + "learning_rate": 7.717660411473035e-05, + "loss": 0.2576, + "step": 8716 + }, + { + "epoch": 1.756598831352005, + "grad_norm": 0.046693529933691025, + "learning_rate": 7.71654170275251e-05, + "loss": 0.1911, + "step": 8718 + }, + { + "epoch": 1.7570018134193028, + "grad_norm": 0.045685358345508575, + "learning_rate": 7.715422801053207e-05, + "loss": 0.1662, + "step": 8720 + }, + { + "epoch": 1.7574047954866008, + "grad_norm": 0.05251702293753624, + "learning_rate": 7.714303706454611e-05, + "loss": 0.217, + "step": 8722 + }, + { + "epoch": 1.7578077775538987, + "grad_norm": 0.04720870032906532, + "learning_rate": 7.713184419036222e-05, + "loss": 0.2178, + "step": 8724 + }, + { + "epoch": 1.7582107596211969, + "grad_norm": 0.04424897953867912, + "learning_rate": 7.712064938877548e-05, + "loss": 0.1485, + "step": 8726 + }, + { + "epoch": 1.7586137416884948, + "grad_norm": 0.04398166388273239, + "learning_rate": 7.71094526605812e-05, + "loss": 0.1699, + "step": 8728 + }, + { + "epoch": 1.759016723755793, + "grad_norm": 0.05234614759683609, + "learning_rate": 7.709825400657475e-05, + "loss": 0.1842, + "step": 8730 + }, + { + "epoch": 1.759419705823091, + "grad_norm": 0.0670747384428978, + "learning_rate": 7.708705342755169e-05, + "loss": 0.1806, + "step": 8732 + }, + { + "epoch": 1.7598226878903889, + "grad_norm": 0.03887630254030228, + "learning_rate": 7.707585092430765e-05, + "loss": 0.1773, + "step": 8734 + }, + { + "epoch": 1.7602256699576868, + "grad_norm": 0.056282076984643936, + "learning_rate": 7.706464649763847e-05, + "loss": 0.1901, + "step": 8736 + }, + { + "epoch": 1.7606286520249848, + "grad_norm": 0.04681971296668053, + "learning_rate": 7.705344014834011e-05, + "loss": 0.2137, + "step": 8738 + }, + { + "epoch": 1.761031634092283, + "grad_norm": 0.05810059607028961, + "learning_rate": 7.704223187720861e-05, + "loss": 0.1969, + "step": 8740 + }, + { + "epoch": 1.7614346161595809, + "grad_norm": 0.05052733048796654, + "learning_rate": 7.70310216850402e-05, + "loss": 0.2087, + "step": 8742 + }, + { + "epoch": 1.761837598226879, + "grad_norm": 0.04968501254916191, + "learning_rate": 7.701980957263123e-05, + "loss": 0.1928, + "step": 8744 + }, + { + "epoch": 1.762240580294177, + "grad_norm": 0.0517900288105011, + "learning_rate": 7.700859554077821e-05, + "loss": 0.2056, + "step": 8746 + }, + { + "epoch": 1.762643562361475, + "grad_norm": 0.052969422191381454, + "learning_rate": 7.699737959027776e-05, + "loss": 0.2397, + "step": 8748 + }, + { + "epoch": 1.7630465444287728, + "grad_norm": 0.07398225367069244, + "learning_rate": 7.698616172192663e-05, + "loss": 0.19, + "step": 8750 + }, + { + "epoch": 1.7634495264960708, + "grad_norm": 0.04895591363310814, + "learning_rate": 7.697494193652174e-05, + "loss": 0.1756, + "step": 8752 + }, + { + "epoch": 1.763852508563369, + "grad_norm": 0.058382175862789154, + "learning_rate": 7.696372023486012e-05, + "loss": 0.2415, + "step": 8754 + }, + { + "epoch": 1.764255490630667, + "grad_norm": 0.044502172619104385, + "learning_rate": 7.695249661773892e-05, + "loss": 0.1885, + "step": 8756 + }, + { + "epoch": 1.764658472697965, + "grad_norm": 0.04054013267159462, + "learning_rate": 7.694127108595548e-05, + "loss": 0.172, + "step": 8758 + }, + { + "epoch": 1.765061454765263, + "grad_norm": 0.05540904030203819, + "learning_rate": 7.693004364030723e-05, + "loss": 0.1634, + "step": 8760 + }, + { + "epoch": 1.765464436832561, + "grad_norm": 0.052257224917411804, + "learning_rate": 7.691881428159172e-05, + "loss": 0.2097, + "step": 8762 + }, + { + "epoch": 1.7658674188998589, + "grad_norm": 0.051800746470689774, + "learning_rate": 7.690758301060672e-05, + "loss": 0.1682, + "step": 8764 + }, + { + "epoch": 1.7662704009671568, + "grad_norm": 0.05630108341574669, + "learning_rate": 7.689634982815005e-05, + "loss": 0.1729, + "step": 8766 + }, + { + "epoch": 1.766673383034455, + "grad_norm": 0.05631199851632118, + "learning_rate": 7.68851147350197e-05, + "loss": 0.1729, + "step": 8768 + }, + { + "epoch": 1.767076365101753, + "grad_norm": 0.05669247731566429, + "learning_rate": 7.687387773201379e-05, + "loss": 0.1933, + "step": 8770 + }, + { + "epoch": 1.767479347169051, + "grad_norm": 0.051117509603500366, + "learning_rate": 7.686263881993059e-05, + "loss": 0.1638, + "step": 8772 + }, + { + "epoch": 1.767882329236349, + "grad_norm": 0.041834667325019836, + "learning_rate": 7.685139799956848e-05, + "loss": 0.1576, + "step": 8774 + }, + { + "epoch": 1.768285311303647, + "grad_norm": 0.044466495513916016, + "learning_rate": 7.684015527172601e-05, + "loss": 0.2271, + "step": 8776 + }, + { + "epoch": 1.768688293370945, + "grad_norm": 0.06349855661392212, + "learning_rate": 7.682891063720184e-05, + "loss": 0.2129, + "step": 8778 + }, + { + "epoch": 1.7690912754382428, + "grad_norm": 0.06009896472096443, + "learning_rate": 7.681766409679476e-05, + "loss": 0.2256, + "step": 8780 + }, + { + "epoch": 1.769494257505541, + "grad_norm": 0.056756600737571716, + "learning_rate": 7.680641565130371e-05, + "loss": 0.1993, + "step": 8782 + }, + { + "epoch": 1.769897239572839, + "grad_norm": 0.05109809339046478, + "learning_rate": 7.679516530152775e-05, + "loss": 0.2196, + "step": 8784 + }, + { + "epoch": 1.7703002216401371, + "grad_norm": 0.05256953090429306, + "learning_rate": 7.67839130482661e-05, + "loss": 0.19, + "step": 8786 + }, + { + "epoch": 1.770703203707435, + "grad_norm": 0.045628610998392105, + "learning_rate": 7.677265889231812e-05, + "loss": 0.1846, + "step": 8788 + }, + { + "epoch": 1.771106185774733, + "grad_norm": 0.049970727413892746, + "learning_rate": 7.676140283448328e-05, + "loss": 0.2571, + "step": 8790 + }, + { + "epoch": 1.771509167842031, + "grad_norm": 0.06309273838996887, + "learning_rate": 7.675014487556114e-05, + "loss": 0.247, + "step": 8792 + }, + { + "epoch": 1.7719121499093289, + "grad_norm": 0.04105915129184723, + "learning_rate": 7.673888501635153e-05, + "loss": 0.2053, + "step": 8794 + }, + { + "epoch": 1.772315131976627, + "grad_norm": 0.052911341190338135, + "learning_rate": 7.672762325765425e-05, + "loss": 0.2354, + "step": 8796 + }, + { + "epoch": 1.7727181140439252, + "grad_norm": 0.03674977645277977, + "learning_rate": 7.671635960026939e-05, + "loss": 0.1794, + "step": 8798 + }, + { + "epoch": 1.7731210961112231, + "grad_norm": 0.051963258534669876, + "learning_rate": 7.670509404499706e-05, + "loss": 0.1442, + "step": 8800 + }, + { + "epoch": 1.773524078178521, + "grad_norm": 0.07293074578046799, + "learning_rate": 7.669382659263755e-05, + "loss": 0.2144, + "step": 8802 + }, + { + "epoch": 1.773927060245819, + "grad_norm": 0.03861695155501366, + "learning_rate": 7.66825572439913e-05, + "loss": 0.2062, + "step": 8804 + }, + { + "epoch": 1.774330042313117, + "grad_norm": 0.04328519478440285, + "learning_rate": 7.667128599985887e-05, + "loss": 0.2074, + "step": 8806 + }, + { + "epoch": 1.774733024380415, + "grad_norm": 0.08353982120752335, + "learning_rate": 7.666001286104091e-05, + "loss": 0.2031, + "step": 8808 + }, + { + "epoch": 1.775136006447713, + "grad_norm": 0.03887191042304039, + "learning_rate": 7.664873782833828e-05, + "loss": 0.1916, + "step": 8810 + }, + { + "epoch": 1.7755389885150112, + "grad_norm": 0.035796571522951126, + "learning_rate": 7.663746090255194e-05, + "loss": 0.2134, + "step": 8812 + }, + { + "epoch": 1.7759419705823092, + "grad_norm": 0.07863356918096542, + "learning_rate": 7.662618208448297e-05, + "loss": 0.1853, + "step": 8814 + }, + { + "epoch": 1.7763449526496071, + "grad_norm": 0.05820539966225624, + "learning_rate": 7.66149013749326e-05, + "loss": 0.2169, + "step": 8816 + }, + { + "epoch": 1.776747934716905, + "grad_norm": 0.050097983330488205, + "learning_rate": 7.660361877470221e-05, + "loss": 0.2093, + "step": 8818 + }, + { + "epoch": 1.777150916784203, + "grad_norm": 0.04237944260239601, + "learning_rate": 7.65923342845933e-05, + "loss": 0.2109, + "step": 8820 + }, + { + "epoch": 1.777553898851501, + "grad_norm": 0.047271978110075, + "learning_rate": 7.658104790540748e-05, + "loss": 0.1822, + "step": 8822 + }, + { + "epoch": 1.777956880918799, + "grad_norm": 0.045994047075510025, + "learning_rate": 7.656975963794653e-05, + "loss": 0.2027, + "step": 8824 + }, + { + "epoch": 1.7783598629860973, + "grad_norm": 0.05817195773124695, + "learning_rate": 7.655846948301233e-05, + "loss": 0.1503, + "step": 8826 + }, + { + "epoch": 1.7787628450533952, + "grad_norm": 0.08411876112222672, + "learning_rate": 7.654717744140694e-05, + "loss": 0.2695, + "step": 8828 + }, + { + "epoch": 1.7791658271206932, + "grad_norm": 0.05115870013833046, + "learning_rate": 7.653588351393255e-05, + "loss": 0.2016, + "step": 8830 + }, + { + "epoch": 1.779568809187991, + "grad_norm": 0.03854476660490036, + "learning_rate": 7.652458770139139e-05, + "loss": 0.1893, + "step": 8832 + }, + { + "epoch": 1.779971791255289, + "grad_norm": 0.041677094995975494, + "learning_rate": 7.651329000458596e-05, + "loss": 0.19, + "step": 8834 + }, + { + "epoch": 1.780374773322587, + "grad_norm": 0.04914252087473869, + "learning_rate": 7.650199042431883e-05, + "loss": 0.1929, + "step": 8836 + }, + { + "epoch": 1.7807777553898851, + "grad_norm": 0.058383334428071976, + "learning_rate": 7.649068896139264e-05, + "loss": 0.2562, + "step": 8838 + }, + { + "epoch": 1.7811807374571833, + "grad_norm": 0.03746351599693298, + "learning_rate": 7.64793856166103e-05, + "loss": 0.1706, + "step": 8840 + }, + { + "epoch": 1.7815837195244812, + "grad_norm": 0.03564516454935074, + "learning_rate": 7.646808039077475e-05, + "loss": 0.114, + "step": 8842 + }, + { + "epoch": 1.7819867015917792, + "grad_norm": 0.05532078444957733, + "learning_rate": 7.64567732846891e-05, + "loss": 0.1985, + "step": 8844 + }, + { + "epoch": 1.7823896836590771, + "grad_norm": 0.061978355050086975, + "learning_rate": 7.644546429915658e-05, + "loss": 0.1941, + "step": 8846 + }, + { + "epoch": 1.782792665726375, + "grad_norm": 0.05426686629652977, + "learning_rate": 7.643415343498058e-05, + "loss": 0.2307, + "step": 8848 + }, + { + "epoch": 1.7831956477936732, + "grad_norm": 0.05495776608586311, + "learning_rate": 7.642284069296458e-05, + "loss": 0.2268, + "step": 8850 + }, + { + "epoch": 1.7835986298609712, + "grad_norm": 0.047012731432914734, + "learning_rate": 7.641152607391224e-05, + "loss": 0.1788, + "step": 8852 + }, + { + "epoch": 1.7840016119282693, + "grad_norm": 0.05379527434706688, + "learning_rate": 7.640020957862733e-05, + "loss": 0.1869, + "step": 8854 + }, + { + "epoch": 1.7844045939955673, + "grad_norm": 0.04625250771641731, + "learning_rate": 7.638889120791374e-05, + "loss": 0.2064, + "step": 8856 + }, + { + "epoch": 1.7848075760628652, + "grad_norm": 0.04690789431333542, + "learning_rate": 7.637757096257554e-05, + "loss": 0.2008, + "step": 8858 + }, + { + "epoch": 1.7852105581301632, + "grad_norm": 0.06040222942829132, + "learning_rate": 7.636624884341688e-05, + "loss": 0.2081, + "step": 8860 + }, + { + "epoch": 1.785613540197461, + "grad_norm": 0.04381508380174637, + "learning_rate": 7.635492485124207e-05, + "loss": 0.1608, + "step": 8862 + }, + { + "epoch": 1.7860165222647593, + "grad_norm": 0.041126348078250885, + "learning_rate": 7.634359898685554e-05, + "loss": 0.1963, + "step": 8864 + }, + { + "epoch": 1.7864195043320572, + "grad_norm": 0.04755943641066551, + "learning_rate": 7.633227125106187e-05, + "loss": 0.2042, + "step": 8866 + }, + { + "epoch": 1.7868224863993554, + "grad_norm": 0.046007703989744186, + "learning_rate": 7.632094164466577e-05, + "loss": 0.1998, + "step": 8868 + }, + { + "epoch": 1.7872254684666533, + "grad_norm": 0.047473929822444916, + "learning_rate": 7.630961016847207e-05, + "loss": 0.1677, + "step": 8870 + }, + { + "epoch": 1.7876284505339513, + "grad_norm": 0.04671850427985191, + "learning_rate": 7.629827682328572e-05, + "loss": 0.2158, + "step": 8872 + }, + { + "epoch": 1.7880314326012492, + "grad_norm": 0.05689868703484535, + "learning_rate": 7.628694160991185e-05, + "loss": 0.1887, + "step": 8874 + }, + { + "epoch": 1.7884344146685471, + "grad_norm": 0.03737175464630127, + "learning_rate": 7.62756045291557e-05, + "loss": 0.2039, + "step": 8876 + }, + { + "epoch": 1.7888373967358453, + "grad_norm": 0.10977429151535034, + "learning_rate": 7.626426558182262e-05, + "loss": 0.2176, + "step": 8878 + }, + { + "epoch": 1.7892403788031432, + "grad_norm": 0.04432012513279915, + "learning_rate": 7.62529247687181e-05, + "loss": 0.2089, + "step": 8880 + }, + { + "epoch": 1.7896433608704414, + "grad_norm": 0.04522010684013367, + "learning_rate": 7.624158209064782e-05, + "loss": 0.1346, + "step": 8882 + }, + { + "epoch": 1.7900463429377393, + "grad_norm": 0.07001801580190659, + "learning_rate": 7.62302375484175e-05, + "loss": 0.1907, + "step": 8884 + }, + { + "epoch": 1.7904493250050373, + "grad_norm": 0.05941377207636833, + "learning_rate": 7.621889114283305e-05, + "loss": 0.2282, + "step": 8886 + }, + { + "epoch": 1.7908523070723352, + "grad_norm": 0.11536096781492233, + "learning_rate": 7.620754287470051e-05, + "loss": 0.1884, + "step": 8888 + }, + { + "epoch": 1.7912552891396332, + "grad_norm": 0.044333018362522125, + "learning_rate": 7.619619274482603e-05, + "loss": 0.1956, + "step": 8890 + }, + { + "epoch": 1.7916582712069313, + "grad_norm": 0.06309880316257477, + "learning_rate": 7.618484075401591e-05, + "loss": 0.2062, + "step": 8892 + }, + { + "epoch": 1.7920612532742293, + "grad_norm": 0.034674037247896194, + "learning_rate": 7.617348690307659e-05, + "loss": 0.1751, + "step": 8894 + }, + { + "epoch": 1.7924642353415274, + "grad_norm": 0.35374215245246887, + "learning_rate": 7.616213119281462e-05, + "loss": 0.1593, + "step": 8896 + }, + { + "epoch": 1.7928672174088254, + "grad_norm": 0.043103184551000595, + "learning_rate": 7.615077362403669e-05, + "loss": 0.1558, + "step": 8898 + }, + { + "epoch": 1.7932701994761233, + "grad_norm": 0.04136328399181366, + "learning_rate": 7.613941419754961e-05, + "loss": 0.1753, + "step": 8900 + }, + { + "epoch": 1.7936731815434213, + "grad_norm": 0.04389548674225807, + "learning_rate": 7.612805291416036e-05, + "loss": 0.1998, + "step": 8902 + }, + { + "epoch": 1.7940761636107192, + "grad_norm": 0.04551811143755913, + "learning_rate": 7.6116689774676e-05, + "loss": 0.1881, + "step": 8904 + }, + { + "epoch": 1.7944791456780174, + "grad_norm": 0.05296977981925011, + "learning_rate": 7.61053247799038e-05, + "loss": 0.1489, + "step": 8906 + }, + { + "epoch": 1.7948821277453153, + "grad_norm": 0.09822306036949158, + "learning_rate": 7.609395793065107e-05, + "loss": 0.2469, + "step": 8908 + }, + { + "epoch": 1.7952851098126135, + "grad_norm": 0.04015516862273216, + "learning_rate": 7.608258922772527e-05, + "loss": 0.1482, + "step": 8910 + }, + { + "epoch": 1.7956880918799114, + "grad_norm": 0.05408100038766861, + "learning_rate": 7.607121867193407e-05, + "loss": 0.2308, + "step": 8912 + }, + { + "epoch": 1.7960910739472093, + "grad_norm": 0.04759914055466652, + "learning_rate": 7.605984626408517e-05, + "loss": 0.1598, + "step": 8914 + }, + { + "epoch": 1.7964940560145073, + "grad_norm": 0.05472610890865326, + "learning_rate": 7.604847200498649e-05, + "loss": 0.1849, + "step": 8916 + }, + { + "epoch": 1.7968970380818052, + "grad_norm": 0.043661490082740784, + "learning_rate": 7.603709589544601e-05, + "loss": 0.1537, + "step": 8918 + }, + { + "epoch": 1.7973000201491034, + "grad_norm": 0.05160943791270256, + "learning_rate": 7.602571793627187e-05, + "loss": 0.1813, + "step": 8920 + }, + { + "epoch": 1.7977030022164013, + "grad_norm": 0.05473243072628975, + "learning_rate": 7.601433812827235e-05, + "loss": 0.208, + "step": 8922 + }, + { + "epoch": 1.7981059842836995, + "grad_norm": 0.04076346382498741, + "learning_rate": 7.600295647225586e-05, + "loss": 0.1423, + "step": 8924 + }, + { + "epoch": 1.7985089663509974, + "grad_norm": 0.06232528015971184, + "learning_rate": 7.599157296903092e-05, + "loss": 0.2041, + "step": 8926 + }, + { + "epoch": 1.7989119484182954, + "grad_norm": 0.07285811007022858, + "learning_rate": 7.598018761940622e-05, + "loss": 0.1965, + "step": 8928 + }, + { + "epoch": 1.7993149304855933, + "grad_norm": 0.058572810143232346, + "learning_rate": 7.596880042419053e-05, + "loss": 0.1957, + "step": 8930 + }, + { + "epoch": 1.7997179125528913, + "grad_norm": 0.06865711510181427, + "learning_rate": 7.595741138419279e-05, + "loss": 0.1719, + "step": 8932 + }, + { + "epoch": 1.8001208946201894, + "grad_norm": 0.043107353150844574, + "learning_rate": 7.594602050022207e-05, + "loss": 0.1416, + "step": 8934 + }, + { + "epoch": 1.8005238766874874, + "grad_norm": 0.047849707305431366, + "learning_rate": 7.593462777308752e-05, + "loss": 0.1889, + "step": 8936 + }, + { + "epoch": 1.8009268587547855, + "grad_norm": 0.053915441036224365, + "learning_rate": 7.592323320359849e-05, + "loss": 0.2267, + "step": 8938 + }, + { + "epoch": 1.8013298408220835, + "grad_norm": 0.04879898205399513, + "learning_rate": 7.591183679256447e-05, + "loss": 0.2082, + "step": 8940 + }, + { + "epoch": 1.8017328228893814, + "grad_norm": 0.04230300709605217, + "learning_rate": 7.590043854079496e-05, + "loss": 0.2072, + "step": 8942 + }, + { + "epoch": 1.8021358049566794, + "grad_norm": 0.05137898400425911, + "learning_rate": 7.588903844909973e-05, + "loss": 0.2026, + "step": 8944 + }, + { + "epoch": 1.8025387870239773, + "grad_norm": 0.037783313542604446, + "learning_rate": 7.587763651828863e-05, + "loss": 0.1661, + "step": 8946 + }, + { + "epoch": 1.8029417690912755, + "grad_norm": 0.055004462599754333, + "learning_rate": 7.58662327491716e-05, + "loss": 0.2076, + "step": 8948 + }, + { + "epoch": 1.8033447511585734, + "grad_norm": 0.056807104498147964, + "learning_rate": 7.585482714255877e-05, + "loss": 0.1984, + "step": 8950 + }, + { + "epoch": 1.8037477332258716, + "grad_norm": 0.03771530091762543, + "learning_rate": 7.584341969926037e-05, + "loss": 0.1587, + "step": 8952 + }, + { + "epoch": 1.8041507152931695, + "grad_norm": 0.049856383353471756, + "learning_rate": 7.583201042008677e-05, + "loss": 0.2049, + "step": 8954 + }, + { + "epoch": 1.8045536973604674, + "grad_norm": 0.051321081817150116, + "learning_rate": 7.582059930584844e-05, + "loss": 0.1883, + "step": 8956 + }, + { + "epoch": 1.8049566794277654, + "grad_norm": 0.045879822224378586, + "learning_rate": 7.580918635735605e-05, + "loss": 0.1875, + "step": 8958 + }, + { + "epoch": 1.8053596614950633, + "grad_norm": 0.0529901348054409, + "learning_rate": 7.579777157542034e-05, + "loss": 0.2356, + "step": 8960 + }, + { + "epoch": 1.8057626435623615, + "grad_norm": 0.04600401595234871, + "learning_rate": 7.578635496085218e-05, + "loss": 0.2251, + "step": 8962 + }, + { + "epoch": 1.8061656256296594, + "grad_norm": 0.2081703096628189, + "learning_rate": 7.577493651446261e-05, + "loss": 0.265, + "step": 8964 + }, + { + "epoch": 1.8065686076969576, + "grad_norm": 0.0612410344183445, + "learning_rate": 7.576351623706277e-05, + "loss": 0.2312, + "step": 8966 + }, + { + "epoch": 1.8069715897642555, + "grad_norm": 0.04466860368847847, + "learning_rate": 7.575209412946394e-05, + "loss": 0.1786, + "step": 8968 + }, + { + "epoch": 1.8073745718315535, + "grad_norm": 0.10041309893131256, + "learning_rate": 7.574067019247753e-05, + "loss": 0.2351, + "step": 8970 + }, + { + "epoch": 1.8077775538988514, + "grad_norm": 0.06266580522060394, + "learning_rate": 7.572924442691505e-05, + "loss": 0.2368, + "step": 8972 + }, + { + "epoch": 1.8081805359661494, + "grad_norm": 0.04474787414073944, + "learning_rate": 7.571781683358822e-05, + "loss": 0.1473, + "step": 8974 + }, + { + "epoch": 1.8085835180334475, + "grad_norm": 0.043090350925922394, + "learning_rate": 7.57063874133088e-05, + "loss": 0.1662, + "step": 8976 + }, + { + "epoch": 1.8089865001007455, + "grad_norm": 0.22436150908470154, + "learning_rate": 7.569495616688873e-05, + "loss": 0.2034, + "step": 8978 + }, + { + "epoch": 1.8093894821680436, + "grad_norm": 0.05931695178151131, + "learning_rate": 7.568352309514008e-05, + "loss": 0.233, + "step": 8980 + }, + { + "epoch": 1.8097924642353416, + "grad_norm": 0.05479630082845688, + "learning_rate": 7.567208819887502e-05, + "loss": 0.1908, + "step": 8982 + }, + { + "epoch": 1.8101954463026395, + "grad_norm": 0.04764629527926445, + "learning_rate": 7.566065147890586e-05, + "loss": 0.1856, + "step": 8984 + }, + { + "epoch": 1.8105984283699375, + "grad_norm": 0.046009134501218796, + "learning_rate": 7.564921293604508e-05, + "loss": 0.167, + "step": 8986 + }, + { + "epoch": 1.8110014104372354, + "grad_norm": 0.06495673209428787, + "learning_rate": 7.56377725711052e-05, + "loss": 0.1937, + "step": 8988 + }, + { + "epoch": 1.8114043925045336, + "grad_norm": 0.049228839576244354, + "learning_rate": 7.562633038489897e-05, + "loss": 0.1917, + "step": 8990 + }, + { + "epoch": 1.8118073745718317, + "grad_norm": 0.04694323614239693, + "learning_rate": 7.561488637823924e-05, + "loss": 0.1727, + "step": 8992 + }, + { + "epoch": 1.8122103566391297, + "grad_norm": 0.04595312848687172, + "learning_rate": 7.560344055193891e-05, + "loss": 0.2031, + "step": 8994 + }, + { + "epoch": 1.8126133387064276, + "grad_norm": 0.043079208582639694, + "learning_rate": 7.559199290681112e-05, + "loss": 0.2117, + "step": 8996 + }, + { + "epoch": 1.8130163207737255, + "grad_norm": 0.052916523069143295, + "learning_rate": 7.55805434436691e-05, + "loss": 0.1722, + "step": 8998 + }, + { + "epoch": 1.8134193028410235, + "grad_norm": 0.06618094444274902, + "learning_rate": 7.556909216332617e-05, + "loss": 0.1934, + "step": 9000 + }, + { + "epoch": 1.8138222849083214, + "grad_norm": 0.059367429465055466, + "learning_rate": 7.555763906659582e-05, + "loss": 0.2204, + "step": 9002 + }, + { + "epoch": 1.8142252669756196, + "grad_norm": 0.05537892505526543, + "learning_rate": 7.554618415429168e-05, + "loss": 0.1491, + "step": 9004 + }, + { + "epoch": 1.8146282490429178, + "grad_norm": 0.05505523830652237, + "learning_rate": 7.553472742722745e-05, + "loss": 0.2061, + "step": 9006 + }, + { + "epoch": 1.8150312311102157, + "grad_norm": 0.05641672760248184, + "learning_rate": 7.552326888621703e-05, + "loss": 0.1719, + "step": 9008 + }, + { + "epoch": 1.8154342131775136, + "grad_norm": 0.04747428745031357, + "learning_rate": 7.551180853207442e-05, + "loss": 0.2033, + "step": 9010 + }, + { + "epoch": 1.8158371952448116, + "grad_norm": 0.07291791588068008, + "learning_rate": 7.550034636561371e-05, + "loss": 0.2159, + "step": 9012 + }, + { + "epoch": 1.8162401773121095, + "grad_norm": 0.06158110871911049, + "learning_rate": 7.54888823876492e-05, + "loss": 0.1543, + "step": 9014 + }, + { + "epoch": 1.8166431593794075, + "grad_norm": 0.05462907254695892, + "learning_rate": 7.547741659899523e-05, + "loss": 0.1869, + "step": 9016 + }, + { + "epoch": 1.8170461414467056, + "grad_norm": 0.09753762930631638, + "learning_rate": 7.546594900046633e-05, + "loss": 0.1726, + "step": 9018 + }, + { + "epoch": 1.8174491235140038, + "grad_norm": 0.12133771181106567, + "learning_rate": 7.545447959287714e-05, + "loss": 0.1845, + "step": 9020 + }, + { + "epoch": 1.8178521055813017, + "grad_norm": 0.05773913860321045, + "learning_rate": 7.544300837704244e-05, + "loss": 0.194, + "step": 9022 + }, + { + "epoch": 1.8182550876485997, + "grad_norm": 0.049177125096321106, + "learning_rate": 7.543153535377711e-05, + "loss": 0.2086, + "step": 9024 + }, + { + "epoch": 1.8186580697158976, + "grad_norm": 0.05512756481766701, + "learning_rate": 7.542006052389619e-05, + "loss": 0.2001, + "step": 9026 + }, + { + "epoch": 1.8190610517831955, + "grad_norm": 0.04927990213036537, + "learning_rate": 7.540858388821482e-05, + "loss": 0.1936, + "step": 9028 + }, + { + "epoch": 1.8194640338504935, + "grad_norm": 0.06231493130326271, + "learning_rate": 7.539710544754826e-05, + "loss": 0.2342, + "step": 9030 + }, + { + "epoch": 1.8198670159177917, + "grad_norm": 0.040557861328125, + "learning_rate": 7.538562520271197e-05, + "loss": 0.1761, + "step": 9032 + }, + { + "epoch": 1.8202699979850898, + "grad_norm": 0.05428093299269676, + "learning_rate": 7.537414315452145e-05, + "loss": 0.2394, + "step": 9034 + }, + { + "epoch": 1.8206729800523878, + "grad_norm": 0.041098061949014664, + "learning_rate": 7.536265930379239e-05, + "loss": 0.2154, + "step": 9036 + }, + { + "epoch": 1.8210759621196857, + "grad_norm": 0.04267246648669243, + "learning_rate": 7.535117365134058e-05, + "loss": 0.1659, + "step": 9038 + }, + { + "epoch": 1.8214789441869836, + "grad_norm": 0.04948339983820915, + "learning_rate": 7.533968619798193e-05, + "loss": 0.2019, + "step": 9040 + }, + { + "epoch": 1.8218819262542816, + "grad_norm": 0.08953138440847397, + "learning_rate": 7.53281969445325e-05, + "loss": 0.3046, + "step": 9042 + }, + { + "epoch": 1.8222849083215797, + "grad_norm": 0.05375578626990318, + "learning_rate": 7.531670589180846e-05, + "loss": 0.1992, + "step": 9044 + }, + { + "epoch": 1.8226878903888777, + "grad_norm": 0.0598725751042366, + "learning_rate": 7.530521304062613e-05, + "loss": 0.204, + "step": 9046 + }, + { + "epoch": 1.8230908724561758, + "grad_norm": 0.06039509177207947, + "learning_rate": 7.529371839180191e-05, + "loss": 0.2076, + "step": 9048 + }, + { + "epoch": 1.8234938545234738, + "grad_norm": 0.04848941043019295, + "learning_rate": 7.528222194615242e-05, + "loss": 0.1768, + "step": 9050 + }, + { + "epoch": 1.8238968365907717, + "grad_norm": 0.04888831824064255, + "learning_rate": 7.52707237044943e-05, + "loss": 0.1861, + "step": 9052 + }, + { + "epoch": 1.8242998186580697, + "grad_norm": 0.040995147079229355, + "learning_rate": 7.525922366764437e-05, + "loss": 0.2125, + "step": 9054 + }, + { + "epoch": 1.8247028007253676, + "grad_norm": 0.051442645490169525, + "learning_rate": 7.524772183641961e-05, + "loss": 0.2137, + "step": 9056 + }, + { + "epoch": 1.8251057827926658, + "grad_norm": 0.04963943362236023, + "learning_rate": 7.523621821163707e-05, + "loss": 0.2418, + "step": 9058 + }, + { + "epoch": 1.8255087648599637, + "grad_norm": 0.04928060993552208, + "learning_rate": 7.522471279411393e-05, + "loss": 0.227, + "step": 9060 + }, + { + "epoch": 1.8259117469272619, + "grad_norm": 0.04120678827166557, + "learning_rate": 7.521320558466755e-05, + "loss": 0.1801, + "step": 9062 + }, + { + "epoch": 1.8263147289945598, + "grad_norm": 0.05170593783259392, + "learning_rate": 7.520169658411535e-05, + "loss": 0.2048, + "step": 9064 + }, + { + "epoch": 1.8267177110618578, + "grad_norm": 0.05370767042040825, + "learning_rate": 7.519018579327493e-05, + "loss": 0.1983, + "step": 9066 + }, + { + "epoch": 1.8271206931291557, + "grad_norm": 0.050350531935691833, + "learning_rate": 7.517867321296402e-05, + "loss": 0.219, + "step": 9068 + }, + { + "epoch": 1.8275236751964536, + "grad_norm": 0.04988570138812065, + "learning_rate": 7.51671588440004e-05, + "loss": 0.1792, + "step": 9070 + }, + { + "epoch": 1.8279266572637518, + "grad_norm": 0.04610089212656021, + "learning_rate": 7.51556426872021e-05, + "loss": 0.2189, + "step": 9072 + }, + { + "epoch": 1.8283296393310497, + "grad_norm": 0.03649386391043663, + "learning_rate": 7.514412474338715e-05, + "loss": 0.1567, + "step": 9074 + }, + { + "epoch": 1.828732621398348, + "grad_norm": 0.05270843580365181, + "learning_rate": 7.51326050133738e-05, + "loss": 0.1914, + "step": 9076 + }, + { + "epoch": 1.8291356034656459, + "grad_norm": 0.05600623041391373, + "learning_rate": 7.512108349798037e-05, + "loss": 0.2287, + "step": 9078 + }, + { + "epoch": 1.8295385855329438, + "grad_norm": 0.05488625913858414, + "learning_rate": 7.510956019802537e-05, + "loss": 0.2209, + "step": 9080 + }, + { + "epoch": 1.8299415676002417, + "grad_norm": 0.0471048466861248, + "learning_rate": 7.509803511432734e-05, + "loss": 0.2048, + "step": 9082 + }, + { + "epoch": 1.8303445496675397, + "grad_norm": 0.05616595223546028, + "learning_rate": 7.508650824770505e-05, + "loss": 0.1831, + "step": 9084 + }, + { + "epoch": 1.8307475317348378, + "grad_norm": 0.05483395606279373, + "learning_rate": 7.507497959897734e-05, + "loss": 0.1852, + "step": 9086 + }, + { + "epoch": 1.8311505138021358, + "grad_norm": 0.05591282621026039, + "learning_rate": 7.506344916896317e-05, + "loss": 0.1974, + "step": 9088 + }, + { + "epoch": 1.831553495869434, + "grad_norm": 0.046104494482278824, + "learning_rate": 7.505191695848165e-05, + "loss": 0.215, + "step": 9090 + }, + { + "epoch": 1.8319564779367319, + "grad_norm": 0.06968837231397629, + "learning_rate": 7.504038296835203e-05, + "loss": 0.2492, + "step": 9092 + }, + { + "epoch": 1.8323594600040298, + "grad_norm": 0.048819273710250854, + "learning_rate": 7.502884719939363e-05, + "loss": 0.2016, + "step": 9094 + }, + { + "epoch": 1.8327624420713278, + "grad_norm": 0.06848305463790894, + "learning_rate": 7.501730965242598e-05, + "loss": 0.1702, + "step": 9096 + }, + { + "epoch": 1.8331654241386257, + "grad_norm": 0.05366634204983711, + "learning_rate": 7.500577032826863e-05, + "loss": 0.1789, + "step": 9098 + }, + { + "epoch": 1.8335684062059239, + "grad_norm": 0.04202277213335037, + "learning_rate": 7.499422922774137e-05, + "loss": 0.1796, + "step": 9100 + }, + { + "epoch": 1.8339713882732218, + "grad_norm": 0.04547832906246185, + "learning_rate": 7.498268635166403e-05, + "loss": 0.1709, + "step": 9102 + }, + { + "epoch": 1.83437437034052, + "grad_norm": 0.04689347371459007, + "learning_rate": 7.497114170085661e-05, + "loss": 0.175, + "step": 9104 + }, + { + "epoch": 1.834777352407818, + "grad_norm": 0.043508078902959824, + "learning_rate": 7.495959527613921e-05, + "loss": 0.2098, + "step": 9106 + }, + { + "epoch": 1.8351803344751159, + "grad_norm": 0.05118690803647041, + "learning_rate": 7.494804707833208e-05, + "loss": 0.159, + "step": 9108 + }, + { + "epoch": 1.8355833165424138, + "grad_norm": 0.037060126662254333, + "learning_rate": 7.493649710825559e-05, + "loss": 0.1626, + "step": 9110 + }, + { + "epoch": 1.8359862986097117, + "grad_norm": 0.05963043123483658, + "learning_rate": 7.492494536673021e-05, + "loss": 0.1906, + "step": 9112 + }, + { + "epoch": 1.83638928067701, + "grad_norm": 0.044435955584049225, + "learning_rate": 7.49133918545766e-05, + "loss": 0.1564, + "step": 9114 + }, + { + "epoch": 1.8367922627443078, + "grad_norm": 0.058326657861471176, + "learning_rate": 7.490183657261546e-05, + "loss": 0.1771, + "step": 9116 + }, + { + "epoch": 1.837195244811606, + "grad_norm": 0.06040395423769951, + "learning_rate": 7.489027952166768e-05, + "loss": 0.2254, + "step": 9118 + }, + { + "epoch": 1.837598226878904, + "grad_norm": 0.04052739217877388, + "learning_rate": 7.487872070255425e-05, + "loss": 0.135, + "step": 9120 + }, + { + "epoch": 1.838001208946202, + "grad_norm": 0.05821429565548897, + "learning_rate": 7.486716011609627e-05, + "loss": 0.1784, + "step": 9122 + }, + { + "epoch": 1.8384041910134998, + "grad_norm": 0.05247338116168976, + "learning_rate": 7.485559776311501e-05, + "loss": 0.2273, + "step": 9124 + }, + { + "epoch": 1.8388071730807978, + "grad_norm": 0.05803043395280838, + "learning_rate": 7.484403364443185e-05, + "loss": 0.2082, + "step": 9126 + }, + { + "epoch": 1.839210155148096, + "grad_norm": 0.051874928176403046, + "learning_rate": 7.483246776086827e-05, + "loss": 0.2019, + "step": 9128 + }, + { + "epoch": 1.8396131372153939, + "grad_norm": 0.048445116728544235, + "learning_rate": 7.482090011324588e-05, + "loss": 0.1848, + "step": 9130 + }, + { + "epoch": 1.840016119282692, + "grad_norm": 0.06173400208353996, + "learning_rate": 7.480933070238645e-05, + "loss": 0.2716, + "step": 9132 + }, + { + "epoch": 1.84041910134999, + "grad_norm": 0.04523913934826851, + "learning_rate": 7.479775952911184e-05, + "loss": 0.1917, + "step": 9134 + }, + { + "epoch": 1.840822083417288, + "grad_norm": 0.054122623056173325, + "learning_rate": 7.478618659424406e-05, + "loss": 0.2317, + "step": 9136 + }, + { + "epoch": 1.8412250654845859, + "grad_norm": 0.050042774528265, + "learning_rate": 7.477461189860522e-05, + "loss": 0.2069, + "step": 9138 + }, + { + "epoch": 1.8416280475518838, + "grad_norm": 0.04367179423570633, + "learning_rate": 7.476303544301757e-05, + "loss": 0.1848, + "step": 9140 + }, + { + "epoch": 1.842031029619182, + "grad_norm": 0.054118309170007706, + "learning_rate": 7.475145722830348e-05, + "loss": 0.1706, + "step": 9142 + }, + { + "epoch": 1.84243401168648, + "grad_norm": 0.04461648315191269, + "learning_rate": 7.473987725528547e-05, + "loss": 0.2188, + "step": 9144 + }, + { + "epoch": 1.842836993753778, + "grad_norm": 0.0384267196059227, + "learning_rate": 7.472829552478613e-05, + "loss": 0.1457, + "step": 9146 + }, + { + "epoch": 1.843239975821076, + "grad_norm": 0.04002009332180023, + "learning_rate": 7.471671203762822e-05, + "loss": 0.1605, + "step": 9148 + }, + { + "epoch": 1.843642957888374, + "grad_norm": 0.04789675399661064, + "learning_rate": 7.470512679463463e-05, + "loss": 0.169, + "step": 9150 + }, + { + "epoch": 1.844045939955672, + "grad_norm": 0.054053302854299545, + "learning_rate": 7.469353979662833e-05, + "loss": 0.1935, + "step": 9152 + }, + { + "epoch": 1.8444489220229698, + "grad_norm": 0.03856454789638519, + "learning_rate": 7.468195104443246e-05, + "loss": 0.1723, + "step": 9154 + }, + { + "epoch": 1.844851904090268, + "grad_norm": 0.05694045126438141, + "learning_rate": 7.467036053887027e-05, + "loss": 0.1889, + "step": 9156 + }, + { + "epoch": 1.845254886157566, + "grad_norm": 0.03868400678038597, + "learning_rate": 7.46587682807651e-05, + "loss": 0.2551, + "step": 9158 + }, + { + "epoch": 1.845657868224864, + "grad_norm": 0.044934503734111786, + "learning_rate": 7.464717427094048e-05, + "loss": 0.2043, + "step": 9160 + }, + { + "epoch": 1.846060850292162, + "grad_norm": 0.04237751290202141, + "learning_rate": 7.463557851022001e-05, + "loss": 0.219, + "step": 9162 + }, + { + "epoch": 1.84646383235946, + "grad_norm": 0.050449177622795105, + "learning_rate": 7.462398099942745e-05, + "loss": 0.1908, + "step": 9164 + }, + { + "epoch": 1.846866814426758, + "grad_norm": 0.03661469370126724, + "learning_rate": 7.461238173938667e-05, + "loss": 0.1453, + "step": 9166 + }, + { + "epoch": 1.8472697964940559, + "grad_norm": 0.04814742133021355, + "learning_rate": 7.460078073092163e-05, + "loss": 0.1919, + "step": 9168 + }, + { + "epoch": 1.847672778561354, + "grad_norm": 0.09877938777208328, + "learning_rate": 7.458917797485648e-05, + "loss": 0.2523, + "step": 9170 + }, + { + "epoch": 1.848075760628652, + "grad_norm": 0.03724796697497368, + "learning_rate": 7.457757347201545e-05, + "loss": 0.1498, + "step": 9172 + }, + { + "epoch": 1.8484787426959501, + "grad_norm": 0.04252644255757332, + "learning_rate": 7.456596722322292e-05, + "loss": 0.1498, + "step": 9174 + }, + { + "epoch": 1.848881724763248, + "grad_norm": 0.048177529126405716, + "learning_rate": 7.455435922930335e-05, + "loss": 0.1798, + "step": 9176 + }, + { + "epoch": 1.849284706830546, + "grad_norm": 0.038370076566934586, + "learning_rate": 7.454274949108136e-05, + "loss": 0.2012, + "step": 9178 + }, + { + "epoch": 1.849687688897844, + "grad_norm": 0.04380296543240547, + "learning_rate": 7.453113800938172e-05, + "loss": 0.2188, + "step": 9180 + }, + { + "epoch": 1.850090670965142, + "grad_norm": 0.05155162513256073, + "learning_rate": 7.451952478502924e-05, + "loss": 0.179, + "step": 9182 + }, + { + "epoch": 1.85049365303244, + "grad_norm": 0.04221152514219284, + "learning_rate": 7.450790981884896e-05, + "loss": 0.1924, + "step": 9184 + }, + { + "epoch": 1.850896635099738, + "grad_norm": 0.06590647250413895, + "learning_rate": 7.449629311166595e-05, + "loss": 0.2229, + "step": 9186 + }, + { + "epoch": 1.8512996171670362, + "grad_norm": 0.040665872395038605, + "learning_rate": 7.448467466430545e-05, + "loss": 0.1418, + "step": 9188 + }, + { + "epoch": 1.8517025992343341, + "grad_norm": 0.0483027920126915, + "learning_rate": 7.447305447759282e-05, + "loss": 0.2081, + "step": 9190 + }, + { + "epoch": 1.852105581301632, + "grad_norm": 0.03717666119337082, + "learning_rate": 7.446143255235355e-05, + "loss": 0.157, + "step": 9192 + }, + { + "epoch": 1.85250856336893, + "grad_norm": 0.04697210341691971, + "learning_rate": 7.444980888941322e-05, + "loss": 0.1733, + "step": 9194 + }, + { + "epoch": 1.852911545436228, + "grad_norm": 0.05617213249206543, + "learning_rate": 7.443818348959757e-05, + "loss": 0.1802, + "step": 9196 + }, + { + "epoch": 1.853314527503526, + "grad_norm": 0.057184625416994095, + "learning_rate": 7.442655635373246e-05, + "loss": 0.1908, + "step": 9198 + }, + { + "epoch": 1.8537175095708243, + "grad_norm": 0.05311274901032448, + "learning_rate": 7.441492748264384e-05, + "loss": 0.2275, + "step": 9200 + }, + { + "epoch": 1.8541204916381222, + "grad_norm": 0.05756756663322449, + "learning_rate": 7.440329687715781e-05, + "loss": 0.1991, + "step": 9202 + }, + { + "epoch": 1.8545234737054201, + "grad_norm": 0.07354568690061569, + "learning_rate": 7.439166453810061e-05, + "loss": 0.227, + "step": 9204 + }, + { + "epoch": 1.854926455772718, + "grad_norm": 0.07992871105670929, + "learning_rate": 7.438003046629857e-05, + "loss": 0.2499, + "step": 9206 + }, + { + "epoch": 1.855329437840016, + "grad_norm": 0.04908233880996704, + "learning_rate": 7.436839466257816e-05, + "loss": 0.1945, + "step": 9208 + }, + { + "epoch": 1.855732419907314, + "grad_norm": 0.04218778386712074, + "learning_rate": 7.435675712776594e-05, + "loss": 0.1944, + "step": 9210 + }, + { + "epoch": 1.8561354019746121, + "grad_norm": 0.06127457693219185, + "learning_rate": 7.434511786268866e-05, + "loss": 0.2256, + "step": 9212 + }, + { + "epoch": 1.8565383840419103, + "grad_norm": 0.05392537638545036, + "learning_rate": 7.433347686817316e-05, + "loss": 0.2313, + "step": 9214 + }, + { + "epoch": 1.8569413661092082, + "grad_norm": 0.048670317977666855, + "learning_rate": 7.432183414504635e-05, + "loss": 0.1915, + "step": 9216 + }, + { + "epoch": 1.8573443481765062, + "grad_norm": 0.04563140869140625, + "learning_rate": 7.431018969413536e-05, + "loss": 0.2058, + "step": 9218 + }, + { + "epoch": 1.8577473302438041, + "grad_norm": 0.052207816392183304, + "learning_rate": 7.429854351626737e-05, + "loss": 0.2068, + "step": 9220 + }, + { + "epoch": 1.858150312311102, + "grad_norm": 0.04309464991092682, + "learning_rate": 7.428689561226969e-05, + "loss": 0.169, + "step": 9222 + }, + { + "epoch": 1.8585532943784, + "grad_norm": 0.05821385979652405, + "learning_rate": 7.42752459829698e-05, + "loss": 0.2067, + "step": 9224 + }, + { + "epoch": 1.8589562764456982, + "grad_norm": 0.04867576062679291, + "learning_rate": 7.426359462919527e-05, + "loss": 0.186, + "step": 9226 + }, + { + "epoch": 1.8593592585129963, + "grad_norm": 0.055768147110939026, + "learning_rate": 7.425194155177377e-05, + "loss": 0.1948, + "step": 9228 + }, + { + "epoch": 1.8597622405802943, + "grad_norm": 0.055632948875427246, + "learning_rate": 7.424028675153313e-05, + "loss": 0.1917, + "step": 9230 + }, + { + "epoch": 1.8601652226475922, + "grad_norm": 0.03492635115981102, + "learning_rate": 7.422863022930128e-05, + "loss": 0.1538, + "step": 9232 + }, + { + "epoch": 1.8605682047148902, + "grad_norm": 0.1014142632484436, + "learning_rate": 7.421697198590628e-05, + "loss": 0.1698, + "step": 9234 + }, + { + "epoch": 1.860971186782188, + "grad_norm": 0.05396107956767082, + "learning_rate": 7.420531202217634e-05, + "loss": 0.2376, + "step": 9236 + }, + { + "epoch": 1.861374168849486, + "grad_norm": 0.058612339198589325, + "learning_rate": 7.419365033893972e-05, + "loss": 0.185, + "step": 9238 + }, + { + "epoch": 1.8617771509167842, + "grad_norm": 0.07215067744255066, + "learning_rate": 7.418198693702489e-05, + "loss": 0.1545, + "step": 9240 + }, + { + "epoch": 1.8621801329840824, + "grad_norm": 0.05233979597687721, + "learning_rate": 7.417032181726038e-05, + "loss": 0.1979, + "step": 9242 + }, + { + "epoch": 1.8625831150513803, + "grad_norm": 0.03794995695352554, + "learning_rate": 7.415865498047485e-05, + "loss": 0.1615, + "step": 9244 + }, + { + "epoch": 1.8629860971186782, + "grad_norm": 0.049930084496736526, + "learning_rate": 7.414698642749712e-05, + "loss": 0.1539, + "step": 9246 + }, + { + "epoch": 1.8633890791859762, + "grad_norm": 0.05968122184276581, + "learning_rate": 7.413531615915609e-05, + "loss": 0.1558, + "step": 9248 + }, + { + "epoch": 1.8637920612532741, + "grad_norm": 0.038739725947380066, + "learning_rate": 7.41236441762808e-05, + "loss": 0.1648, + "step": 9250 + }, + { + "epoch": 1.8641950433205723, + "grad_norm": 0.06096603721380234, + "learning_rate": 7.41119704797004e-05, + "loss": 0.1747, + "step": 9252 + }, + { + "epoch": 1.8645980253878702, + "grad_norm": 0.06738614290952682, + "learning_rate": 7.410029507024418e-05, + "loss": 0.2019, + "step": 9254 + }, + { + "epoch": 1.8650010074551684, + "grad_norm": 0.05740448087453842, + "learning_rate": 7.408861794874155e-05, + "loss": 0.1431, + "step": 9256 + }, + { + "epoch": 1.8654039895224663, + "grad_norm": 0.07034429907798767, + "learning_rate": 7.407693911602201e-05, + "loss": 0.18, + "step": 9258 + }, + { + "epoch": 1.8658069715897643, + "grad_norm": 0.05217040330171585, + "learning_rate": 7.406525857291523e-05, + "loss": 0.2113, + "step": 9260 + }, + { + "epoch": 1.8662099536570622, + "grad_norm": 0.05973799526691437, + "learning_rate": 7.405357632025097e-05, + "loss": 0.1966, + "step": 9262 + }, + { + "epoch": 1.8666129357243602, + "grad_norm": 0.06974881887435913, + "learning_rate": 7.40418923588591e-05, + "loss": 0.2153, + "step": 9264 + }, + { + "epoch": 1.8670159177916583, + "grad_norm": 0.08291905373334885, + "learning_rate": 7.403020668956967e-05, + "loss": 0.1821, + "step": 9266 + }, + { + "epoch": 1.8674188998589563, + "grad_norm": 0.04557829722762108, + "learning_rate": 7.401851931321278e-05, + "loss": 0.2295, + "step": 9268 + }, + { + "epoch": 1.8678218819262544, + "grad_norm": 0.04647723585367203, + "learning_rate": 7.400683023061868e-05, + "loss": 0.2267, + "step": 9270 + }, + { + "epoch": 1.8682248639935524, + "grad_norm": 0.0661553293466568, + "learning_rate": 7.399513944261776e-05, + "loss": 0.2783, + "step": 9272 + }, + { + "epoch": 1.8686278460608503, + "grad_norm": 0.03786701336503029, + "learning_rate": 7.398344695004051e-05, + "loss": 0.1326, + "step": 9274 + }, + { + "epoch": 1.8690308281281482, + "grad_norm": 0.05944419279694557, + "learning_rate": 7.397175275371754e-05, + "loss": 0.1802, + "step": 9276 + }, + { + "epoch": 1.8694338101954462, + "grad_norm": 0.042872168123722076, + "learning_rate": 7.39600568544796e-05, + "loss": 0.1665, + "step": 9278 + }, + { + "epoch": 1.8698367922627444, + "grad_norm": 0.03952530026435852, + "learning_rate": 7.394835925315753e-05, + "loss": 0.2041, + "step": 9280 + }, + { + "epoch": 1.8702397743300423, + "grad_norm": 0.045997776091098785, + "learning_rate": 7.393665995058232e-05, + "loss": 0.2029, + "step": 9282 + }, + { + "epoch": 1.8706427563973405, + "grad_norm": 0.041152164340019226, + "learning_rate": 7.392495894758508e-05, + "loss": 0.1887, + "step": 9284 + }, + { + "epoch": 1.8710457384646384, + "grad_norm": 0.057696882635354996, + "learning_rate": 7.3913256244997e-05, + "loss": 0.2322, + "step": 9286 + }, + { + "epoch": 1.8714487205319363, + "grad_norm": 0.05366590991616249, + "learning_rate": 7.390155184364944e-05, + "loss": 0.1969, + "step": 9288 + }, + { + "epoch": 1.8718517025992343, + "grad_norm": 0.05442323908209801, + "learning_rate": 7.388984574437388e-05, + "loss": 0.2094, + "step": 9290 + }, + { + "epoch": 1.8722546846665322, + "grad_norm": 0.040767259895801544, + "learning_rate": 7.387813794800187e-05, + "loss": 0.1648, + "step": 9292 + }, + { + "epoch": 1.8726576667338304, + "grad_norm": 0.04274585098028183, + "learning_rate": 7.386642845536513e-05, + "loss": 0.2038, + "step": 9294 + }, + { + "epoch": 1.8730606488011283, + "grad_norm": 0.05937502160668373, + "learning_rate": 7.385471726729549e-05, + "loss": 0.2133, + "step": 9296 + }, + { + "epoch": 1.8734636308684265, + "grad_norm": 0.04178241640329361, + "learning_rate": 7.384300438462488e-05, + "loss": 0.226, + "step": 9298 + }, + { + "epoch": 1.8738666129357244, + "grad_norm": 0.04841248691082001, + "learning_rate": 7.383128980818538e-05, + "loss": 0.1665, + "step": 9300 + }, + { + "epoch": 1.8742695950030224, + "grad_norm": 0.05001138895750046, + "learning_rate": 7.381957353880916e-05, + "loss": 0.2038, + "step": 9302 + }, + { + "epoch": 1.8746725770703203, + "grad_norm": 0.047811415046453476, + "learning_rate": 7.380785557732851e-05, + "loss": 0.1678, + "step": 9304 + }, + { + "epoch": 1.8750755591376183, + "grad_norm": 0.044050272554159164, + "learning_rate": 7.37961359245759e-05, + "loss": 0.215, + "step": 9306 + }, + { + "epoch": 1.8754785412049164, + "grad_norm": 0.05505215376615524, + "learning_rate": 7.378441458138383e-05, + "loss": 0.1941, + "step": 9308 + }, + { + "epoch": 1.8758815232722144, + "grad_norm": 0.06320565193891525, + "learning_rate": 7.3772691548585e-05, + "loss": 0.2459, + "step": 9310 + }, + { + "epoch": 1.8762845053395125, + "grad_norm": 0.04323972761631012, + "learning_rate": 7.376096682701217e-05, + "loss": 0.2206, + "step": 9312 + }, + { + "epoch": 1.8766874874068105, + "grad_norm": 0.054532576352357864, + "learning_rate": 7.374924041749826e-05, + "loss": 0.2011, + "step": 9314 + }, + { + "epoch": 1.8770904694741084, + "grad_norm": 0.08867479860782623, + "learning_rate": 7.373751232087629e-05, + "loss": 0.1648, + "step": 9316 + }, + { + "epoch": 1.8774934515414063, + "grad_norm": 0.050876423716545105, + "learning_rate": 7.372578253797942e-05, + "loss": 0.207, + "step": 9318 + }, + { + "epoch": 1.8778964336087043, + "grad_norm": 0.03266465291380882, + "learning_rate": 7.371405106964089e-05, + "loss": 0.1775, + "step": 9320 + }, + { + "epoch": 1.8782994156760024, + "grad_norm": 0.0411655530333519, + "learning_rate": 7.37023179166941e-05, + "loss": 0.2063, + "step": 9322 + }, + { + "epoch": 1.8787023977433004, + "grad_norm": 0.04222915321588516, + "learning_rate": 7.369058307997255e-05, + "loss": 0.1833, + "step": 9324 + }, + { + "epoch": 1.8791053798105986, + "grad_norm": 0.050624068826436996, + "learning_rate": 7.367884656030987e-05, + "loss": 0.1812, + "step": 9326 + }, + { + "epoch": 1.8795083618778965, + "grad_norm": 0.04637574031949043, + "learning_rate": 7.366710835853979e-05, + "loss": 0.1842, + "step": 9328 + }, + { + "epoch": 1.8799113439451944, + "grad_norm": 0.0690167099237442, + "learning_rate": 7.36553684754962e-05, + "loss": 0.1723, + "step": 9330 + }, + { + "epoch": 1.8803143260124924, + "grad_norm": 0.05005061626434326, + "learning_rate": 7.364362691201305e-05, + "loss": 0.2336, + "step": 9332 + }, + { + "epoch": 1.8807173080797903, + "grad_norm": 0.046944767236709595, + "learning_rate": 7.363188366892445e-05, + "loss": 0.2091, + "step": 9334 + }, + { + "epoch": 1.8811202901470885, + "grad_norm": 0.05391280725598335, + "learning_rate": 7.362013874706465e-05, + "loss": 0.1837, + "step": 9336 + }, + { + "epoch": 1.8815232722143864, + "grad_norm": 0.04615011438727379, + "learning_rate": 7.360839214726796e-05, + "loss": 0.215, + "step": 9338 + }, + { + "epoch": 1.8819262542816846, + "grad_norm": 0.05353038012981415, + "learning_rate": 7.359664387036884e-05, + "loss": 0.1675, + "step": 9340 + }, + { + "epoch": 1.8823292363489825, + "grad_norm": 0.0974080041050911, + "learning_rate": 7.358489391720188e-05, + "loss": 0.2209, + "step": 9342 + }, + { + "epoch": 1.8827322184162805, + "grad_norm": 0.04070080816745758, + "learning_rate": 7.357314228860177e-05, + "loss": 0.1567, + "step": 9344 + }, + { + "epoch": 1.8831352004835784, + "grad_norm": 0.05149964988231659, + "learning_rate": 7.356138898540333e-05, + "loss": 0.2078, + "step": 9346 + }, + { + "epoch": 1.8835381825508763, + "grad_norm": 0.057487696409225464, + "learning_rate": 7.354963400844151e-05, + "loss": 0.2376, + "step": 9348 + }, + { + "epoch": 1.8839411646181745, + "grad_norm": 0.04620504751801491, + "learning_rate": 7.353787735855135e-05, + "loss": 0.1923, + "step": 9350 + }, + { + "epoch": 1.8843441466854725, + "grad_norm": 0.058796484023332596, + "learning_rate": 7.352611903656802e-05, + "loss": 0.2053, + "step": 9352 + }, + { + "epoch": 1.8847471287527706, + "grad_norm": 0.04183940589427948, + "learning_rate": 7.351435904332682e-05, + "loss": 0.1519, + "step": 9354 + }, + { + "epoch": 1.8851501108200686, + "grad_norm": 0.037236157804727554, + "learning_rate": 7.350259737966317e-05, + "loss": 0.1911, + "step": 9356 + }, + { + "epoch": 1.8855530928873665, + "grad_norm": 0.05059423670172691, + "learning_rate": 7.349083404641257e-05, + "loss": 0.254, + "step": 9358 + }, + { + "epoch": 1.8859560749546644, + "grad_norm": 0.05843758583068848, + "learning_rate": 7.347906904441068e-05, + "loss": 0.1811, + "step": 9360 + }, + { + "epoch": 1.8863590570219624, + "grad_norm": 0.08169310539960861, + "learning_rate": 7.34673023744933e-05, + "loss": 0.1997, + "step": 9362 + }, + { + "epoch": 1.8867620390892605, + "grad_norm": 0.03737678378820419, + "learning_rate": 7.345553403749628e-05, + "loss": 0.1459, + "step": 9364 + }, + { + "epoch": 1.8871650211565585, + "grad_norm": 0.05280197039246559, + "learning_rate": 7.344376403425563e-05, + "loss": 0.2147, + "step": 9366 + }, + { + "epoch": 1.8875680032238566, + "grad_norm": 0.04864995554089546, + "learning_rate": 7.343199236560748e-05, + "loss": 0.2175, + "step": 9368 + }, + { + "epoch": 1.8879709852911546, + "grad_norm": 0.03643433377146721, + "learning_rate": 7.342021903238808e-05, + "loss": 0.1972, + "step": 9370 + }, + { + "epoch": 1.8883739673584525, + "grad_norm": 0.04789574071764946, + "learning_rate": 7.340844403543375e-05, + "loss": 0.2055, + "step": 9372 + }, + { + "epoch": 1.8887769494257505, + "grad_norm": 0.04603145644068718, + "learning_rate": 7.3396667375581e-05, + "loss": 0.1673, + "step": 9374 + }, + { + "epoch": 1.8891799314930484, + "grad_norm": 0.041253577917814255, + "learning_rate": 7.338488905366642e-05, + "loss": 0.2102, + "step": 9376 + }, + { + "epoch": 1.8895829135603466, + "grad_norm": 0.046039849519729614, + "learning_rate": 7.337310907052672e-05, + "loss": 0.1839, + "step": 9378 + }, + { + "epoch": 1.8899858956276445, + "grad_norm": 0.03352159634232521, + "learning_rate": 7.336132742699873e-05, + "loss": 0.2224, + "step": 9380 + }, + { + "epoch": 1.8903888776949427, + "grad_norm": 0.04696164280176163, + "learning_rate": 7.33495441239194e-05, + "loss": 0.1989, + "step": 9382 + }, + { + "epoch": 1.8907918597622406, + "grad_norm": 0.03370807319879532, + "learning_rate": 7.33377591621258e-05, + "loss": 0.1577, + "step": 9384 + }, + { + "epoch": 1.8911948418295386, + "grad_norm": 0.039031196385622025, + "learning_rate": 7.33259725424551e-05, + "loss": 0.2015, + "step": 9386 + }, + { + "epoch": 1.8915978238968365, + "grad_norm": 0.04066183418035507, + "learning_rate": 7.331418426574464e-05, + "loss": 0.1625, + "step": 9388 + }, + { + "epoch": 1.8920008059641344, + "grad_norm": 0.04581267014145851, + "learning_rate": 7.330239433283179e-05, + "loss": 0.1837, + "step": 9390 + }, + { + "epoch": 1.8924037880314326, + "grad_norm": 0.05180869251489639, + "learning_rate": 7.329060274455412e-05, + "loss": 0.1885, + "step": 9392 + }, + { + "epoch": 1.8928067700987306, + "grad_norm": 0.047756217420101166, + "learning_rate": 7.32788095017493e-05, + "loss": 0.2492, + "step": 9394 + }, + { + "epoch": 1.8932097521660287, + "grad_norm": 0.03515281155705452, + "learning_rate": 7.326701460525506e-05, + "loss": 0.1625, + "step": 9396 + }, + { + "epoch": 1.8936127342333267, + "grad_norm": 0.05632800981402397, + "learning_rate": 7.325521805590932e-05, + "loss": 0.1943, + "step": 9398 + }, + { + "epoch": 1.8940157163006246, + "grad_norm": 0.04352201148867607, + "learning_rate": 7.324341985455008e-05, + "loss": 0.2154, + "step": 9400 + }, + { + "epoch": 1.8944186983679225, + "grad_norm": 0.04481047764420509, + "learning_rate": 7.323162000201547e-05, + "loss": 0.2175, + "step": 9402 + }, + { + "epoch": 1.8948216804352205, + "grad_norm": 0.0542139895260334, + "learning_rate": 7.321981849914372e-05, + "loss": 0.1365, + "step": 9404 + }, + { + "epoch": 1.8952246625025186, + "grad_norm": 0.044655684381723404, + "learning_rate": 7.32080153467732e-05, + "loss": 0.2363, + "step": 9406 + }, + { + "epoch": 1.8956276445698168, + "grad_norm": 0.045591697096824646, + "learning_rate": 7.319621054574239e-05, + "loss": 0.1815, + "step": 9408 + }, + { + "epoch": 1.8960306266371147, + "grad_norm": 0.05051286518573761, + "learning_rate": 7.318440409688988e-05, + "loss": 0.2434, + "step": 9410 + }, + { + "epoch": 1.8964336087044127, + "grad_norm": 0.04740666598081589, + "learning_rate": 7.317259600105437e-05, + "loss": 0.2038, + "step": 9412 + }, + { + "epoch": 1.8968365907717106, + "grad_norm": 0.08768098801374435, + "learning_rate": 7.31607862590747e-05, + "loss": 0.1866, + "step": 9414 + }, + { + "epoch": 1.8972395728390086, + "grad_norm": 0.055649664252996445, + "learning_rate": 7.314897487178985e-05, + "loss": 0.1873, + "step": 9416 + }, + { + "epoch": 1.8976425549063065, + "grad_norm": 0.053928524255752563, + "learning_rate": 7.313716184003881e-05, + "loss": 0.2381, + "step": 9418 + }, + { + "epoch": 1.8980455369736047, + "grad_norm": 0.06719769537448883, + "learning_rate": 7.312534716466079e-05, + "loss": 0.1988, + "step": 9420 + }, + { + "epoch": 1.8984485190409028, + "grad_norm": 0.03923649340867996, + "learning_rate": 7.311353084649511e-05, + "loss": 0.1837, + "step": 9422 + }, + { + "epoch": 1.8988515011082008, + "grad_norm": 0.05952714383602142, + "learning_rate": 7.310171288638116e-05, + "loss": 0.1996, + "step": 9424 + }, + { + "epoch": 1.8992544831754987, + "grad_norm": 0.03760204464197159, + "learning_rate": 7.308989328515847e-05, + "loss": 0.1746, + "step": 9426 + }, + { + "epoch": 1.8996574652427967, + "grad_norm": 0.04612984135746956, + "learning_rate": 7.30780720436667e-05, + "loss": 0.2176, + "step": 9428 + }, + { + "epoch": 1.9000604473100946, + "grad_norm": 0.04813413694500923, + "learning_rate": 7.306624916274557e-05, + "loss": 0.1552, + "step": 9430 + }, + { + "epoch": 1.9004634293773925, + "grad_norm": 0.060743726789951324, + "learning_rate": 7.3054424643235e-05, + "loss": 0.2085, + "step": 9432 + }, + { + "epoch": 1.9008664114446907, + "grad_norm": 0.05094519630074501, + "learning_rate": 7.3042598485975e-05, + "loss": 0.1934, + "step": 9434 + }, + { + "epoch": 1.9012693935119889, + "grad_norm": 0.060220785439014435, + "learning_rate": 7.303077069180562e-05, + "loss": 0.2007, + "step": 9436 + }, + { + "epoch": 1.9016723755792868, + "grad_norm": 0.04000959172844887, + "learning_rate": 7.301894126156713e-05, + "loss": 0.2259, + "step": 9438 + }, + { + "epoch": 1.9020753576465848, + "grad_norm": 0.04038378223776817, + "learning_rate": 7.300711019609989e-05, + "loss": 0.1858, + "step": 9440 + }, + { + "epoch": 1.9024783397138827, + "grad_norm": 0.05266295000910759, + "learning_rate": 7.299527749624431e-05, + "loss": 0.2069, + "step": 9442 + }, + { + "epoch": 1.9028813217811806, + "grad_norm": 0.045283444225788116, + "learning_rate": 7.2983443162841e-05, + "loss": 0.2187, + "step": 9444 + }, + { + "epoch": 1.9032843038484788, + "grad_norm": 0.04584207758307457, + "learning_rate": 7.297160719673064e-05, + "loss": 0.1718, + "step": 9446 + }, + { + "epoch": 1.9036872859157767, + "grad_norm": 0.049773987382650375, + "learning_rate": 7.295976959875406e-05, + "loss": 0.1761, + "step": 9448 + }, + { + "epoch": 1.904090267983075, + "grad_norm": 0.04451719671487808, + "learning_rate": 7.294793036975214e-05, + "loss": 0.192, + "step": 9450 + }, + { + "epoch": 1.9044932500503728, + "grad_norm": 0.04054448753595352, + "learning_rate": 7.293608951056596e-05, + "loss": 0.1766, + "step": 9452 + }, + { + "epoch": 1.9048962321176708, + "grad_norm": 0.05210372805595398, + "learning_rate": 7.292424702203666e-05, + "loss": 0.2294, + "step": 9454 + }, + { + "epoch": 1.9052992141849687, + "grad_norm": 0.03780307248234749, + "learning_rate": 7.291240290500551e-05, + "loss": 0.1625, + "step": 9456 + }, + { + "epoch": 1.9057021962522667, + "grad_norm": 0.03918739780783653, + "learning_rate": 7.290055716031392e-05, + "loss": 0.1559, + "step": 9458 + }, + { + "epoch": 1.9061051783195648, + "grad_norm": 0.050934337079524994, + "learning_rate": 7.288870978880336e-05, + "loss": 0.1769, + "step": 9460 + }, + { + "epoch": 1.9065081603868628, + "grad_norm": 0.08649832010269165, + "learning_rate": 7.287686079131548e-05, + "loss": 0.2055, + "step": 9462 + }, + { + "epoch": 1.906911142454161, + "grad_norm": 0.06893979012966156, + "learning_rate": 7.286501016869197e-05, + "loss": 0.2656, + "step": 9464 + }, + { + "epoch": 1.9073141245214589, + "grad_norm": 0.035416193306446075, + "learning_rate": 7.28531579217747e-05, + "loss": 0.1738, + "step": 9466 + }, + { + "epoch": 1.9077171065887568, + "grad_norm": 0.035585805773735046, + "learning_rate": 7.284130405140565e-05, + "loss": 0.1822, + "step": 9468 + }, + { + "epoch": 1.9081200886560548, + "grad_norm": 0.06133156269788742, + "learning_rate": 7.28294485584269e-05, + "loss": 0.1803, + "step": 9470 + }, + { + "epoch": 1.9085230707233527, + "grad_norm": 0.07323663681745529, + "learning_rate": 7.281759144368062e-05, + "loss": 0.1774, + "step": 9472 + }, + { + "epoch": 1.9089260527906509, + "grad_norm": 0.07056137174367905, + "learning_rate": 7.280573270800914e-05, + "loss": 0.1799, + "step": 9474 + }, + { + "epoch": 1.9093290348579488, + "grad_norm": 0.03919363394379616, + "learning_rate": 7.279387235225488e-05, + "loss": 0.1497, + "step": 9476 + }, + { + "epoch": 1.909732016925247, + "grad_norm": 0.04808547720313072, + "learning_rate": 7.278201037726038e-05, + "loss": 0.1636, + "step": 9478 + }, + { + "epoch": 1.910134998992545, + "grad_norm": 0.06041782721877098, + "learning_rate": 7.277014678386831e-05, + "loss": 0.221, + "step": 9480 + }, + { + "epoch": 1.9105379810598428, + "grad_norm": 0.06125279888510704, + "learning_rate": 7.275828157292142e-05, + "loss": 0.1225, + "step": 9482 + }, + { + "epoch": 1.9109409631271408, + "grad_norm": 0.04989524930715561, + "learning_rate": 7.274641474526259e-05, + "loss": 0.2219, + "step": 9484 + }, + { + "epoch": 1.9113439451944387, + "grad_norm": 0.04934917762875557, + "learning_rate": 7.273454630173485e-05, + "loss": 0.2038, + "step": 9486 + }, + { + "epoch": 1.911746927261737, + "grad_norm": 0.054218970239162445, + "learning_rate": 7.27226762431813e-05, + "loss": 0.215, + "step": 9488 + }, + { + "epoch": 1.9121499093290348, + "grad_norm": 0.04823169857263565, + "learning_rate": 7.271080457044515e-05, + "loss": 0.1865, + "step": 9490 + }, + { + "epoch": 1.912552891396333, + "grad_norm": 0.043067727237939835, + "learning_rate": 7.26989312843698e-05, + "loss": 0.2181, + "step": 9492 + }, + { + "epoch": 1.912955873463631, + "grad_norm": 0.049720462411642075, + "learning_rate": 7.268705638579865e-05, + "loss": 0.2346, + "step": 9494 + }, + { + "epoch": 1.9133588555309289, + "grad_norm": 0.05683385208249092, + "learning_rate": 7.267517987557528e-05, + "loss": 0.203, + "step": 9496 + }, + { + "epoch": 1.9137618375982268, + "grad_norm": 0.044213853776454926, + "learning_rate": 7.266330175454342e-05, + "loss": 0.1637, + "step": 9498 + }, + { + "epoch": 1.9141648196655248, + "grad_norm": 0.042827364057302475, + "learning_rate": 7.265142202354684e-05, + "loss": 0.2257, + "step": 9500 + }, + { + "epoch": 1.914567801732823, + "grad_norm": 0.05701254680752754, + "learning_rate": 7.263954068342946e-05, + "loss": 0.2565, + "step": 9502 + }, + { + "epoch": 1.9149707838001209, + "grad_norm": 0.04864765703678131, + "learning_rate": 7.262765773503534e-05, + "loss": 0.191, + "step": 9504 + }, + { + "epoch": 1.915373765867419, + "grad_norm": 0.04549378156661987, + "learning_rate": 7.261577317920857e-05, + "loss": 0.147, + "step": 9506 + }, + { + "epoch": 1.915776747934717, + "grad_norm": 0.04124518856406212, + "learning_rate": 7.260388701679345e-05, + "loss": 0.1651, + "step": 9508 + }, + { + "epoch": 1.916179730002015, + "grad_norm": 0.055826831609010696, + "learning_rate": 7.259199924863437e-05, + "loss": 0.2172, + "step": 9510 + }, + { + "epoch": 1.9165827120693129, + "grad_norm": 0.05360211059451103, + "learning_rate": 7.258010987557577e-05, + "loss": 0.2298, + "step": 9512 + }, + { + "epoch": 1.9169856941366108, + "grad_norm": 0.047606151551008224, + "learning_rate": 7.256821889846228e-05, + "loss": 0.2104, + "step": 9514 + }, + { + "epoch": 1.917388676203909, + "grad_norm": 0.049462106078863144, + "learning_rate": 7.255632631813862e-05, + "loss": 0.1967, + "step": 9516 + }, + { + "epoch": 1.917791658271207, + "grad_norm": 0.04005942866206169, + "learning_rate": 7.254443213544962e-05, + "loss": 0.1788, + "step": 9518 + }, + { + "epoch": 1.918194640338505, + "grad_norm": 0.048854950815439224, + "learning_rate": 7.253253635124018e-05, + "loss": 0.1889, + "step": 9520 + }, + { + "epoch": 1.918597622405803, + "grad_norm": 0.049189258366823196, + "learning_rate": 7.252063896635543e-05, + "loss": 0.1683, + "step": 9522 + }, + { + "epoch": 1.919000604473101, + "grad_norm": 0.06358414888381958, + "learning_rate": 7.250873998164049e-05, + "loss": 0.2469, + "step": 9524 + }, + { + "epoch": 1.9194035865403989, + "grad_norm": 0.061269596219062805, + "learning_rate": 7.249683939794065e-05, + "loss": 0.1833, + "step": 9526 + }, + { + "epoch": 1.9198065686076968, + "grad_norm": 0.045860689133405685, + "learning_rate": 7.248493721610134e-05, + "loss": 0.2043, + "step": 9528 + }, + { + "epoch": 1.920209550674995, + "grad_norm": 0.04932362958788872, + "learning_rate": 7.247303343696803e-05, + "loss": 0.2568, + "step": 9530 + }, + { + "epoch": 1.920612532742293, + "grad_norm": 0.04170341044664383, + "learning_rate": 7.246112806138637e-05, + "loss": 0.2479, + "step": 9532 + }, + { + "epoch": 1.921015514809591, + "grad_norm": 0.038920141756534576, + "learning_rate": 7.244922109020209e-05, + "loss": 0.147, + "step": 9534 + }, + { + "epoch": 1.921418496876889, + "grad_norm": 0.049470383673906326, + "learning_rate": 7.243731252426105e-05, + "loss": 0.2193, + "step": 9536 + }, + { + "epoch": 1.921821478944187, + "grad_norm": 0.0644969716668129, + "learning_rate": 7.242540236440922e-05, + "loss": 0.176, + "step": 9538 + }, + { + "epoch": 1.922224461011485, + "grad_norm": 0.035150207579135895, + "learning_rate": 7.241349061149265e-05, + "loss": 0.1792, + "step": 9540 + }, + { + "epoch": 1.9226274430787829, + "grad_norm": 0.04241754859685898, + "learning_rate": 7.240157726635757e-05, + "loss": 0.2179, + "step": 9542 + }, + { + "epoch": 1.923030425146081, + "grad_norm": 0.044732749462127686, + "learning_rate": 7.238966232985027e-05, + "loss": 0.2034, + "step": 9544 + }, + { + "epoch": 1.923433407213379, + "grad_norm": 0.04975948482751846, + "learning_rate": 7.237774580281716e-05, + "loss": 0.1864, + "step": 9546 + }, + { + "epoch": 1.9238363892806771, + "grad_norm": 0.053191766142845154, + "learning_rate": 7.236582768610476e-05, + "loss": 0.2078, + "step": 9548 + }, + { + "epoch": 1.924239371347975, + "grad_norm": 0.04343879595398903, + "learning_rate": 7.235390798055975e-05, + "loss": 0.1637, + "step": 9550 + }, + { + "epoch": 1.924642353415273, + "grad_norm": 0.059335123747587204, + "learning_rate": 7.234198668702885e-05, + "loss": 0.1638, + "step": 9552 + }, + { + "epoch": 1.925045335482571, + "grad_norm": 0.052407678216695786, + "learning_rate": 7.233006380635897e-05, + "loss": 0.1892, + "step": 9554 + }, + { + "epoch": 1.925448317549869, + "grad_norm": 0.05545216426253319, + "learning_rate": 7.231813933939704e-05, + "loss": 0.1467, + "step": 9556 + }, + { + "epoch": 1.925851299617167, + "grad_norm": 0.0498746857047081, + "learning_rate": 7.23062132869902e-05, + "loss": 0.2632, + "step": 9558 + }, + { + "epoch": 1.926254281684465, + "grad_norm": 0.04190506041049957, + "learning_rate": 7.229428564998564e-05, + "loss": 0.1336, + "step": 9560 + }, + { + "epoch": 1.9266572637517632, + "grad_norm": 0.04394135996699333, + "learning_rate": 7.228235642923069e-05, + "loss": 0.2154, + "step": 9562 + }, + { + "epoch": 1.927060245819061, + "grad_norm": 0.0600002259016037, + "learning_rate": 7.227042562557276e-05, + "loss": 0.1992, + "step": 9564 + }, + { + "epoch": 1.927463227886359, + "grad_norm": 0.06488315016031265, + "learning_rate": 7.225849323985941e-05, + "loss": 0.1654, + "step": 9566 + }, + { + "epoch": 1.927866209953657, + "grad_norm": 0.035505276173353195, + "learning_rate": 7.22465592729383e-05, + "loss": 0.1619, + "step": 9568 + }, + { + "epoch": 1.928269192020955, + "grad_norm": 0.055548056960105896, + "learning_rate": 7.223462372565721e-05, + "loss": 0.1691, + "step": 9570 + }, + { + "epoch": 1.928672174088253, + "grad_norm": 0.05716124176979065, + "learning_rate": 7.2222686598864e-05, + "loss": 0.1839, + "step": 9572 + }, + { + "epoch": 1.929075156155551, + "grad_norm": 0.07215842604637146, + "learning_rate": 7.221074789340667e-05, + "loss": 0.179, + "step": 9574 + }, + { + "epoch": 1.9294781382228492, + "grad_norm": 0.04680660739541054, + "learning_rate": 7.219880761013334e-05, + "loss": 0.2074, + "step": 9576 + }, + { + "epoch": 1.9298811202901471, + "grad_norm": 0.04360821843147278, + "learning_rate": 7.21868657498922e-05, + "loss": 0.1993, + "step": 9578 + }, + { + "epoch": 1.930284102357445, + "grad_norm": 0.04149682819843292, + "learning_rate": 7.217492231353164e-05, + "loss": 0.2054, + "step": 9580 + }, + { + "epoch": 1.930687084424743, + "grad_norm": 0.03975476324558258, + "learning_rate": 7.216297730190003e-05, + "loss": 0.2075, + "step": 9582 + }, + { + "epoch": 1.931090066492041, + "grad_norm": 0.07294421643018723, + "learning_rate": 7.215103071584596e-05, + "loss": 0.2055, + "step": 9584 + }, + { + "epoch": 1.9314930485593391, + "grad_norm": 0.07211649417877197, + "learning_rate": 7.21390825562181e-05, + "loss": 0.1824, + "step": 9586 + }, + { + "epoch": 1.931896030626637, + "grad_norm": 0.04363333433866501, + "learning_rate": 7.212713282386521e-05, + "loss": 0.1955, + "step": 9588 + }, + { + "epoch": 1.9322990126939352, + "grad_norm": 0.04804873839020729, + "learning_rate": 7.21151815196362e-05, + "loss": 0.2456, + "step": 9590 + }, + { + "epoch": 1.9327019947612332, + "grad_norm": 0.04542528837919235, + "learning_rate": 7.210322864438006e-05, + "loss": 0.1742, + "step": 9592 + }, + { + "epoch": 1.933104976828531, + "grad_norm": 0.04244324937462807, + "learning_rate": 7.209127419894591e-05, + "loss": 0.1685, + "step": 9594 + }, + { + "epoch": 1.933507958895829, + "grad_norm": 0.044985342770814896, + "learning_rate": 7.207931818418297e-05, + "loss": 0.1957, + "step": 9596 + }, + { + "epoch": 1.933910940963127, + "grad_norm": 0.04435814917087555, + "learning_rate": 7.206736060094059e-05, + "loss": 0.1631, + "step": 9598 + }, + { + "epoch": 1.9343139230304252, + "grad_norm": 0.04762697592377663, + "learning_rate": 7.205540145006818e-05, + "loss": 0.2069, + "step": 9600 + }, + { + "epoch": 1.9347169050977233, + "grad_norm": 0.05355559661984444, + "learning_rate": 7.204344073241534e-05, + "loss": 0.1931, + "step": 9602 + }, + { + "epoch": 1.9351198871650213, + "grad_norm": 0.05477927625179291, + "learning_rate": 7.203147844883172e-05, + "loss": 0.2047, + "step": 9604 + }, + { + "epoch": 1.9355228692323192, + "grad_norm": 0.04428846761584282, + "learning_rate": 7.201951460016709e-05, + "loss": 0.1725, + "step": 9606 + }, + { + "epoch": 1.9359258512996171, + "grad_norm": 0.041742779314517975, + "learning_rate": 7.200754918727137e-05, + "loss": 0.166, + "step": 9608 + }, + { + "epoch": 1.936328833366915, + "grad_norm": 0.04589561000466347, + "learning_rate": 7.199558221099456e-05, + "loss": 0.169, + "step": 9610 + }, + { + "epoch": 1.936731815434213, + "grad_norm": 0.04550304263830185, + "learning_rate": 7.198361367218676e-05, + "loss": 0.1958, + "step": 9612 + }, + { + "epoch": 1.9371347975015112, + "grad_norm": 0.047180287539958954, + "learning_rate": 7.19716435716982e-05, + "loss": 0.1825, + "step": 9614 + }, + { + "epoch": 1.9375377795688093, + "grad_norm": 0.05457916855812073, + "learning_rate": 7.195967191037922e-05, + "loss": 0.2018, + "step": 9616 + }, + { + "epoch": 1.9379407616361073, + "grad_norm": 0.04042569547891617, + "learning_rate": 7.194769868908026e-05, + "loss": 0.2112, + "step": 9618 + }, + { + "epoch": 1.9383437437034052, + "grad_norm": 0.042039696127176285, + "learning_rate": 7.19357239086519e-05, + "loss": 0.173, + "step": 9620 + }, + { + "epoch": 1.9387467257707032, + "grad_norm": 0.03700724244117737, + "learning_rate": 7.192374756994477e-05, + "loss": 0.1802, + "step": 9622 + }, + { + "epoch": 1.9391497078380011, + "grad_norm": 0.06561025232076645, + "learning_rate": 7.19117696738097e-05, + "loss": 0.1846, + "step": 9624 + }, + { + "epoch": 1.939552689905299, + "grad_norm": 0.06743749976158142, + "learning_rate": 7.189979022109755e-05, + "loss": 0.1871, + "step": 9626 + }, + { + "epoch": 1.9399556719725972, + "grad_norm": 0.06319016963243484, + "learning_rate": 7.188780921265932e-05, + "loss": 0.2142, + "step": 9628 + }, + { + "epoch": 1.9403586540398954, + "grad_norm": 0.06255649775266647, + "learning_rate": 7.187582664934613e-05, + "loss": 0.1843, + "step": 9630 + }, + { + "epoch": 1.9407616361071933, + "grad_norm": 0.04725359007716179, + "learning_rate": 7.186384253200919e-05, + "loss": 0.166, + "step": 9632 + }, + { + "epoch": 1.9411646181744913, + "grad_norm": 0.08775375038385391, + "learning_rate": 7.185185686149987e-05, + "loss": 0.1707, + "step": 9634 + }, + { + "epoch": 1.9415676002417892, + "grad_norm": 0.06302014738321304, + "learning_rate": 7.183986963866955e-05, + "loss": 0.2212, + "step": 9636 + }, + { + "epoch": 1.9419705823090871, + "grad_norm": 0.05067121982574463, + "learning_rate": 7.182788086436985e-05, + "loss": 0.1931, + "step": 9638 + }, + { + "epoch": 1.942373564376385, + "grad_norm": 0.05320592224597931, + "learning_rate": 7.181589053945239e-05, + "loss": 0.1897, + "step": 9640 + }, + { + "epoch": 1.9427765464436832, + "grad_norm": 0.07704076915979385, + "learning_rate": 7.180389866476895e-05, + "loss": 0.2012, + "step": 9642 + }, + { + "epoch": 1.9431795285109814, + "grad_norm": 0.03068099170923233, + "learning_rate": 7.179190524117143e-05, + "loss": 0.1731, + "step": 9644 + }, + { + "epoch": 1.9435825105782794, + "grad_norm": 0.0580390989780426, + "learning_rate": 7.177991026951179e-05, + "loss": 0.1883, + "step": 9646 + }, + { + "epoch": 1.9439854926455773, + "grad_norm": 0.0502253882586956, + "learning_rate": 7.176791375064217e-05, + "loss": 0.1973, + "step": 9648 + }, + { + "epoch": 1.9443884747128752, + "grad_norm": 0.048907410353422165, + "learning_rate": 7.175591568541479e-05, + "loss": 0.201, + "step": 9650 + }, + { + "epoch": 1.9447914567801732, + "grad_norm": 0.056508488953113556, + "learning_rate": 7.174391607468193e-05, + "loss": 0.1606, + "step": 9652 + }, + { + "epoch": 1.9451944388474713, + "grad_norm": 0.037321534007787704, + "learning_rate": 7.173191491929605e-05, + "loss": 0.1595, + "step": 9654 + }, + { + "epoch": 1.9455974209147693, + "grad_norm": 0.04397205635905266, + "learning_rate": 7.17199122201097e-05, + "loss": 0.2088, + "step": 9656 + }, + { + "epoch": 1.9460004029820674, + "grad_norm": 0.041947152465581894, + "learning_rate": 7.17079079779755e-05, + "loss": 0.1825, + "step": 9658 + }, + { + "epoch": 1.9464033850493654, + "grad_norm": 0.053206928074359894, + "learning_rate": 7.169590219374625e-05, + "loss": 0.2055, + "step": 9660 + }, + { + "epoch": 1.9468063671166633, + "grad_norm": 0.04078378155827522, + "learning_rate": 7.16838948682748e-05, + "loss": 0.1977, + "step": 9662 + }, + { + "epoch": 1.9472093491839613, + "grad_norm": 0.08155234903097153, + "learning_rate": 7.167188600241413e-05, + "loss": 0.2538, + "step": 9664 + }, + { + "epoch": 1.9476123312512592, + "grad_norm": 0.04778615012764931, + "learning_rate": 7.165987559701735e-05, + "loss": 0.1354, + "step": 9666 + }, + { + "epoch": 1.9480153133185574, + "grad_norm": 0.05697598680853844, + "learning_rate": 7.164786365293765e-05, + "loss": 0.2056, + "step": 9668 + }, + { + "epoch": 1.9484182953858553, + "grad_norm": 0.035130467265844345, + "learning_rate": 7.163585017102833e-05, + "loss": 0.2103, + "step": 9670 + }, + { + "epoch": 1.9488212774531535, + "grad_norm": 0.06631353497505188, + "learning_rate": 7.162383515214281e-05, + "loss": 0.2048, + "step": 9672 + }, + { + "epoch": 1.9492242595204514, + "grad_norm": 0.04498956725001335, + "learning_rate": 7.161181859713463e-05, + "loss": 0.2305, + "step": 9674 + }, + { + "epoch": 1.9496272415877494, + "grad_norm": 0.04058938845992088, + "learning_rate": 7.159980050685742e-05, + "loss": 0.2126, + "step": 9676 + }, + { + "epoch": 1.9500302236550473, + "grad_norm": 0.0352054089307785, + "learning_rate": 7.158778088216494e-05, + "loss": 0.1872, + "step": 9678 + }, + { + "epoch": 1.9504332057223452, + "grad_norm": 0.03873911872506142, + "learning_rate": 7.1575759723911e-05, + "loss": 0.1815, + "step": 9680 + }, + { + "epoch": 1.9508361877896434, + "grad_norm": 0.030585993081331253, + "learning_rate": 7.156373703294961e-05, + "loss": 0.15, + "step": 9682 + }, + { + "epoch": 1.9512391698569413, + "grad_norm": 0.059601254761219025, + "learning_rate": 7.155171281013483e-05, + "loss": 0.189, + "step": 9684 + }, + { + "epoch": 1.9516421519242395, + "grad_norm": 0.04292791336774826, + "learning_rate": 7.153968705632083e-05, + "loss": 0.2102, + "step": 9686 + }, + { + "epoch": 1.9520451339915375, + "grad_norm": 0.05211935192346573, + "learning_rate": 7.152765977236191e-05, + "loss": 0.1977, + "step": 9688 + }, + { + "epoch": 1.9524481160588354, + "grad_norm": 0.03731980919837952, + "learning_rate": 7.15156309591125e-05, + "loss": 0.1749, + "step": 9690 + }, + { + "epoch": 1.9528510981261333, + "grad_norm": 0.047702062875032425, + "learning_rate": 7.150360061742702e-05, + "loss": 0.2043, + "step": 9692 + }, + { + "epoch": 1.9532540801934313, + "grad_norm": 0.048761576414108276, + "learning_rate": 7.149156874816018e-05, + "loss": 0.2322, + "step": 9694 + }, + { + "epoch": 1.9536570622607294, + "grad_norm": 0.07745224982500076, + "learning_rate": 7.147953535216666e-05, + "loss": 0.2188, + "step": 9696 + }, + { + "epoch": 1.9540600443280274, + "grad_norm": 0.06417658925056458, + "learning_rate": 7.14675004303013e-05, + "loss": 0.2092, + "step": 9698 + }, + { + "epoch": 1.9544630263953255, + "grad_norm": 0.04546617716550827, + "learning_rate": 7.145546398341903e-05, + "loss": 0.1456, + "step": 9700 + }, + { + "epoch": 1.9548660084626235, + "grad_norm": 0.07141395658254623, + "learning_rate": 7.144342601237493e-05, + "loss": 0.2023, + "step": 9702 + }, + { + "epoch": 1.9552689905299214, + "grad_norm": 0.06302723288536072, + "learning_rate": 7.143138651802412e-05, + "loss": 0.1423, + "step": 9704 + }, + { + "epoch": 1.9556719725972194, + "grad_norm": 0.050656940788030624, + "learning_rate": 7.14193455012219e-05, + "loss": 0.174, + "step": 9706 + }, + { + "epoch": 1.9560749546645173, + "grad_norm": 0.05971457064151764, + "learning_rate": 7.140730296282363e-05, + "loss": 0.194, + "step": 9708 + }, + { + "epoch": 1.9564779367318155, + "grad_norm": 0.07366830855607986, + "learning_rate": 7.139525890368479e-05, + "loss": 0.1877, + "step": 9710 + }, + { + "epoch": 1.9568809187991134, + "grad_norm": 0.04481309652328491, + "learning_rate": 7.138321332466097e-05, + "loss": 0.2125, + "step": 9712 + }, + { + "epoch": 1.9572839008664116, + "grad_norm": 0.0536804161965847, + "learning_rate": 7.137116622660788e-05, + "loss": 0.2313, + "step": 9714 + }, + { + "epoch": 1.9576868829337095, + "grad_norm": 0.05160369351506233, + "learning_rate": 7.135911761038132e-05, + "loss": 0.2034, + "step": 9716 + }, + { + "epoch": 1.9580898650010075, + "grad_norm": 0.04648306593298912, + "learning_rate": 7.13470674768372e-05, + "loss": 0.1625, + "step": 9718 + }, + { + "epoch": 1.9584928470683054, + "grad_norm": 0.06373865157365799, + "learning_rate": 7.133501582683155e-05, + "loss": 0.1981, + "step": 9720 + }, + { + "epoch": 1.9588958291356033, + "grad_norm": 0.07009351998567581, + "learning_rate": 7.132296266122049e-05, + "loss": 0.2067, + "step": 9722 + }, + { + "epoch": 1.9592988112029015, + "grad_norm": 0.06840388476848602, + "learning_rate": 7.131090798086026e-05, + "loss": 0.2469, + "step": 9724 + }, + { + "epoch": 1.9597017932701994, + "grad_norm": 0.04283274710178375, + "learning_rate": 7.129885178660722e-05, + "loss": 0.1898, + "step": 9726 + }, + { + "epoch": 1.9601047753374976, + "grad_norm": 0.04922349750995636, + "learning_rate": 7.128679407931781e-05, + "loss": 0.2174, + "step": 9728 + }, + { + "epoch": 1.9605077574047955, + "grad_norm": 0.049361422657966614, + "learning_rate": 7.127473485984859e-05, + "loss": 0.2225, + "step": 9730 + }, + { + "epoch": 1.9609107394720935, + "grad_norm": 0.05047673359513283, + "learning_rate": 7.126267412905623e-05, + "loss": 0.1927, + "step": 9732 + }, + { + "epoch": 1.9613137215393914, + "grad_norm": 0.05042770877480507, + "learning_rate": 7.125061188779751e-05, + "loss": 0.2007, + "step": 9734 + }, + { + "epoch": 1.9617167036066894, + "grad_norm": 0.05922012776136398, + "learning_rate": 7.123854813692929e-05, + "loss": 0.1867, + "step": 9736 + }, + { + "epoch": 1.9621196856739875, + "grad_norm": 0.0502203106880188, + "learning_rate": 7.122648287730859e-05, + "loss": 0.2325, + "step": 9738 + }, + { + "epoch": 1.9625226677412855, + "grad_norm": 0.04548550397157669, + "learning_rate": 7.12144161097925e-05, + "loss": 0.1761, + "step": 9740 + }, + { + "epoch": 1.9629256498085836, + "grad_norm": 0.04983745142817497, + "learning_rate": 7.12023478352382e-05, + "loss": 0.1964, + "step": 9742 + }, + { + "epoch": 1.9633286318758816, + "grad_norm": 0.06270463019609451, + "learning_rate": 7.119027805450301e-05, + "loss": 0.1915, + "step": 9744 + }, + { + "epoch": 1.9637316139431795, + "grad_norm": 0.05191759392619133, + "learning_rate": 7.117820676844437e-05, + "loss": 0.2485, + "step": 9746 + }, + { + "epoch": 1.9641345960104775, + "grad_norm": 0.061899662017822266, + "learning_rate": 7.116613397791978e-05, + "loss": 0.228, + "step": 9748 + }, + { + "epoch": 1.9645375780777754, + "grad_norm": 0.09284082800149918, + "learning_rate": 7.11540596837869e-05, + "loss": 0.2192, + "step": 9750 + }, + { + "epoch": 1.9649405601450736, + "grad_norm": 0.054707858711481094, + "learning_rate": 7.114198388690344e-05, + "loss": 0.1977, + "step": 9752 + }, + { + "epoch": 1.9653435422123715, + "grad_norm": 0.05887407064437866, + "learning_rate": 7.112990658812727e-05, + "loss": 0.2365, + "step": 9754 + }, + { + "epoch": 1.9657465242796697, + "grad_norm": 0.04575325548648834, + "learning_rate": 7.111782778831632e-05, + "loss": 0.1869, + "step": 9756 + }, + { + "epoch": 1.9661495063469676, + "grad_norm": 0.05046350136399269, + "learning_rate": 7.110574748832864e-05, + "loss": 0.1999, + "step": 9758 + }, + { + "epoch": 1.9665524884142656, + "grad_norm": 0.056472841650247574, + "learning_rate": 7.109366568902245e-05, + "loss": 0.1741, + "step": 9760 + }, + { + "epoch": 1.9669554704815635, + "grad_norm": 0.07386624068021774, + "learning_rate": 7.108158239125595e-05, + "loss": 0.2023, + "step": 9762 + }, + { + "epoch": 1.9673584525488614, + "grad_norm": 0.049812037497758865, + "learning_rate": 7.106949759588757e-05, + "loss": 0.191, + "step": 9764 + }, + { + "epoch": 1.9677614346161596, + "grad_norm": 0.05968543142080307, + "learning_rate": 7.105741130377577e-05, + "loss": 0.1909, + "step": 9766 + }, + { + "epoch": 1.9681644166834575, + "grad_norm": 0.04280983284115791, + "learning_rate": 7.104532351577914e-05, + "loss": 0.2065, + "step": 9768 + }, + { + "epoch": 1.9685673987507557, + "grad_norm": 0.03398040309548378, + "learning_rate": 7.10332342327564e-05, + "loss": 0.1459, + "step": 9770 + }, + { + "epoch": 1.9689703808180536, + "grad_norm": 0.06277387589216232, + "learning_rate": 7.102114345556632e-05, + "loss": 0.1871, + "step": 9772 + }, + { + "epoch": 1.9693733628853516, + "grad_norm": 0.04359853267669678, + "learning_rate": 7.100905118506785e-05, + "loss": 0.2372, + "step": 9774 + }, + { + "epoch": 1.9697763449526495, + "grad_norm": 0.04512301832437515, + "learning_rate": 7.099695742211996e-05, + "loss": 0.1822, + "step": 9776 + }, + { + "epoch": 1.9701793270199475, + "grad_norm": 0.04159922897815704, + "learning_rate": 7.09848621675818e-05, + "loss": 0.2415, + "step": 9778 + }, + { + "epoch": 1.9705823090872456, + "grad_norm": 0.047807469964027405, + "learning_rate": 7.097276542231259e-05, + "loss": 0.1432, + "step": 9780 + }, + { + "epoch": 1.9709852911545436, + "grad_norm": 0.06380611658096313, + "learning_rate": 7.096066718717169e-05, + "loss": 0.2216, + "step": 9782 + }, + { + "epoch": 1.9713882732218417, + "grad_norm": 0.04591721296310425, + "learning_rate": 7.09485674630185e-05, + "loss": 0.1293, + "step": 9784 + }, + { + "epoch": 1.9717912552891397, + "grad_norm": 0.056421924382448196, + "learning_rate": 7.093646625071256e-05, + "loss": 0.2394, + "step": 9786 + }, + { + "epoch": 1.9721942373564376, + "grad_norm": 0.052102234214544296, + "learning_rate": 7.092436355111356e-05, + "loss": 0.1894, + "step": 9788 + }, + { + "epoch": 1.9725972194237356, + "grad_norm": 0.04517102986574173, + "learning_rate": 7.091225936508124e-05, + "loss": 0.1867, + "step": 9790 + }, + { + "epoch": 1.9730002014910335, + "grad_norm": 0.04084205627441406, + "learning_rate": 7.090015369347544e-05, + "loss": 0.1919, + "step": 9792 + }, + { + "epoch": 1.9734031835583317, + "grad_norm": 0.04202771186828613, + "learning_rate": 7.088804653715617e-05, + "loss": 0.173, + "step": 9794 + }, + { + "epoch": 1.9738061656256296, + "grad_norm": 0.05949341878294945, + "learning_rate": 7.087593789698345e-05, + "loss": 0.1754, + "step": 9796 + }, + { + "epoch": 1.9742091476929278, + "grad_norm": 0.05214182287454605, + "learning_rate": 7.086382777381751e-05, + "loss": 0.17, + "step": 9798 + }, + { + "epoch": 1.9746121297602257, + "grad_norm": 0.058549653738737106, + "learning_rate": 7.085171616851862e-05, + "loss": 0.193, + "step": 9800 + }, + { + "epoch": 1.9750151118275237, + "grad_norm": 0.050743550062179565, + "learning_rate": 7.083960308194715e-05, + "loss": 0.2091, + "step": 9802 + }, + { + "epoch": 1.9754180938948216, + "grad_norm": 0.06692482531070709, + "learning_rate": 7.08274885149636e-05, + "loss": 0.2314, + "step": 9804 + }, + { + "epoch": 1.9758210759621195, + "grad_norm": 0.0435248427093029, + "learning_rate": 7.081537246842857e-05, + "loss": 0.2488, + "step": 9806 + }, + { + "epoch": 1.9762240580294177, + "grad_norm": 0.0509062334895134, + "learning_rate": 7.080325494320279e-05, + "loss": 0.1879, + "step": 9808 + }, + { + "epoch": 1.9766270400967159, + "grad_norm": 0.05188576877117157, + "learning_rate": 7.079113594014702e-05, + "loss": 0.2126, + "step": 9810 + }, + { + "epoch": 1.9770300221640138, + "grad_norm": 0.06566416472196579, + "learning_rate": 7.077901546012223e-05, + "loss": 0.1967, + "step": 9812 + }, + { + "epoch": 1.9774330042313117, + "grad_norm": 0.05517309531569481, + "learning_rate": 7.076689350398939e-05, + "loss": 0.145, + "step": 9814 + }, + { + "epoch": 1.9778359862986097, + "grad_norm": 0.04569048807024956, + "learning_rate": 7.075477007260966e-05, + "loss": 0.161, + "step": 9816 + }, + { + "epoch": 1.9782389683659076, + "grad_norm": 0.04703947901725769, + "learning_rate": 7.074264516684427e-05, + "loss": 0.1748, + "step": 9818 + }, + { + "epoch": 1.9786419504332056, + "grad_norm": 0.045801058411598206, + "learning_rate": 7.073051878755452e-05, + "loss": 0.1644, + "step": 9820 + }, + { + "epoch": 1.9790449325005037, + "grad_norm": 0.03103361465036869, + "learning_rate": 7.071839093560188e-05, + "loss": 0.1535, + "step": 9822 + }, + { + "epoch": 1.979447914567802, + "grad_norm": 0.04012531787157059, + "learning_rate": 7.070626161184788e-05, + "loss": 0.1788, + "step": 9824 + }, + { + "epoch": 1.9798508966350998, + "grad_norm": 0.04191760718822479, + "learning_rate": 7.069413081715416e-05, + "loss": 0.1657, + "step": 9826 + }, + { + "epoch": 1.9802538787023978, + "grad_norm": 0.06015370413661003, + "learning_rate": 7.068199855238249e-05, + "loss": 0.2029, + "step": 9828 + }, + { + "epoch": 1.9806568607696957, + "grad_norm": 0.049391429871320724, + "learning_rate": 7.066986481839471e-05, + "loss": 0.1951, + "step": 9830 + }, + { + "epoch": 1.9810598428369937, + "grad_norm": 0.05129459872841835, + "learning_rate": 7.065772961605281e-05, + "loss": 0.2027, + "step": 9832 + }, + { + "epoch": 1.9814628249042916, + "grad_norm": 0.05638004466891289, + "learning_rate": 7.064559294621882e-05, + "loss": 0.1905, + "step": 9834 + }, + { + "epoch": 1.9818658069715898, + "grad_norm": 0.049807120114564896, + "learning_rate": 7.063345480975493e-05, + "loss": 0.2021, + "step": 9836 + }, + { + "epoch": 1.982268789038888, + "grad_norm": 0.056036632508039474, + "learning_rate": 7.06213152075234e-05, + "loss": 0.2191, + "step": 9838 + }, + { + "epoch": 1.9826717711061859, + "grad_norm": 0.04058406874537468, + "learning_rate": 7.060917414038663e-05, + "loss": 0.2015, + "step": 9840 + }, + { + "epoch": 1.9830747531734838, + "grad_norm": 0.044951487332582474, + "learning_rate": 7.059703160920707e-05, + "loss": 0.1379, + "step": 9842 + }, + { + "epoch": 1.9834777352407817, + "grad_norm": 0.06562306731939316, + "learning_rate": 7.058488761484735e-05, + "loss": 0.2462, + "step": 9844 + }, + { + "epoch": 1.9838807173080797, + "grad_norm": 0.05451963469386101, + "learning_rate": 7.057274215817011e-05, + "loss": 0.217, + "step": 9846 + }, + { + "epoch": 1.9842836993753776, + "grad_norm": 0.04903053119778633, + "learning_rate": 7.056059524003818e-05, + "loss": 0.1458, + "step": 9848 + }, + { + "epoch": 1.9846866814426758, + "grad_norm": 0.05418461188673973, + "learning_rate": 7.054844686131445e-05, + "loss": 0.2123, + "step": 9850 + }, + { + "epoch": 1.985089663509974, + "grad_norm": 0.04938149452209473, + "learning_rate": 7.05362970228619e-05, + "loss": 0.2197, + "step": 9852 + }, + { + "epoch": 1.985492645577272, + "grad_norm": 0.06866676360368729, + "learning_rate": 7.052414572554367e-05, + "loss": 0.2042, + "step": 9854 + }, + { + "epoch": 1.9858956276445698, + "grad_norm": 0.04523222893476486, + "learning_rate": 7.051199297022295e-05, + "loss": 0.1729, + "step": 9856 + }, + { + "epoch": 1.9862986097118678, + "grad_norm": 0.037189189344644547, + "learning_rate": 7.049983875776305e-05, + "loss": 0.1686, + "step": 9858 + }, + { + "epoch": 1.9867015917791657, + "grad_norm": 0.05150702968239784, + "learning_rate": 7.048768308902739e-05, + "loss": 0.1988, + "step": 9860 + }, + { + "epoch": 1.9871045738464639, + "grad_norm": 0.04161156713962555, + "learning_rate": 7.047552596487947e-05, + "loss": 0.1352, + "step": 9862 + }, + { + "epoch": 1.9875075559137618, + "grad_norm": 0.03971536085009575, + "learning_rate": 7.046336738618296e-05, + "loss": 0.2015, + "step": 9864 + }, + { + "epoch": 1.98791053798106, + "grad_norm": 0.05211701616644859, + "learning_rate": 7.045120735380155e-05, + "loss": 0.1902, + "step": 9866 + }, + { + "epoch": 1.988313520048358, + "grad_norm": 0.0716228112578392, + "learning_rate": 7.043904586859906e-05, + "loss": 0.2204, + "step": 9868 + }, + { + "epoch": 1.9887165021156559, + "grad_norm": 0.0337841659784317, + "learning_rate": 7.042688293143946e-05, + "loss": 0.1608, + "step": 9870 + }, + { + "epoch": 1.9891194841829538, + "grad_norm": 0.05220310389995575, + "learning_rate": 7.041471854318675e-05, + "loss": 0.1816, + "step": 9872 + }, + { + "epoch": 1.9895224662502518, + "grad_norm": 0.053605400025844574, + "learning_rate": 7.040255270470509e-05, + "loss": 0.188, + "step": 9874 + }, + { + "epoch": 1.98992544831755, + "grad_norm": 0.04616335779428482, + "learning_rate": 7.039038541685872e-05, + "loss": 0.1861, + "step": 9876 + }, + { + "epoch": 1.9903284303848479, + "grad_norm": 0.041820913553237915, + "learning_rate": 7.037821668051196e-05, + "loss": 0.1961, + "step": 9878 + }, + { + "epoch": 1.990731412452146, + "grad_norm": 0.041626784950494766, + "learning_rate": 7.03660464965293e-05, + "loss": 0.1752, + "step": 9880 + }, + { + "epoch": 1.991134394519444, + "grad_norm": 0.03145499899983406, + "learning_rate": 7.035387486577527e-05, + "loss": 0.1503, + "step": 9882 + }, + { + "epoch": 1.991537376586742, + "grad_norm": 0.03995690122246742, + "learning_rate": 7.03417017891145e-05, + "loss": 0.2092, + "step": 9884 + }, + { + "epoch": 1.9919403586540398, + "grad_norm": 0.0410347580909729, + "learning_rate": 7.032952726741178e-05, + "loss": 0.1642, + "step": 9886 + }, + { + "epoch": 1.9923433407213378, + "grad_norm": 0.04722330719232559, + "learning_rate": 7.031735130153194e-05, + "loss": 0.171, + "step": 9888 + }, + { + "epoch": 1.992746322788636, + "grad_norm": 0.05450482293963432, + "learning_rate": 7.030517389233997e-05, + "loss": 0.1765, + "step": 9890 + }, + { + "epoch": 1.993149304855934, + "grad_norm": 0.043954089283943176, + "learning_rate": 7.029299504070091e-05, + "loss": 0.1902, + "step": 9892 + }, + { + "epoch": 1.993552286923232, + "grad_norm": 0.051854848861694336, + "learning_rate": 7.028081474747996e-05, + "loss": 0.2037, + "step": 9894 + }, + { + "epoch": 1.99395526899053, + "grad_norm": 0.05640044063329697, + "learning_rate": 7.026863301354234e-05, + "loss": 0.1626, + "step": 9896 + }, + { + "epoch": 1.994358251057828, + "grad_norm": 0.04978862777352333, + "learning_rate": 7.025644983975345e-05, + "loss": 0.2004, + "step": 9898 + }, + { + "epoch": 1.9947612331251259, + "grad_norm": 0.06613018363714218, + "learning_rate": 7.024426522697877e-05, + "loss": 0.1726, + "step": 9900 + }, + { + "epoch": 1.9951642151924238, + "grad_norm": 0.043966181576251984, + "learning_rate": 7.023207917608385e-05, + "loss": 0.1715, + "step": 9902 + }, + { + "epoch": 1.995567197259722, + "grad_norm": 0.05377311259508133, + "learning_rate": 7.021989168793439e-05, + "loss": 0.1577, + "step": 9904 + }, + { + "epoch": 1.99597017932702, + "grad_norm": 0.055230725556612015, + "learning_rate": 7.020770276339617e-05, + "loss": 0.215, + "step": 9906 + }, + { + "epoch": 1.996373161394318, + "grad_norm": 0.061077140271663666, + "learning_rate": 7.019551240333504e-05, + "loss": 0.1898, + "step": 9908 + }, + { + "epoch": 1.996776143461616, + "grad_norm": 0.06587786227464676, + "learning_rate": 7.018332060861704e-05, + "loss": 0.215, + "step": 9910 + }, + { + "epoch": 1.997179125528914, + "grad_norm": 0.04662555083632469, + "learning_rate": 7.017112738010819e-05, + "loss": 0.2062, + "step": 9912 + }, + { + "epoch": 1.997582107596212, + "grad_norm": 0.07606983929872513, + "learning_rate": 7.01589327186747e-05, + "loss": 0.2201, + "step": 9914 + }, + { + "epoch": 1.9979850896635099, + "grad_norm": 0.051272887736558914, + "learning_rate": 7.01467366251829e-05, + "loss": 0.1999, + "step": 9916 + }, + { + "epoch": 1.998388071730808, + "grad_norm": 0.059559039771556854, + "learning_rate": 7.013453910049914e-05, + "loss": 0.2025, + "step": 9918 + }, + { + "epoch": 1.998791053798106, + "grad_norm": 0.04579564183950424, + "learning_rate": 7.012234014548993e-05, + "loss": 0.1785, + "step": 9920 + }, + { + "epoch": 1.9991940358654041, + "grad_norm": 0.05059002712368965, + "learning_rate": 7.011013976102185e-05, + "loss": 0.2108, + "step": 9922 + }, + { + "epoch": 1.999597017932702, + "grad_norm": 0.041644349694252014, + "learning_rate": 7.00979379479616e-05, + "loss": 0.2111, + "step": 9924 + }, + { + "epoch": 2.0, + "grad_norm": 0.0676032155752182, + "learning_rate": 7.008573470717599e-05, + "loss": 0.2367, + "step": 9926 + }, + { + "epoch": 2.000402982067298, + "grad_norm": 0.04403241351246834, + "learning_rate": 7.00735300395319e-05, + "loss": 0.2116, + "step": 9928 + }, + { + "epoch": 2.000805964134596, + "grad_norm": 0.05903521552681923, + "learning_rate": 7.006132394589634e-05, + "loss": 0.216, + "step": 9930 + }, + { + "epoch": 2.001208946201894, + "grad_norm": 0.04107736051082611, + "learning_rate": 7.004911642713641e-05, + "loss": 0.1867, + "step": 9932 + }, + { + "epoch": 2.001611928269192, + "grad_norm": 0.039992742240428925, + "learning_rate": 7.003690748411932e-05, + "loss": 0.1964, + "step": 9934 + }, + { + "epoch": 2.00201491033649, + "grad_norm": 0.04343795403838158, + "learning_rate": 7.002469711771236e-05, + "loss": 0.1488, + "step": 9936 + }, + { + "epoch": 2.002417892403788, + "grad_norm": 0.04299810156226158, + "learning_rate": 7.001248532878293e-05, + "loss": 0.1904, + "step": 9938 + }, + { + "epoch": 2.002820874471086, + "grad_norm": 0.02997422404587269, + "learning_rate": 7.000027211819857e-05, + "loss": 0.1424, + "step": 9940 + }, + { + "epoch": 2.003223856538384, + "grad_norm": 0.04063683748245239, + "learning_rate": 6.998805748682686e-05, + "loss": 0.1956, + "step": 9942 + }, + { + "epoch": 2.003626838605682, + "grad_norm": 0.03808411955833435, + "learning_rate": 6.99758414355355e-05, + "loss": 0.1597, + "step": 9944 + }, + { + "epoch": 2.00402982067298, + "grad_norm": 0.04115324094891548, + "learning_rate": 6.996362396519232e-05, + "loss": 0.1725, + "step": 9946 + }, + { + "epoch": 2.0044328027402782, + "grad_norm": 0.054356805980205536, + "learning_rate": 6.995140507666523e-05, + "loss": 0.2477, + "step": 9948 + }, + { + "epoch": 2.004835784807576, + "grad_norm": 0.04506862163543701, + "learning_rate": 6.993918477082221e-05, + "loss": 0.1809, + "step": 9950 + }, + { + "epoch": 2.005238766874874, + "grad_norm": 0.05171862989664078, + "learning_rate": 6.99269630485314e-05, + "loss": 0.184, + "step": 9952 + }, + { + "epoch": 2.005641748942172, + "grad_norm": 0.06944061070680618, + "learning_rate": 6.9914739910661e-05, + "loss": 0.1542, + "step": 9954 + }, + { + "epoch": 2.00604473100947, + "grad_norm": 0.04488156735897064, + "learning_rate": 6.990251535807934e-05, + "loss": 0.1921, + "step": 9956 + }, + { + "epoch": 2.006447713076768, + "grad_norm": 0.031162697821855545, + "learning_rate": 6.98902893916548e-05, + "loss": 0.1407, + "step": 9958 + }, + { + "epoch": 2.006850695144066, + "grad_norm": 0.04962150752544403, + "learning_rate": 6.987806201225592e-05, + "loss": 0.1829, + "step": 9960 + }, + { + "epoch": 2.0072536772113643, + "grad_norm": 0.04459129646420479, + "learning_rate": 6.98658332207513e-05, + "loss": 0.1636, + "step": 9962 + }, + { + "epoch": 2.007656659278662, + "grad_norm": 0.04113384708762169, + "learning_rate": 6.985360301800967e-05, + "loss": 0.1955, + "step": 9964 + }, + { + "epoch": 2.00805964134596, + "grad_norm": 0.07087098807096481, + "learning_rate": 6.984137140489982e-05, + "loss": 0.1973, + "step": 9966 + }, + { + "epoch": 2.008462623413258, + "grad_norm": 0.052531544119119644, + "learning_rate": 6.982913838229068e-05, + "loss": 0.1526, + "step": 9968 + }, + { + "epoch": 2.008865605480556, + "grad_norm": 0.06382293254137039, + "learning_rate": 6.981690395105128e-05, + "loss": 0.2112, + "step": 9970 + }, + { + "epoch": 2.009268587547854, + "grad_norm": 0.05440857633948326, + "learning_rate": 6.98046681120507e-05, + "loss": 0.2131, + "step": 9972 + }, + { + "epoch": 2.009671569615152, + "grad_norm": 0.060303978621959686, + "learning_rate": 6.979243086615818e-05, + "loss": 0.2324, + "step": 9974 + }, + { + "epoch": 2.0100745516824503, + "grad_norm": 0.036824069917201996, + "learning_rate": 6.978019221424302e-05, + "loss": 0.2191, + "step": 9976 + }, + { + "epoch": 2.0104775337497482, + "grad_norm": 0.048221174627542496, + "learning_rate": 6.976795215717462e-05, + "loss": 0.1703, + "step": 9978 + }, + { + "epoch": 2.010880515817046, + "grad_norm": 0.0401717834174633, + "learning_rate": 6.975571069582253e-05, + "loss": 0.1442, + "step": 9980 + }, + { + "epoch": 2.011283497884344, + "grad_norm": 0.05119583383202553, + "learning_rate": 6.974346783105634e-05, + "loss": 0.2227, + "step": 9982 + }, + { + "epoch": 2.011686479951642, + "grad_norm": 0.06163738667964935, + "learning_rate": 6.973122356374578e-05, + "loss": 0.1731, + "step": 9984 + }, + { + "epoch": 2.01208946201894, + "grad_norm": 0.0363716222345829, + "learning_rate": 6.971897789476065e-05, + "loss": 0.1504, + "step": 9986 + }, + { + "epoch": 2.012492444086238, + "grad_norm": 0.047141000628471375, + "learning_rate": 6.970673082497085e-05, + "loss": 0.184, + "step": 9988 + }, + { + "epoch": 2.0128954261535363, + "grad_norm": 0.07161784917116165, + "learning_rate": 6.969448235524643e-05, + "loss": 0.2386, + "step": 9990 + }, + { + "epoch": 2.0132984082208343, + "grad_norm": 0.038642749190330505, + "learning_rate": 6.968223248645748e-05, + "loss": 0.2215, + "step": 9992 + }, + { + "epoch": 2.013701390288132, + "grad_norm": 0.033294469118118286, + "learning_rate": 6.966998121947419e-05, + "loss": 0.1225, + "step": 9994 + }, + { + "epoch": 2.01410437235543, + "grad_norm": 0.0679098516702652, + "learning_rate": 6.965772855516691e-05, + "loss": 0.2254, + "step": 9996 + }, + { + "epoch": 2.014507354422728, + "grad_norm": 0.05759232118725777, + "learning_rate": 6.964547449440602e-05, + "loss": 0.2548, + "step": 9998 + }, + { + "epoch": 2.014910336490026, + "grad_norm": 0.04030608758330345, + "learning_rate": 6.963321903806206e-05, + "loss": 0.132, + "step": 10000 + }, + { + "epoch": 2.015313318557324, + "grad_norm": 0.0534801110625267, + "learning_rate": 6.96209621870056e-05, + "loss": 0.1883, + "step": 10002 + }, + { + "epoch": 2.0157163006246224, + "grad_norm": 0.036523666232824326, + "learning_rate": 6.960870394210737e-05, + "loss": 0.1703, + "step": 10004 + }, + { + "epoch": 2.0161192826919203, + "grad_norm": 0.052369460463523865, + "learning_rate": 6.959644430423818e-05, + "loss": 0.1431, + "step": 10006 + }, + { + "epoch": 2.0165222647592183, + "grad_norm": 0.051367372274398804, + "learning_rate": 6.958418327426889e-05, + "loss": 0.1807, + "step": 10008 + }, + { + "epoch": 2.016925246826516, + "grad_norm": 0.04442469775676727, + "learning_rate": 6.95719208530706e-05, + "loss": 0.1667, + "step": 10010 + }, + { + "epoch": 2.017328228893814, + "grad_norm": 0.05809894576668739, + "learning_rate": 6.95596570415143e-05, + "loss": 0.1925, + "step": 10012 + }, + { + "epoch": 2.017731210961112, + "grad_norm": 0.07553379237651825, + "learning_rate": 6.954739184047127e-05, + "loss": 0.1611, + "step": 10014 + }, + { + "epoch": 2.01813419302841, + "grad_norm": 0.058313459157943726, + "learning_rate": 6.953512525081279e-05, + "loss": 0.1802, + "step": 10016 + }, + { + "epoch": 2.0185371750957084, + "grad_norm": 0.03649749606847763, + "learning_rate": 6.952285727341025e-05, + "loss": 0.1916, + "step": 10018 + }, + { + "epoch": 2.0189401571630063, + "grad_norm": 0.06322020292282104, + "learning_rate": 6.951058790913514e-05, + "loss": 0.241, + "step": 10020 + }, + { + "epoch": 2.0193431392303043, + "grad_norm": 0.06284866482019424, + "learning_rate": 6.949831715885909e-05, + "loss": 0.22, + "step": 10022 + }, + { + "epoch": 2.0197461212976022, + "grad_norm": 0.04919195920228958, + "learning_rate": 6.948604502345375e-05, + "loss": 0.1511, + "step": 10024 + }, + { + "epoch": 2.0201491033649, + "grad_norm": 0.04934461787343025, + "learning_rate": 6.947377150379092e-05, + "loss": 0.1878, + "step": 10026 + }, + { + "epoch": 2.020552085432198, + "grad_norm": 0.04316118359565735, + "learning_rate": 6.946149660074255e-05, + "loss": 0.206, + "step": 10028 + }, + { + "epoch": 2.0209550674994965, + "grad_norm": 0.047829512506723404, + "learning_rate": 6.944922031518055e-05, + "loss": 0.1894, + "step": 10030 + }, + { + "epoch": 2.0213580495667944, + "grad_norm": 0.06630343943834305, + "learning_rate": 6.943694264797707e-05, + "loss": 0.2087, + "step": 10032 + }, + { + "epoch": 2.0217610316340924, + "grad_norm": 0.05703127011656761, + "learning_rate": 6.942466360000426e-05, + "loss": 0.1625, + "step": 10034 + }, + { + "epoch": 2.0221640137013903, + "grad_norm": 0.051246266812086105, + "learning_rate": 6.94123831721344e-05, + "loss": 0.2249, + "step": 10036 + }, + { + "epoch": 2.0225669957686883, + "grad_norm": 0.06704655289649963, + "learning_rate": 6.94001013652399e-05, + "loss": 0.1971, + "step": 10038 + }, + { + "epoch": 2.022969977835986, + "grad_norm": 0.07502034306526184, + "learning_rate": 6.938781818019322e-05, + "loss": 0.2133, + "step": 10040 + }, + { + "epoch": 2.023372959903284, + "grad_norm": 0.06751389801502228, + "learning_rate": 6.937553361786693e-05, + "loss": 0.2083, + "step": 10042 + }, + { + "epoch": 2.0237759419705825, + "grad_norm": 0.056213442236185074, + "learning_rate": 6.936324767913373e-05, + "loss": 0.2121, + "step": 10044 + }, + { + "epoch": 2.0241789240378805, + "grad_norm": 0.05019683390855789, + "learning_rate": 6.935096036486639e-05, + "loss": 0.2064, + "step": 10046 + }, + { + "epoch": 2.0245819061051784, + "grad_norm": 0.05334010347723961, + "learning_rate": 6.933867167593776e-05, + "loss": 0.2139, + "step": 10048 + }, + { + "epoch": 2.0249848881724763, + "grad_norm": 0.07266365736722946, + "learning_rate": 6.932638161322082e-05, + "loss": 0.2378, + "step": 10050 + }, + { + "epoch": 2.0253878702397743, + "grad_norm": 0.05368518829345703, + "learning_rate": 6.931409017758866e-05, + "loss": 0.1434, + "step": 10052 + }, + { + "epoch": 2.0257908523070722, + "grad_norm": 0.06453645974397659, + "learning_rate": 6.93017973699144e-05, + "loss": 0.204, + "step": 10054 + }, + { + "epoch": 2.02619383437437, + "grad_norm": 0.0673045814037323, + "learning_rate": 6.928950319107134e-05, + "loss": 0.2312, + "step": 10056 + }, + { + "epoch": 2.0265968164416686, + "grad_norm": 0.0639619305729866, + "learning_rate": 6.927720764193279e-05, + "loss": 0.1998, + "step": 10058 + }, + { + "epoch": 2.0269997985089665, + "grad_norm": 0.07452709972858429, + "learning_rate": 6.926491072337226e-05, + "loss": 0.251, + "step": 10060 + }, + { + "epoch": 2.0274027805762644, + "grad_norm": 0.04336640611290932, + "learning_rate": 6.92526124362633e-05, + "loss": 0.1963, + "step": 10062 + }, + { + "epoch": 2.0278057626435624, + "grad_norm": 0.041949085891246796, + "learning_rate": 6.924031278147952e-05, + "loss": 0.2065, + "step": 10064 + }, + { + "epoch": 2.0282087447108603, + "grad_norm": 0.041135333478450775, + "learning_rate": 6.922801175989469e-05, + "loss": 0.1776, + "step": 10066 + }, + { + "epoch": 2.0286117267781583, + "grad_norm": 0.048920802772045135, + "learning_rate": 6.921570937238266e-05, + "loss": 0.209, + "step": 10068 + }, + { + "epoch": 2.029014708845456, + "grad_norm": 0.049562010914087296, + "learning_rate": 6.920340561981738e-05, + "loss": 0.1398, + "step": 10070 + }, + { + "epoch": 2.0294176909127546, + "grad_norm": 0.043229840695858, + "learning_rate": 6.919110050307286e-05, + "loss": 0.1732, + "step": 10072 + }, + { + "epoch": 2.0298206729800525, + "grad_norm": 0.04172005504369736, + "learning_rate": 6.917879402302327e-05, + "loss": 0.1695, + "step": 10074 + }, + { + "epoch": 2.0302236550473505, + "grad_norm": 0.05273672565817833, + "learning_rate": 6.91664861805428e-05, + "loss": 0.1919, + "step": 10076 + }, + { + "epoch": 2.0306266371146484, + "grad_norm": 0.04306695610284805, + "learning_rate": 6.915417697650582e-05, + "loss": 0.1703, + "step": 10078 + }, + { + "epoch": 2.0310296191819464, + "grad_norm": 0.04110530763864517, + "learning_rate": 6.914186641178672e-05, + "loss": 0.1616, + "step": 10080 + }, + { + "epoch": 2.0314326012492443, + "grad_norm": 0.05396244302392006, + "learning_rate": 6.912955448726006e-05, + "loss": 0.2022, + "step": 10082 + }, + { + "epoch": 2.0318355833165422, + "grad_norm": 0.07104260474443436, + "learning_rate": 6.911724120380045e-05, + "loss": 0.1944, + "step": 10084 + }, + { + "epoch": 2.0322385653838406, + "grad_norm": 0.0550391860306263, + "learning_rate": 6.910492656228258e-05, + "loss": 0.1653, + "step": 10086 + }, + { + "epoch": 2.0326415474511386, + "grad_norm": 0.0574769526720047, + "learning_rate": 6.90926105635813e-05, + "loss": 0.2217, + "step": 10088 + }, + { + "epoch": 2.0330445295184365, + "grad_norm": 0.043378256261348724, + "learning_rate": 6.908029320857147e-05, + "loss": 0.2421, + "step": 10090 + }, + { + "epoch": 2.0334475115857344, + "grad_norm": 0.047401294112205505, + "learning_rate": 6.906797449812817e-05, + "loss": 0.192, + "step": 10092 + }, + { + "epoch": 2.0338504936530324, + "grad_norm": 0.04810367152094841, + "learning_rate": 6.905565443312642e-05, + "loss": 0.1818, + "step": 10094 + }, + { + "epoch": 2.0342534757203303, + "grad_norm": 0.03586292639374733, + "learning_rate": 6.904333301444146e-05, + "loss": 0.1771, + "step": 10096 + }, + { + "epoch": 2.0346564577876283, + "grad_norm": 0.042285818606615067, + "learning_rate": 6.903101024294858e-05, + "loss": 0.2083, + "step": 10098 + }, + { + "epoch": 2.0350594398549267, + "grad_norm": 0.0737522765994072, + "learning_rate": 6.901868611952317e-05, + "loss": 0.238, + "step": 10100 + }, + { + "epoch": 2.0354624219222246, + "grad_norm": 0.05117824673652649, + "learning_rate": 6.900636064504071e-05, + "loss": 0.1915, + "step": 10102 + }, + { + "epoch": 2.0358654039895225, + "grad_norm": 0.07703465223312378, + "learning_rate": 6.899403382037681e-05, + "loss": 0.1974, + "step": 10104 + }, + { + "epoch": 2.0362683860568205, + "grad_norm": 0.04766885191202164, + "learning_rate": 6.898170564640709e-05, + "loss": 0.1444, + "step": 10106 + }, + { + "epoch": 2.0366713681241184, + "grad_norm": 0.07825704663991928, + "learning_rate": 6.896937612400738e-05, + "loss": 0.1678, + "step": 10108 + }, + { + "epoch": 2.0370743501914164, + "grad_norm": 0.049602579325437546, + "learning_rate": 6.895704525405351e-05, + "loss": 0.1925, + "step": 10110 + }, + { + "epoch": 2.0374773322587143, + "grad_norm": 0.060352474451065063, + "learning_rate": 6.894471303742147e-05, + "loss": 0.1548, + "step": 10112 + }, + { + "epoch": 2.0378803143260127, + "grad_norm": 0.060082901269197464, + "learning_rate": 6.893237947498732e-05, + "loss": 0.1508, + "step": 10114 + }, + { + "epoch": 2.0382832963933106, + "grad_norm": 0.0637107640504837, + "learning_rate": 6.89200445676272e-05, + "loss": 0.2168, + "step": 10116 + }, + { + "epoch": 2.0386862784606086, + "grad_norm": 0.05720100551843643, + "learning_rate": 6.890770831621738e-05, + "loss": 0.1759, + "step": 10118 + }, + { + "epoch": 2.0390892605279065, + "grad_norm": 0.048458032310009, + "learning_rate": 6.88953707216342e-05, + "loss": 0.1957, + "step": 10120 + }, + { + "epoch": 2.0394922425952045, + "grad_norm": 0.06728371232748032, + "learning_rate": 6.888303178475411e-05, + "loss": 0.176, + "step": 10122 + }, + { + "epoch": 2.0398952246625024, + "grad_norm": 0.053860168904066086, + "learning_rate": 6.887069150645362e-05, + "loss": 0.21, + "step": 10124 + }, + { + "epoch": 2.0402982067298003, + "grad_norm": 0.06876907497644424, + "learning_rate": 6.88583498876094e-05, + "loss": 0.197, + "step": 10126 + }, + { + "epoch": 2.0407011887970987, + "grad_norm": 0.05635106936097145, + "learning_rate": 6.884600692909815e-05, + "loss": 0.2058, + "step": 10128 + }, + { + "epoch": 2.0411041708643967, + "grad_norm": 0.04968864470720291, + "learning_rate": 6.88336626317967e-05, + "loss": 0.2126, + "step": 10130 + }, + { + "epoch": 2.0415071529316946, + "grad_norm": 0.06307017803192139, + "learning_rate": 6.8821316996582e-05, + "loss": 0.1865, + "step": 10132 + }, + { + "epoch": 2.0419101349989925, + "grad_norm": 0.05695520341396332, + "learning_rate": 6.880897002433104e-05, + "loss": 0.1708, + "step": 10134 + }, + { + "epoch": 2.0423131170662905, + "grad_norm": 0.056855130940675735, + "learning_rate": 6.879662171592092e-05, + "loss": 0.1786, + "step": 10136 + }, + { + "epoch": 2.0427160991335884, + "grad_norm": 0.05637526512145996, + "learning_rate": 6.878427207222887e-05, + "loss": 0.2307, + "step": 10138 + }, + { + "epoch": 2.0431190812008864, + "grad_norm": 0.07096744328737259, + "learning_rate": 6.877192109413214e-05, + "loss": 0.1843, + "step": 10140 + }, + { + "epoch": 2.0435220632681848, + "grad_norm": 0.06431825459003448, + "learning_rate": 6.875956878250819e-05, + "loss": 0.2259, + "step": 10142 + }, + { + "epoch": 2.0439250453354827, + "grad_norm": 0.055992890149354935, + "learning_rate": 6.874721513823445e-05, + "loss": 0.1992, + "step": 10144 + }, + { + "epoch": 2.0443280274027806, + "grad_norm": 0.05946403741836548, + "learning_rate": 6.873486016218854e-05, + "loss": 0.2288, + "step": 10146 + }, + { + "epoch": 2.0447310094700786, + "grad_norm": 0.07745692878961563, + "learning_rate": 6.872250385524813e-05, + "loss": 0.2044, + "step": 10148 + }, + { + "epoch": 2.0451339915373765, + "grad_norm": 0.05934702232480049, + "learning_rate": 6.871014621829099e-05, + "loss": 0.2129, + "step": 10150 + }, + { + "epoch": 2.0455369736046745, + "grad_norm": 0.06850776076316833, + "learning_rate": 6.869778725219498e-05, + "loss": 0.1735, + "step": 10152 + }, + { + "epoch": 2.0459399556719724, + "grad_norm": 0.06359641253948212, + "learning_rate": 6.868542695783806e-05, + "loss": 0.164, + "step": 10154 + }, + { + "epoch": 2.046342937739271, + "grad_norm": 0.05588820204138756, + "learning_rate": 6.867306533609829e-05, + "loss": 0.1713, + "step": 10156 + }, + { + "epoch": 2.0467459198065687, + "grad_norm": 0.053674276918172836, + "learning_rate": 6.866070238785384e-05, + "loss": 0.1908, + "step": 10158 + }, + { + "epoch": 2.0471489018738667, + "grad_norm": 0.04785943776369095, + "learning_rate": 6.864833811398292e-05, + "loss": 0.2058, + "step": 10160 + }, + { + "epoch": 2.0475518839411646, + "grad_norm": 0.05280066281557083, + "learning_rate": 6.863597251536389e-05, + "loss": 0.2018, + "step": 10162 + }, + { + "epoch": 2.0479548660084625, + "grad_norm": 0.05563594773411751, + "learning_rate": 6.862360559287517e-05, + "loss": 0.2189, + "step": 10164 + }, + { + "epoch": 2.0483578480757605, + "grad_norm": 0.0552031509578228, + "learning_rate": 6.86112373473953e-05, + "loss": 0.1998, + "step": 10166 + }, + { + "epoch": 2.0487608301430584, + "grad_norm": 0.07908007502555847, + "learning_rate": 6.85988677798029e-05, + "loss": 0.1639, + "step": 10168 + }, + { + "epoch": 2.049163812210357, + "grad_norm": 0.05633252486586571, + "learning_rate": 6.858649689097667e-05, + "loss": 0.1182, + "step": 10170 + }, + { + "epoch": 2.0495667942776548, + "grad_norm": 0.04489020258188248, + "learning_rate": 6.857412468179543e-05, + "loss": 0.1718, + "step": 10172 + }, + { + "epoch": 2.0499697763449527, + "grad_norm": 0.049923308193683624, + "learning_rate": 6.856175115313806e-05, + "loss": 0.2262, + "step": 10174 + }, + { + "epoch": 2.0503727584122506, + "grad_norm": 0.0499308817088604, + "learning_rate": 6.854937630588359e-05, + "loss": 0.1507, + "step": 10176 + }, + { + "epoch": 2.0507757404795486, + "grad_norm": 0.047174129635095596, + "learning_rate": 6.853700014091108e-05, + "loss": 0.1446, + "step": 10178 + }, + { + "epoch": 2.0511787225468465, + "grad_norm": 0.03964932635426521, + "learning_rate": 6.852462265909973e-05, + "loss": 0.1218, + "step": 10180 + }, + { + "epoch": 2.0515817046141445, + "grad_norm": 0.06714117527008057, + "learning_rate": 6.851224386132882e-05, + "loss": 0.2253, + "step": 10182 + }, + { + "epoch": 2.051984686681443, + "grad_norm": 0.05636550858616829, + "learning_rate": 6.849986374847773e-05, + "loss": 0.2178, + "step": 10184 + }, + { + "epoch": 2.052387668748741, + "grad_norm": 0.06559525430202484, + "learning_rate": 6.848748232142586e-05, + "loss": 0.2283, + "step": 10186 + }, + { + "epoch": 2.0527906508160387, + "grad_norm": 0.04787338525056839, + "learning_rate": 6.847509958105283e-05, + "loss": 0.1585, + "step": 10188 + }, + { + "epoch": 2.0531936328833367, + "grad_norm": 0.060912150889635086, + "learning_rate": 6.84627155282383e-05, + "loss": 0.1765, + "step": 10190 + }, + { + "epoch": 2.0535966149506346, + "grad_norm": 0.04154639318585396, + "learning_rate": 6.845033016386196e-05, + "loss": 0.1741, + "step": 10192 + }, + { + "epoch": 2.0539995970179326, + "grad_norm": 0.07247091829776764, + "learning_rate": 6.843794348880367e-05, + "loss": 0.2379, + "step": 10194 + }, + { + "epoch": 2.0544025790852305, + "grad_norm": 0.05615399777889252, + "learning_rate": 6.842555550394338e-05, + "loss": 0.2099, + "step": 10196 + }, + { + "epoch": 2.054805561152529, + "grad_norm": 0.056154392659664154, + "learning_rate": 6.841316621016107e-05, + "loss": 0.233, + "step": 10198 + }, + { + "epoch": 2.055208543219827, + "grad_norm": 0.04954691231250763, + "learning_rate": 6.840077560833688e-05, + "loss": 0.2172, + "step": 10200 + }, + { + "epoch": 2.0556115252871248, + "grad_norm": 0.03744814172387123, + "learning_rate": 6.838838369935104e-05, + "loss": 0.1399, + "step": 10202 + }, + { + "epoch": 2.0560145073544227, + "grad_norm": 0.06068943440914154, + "learning_rate": 6.837599048408381e-05, + "loss": 0.1913, + "step": 10204 + }, + { + "epoch": 2.0564174894217206, + "grad_norm": 0.06408362835645676, + "learning_rate": 6.836359596341563e-05, + "loss": 0.2003, + "step": 10206 + }, + { + "epoch": 2.0568204714890186, + "grad_norm": 0.054449256509542465, + "learning_rate": 6.835120013822694e-05, + "loss": 0.1994, + "step": 10208 + }, + { + "epoch": 2.0572234535563165, + "grad_norm": 0.055184461176395416, + "learning_rate": 6.833880300939835e-05, + "loss": 0.1977, + "step": 10210 + }, + { + "epoch": 2.057626435623615, + "grad_norm": 0.04949701577425003, + "learning_rate": 6.832640457781053e-05, + "loss": 0.1376, + "step": 10212 + }, + { + "epoch": 2.058029417690913, + "grad_norm": 0.07206345349550247, + "learning_rate": 6.83140048443442e-05, + "loss": 0.1795, + "step": 10214 + }, + { + "epoch": 2.058432399758211, + "grad_norm": 0.060080841183662415, + "learning_rate": 6.830160380988029e-05, + "loss": 0.2113, + "step": 10216 + }, + { + "epoch": 2.0588353818255087, + "grad_norm": 0.03272469714283943, + "learning_rate": 6.828920147529971e-05, + "loss": 0.1516, + "step": 10218 + }, + { + "epoch": 2.0592383638928067, + "grad_norm": 0.052576545625925064, + "learning_rate": 6.82767978414835e-05, + "loss": 0.2245, + "step": 10220 + }, + { + "epoch": 2.0596413459601046, + "grad_norm": 0.05587625131011009, + "learning_rate": 6.826439290931279e-05, + "loss": 0.2133, + "step": 10222 + }, + { + "epoch": 2.060044328027403, + "grad_norm": 0.033351000398397446, + "learning_rate": 6.825198667966883e-05, + "loss": 0.1667, + "step": 10224 + }, + { + "epoch": 2.060447310094701, + "grad_norm": 0.0870516449213028, + "learning_rate": 6.823957915343293e-05, + "loss": 0.1779, + "step": 10226 + }, + { + "epoch": 2.060850292161999, + "grad_norm": 0.06419102102518082, + "learning_rate": 6.822717033148649e-05, + "loss": 0.2087, + "step": 10228 + }, + { + "epoch": 2.061253274229297, + "grad_norm": 0.055633675307035446, + "learning_rate": 6.821476021471103e-05, + "loss": 0.212, + "step": 10230 + }, + { + "epoch": 2.0616562562965948, + "grad_norm": 0.04498250037431717, + "learning_rate": 6.820234880398813e-05, + "loss": 0.1942, + "step": 10232 + }, + { + "epoch": 2.0620592383638927, + "grad_norm": 0.05661292001605034, + "learning_rate": 6.818993610019947e-05, + "loss": 0.1667, + "step": 10234 + }, + { + "epoch": 2.0624622204311907, + "grad_norm": 0.06287211179733276, + "learning_rate": 6.817752210422686e-05, + "loss": 0.216, + "step": 10236 + }, + { + "epoch": 2.0628652024984886, + "grad_norm": 0.04590194299817085, + "learning_rate": 6.816510681695213e-05, + "loss": 0.1958, + "step": 10238 + }, + { + "epoch": 2.063268184565787, + "grad_norm": 0.03706109896302223, + "learning_rate": 6.815269023925726e-05, + "loss": 0.1317, + "step": 10240 + }, + { + "epoch": 2.063671166633085, + "grad_norm": 0.04809437692165375, + "learning_rate": 6.814027237202433e-05, + "loss": 0.203, + "step": 10242 + }, + { + "epoch": 2.064074148700383, + "grad_norm": 0.05192924663424492, + "learning_rate": 6.812785321613545e-05, + "loss": 0.2064, + "step": 10244 + }, + { + "epoch": 2.064477130767681, + "grad_norm": 0.050232961773872375, + "learning_rate": 6.811543277247285e-05, + "loss": 0.1612, + "step": 10246 + }, + { + "epoch": 2.0648801128349787, + "grad_norm": 0.051082853227853775, + "learning_rate": 6.810301104191891e-05, + "loss": 0.1402, + "step": 10248 + }, + { + "epoch": 2.0652830949022767, + "grad_norm": 0.05042251944541931, + "learning_rate": 6.8090588025356e-05, + "loss": 0.1719, + "step": 10250 + }, + { + "epoch": 2.065686076969575, + "grad_norm": 0.09551627933979034, + "learning_rate": 6.807816372366664e-05, + "loss": 0.2145, + "step": 10252 + }, + { + "epoch": 2.066089059036873, + "grad_norm": 0.07502955198287964, + "learning_rate": 6.806573813773346e-05, + "loss": 0.2287, + "step": 10254 + }, + { + "epoch": 2.066492041104171, + "grad_norm": 0.06268194317817688, + "learning_rate": 6.805331126843912e-05, + "loss": 0.2206, + "step": 10256 + }, + { + "epoch": 2.066895023171469, + "grad_norm": 0.05798924341797829, + "learning_rate": 6.804088311666642e-05, + "loss": 0.2267, + "step": 10258 + }, + { + "epoch": 2.067298005238767, + "grad_norm": 0.06151028349995613, + "learning_rate": 6.802845368329825e-05, + "loss": 0.1738, + "step": 10260 + }, + { + "epoch": 2.0677009873060648, + "grad_norm": 0.051942527294158936, + "learning_rate": 6.801602296921755e-05, + "loss": 0.1722, + "step": 10262 + }, + { + "epoch": 2.0681039693733627, + "grad_norm": 0.048653073608875275, + "learning_rate": 6.800359097530739e-05, + "loss": 0.1447, + "step": 10264 + }, + { + "epoch": 2.068506951440661, + "grad_norm": 0.07658076286315918, + "learning_rate": 6.799115770245093e-05, + "loss": 0.2365, + "step": 10266 + }, + { + "epoch": 2.068909933507959, + "grad_norm": 0.055393122136592865, + "learning_rate": 6.797872315153139e-05, + "loss": 0.2255, + "step": 10268 + }, + { + "epoch": 2.069312915575257, + "grad_norm": 0.07611057907342911, + "learning_rate": 6.796628732343212e-05, + "loss": 0.2032, + "step": 10270 + }, + { + "epoch": 2.069715897642555, + "grad_norm": 0.05497412756085396, + "learning_rate": 6.795385021903652e-05, + "loss": 0.201, + "step": 10272 + }, + { + "epoch": 2.070118879709853, + "grad_norm": 0.0471356138586998, + "learning_rate": 6.79414118392281e-05, + "loss": 0.2179, + "step": 10274 + }, + { + "epoch": 2.070521861777151, + "grad_norm": 0.06860160082578659, + "learning_rate": 6.792897218489051e-05, + "loss": 0.1896, + "step": 10276 + }, + { + "epoch": 2.0709248438444487, + "grad_norm": 0.04450426623225212, + "learning_rate": 6.79165312569074e-05, + "loss": 0.196, + "step": 10278 + }, + { + "epoch": 2.071327825911747, + "grad_norm": 0.06577623635530472, + "learning_rate": 6.790408905616254e-05, + "loss": 0.1959, + "step": 10280 + }, + { + "epoch": 2.071730807979045, + "grad_norm": 0.0391886830329895, + "learning_rate": 6.789164558353985e-05, + "loss": 0.1776, + "step": 10282 + }, + { + "epoch": 2.072133790046343, + "grad_norm": 0.06485151499509811, + "learning_rate": 6.787920083992326e-05, + "loss": 0.1951, + "step": 10284 + }, + { + "epoch": 2.072536772113641, + "grad_norm": 0.06771334260702133, + "learning_rate": 6.786675482619684e-05, + "loss": 0.1854, + "step": 10286 + }, + { + "epoch": 2.072939754180939, + "grad_norm": 0.04598645493388176, + "learning_rate": 6.785430754324473e-05, + "loss": 0.1892, + "step": 10288 + }, + { + "epoch": 2.073342736248237, + "grad_norm": 0.05460431054234505, + "learning_rate": 6.784185899195117e-05, + "loss": 0.1618, + "step": 10290 + }, + { + "epoch": 2.073745718315535, + "grad_norm": 0.0572720542550087, + "learning_rate": 6.782940917320048e-05, + "loss": 0.194, + "step": 10292 + }, + { + "epoch": 2.074148700382833, + "grad_norm": 0.062214624136686325, + "learning_rate": 6.781695808787708e-05, + "loss": 0.1985, + "step": 10294 + }, + { + "epoch": 2.074551682450131, + "grad_norm": 0.0574166476726532, + "learning_rate": 6.780450573686545e-05, + "loss": 0.2226, + "step": 10296 + }, + { + "epoch": 2.074954664517429, + "grad_norm": 0.05803331732749939, + "learning_rate": 6.779205212105022e-05, + "loss": 0.189, + "step": 10298 + }, + { + "epoch": 2.075357646584727, + "grad_norm": 0.050910577178001404, + "learning_rate": 6.777959724131607e-05, + "loss": 0.1377, + "step": 10300 + }, + { + "epoch": 2.075760628652025, + "grad_norm": 0.04367856681346893, + "learning_rate": 6.776714109854777e-05, + "loss": 0.1531, + "step": 10302 + }, + { + "epoch": 2.076163610719323, + "grad_norm": 0.06604331731796265, + "learning_rate": 6.775468369363015e-05, + "loss": 0.2169, + "step": 10304 + }, + { + "epoch": 2.076566592786621, + "grad_norm": 0.07328059524297714, + "learning_rate": 6.774222502744823e-05, + "loss": 0.1856, + "step": 10306 + }, + { + "epoch": 2.076969574853919, + "grad_norm": 0.06949375569820404, + "learning_rate": 6.7729765100887e-05, + "loss": 0.2027, + "step": 10308 + }, + { + "epoch": 2.077372556921217, + "grad_norm": 0.07183837890625, + "learning_rate": 6.77173039148316e-05, + "loss": 0.2086, + "step": 10310 + }, + { + "epoch": 2.077775538988515, + "grad_norm": 0.05574171990156174, + "learning_rate": 6.77048414701673e-05, + "loss": 0.1783, + "step": 10312 + }, + { + "epoch": 2.078178521055813, + "grad_norm": 0.0639929473400116, + "learning_rate": 6.769237776777934e-05, + "loss": 0.1766, + "step": 10314 + }, + { + "epoch": 2.078581503123111, + "grad_norm": 0.06870213896036148, + "learning_rate": 6.767991280855316e-05, + "loss": 0.2004, + "step": 10316 + }, + { + "epoch": 2.078984485190409, + "grad_norm": 0.04897087812423706, + "learning_rate": 6.766744659337429e-05, + "loss": 0.1685, + "step": 10318 + }, + { + "epoch": 2.079387467257707, + "grad_norm": 0.04760335013270378, + "learning_rate": 6.765497912312824e-05, + "loss": 0.1805, + "step": 10320 + }, + { + "epoch": 2.0797904493250052, + "grad_norm": 0.06363385915756226, + "learning_rate": 6.76425103987007e-05, + "loss": 0.212, + "step": 10322 + }, + { + "epoch": 2.080193431392303, + "grad_norm": 0.072906494140625, + "learning_rate": 6.763004042097745e-05, + "loss": 0.18, + "step": 10324 + }, + { + "epoch": 2.080596413459601, + "grad_norm": 0.08023212850093842, + "learning_rate": 6.761756919084432e-05, + "loss": 0.1753, + "step": 10326 + }, + { + "epoch": 2.080999395526899, + "grad_norm": 0.05875491350889206, + "learning_rate": 6.760509670918725e-05, + "loss": 0.1478, + "step": 10328 + }, + { + "epoch": 2.081402377594197, + "grad_norm": 0.06197643280029297, + "learning_rate": 6.759262297689227e-05, + "loss": 0.1899, + "step": 10330 + }, + { + "epoch": 2.081805359661495, + "grad_norm": 0.07489572465419769, + "learning_rate": 6.758014799484548e-05, + "loss": 0.2358, + "step": 10332 + }, + { + "epoch": 2.082208341728793, + "grad_norm": 0.06615934520959854, + "learning_rate": 6.75676717639331e-05, + "loss": 0.2092, + "step": 10334 + }, + { + "epoch": 2.0826113237960913, + "grad_norm": 0.05848393589258194, + "learning_rate": 6.75551942850414e-05, + "loss": 0.182, + "step": 10336 + }, + { + "epoch": 2.083014305863389, + "grad_norm": 0.054525017738342285, + "learning_rate": 6.754271555905678e-05, + "loss": 0.1952, + "step": 10338 + }, + { + "epoch": 2.083417287930687, + "grad_norm": 0.07348795980215073, + "learning_rate": 6.753023558686572e-05, + "loss": 0.2201, + "step": 10340 + }, + { + "epoch": 2.083820269997985, + "grad_norm": 0.06381559371948242, + "learning_rate": 6.751775436935474e-05, + "loss": 0.2075, + "step": 10342 + }, + { + "epoch": 2.084223252065283, + "grad_norm": 0.05462156981229782, + "learning_rate": 6.75052719074105e-05, + "loss": 0.2108, + "step": 10344 + }, + { + "epoch": 2.084626234132581, + "grad_norm": 0.05999904125928879, + "learning_rate": 6.749278820191976e-05, + "loss": 0.1819, + "step": 10346 + }, + { + "epoch": 2.085029216199879, + "grad_norm": 0.06505034118890762, + "learning_rate": 6.74803032537693e-05, + "loss": 0.1821, + "step": 10348 + }, + { + "epoch": 2.0854321982671773, + "grad_norm": 0.05717167258262634, + "learning_rate": 6.746781706384606e-05, + "loss": 0.261, + "step": 10350 + }, + { + "epoch": 2.0858351803344752, + "grad_norm": 0.06013219431042671, + "learning_rate": 6.745532963303705e-05, + "loss": 0.1888, + "step": 10352 + }, + { + "epoch": 2.086238162401773, + "grad_norm": 0.043344974517822266, + "learning_rate": 6.744284096222932e-05, + "loss": 0.1607, + "step": 10354 + }, + { + "epoch": 2.086641144469071, + "grad_norm": 0.0586077980697155, + "learning_rate": 6.743035105231006e-05, + "loss": 0.1978, + "step": 10356 + }, + { + "epoch": 2.087044126536369, + "grad_norm": 0.05550181493163109, + "learning_rate": 6.741785990416654e-05, + "loss": 0.1743, + "step": 10358 + }, + { + "epoch": 2.087447108603667, + "grad_norm": 0.049226608127355576, + "learning_rate": 6.740536751868611e-05, + "loss": 0.1788, + "step": 10360 + }, + { + "epoch": 2.087850090670965, + "grad_norm": 0.05161656439304352, + "learning_rate": 6.739287389675621e-05, + "loss": 0.1803, + "step": 10362 + }, + { + "epoch": 2.0882530727382633, + "grad_norm": 0.0661526471376419, + "learning_rate": 6.738037903926437e-05, + "loss": 0.241, + "step": 10364 + }, + { + "epoch": 2.0886560548055613, + "grad_norm": 0.0597810298204422, + "learning_rate": 6.736788294709819e-05, + "loss": 0.1723, + "step": 10366 + }, + { + "epoch": 2.089059036872859, + "grad_norm": 0.0493599995970726, + "learning_rate": 6.735538562114538e-05, + "loss": 0.1674, + "step": 10368 + }, + { + "epoch": 2.089462018940157, + "grad_norm": 0.04726816341280937, + "learning_rate": 6.734288706229373e-05, + "loss": 0.1667, + "step": 10370 + }, + { + "epoch": 2.089865001007455, + "grad_norm": 0.06035396829247475, + "learning_rate": 6.733038727143113e-05, + "loss": 0.2051, + "step": 10372 + }, + { + "epoch": 2.090267983074753, + "grad_norm": 0.04860403761267662, + "learning_rate": 6.731788624944551e-05, + "loss": 0.1549, + "step": 10374 + }, + { + "epoch": 2.090670965142051, + "grad_norm": 0.05029008165001869, + "learning_rate": 6.730538399722497e-05, + "loss": 0.1953, + "step": 10376 + }, + { + "epoch": 2.0910739472093494, + "grad_norm": 0.053353093564510345, + "learning_rate": 6.729288051565763e-05, + "loss": 0.2012, + "step": 10378 + }, + { + "epoch": 2.0914769292766473, + "grad_norm": 0.07299556583166122, + "learning_rate": 6.72803758056317e-05, + "loss": 0.1612, + "step": 10380 + }, + { + "epoch": 2.0918799113439452, + "grad_norm": 0.0511019267141819, + "learning_rate": 6.726786986803552e-05, + "loss": 0.1902, + "step": 10382 + }, + { + "epoch": 2.092282893411243, + "grad_norm": 0.06305370479822159, + "learning_rate": 6.725536270375747e-05, + "loss": 0.1789, + "step": 10384 + }, + { + "epoch": 2.092685875478541, + "grad_norm": 0.05989459529519081, + "learning_rate": 6.724285431368604e-05, + "loss": 0.2489, + "step": 10386 + }, + { + "epoch": 2.093088857545839, + "grad_norm": 0.07147608697414398, + "learning_rate": 6.723034469870983e-05, + "loss": 0.1998, + "step": 10388 + }, + { + "epoch": 2.093491839613137, + "grad_norm": 0.05758042261004448, + "learning_rate": 6.72178338597175e-05, + "loss": 0.1878, + "step": 10390 + }, + { + "epoch": 2.0938948216804354, + "grad_norm": 0.05327797308564186, + "learning_rate": 6.720532179759777e-05, + "loss": 0.2193, + "step": 10392 + }, + { + "epoch": 2.0942978037477333, + "grad_norm": 0.061337072402238846, + "learning_rate": 6.71928085132395e-05, + "loss": 0.2337, + "step": 10394 + }, + { + "epoch": 2.0947007858150313, + "grad_norm": 0.03611127659678459, + "learning_rate": 6.718029400753161e-05, + "loss": 0.1217, + "step": 10396 + }, + { + "epoch": 2.095103767882329, + "grad_norm": 0.05926065146923065, + "learning_rate": 6.71677782813631e-05, + "loss": 0.2102, + "step": 10398 + }, + { + "epoch": 2.095506749949627, + "grad_norm": 0.04807904362678528, + "learning_rate": 6.71552613356231e-05, + "loss": 0.1536, + "step": 10400 + }, + { + "epoch": 2.095909732016925, + "grad_norm": 0.06720544397830963, + "learning_rate": 6.714274317120076e-05, + "loss": 0.1666, + "step": 10402 + }, + { + "epoch": 2.096312714084223, + "grad_norm": 0.034379951655864716, + "learning_rate": 6.713022378898535e-05, + "loss": 0.121, + "step": 10404 + }, + { + "epoch": 2.0967156961515214, + "grad_norm": 0.0607600137591362, + "learning_rate": 6.711770318986624e-05, + "loss": 0.2098, + "step": 10406 + }, + { + "epoch": 2.0971186782188194, + "grad_norm": 0.051908619701862335, + "learning_rate": 6.710518137473288e-05, + "loss": 0.179, + "step": 10408 + }, + { + "epoch": 2.0975216602861173, + "grad_norm": 0.049980923533439636, + "learning_rate": 6.709265834447479e-05, + "loss": 0.1668, + "step": 10410 + }, + { + "epoch": 2.0979246423534152, + "grad_norm": 0.05012454465031624, + "learning_rate": 6.708013409998158e-05, + "loss": 0.1529, + "step": 10412 + }, + { + "epoch": 2.098327624420713, + "grad_norm": 0.12584945559501648, + "learning_rate": 6.706760864214297e-05, + "loss": 0.1905, + "step": 10414 + }, + { + "epoch": 2.098730606488011, + "grad_norm": 0.05351203680038452, + "learning_rate": 6.705508197184873e-05, + "loss": 0.182, + "step": 10416 + }, + { + "epoch": 2.0991335885553095, + "grad_norm": 0.07033678144216537, + "learning_rate": 6.704255408998873e-05, + "loss": 0.2062, + "step": 10418 + }, + { + "epoch": 2.0995365706226075, + "grad_norm": 0.044345423579216, + "learning_rate": 6.703002499745296e-05, + "loss": 0.1581, + "step": 10420 + }, + { + "epoch": 2.0999395526899054, + "grad_norm": 0.05711236223578453, + "learning_rate": 6.701749469513146e-05, + "loss": 0.1727, + "step": 10422 + }, + { + "epoch": 2.1003425347572033, + "grad_norm": 0.04664299264550209, + "learning_rate": 6.700496318391432e-05, + "loss": 0.1864, + "step": 10424 + }, + { + "epoch": 2.1007455168245013, + "grad_norm": 0.06930460780858994, + "learning_rate": 6.699243046469182e-05, + "loss": 0.2205, + "step": 10426 + }, + { + "epoch": 2.101148498891799, + "grad_norm": 0.056150782853364944, + "learning_rate": 6.697989653835423e-05, + "loss": 0.2011, + "step": 10428 + }, + { + "epoch": 2.101551480959097, + "grad_norm": 0.050900932401418686, + "learning_rate": 6.696736140579193e-05, + "loss": 0.2052, + "step": 10430 + }, + { + "epoch": 2.101954463026395, + "grad_norm": 0.060148272663354874, + "learning_rate": 6.695482506789542e-05, + "loss": 0.1641, + "step": 10432 + }, + { + "epoch": 2.1023574450936935, + "grad_norm": 0.050767235457897186, + "learning_rate": 6.694228752555526e-05, + "loss": 0.1799, + "step": 10434 + }, + { + "epoch": 2.1027604271609914, + "grad_norm": 0.07405157387256622, + "learning_rate": 6.692974877966208e-05, + "loss": 0.2153, + "step": 10436 + }, + { + "epoch": 2.1031634092282894, + "grad_norm": 0.05881273001432419, + "learning_rate": 6.691720883110662e-05, + "loss": 0.177, + "step": 10438 + }, + { + "epoch": 2.1035663912955873, + "grad_norm": 0.07139863073825836, + "learning_rate": 6.69046676807797e-05, + "loss": 0.2424, + "step": 10440 + }, + { + "epoch": 2.1039693733628853, + "grad_norm": 0.05423678830265999, + "learning_rate": 6.689212532957224e-05, + "loss": 0.1913, + "step": 10442 + }, + { + "epoch": 2.104372355430183, + "grad_norm": 0.054789528250694275, + "learning_rate": 6.687958177837518e-05, + "loss": 0.1351, + "step": 10444 + }, + { + "epoch": 2.1047753374974816, + "grad_norm": 0.06818938255310059, + "learning_rate": 6.686703702807965e-05, + "loss": 0.2081, + "step": 10446 + }, + { + "epoch": 2.1051783195647795, + "grad_norm": 0.09108281880617142, + "learning_rate": 6.685449107957678e-05, + "loss": 0.1892, + "step": 10448 + }, + { + "epoch": 2.1055813016320775, + "grad_norm": 0.04925645515322685, + "learning_rate": 6.684194393375781e-05, + "loss": 0.1537, + "step": 10450 + }, + { + "epoch": 2.1059842836993754, + "grad_norm": 0.05035284161567688, + "learning_rate": 6.682939559151409e-05, + "loss": 0.1828, + "step": 10452 + }, + { + "epoch": 2.1063872657666733, + "grad_norm": 0.04909108206629753, + "learning_rate": 6.681684605373702e-05, + "loss": 0.1615, + "step": 10454 + }, + { + "epoch": 2.1067902478339713, + "grad_norm": 0.0470598042011261, + "learning_rate": 6.680429532131809e-05, + "loss": 0.1839, + "step": 10456 + }, + { + "epoch": 2.1071932299012692, + "grad_norm": 0.04207504913210869, + "learning_rate": 6.679174339514891e-05, + "loss": 0.145, + "step": 10458 + }, + { + "epoch": 2.1075962119685676, + "grad_norm": 0.04711586982011795, + "learning_rate": 6.677919027612112e-05, + "loss": 0.1561, + "step": 10460 + }, + { + "epoch": 2.1079991940358656, + "grad_norm": 0.13398267328739166, + "learning_rate": 6.676663596512649e-05, + "loss": 0.1932, + "step": 10462 + }, + { + "epoch": 2.1084021761031635, + "grad_norm": 0.060606952756643295, + "learning_rate": 6.675408046305686e-05, + "loss": 0.1956, + "step": 10464 + }, + { + "epoch": 2.1088051581704614, + "grad_norm": 0.09983116388320923, + "learning_rate": 6.674152377080414e-05, + "loss": 0.2291, + "step": 10466 + }, + { + "epoch": 2.1092081402377594, + "grad_norm": 0.0655071958899498, + "learning_rate": 6.672896588926035e-05, + "loss": 0.2101, + "step": 10468 + }, + { + "epoch": 2.1096111223050573, + "grad_norm": 0.0752595067024231, + "learning_rate": 6.671640681931759e-05, + "loss": 0.2017, + "step": 10470 + }, + { + "epoch": 2.1100141043723553, + "grad_norm": 0.061998266726732254, + "learning_rate": 6.670384656186801e-05, + "loss": 0.1757, + "step": 10472 + }, + { + "epoch": 2.1104170864396536, + "grad_norm": 0.04654568433761597, + "learning_rate": 6.669128511780388e-05, + "loss": 0.1429, + "step": 10474 + }, + { + "epoch": 2.1108200685069516, + "grad_norm": 0.07311231642961502, + "learning_rate": 6.667872248801756e-05, + "loss": 0.1845, + "step": 10476 + }, + { + "epoch": 2.1112230505742495, + "grad_norm": 0.059998735785484314, + "learning_rate": 6.666615867340146e-05, + "loss": 0.1876, + "step": 10478 + }, + { + "epoch": 2.1116260326415475, + "grad_norm": 0.046271517872810364, + "learning_rate": 6.665359367484812e-05, + "loss": 0.1738, + "step": 10480 + }, + { + "epoch": 2.1120290147088454, + "grad_norm": 0.09801926463842392, + "learning_rate": 6.66410274932501e-05, + "loss": 0.2305, + "step": 10482 + }, + { + "epoch": 2.1124319967761434, + "grad_norm": 0.06510908156633377, + "learning_rate": 6.66284601295001e-05, + "loss": 0.1822, + "step": 10484 + }, + { + "epoch": 2.1128349788434413, + "grad_norm": 0.07298275083303452, + "learning_rate": 6.661589158449089e-05, + "loss": 0.2025, + "step": 10486 + }, + { + "epoch": 2.1132379609107397, + "grad_norm": 0.06558576226234436, + "learning_rate": 6.660332185911531e-05, + "loss": 0.2144, + "step": 10488 + }, + { + "epoch": 2.1136409429780376, + "grad_norm": 0.08517705649137497, + "learning_rate": 6.65907509542663e-05, + "loss": 0.1633, + "step": 10490 + }, + { + "epoch": 2.1140439250453356, + "grad_norm": 0.06583531200885773, + "learning_rate": 6.657817887083688e-05, + "loss": 0.1851, + "step": 10492 + }, + { + "epoch": 2.1144469071126335, + "grad_norm": 0.051145102828741074, + "learning_rate": 6.656560560972014e-05, + "loss": 0.1252, + "step": 10494 + }, + { + "epoch": 2.1148498891799314, + "grad_norm": 0.05931749939918518, + "learning_rate": 6.655303117180927e-05, + "loss": 0.1682, + "step": 10496 + }, + { + "epoch": 2.1152528712472294, + "grad_norm": 0.057506263256073, + "learning_rate": 6.654045555799754e-05, + "loss": 0.2283, + "step": 10498 + }, + { + "epoch": 2.1156558533145273, + "grad_norm": 0.0503305122256279, + "learning_rate": 6.652787876917831e-05, + "loss": 0.1649, + "step": 10500 + }, + { + "epoch": 2.1160588353818257, + "grad_norm": 0.05089758709073067, + "learning_rate": 6.6515300806245e-05, + "loss": 0.201, + "step": 10502 + }, + { + "epoch": 2.1164618174491237, + "grad_norm": 0.0781029537320137, + "learning_rate": 6.650272167009113e-05, + "loss": 0.2012, + "step": 10504 + }, + { + "epoch": 2.1168647995164216, + "grad_norm": 0.050134409219026566, + "learning_rate": 6.64901413616103e-05, + "loss": 0.1465, + "step": 10506 + }, + { + "epoch": 2.1172677815837195, + "grad_norm": 0.06083032488822937, + "learning_rate": 6.64775598816962e-05, + "loss": 0.1987, + "step": 10508 + }, + { + "epoch": 2.1176707636510175, + "grad_norm": 0.2716810703277588, + "learning_rate": 6.646497723124262e-05, + "loss": 0.2102, + "step": 10510 + }, + { + "epoch": 2.1180737457183154, + "grad_norm": 0.0632266253232956, + "learning_rate": 6.645239341114335e-05, + "loss": 0.1823, + "step": 10512 + }, + { + "epoch": 2.1184767277856134, + "grad_norm": 0.0860210657119751, + "learning_rate": 6.64398084222924e-05, + "loss": 0.2009, + "step": 10514 + }, + { + "epoch": 2.1188797098529117, + "grad_norm": 0.07961370795965195, + "learning_rate": 6.642722226558374e-05, + "loss": 0.1576, + "step": 10516 + }, + { + "epoch": 2.1192826919202097, + "grad_norm": 0.054681435227394104, + "learning_rate": 6.641463494191146e-05, + "loss": 0.2364, + "step": 10518 + }, + { + "epoch": 2.1196856739875076, + "grad_norm": 0.052845437079668045, + "learning_rate": 6.640204645216978e-05, + "loss": 0.1589, + "step": 10520 + }, + { + "epoch": 2.1200886560548056, + "grad_norm": 0.5749832987785339, + "learning_rate": 6.638945679725295e-05, + "loss": 0.1696, + "step": 10522 + }, + { + "epoch": 2.1204916381221035, + "grad_norm": 0.06033914163708687, + "learning_rate": 6.637686597805533e-05, + "loss": 0.2144, + "step": 10524 + }, + { + "epoch": 2.1208946201894014, + "grad_norm": 0.053227026015520096, + "learning_rate": 6.636427399547133e-05, + "loss": 0.1954, + "step": 10526 + }, + { + "epoch": 2.1212976022566994, + "grad_norm": 0.04543086886405945, + "learning_rate": 6.635168085039549e-05, + "loss": 0.1772, + "step": 10528 + }, + { + "epoch": 2.1217005843239978, + "grad_norm": 0.05598034709692001, + "learning_rate": 6.633908654372239e-05, + "loss": 0.2335, + "step": 10530 + }, + { + "epoch": 2.1221035663912957, + "grad_norm": 0.061585236340761185, + "learning_rate": 6.63264910763467e-05, + "loss": 0.2383, + "step": 10532 + }, + { + "epoch": 2.1225065484585937, + "grad_norm": 0.0683068037033081, + "learning_rate": 6.63138944491632e-05, + "loss": 0.1791, + "step": 10534 + }, + { + "epoch": 2.1229095305258916, + "grad_norm": 0.053205978125333786, + "learning_rate": 6.630129666306674e-05, + "loss": 0.1677, + "step": 10536 + }, + { + "epoch": 2.1233125125931895, + "grad_norm": 0.06105554848909378, + "learning_rate": 6.628869771895223e-05, + "loss": 0.2059, + "step": 10538 + }, + { + "epoch": 2.1237154946604875, + "grad_norm": 0.053694818168878555, + "learning_rate": 6.627609761771467e-05, + "loss": 0.1753, + "step": 10540 + }, + { + "epoch": 2.1241184767277854, + "grad_norm": 0.04918495565652847, + "learning_rate": 6.626349636024918e-05, + "loss": 0.1985, + "step": 10542 + }, + { + "epoch": 2.124521458795084, + "grad_norm": 0.04894405975937843, + "learning_rate": 6.625089394745092e-05, + "loss": 0.1918, + "step": 10544 + }, + { + "epoch": 2.1249244408623817, + "grad_norm": 0.04774921387434006, + "learning_rate": 6.623829038021512e-05, + "loss": 0.1785, + "step": 10546 + }, + { + "epoch": 2.1253274229296797, + "grad_norm": 0.06385761499404907, + "learning_rate": 6.622568565943717e-05, + "loss": 0.1786, + "step": 10548 + }, + { + "epoch": 2.1257304049969776, + "grad_norm": 0.05672699585556984, + "learning_rate": 6.621307978601246e-05, + "loss": 0.174, + "step": 10550 + }, + { + "epoch": 2.1261333870642756, + "grad_norm": 0.04841500148177147, + "learning_rate": 6.620047276083646e-05, + "loss": 0.1782, + "step": 10552 + }, + { + "epoch": 2.1265363691315735, + "grad_norm": 0.050164107233285904, + "learning_rate": 6.61878645848048e-05, + "loss": 0.1399, + "step": 10554 + }, + { + "epoch": 2.1269393511988715, + "grad_norm": 0.05736628547310829, + "learning_rate": 6.617525525881315e-05, + "loss": 0.2191, + "step": 10556 + }, + { + "epoch": 2.12734233326617, + "grad_norm": 0.055756233632564545, + "learning_rate": 6.61626447837572e-05, + "loss": 0.1473, + "step": 10558 + }, + { + "epoch": 2.127745315333468, + "grad_norm": 0.06790580600500107, + "learning_rate": 6.615003316053283e-05, + "loss": 0.2168, + "step": 10560 + }, + { + "epoch": 2.1281482974007657, + "grad_norm": 0.05667269602417946, + "learning_rate": 6.613742039003594e-05, + "loss": 0.1785, + "step": 10562 + }, + { + "epoch": 2.1285512794680637, + "grad_norm": 0.043748367577791214, + "learning_rate": 6.612480647316251e-05, + "loss": 0.1818, + "step": 10564 + }, + { + "epoch": 2.1289542615353616, + "grad_norm": 0.05138718709349632, + "learning_rate": 6.61121914108086e-05, + "loss": 0.1582, + "step": 10566 + }, + { + "epoch": 2.1293572436026595, + "grad_norm": 0.043073199689388275, + "learning_rate": 6.609957520387039e-05, + "loss": 0.1383, + "step": 10568 + }, + { + "epoch": 2.1297602256699575, + "grad_norm": 0.07397390902042389, + "learning_rate": 6.60869578532441e-05, + "loss": 0.1539, + "step": 10570 + }, + { + "epoch": 2.130163207737256, + "grad_norm": 0.07981264591217041, + "learning_rate": 6.607433935982607e-05, + "loss": 0.2419, + "step": 10572 + }, + { + "epoch": 2.130566189804554, + "grad_norm": 0.041248392313718796, + "learning_rate": 6.606171972451266e-05, + "loss": 0.1984, + "step": 10574 + }, + { + "epoch": 2.1309691718718518, + "grad_norm": 0.055371470749378204, + "learning_rate": 6.604909894820037e-05, + "loss": 0.1927, + "step": 10576 + }, + { + "epoch": 2.1313721539391497, + "grad_norm": 0.08325458317995071, + "learning_rate": 6.603647703178577e-05, + "loss": 0.2336, + "step": 10578 + }, + { + "epoch": 2.1317751360064476, + "grad_norm": 0.053513310849666595, + "learning_rate": 6.602385397616547e-05, + "loss": 0.1879, + "step": 10580 + }, + { + "epoch": 2.1321781180737456, + "grad_norm": 0.05937230959534645, + "learning_rate": 6.601122978223622e-05, + "loss": 0.2393, + "step": 10582 + }, + { + "epoch": 2.1325811001410435, + "grad_norm": 0.05260048806667328, + "learning_rate": 6.599860445089481e-05, + "loss": 0.1531, + "step": 10584 + }, + { + "epoch": 2.132984082208342, + "grad_norm": 0.06879477202892303, + "learning_rate": 6.598597798303813e-05, + "loss": 0.1872, + "step": 10586 + }, + { + "epoch": 2.13338706427564, + "grad_norm": 0.11585239320993423, + "learning_rate": 6.597335037956313e-05, + "loss": 0.2035, + "step": 10588 + }, + { + "epoch": 2.133790046342938, + "grad_norm": 0.04980898275971413, + "learning_rate": 6.596072164136689e-05, + "loss": 0.17, + "step": 10590 + }, + { + "epoch": 2.1341930284102357, + "grad_norm": 0.05175916105508804, + "learning_rate": 6.594809176934649e-05, + "loss": 0.2088, + "step": 10592 + }, + { + "epoch": 2.1345960104775337, + "grad_norm": 0.0495474636554718, + "learning_rate": 6.593546076439915e-05, + "loss": 0.1725, + "step": 10594 + }, + { + "epoch": 2.1349989925448316, + "grad_norm": 0.07358019053936005, + "learning_rate": 6.592282862742217e-05, + "loss": 0.218, + "step": 10596 + }, + { + "epoch": 2.1354019746121295, + "grad_norm": 0.05674292892217636, + "learning_rate": 6.591019535931291e-05, + "loss": 0.2143, + "step": 10598 + }, + { + "epoch": 2.135804956679428, + "grad_norm": 0.054214123636484146, + "learning_rate": 6.589756096096881e-05, + "loss": 0.1864, + "step": 10600 + }, + { + "epoch": 2.136207938746726, + "grad_norm": 0.06335791945457458, + "learning_rate": 6.588492543328741e-05, + "loss": 0.2023, + "step": 10602 + }, + { + "epoch": 2.136610920814024, + "grad_norm": 0.05495510995388031, + "learning_rate": 6.587228877716632e-05, + "loss": 0.1576, + "step": 10604 + }, + { + "epoch": 2.1370139028813218, + "grad_norm": 0.05250907689332962, + "learning_rate": 6.58596509935032e-05, + "loss": 0.1167, + "step": 10606 + }, + { + "epoch": 2.1374168849486197, + "grad_norm": 0.05653761699795723, + "learning_rate": 6.584701208319586e-05, + "loss": 0.1657, + "step": 10608 + }, + { + "epoch": 2.1378198670159176, + "grad_norm": 0.046690188348293304, + "learning_rate": 6.58343720471421e-05, + "loss": 0.2224, + "step": 10610 + }, + { + "epoch": 2.138222849083216, + "grad_norm": 0.06347212195396423, + "learning_rate": 6.582173088623988e-05, + "loss": 0.2408, + "step": 10612 + }, + { + "epoch": 2.138625831150514, + "grad_norm": 0.08230816572904587, + "learning_rate": 6.580908860138722e-05, + "loss": 0.2525, + "step": 10614 + }, + { + "epoch": 2.139028813217812, + "grad_norm": 0.079873226583004, + "learning_rate": 6.579644519348217e-05, + "loss": 0.1941, + "step": 10616 + }, + { + "epoch": 2.13943179528511, + "grad_norm": 0.0678006038069725, + "learning_rate": 6.578380066342291e-05, + "loss": 0.1839, + "step": 10618 + }, + { + "epoch": 2.139834777352408, + "grad_norm": 0.055864691734313965, + "learning_rate": 6.577115501210771e-05, + "loss": 0.2021, + "step": 10620 + }, + { + "epoch": 2.1402377594197057, + "grad_norm": 0.08479974418878555, + "learning_rate": 6.575850824043488e-05, + "loss": 0.233, + "step": 10622 + }, + { + "epoch": 2.1406407414870037, + "grad_norm": 0.08520139008760452, + "learning_rate": 6.574586034930282e-05, + "loss": 0.1911, + "step": 10624 + }, + { + "epoch": 2.1410437235543016, + "grad_norm": 0.06425566226243973, + "learning_rate": 6.573321133961003e-05, + "loss": 0.1932, + "step": 10626 + }, + { + "epoch": 2.1414467056216, + "grad_norm": 0.04391736537218094, + "learning_rate": 6.572056121225505e-05, + "loss": 0.1764, + "step": 10628 + }, + { + "epoch": 2.141849687688898, + "grad_norm": 0.052960388362407684, + "learning_rate": 6.570790996813655e-05, + "loss": 0.1735, + "step": 10630 + }, + { + "epoch": 2.142252669756196, + "grad_norm": 0.06271100789308548, + "learning_rate": 6.569525760815326e-05, + "loss": 0.1523, + "step": 10632 + }, + { + "epoch": 2.142655651823494, + "grad_norm": 0.061068955808877945, + "learning_rate": 6.568260413320397e-05, + "loss": 0.184, + "step": 10634 + }, + { + "epoch": 2.1430586338907918, + "grad_norm": 0.04499977454543114, + "learning_rate": 6.566994954418755e-05, + "loss": 0.1898, + "step": 10636 + }, + { + "epoch": 2.1434616159580897, + "grad_norm": 0.06739667057991028, + "learning_rate": 6.565729384200297e-05, + "loss": 0.203, + "step": 10638 + }, + { + "epoch": 2.143864598025388, + "grad_norm": 0.041490063071250916, + "learning_rate": 6.564463702754929e-05, + "loss": 0.1897, + "step": 10640 + }, + { + "epoch": 2.144267580092686, + "grad_norm": 0.04645445942878723, + "learning_rate": 6.563197910172562e-05, + "loss": 0.1341, + "step": 10642 + }, + { + "epoch": 2.144670562159984, + "grad_norm": 0.043481625616550446, + "learning_rate": 6.561932006543115e-05, + "loss": 0.1423, + "step": 10644 + }, + { + "epoch": 2.145073544227282, + "grad_norm": 0.06526371836662292, + "learning_rate": 6.560665991956514e-05, + "loss": 0.1584, + "step": 10646 + }, + { + "epoch": 2.14547652629458, + "grad_norm": 0.0430811382830143, + "learning_rate": 6.5593998665027e-05, + "loss": 0.1757, + "step": 10648 + }, + { + "epoch": 2.145879508361878, + "grad_norm": 0.08176197856664658, + "learning_rate": 6.558133630271611e-05, + "loss": 0.1898, + "step": 10650 + }, + { + "epoch": 2.1462824904291757, + "grad_norm": 0.05684948340058327, + "learning_rate": 6.5568672833532e-05, + "loss": 0.1899, + "step": 10652 + }, + { + "epoch": 2.1466854724964737, + "grad_norm": 0.04240806773304939, + "learning_rate": 6.555600825837431e-05, + "loss": 0.1811, + "step": 10654 + }, + { + "epoch": 2.147088454563772, + "grad_norm": 0.07894350588321686, + "learning_rate": 6.554334257814264e-05, + "loss": 0.204, + "step": 10656 + }, + { + "epoch": 2.14749143663107, + "grad_norm": 0.06273147463798523, + "learning_rate": 6.553067579373677e-05, + "loss": 0.2152, + "step": 10658 + }, + { + "epoch": 2.147894418698368, + "grad_norm": 0.06141069903969765, + "learning_rate": 6.551800790605655e-05, + "loss": 0.1701, + "step": 10660 + }, + { + "epoch": 2.148297400765666, + "grad_norm": 0.04876910522580147, + "learning_rate": 6.550533891600186e-05, + "loss": 0.1384, + "step": 10662 + }, + { + "epoch": 2.148700382832964, + "grad_norm": 0.0704379603266716, + "learning_rate": 6.549266882447268e-05, + "loss": 0.1589, + "step": 10664 + }, + { + "epoch": 2.1491033649002618, + "grad_norm": 0.05774037539958954, + "learning_rate": 6.547999763236909e-05, + "loss": 0.1674, + "step": 10666 + }, + { + "epoch": 2.14950634696756, + "grad_norm": 0.05031192675232887, + "learning_rate": 6.546732534059122e-05, + "loss": 0.2287, + "step": 10668 + }, + { + "epoch": 2.149909329034858, + "grad_norm": 0.06176503375172615, + "learning_rate": 6.54546519500393e-05, + "loss": 0.1826, + "step": 10670 + }, + { + "epoch": 2.150312311102156, + "grad_norm": 0.05960913375020027, + "learning_rate": 6.544197746161363e-05, + "loss": 0.2152, + "step": 10672 + }, + { + "epoch": 2.150715293169454, + "grad_norm": 0.07033538818359375, + "learning_rate": 6.542930187621455e-05, + "loss": 0.1944, + "step": 10674 + }, + { + "epoch": 2.151118275236752, + "grad_norm": 0.05268462374806404, + "learning_rate": 6.541662519474256e-05, + "loss": 0.2345, + "step": 10676 + }, + { + "epoch": 2.15152125730405, + "grad_norm": 0.05274904519319534, + "learning_rate": 6.540394741809818e-05, + "loss": 0.1759, + "step": 10678 + }, + { + "epoch": 2.151924239371348, + "grad_norm": 0.06389094144105911, + "learning_rate": 6.539126854718198e-05, + "loss": 0.1585, + "step": 10680 + }, + { + "epoch": 2.152327221438646, + "grad_norm": 0.07119549810886383, + "learning_rate": 6.53785885828947e-05, + "loss": 0.2174, + "step": 10682 + }, + { + "epoch": 2.152730203505944, + "grad_norm": 0.0506424643099308, + "learning_rate": 6.536590752613708e-05, + "loss": 0.1525, + "step": 10684 + }, + { + "epoch": 2.153133185573242, + "grad_norm": 0.06556964665651321, + "learning_rate": 6.535322537780997e-05, + "loss": 0.1803, + "step": 10686 + }, + { + "epoch": 2.15353616764054, + "grad_norm": 0.0708077996969223, + "learning_rate": 6.534054213881426e-05, + "loss": 0.2223, + "step": 10688 + }, + { + "epoch": 2.153939149707838, + "grad_norm": 0.05754520371556282, + "learning_rate": 6.5327857810051e-05, + "loss": 0.2328, + "step": 10690 + }, + { + "epoch": 2.154342131775136, + "grad_norm": 0.0400908887386322, + "learning_rate": 6.531517239242121e-05, + "loss": 0.1488, + "step": 10692 + }, + { + "epoch": 2.154745113842434, + "grad_norm": 0.0965040922164917, + "learning_rate": 6.530248588682607e-05, + "loss": 0.1879, + "step": 10694 + }, + { + "epoch": 2.155148095909732, + "grad_norm": 0.05838022381067276, + "learning_rate": 6.528979829416682e-05, + "loss": 0.2306, + "step": 10696 + }, + { + "epoch": 2.15555107797703, + "grad_norm": 0.05278877168893814, + "learning_rate": 6.527710961534473e-05, + "loss": 0.2185, + "step": 10698 + }, + { + "epoch": 2.155954060044328, + "grad_norm": 0.0592544861137867, + "learning_rate": 6.526441985126121e-05, + "loss": 0.18, + "step": 10700 + }, + { + "epoch": 2.156357042111626, + "grad_norm": 0.17698846757411957, + "learning_rate": 6.525172900281774e-05, + "loss": 0.1923, + "step": 10702 + }, + { + "epoch": 2.156760024178924, + "grad_norm": 0.05410192534327507, + "learning_rate": 6.523903707091581e-05, + "loss": 0.2426, + "step": 10704 + }, + { + "epoch": 2.157163006246222, + "grad_norm": 0.049191661179065704, + "learning_rate": 6.522634405645705e-05, + "loss": 0.16, + "step": 10706 + }, + { + "epoch": 2.15756598831352, + "grad_norm": 0.049240659922361374, + "learning_rate": 6.521364996034318e-05, + "loss": 0.1309, + "step": 10708 + }, + { + "epoch": 2.1579689703808183, + "grad_norm": 0.04705396294593811, + "learning_rate": 6.520095478347594e-05, + "loss": 0.1748, + "step": 10710 + }, + { + "epoch": 2.158371952448116, + "grad_norm": 0.0652155801653862, + "learning_rate": 6.518825852675719e-05, + "loss": 0.206, + "step": 10712 + }, + { + "epoch": 2.158774934515414, + "grad_norm": 0.06381936371326447, + "learning_rate": 6.517556119108882e-05, + "loss": 0.2326, + "step": 10714 + }, + { + "epoch": 2.159177916582712, + "grad_norm": 0.05069877207279205, + "learning_rate": 6.516286277737287e-05, + "loss": 0.2082, + "step": 10716 + }, + { + "epoch": 2.15958089865001, + "grad_norm": 0.05980228632688522, + "learning_rate": 6.51501632865114e-05, + "loss": 0.2605, + "step": 10718 + }, + { + "epoch": 2.159983880717308, + "grad_norm": 0.054766424000263214, + "learning_rate": 6.513746271940656e-05, + "loss": 0.1951, + "step": 10720 + }, + { + "epoch": 2.160386862784606, + "grad_norm": 0.07653187215328217, + "learning_rate": 6.512476107696058e-05, + "loss": 0.2033, + "step": 10722 + }, + { + "epoch": 2.1607898448519043, + "grad_norm": 0.06018948182463646, + "learning_rate": 6.511205836007575e-05, + "loss": 0.186, + "step": 10724 + }, + { + "epoch": 2.1611928269192022, + "grad_norm": 0.06138040870428085, + "learning_rate": 6.509935456965446e-05, + "loss": 0.1764, + "step": 10726 + }, + { + "epoch": 2.1615958089865, + "grad_norm": 0.055142004042863846, + "learning_rate": 6.508664970659917e-05, + "loss": 0.179, + "step": 10728 + }, + { + "epoch": 2.161998791053798, + "grad_norm": 0.07405485212802887, + "learning_rate": 6.507394377181243e-05, + "loss": 0.2281, + "step": 10730 + }, + { + "epoch": 2.162401773121096, + "grad_norm": 0.06127721071243286, + "learning_rate": 6.506123676619682e-05, + "loss": 0.2391, + "step": 10732 + }, + { + "epoch": 2.162804755188394, + "grad_norm": 0.178990438580513, + "learning_rate": 6.504852869065503e-05, + "loss": 0.1965, + "step": 10734 + }, + { + "epoch": 2.163207737255692, + "grad_norm": 0.05538182333111763, + "learning_rate": 6.503581954608984e-05, + "loss": 0.1992, + "step": 10736 + }, + { + "epoch": 2.1636107193229903, + "grad_norm": 0.05879183113574982, + "learning_rate": 6.502310933340407e-05, + "loss": 0.1626, + "step": 10738 + }, + { + "epoch": 2.1640137013902883, + "grad_norm": 0.06185007095336914, + "learning_rate": 6.501039805350063e-05, + "loss": 0.1972, + "step": 10740 + }, + { + "epoch": 2.164416683457586, + "grad_norm": 0.06894201785326004, + "learning_rate": 6.499768570728254e-05, + "loss": 0.2017, + "step": 10742 + }, + { + "epoch": 2.164819665524884, + "grad_norm": 0.07132090628147125, + "learning_rate": 6.498497229565283e-05, + "loss": 0.2323, + "step": 10744 + }, + { + "epoch": 2.165222647592182, + "grad_norm": 0.06672022491693497, + "learning_rate": 6.497225781951465e-05, + "loss": 0.2459, + "step": 10746 + }, + { + "epoch": 2.16562562965948, + "grad_norm": 0.04566428065299988, + "learning_rate": 6.495954227977123e-05, + "loss": 0.1642, + "step": 10748 + }, + { + "epoch": 2.166028611726778, + "grad_norm": 0.077194444835186, + "learning_rate": 6.494682567732584e-05, + "loss": 0.2101, + "step": 10750 + }, + { + "epoch": 2.1664315937940763, + "grad_norm": 0.07681397348642349, + "learning_rate": 6.493410801308185e-05, + "loss": 0.1838, + "step": 10752 + }, + { + "epoch": 2.1668345758613743, + "grad_norm": 0.08021048456430435, + "learning_rate": 6.492138928794274e-05, + "loss": 0.2141, + "step": 10754 + }, + { + "epoch": 2.1672375579286722, + "grad_norm": 0.0956893116235733, + "learning_rate": 6.490866950281196e-05, + "loss": 0.1837, + "step": 10756 + }, + { + "epoch": 2.16764053999597, + "grad_norm": 0.06688991189002991, + "learning_rate": 6.489594865859316e-05, + "loss": 0.1923, + "step": 10758 + }, + { + "epoch": 2.168043522063268, + "grad_norm": 0.06143457069993019, + "learning_rate": 6.488322675619e-05, + "loss": 0.1785, + "step": 10760 + }, + { + "epoch": 2.168446504130566, + "grad_norm": 0.06079430878162384, + "learning_rate": 6.487050379650622e-05, + "loss": 0.1553, + "step": 10762 + }, + { + "epoch": 2.168849486197864, + "grad_norm": 0.03899361938238144, + "learning_rate": 6.48577797804456e-05, + "loss": 0.1434, + "step": 10764 + }, + { + "epoch": 2.1692524682651624, + "grad_norm": 0.05817665159702301, + "learning_rate": 6.484505470891209e-05, + "loss": 0.1804, + "step": 10766 + }, + { + "epoch": 2.1696554503324603, + "grad_norm": 0.06685777008533478, + "learning_rate": 6.483232858280962e-05, + "loss": 0.195, + "step": 10768 + }, + { + "epoch": 2.1700584323997583, + "grad_norm": 0.05597671493887901, + "learning_rate": 6.481960140304225e-05, + "loss": 0.1987, + "step": 10770 + }, + { + "epoch": 2.170461414467056, + "grad_norm": 0.059224747121334076, + "learning_rate": 6.48068731705141e-05, + "loss": 0.1825, + "step": 10772 + }, + { + "epoch": 2.170864396534354, + "grad_norm": 0.07822896540164948, + "learning_rate": 6.479414388612936e-05, + "loss": 0.1742, + "step": 10774 + }, + { + "epoch": 2.171267378601652, + "grad_norm": 0.070325568318367, + "learning_rate": 6.47814135507923e-05, + "loss": 0.2235, + "step": 10776 + }, + { + "epoch": 2.17167036066895, + "grad_norm": 0.0596688948571682, + "learning_rate": 6.476868216540728e-05, + "loss": 0.1769, + "step": 10778 + }, + { + "epoch": 2.1720733427362484, + "grad_norm": 0.04775102809071541, + "learning_rate": 6.475594973087866e-05, + "loss": 0.1415, + "step": 10780 + }, + { + "epoch": 2.1724763248035464, + "grad_norm": 0.06646701693534851, + "learning_rate": 6.474321624811101e-05, + "loss": 0.1886, + "step": 10782 + }, + { + "epoch": 2.1728793068708443, + "grad_norm": 0.05909574404358864, + "learning_rate": 6.473048171800882e-05, + "loss": 0.2096, + "step": 10784 + }, + { + "epoch": 2.1732822889381422, + "grad_norm": 0.10706569254398346, + "learning_rate": 6.471774614147678e-05, + "loss": 0.214, + "step": 10786 + }, + { + "epoch": 2.17368527100544, + "grad_norm": 0.06290362775325775, + "learning_rate": 6.47050095194196e-05, + "loss": 0.2276, + "step": 10788 + }, + { + "epoch": 2.174088253072738, + "grad_norm": 0.06873378157615662, + "learning_rate": 6.469227185274204e-05, + "loss": 0.1938, + "step": 10790 + }, + { + "epoch": 2.174491235140036, + "grad_norm": 0.14236712455749512, + "learning_rate": 6.4679533142349e-05, + "loss": 0.1875, + "step": 10792 + }, + { + "epoch": 2.1748942172073344, + "grad_norm": 0.05300990492105484, + "learning_rate": 6.466679338914542e-05, + "loss": 0.1466, + "step": 10794 + }, + { + "epoch": 2.1752971992746324, + "grad_norm": 0.06521982699632645, + "learning_rate": 6.465405259403626e-05, + "loss": 0.2241, + "step": 10796 + }, + { + "epoch": 2.1757001813419303, + "grad_norm": 0.08661910891532898, + "learning_rate": 6.464131075792665e-05, + "loss": 0.2403, + "step": 10798 + }, + { + "epoch": 2.1761031634092283, + "grad_norm": 0.05931783467531204, + "learning_rate": 6.462856788172175e-05, + "loss": 0.1893, + "step": 10800 + }, + { + "epoch": 2.176506145476526, + "grad_norm": 0.07565109431743622, + "learning_rate": 6.461582396632677e-05, + "loss": 0.1655, + "step": 10802 + }, + { + "epoch": 2.176909127543824, + "grad_norm": 0.08917911350727081, + "learning_rate": 6.460307901264704e-05, + "loss": 0.167, + "step": 10804 + }, + { + "epoch": 2.1773121096111225, + "grad_norm": 0.053038131445646286, + "learning_rate": 6.459033302158793e-05, + "loss": 0.1885, + "step": 10806 + }, + { + "epoch": 2.1777150916784205, + "grad_norm": 0.062328778207302094, + "learning_rate": 6.457758599405489e-05, + "loss": 0.1445, + "step": 10808 + }, + { + "epoch": 2.1781180737457184, + "grad_norm": 0.06661716103553772, + "learning_rate": 6.456483793095345e-05, + "loss": 0.1779, + "step": 10810 + }, + { + "epoch": 2.1785210558130164, + "grad_norm": 0.07117008417844772, + "learning_rate": 6.455208883318923e-05, + "loss": 0.2424, + "step": 10812 + }, + { + "epoch": 2.1789240378803143, + "grad_norm": 0.05308781564235687, + "learning_rate": 6.453933870166788e-05, + "loss": 0.2022, + "step": 10814 + }, + { + "epoch": 2.1793270199476122, + "grad_norm": 0.07809494435787201, + "learning_rate": 6.452658753729517e-05, + "loss": 0.2611, + "step": 10816 + }, + { + "epoch": 2.17973000201491, + "grad_norm": 0.06897277384996414, + "learning_rate": 6.451383534097692e-05, + "loss": 0.1904, + "step": 10818 + }, + { + "epoch": 2.180132984082208, + "grad_norm": 0.050235819071531296, + "learning_rate": 6.450108211361899e-05, + "loss": 0.1894, + "step": 10820 + }, + { + "epoch": 2.1805359661495065, + "grad_norm": 0.053104523569345474, + "learning_rate": 6.448832785612739e-05, + "loss": 0.1917, + "step": 10822 + }, + { + "epoch": 2.1809389482168045, + "grad_norm": 0.049236420542001724, + "learning_rate": 6.447557256940817e-05, + "loss": 0.169, + "step": 10824 + }, + { + "epoch": 2.1813419302841024, + "grad_norm": 0.10533499717712402, + "learning_rate": 6.446281625436741e-05, + "loss": 0.2435, + "step": 10826 + }, + { + "epoch": 2.1817449123514003, + "grad_norm": 0.05212007090449333, + "learning_rate": 6.44500589119113e-05, + "loss": 0.1979, + "step": 10828 + }, + { + "epoch": 2.1821478944186983, + "grad_norm": 0.05394808202981949, + "learning_rate": 6.443730054294614e-05, + "loss": 0.202, + "step": 10830 + }, + { + "epoch": 2.182550876485996, + "grad_norm": 0.05483834818005562, + "learning_rate": 6.442454114837823e-05, + "loss": 0.1981, + "step": 10832 + }, + { + "epoch": 2.1829538585532946, + "grad_norm": 0.05570824816823006, + "learning_rate": 6.441178072911398e-05, + "loss": 0.1896, + "step": 10834 + }, + { + "epoch": 2.1833568406205925, + "grad_norm": 0.07239922881126404, + "learning_rate": 6.439901928605988e-05, + "loss": 0.2285, + "step": 10836 + }, + { + "epoch": 2.1837598226878905, + "grad_norm": 0.04814567789435387, + "learning_rate": 6.438625682012248e-05, + "loss": 0.1692, + "step": 10838 + }, + { + "epoch": 2.1841628047551884, + "grad_norm": 0.07324769347906113, + "learning_rate": 6.437349333220838e-05, + "loss": 0.2088, + "step": 10840 + }, + { + "epoch": 2.1845657868224864, + "grad_norm": 0.04402107000350952, + "learning_rate": 6.436072882322432e-05, + "loss": 0.1789, + "step": 10842 + }, + { + "epoch": 2.1849687688897843, + "grad_norm": 2.5281331539154053, + "learning_rate": 6.434796329407705e-05, + "loss": 0.1853, + "step": 10844 + }, + { + "epoch": 2.1853717509570822, + "grad_norm": 0.04652038589119911, + "learning_rate": 6.433519674567342e-05, + "loss": 0.1921, + "step": 10846 + }, + { + "epoch": 2.18577473302438, + "grad_norm": 0.04080631211400032, + "learning_rate": 6.432242917892033e-05, + "loss": 0.1401, + "step": 10848 + }, + { + "epoch": 2.1861777150916786, + "grad_norm": 0.062017377465963364, + "learning_rate": 6.430966059472478e-05, + "loss": 0.1588, + "step": 10850 + }, + { + "epoch": 2.1865806971589765, + "grad_norm": 0.034065987914800644, + "learning_rate": 6.429689099399383e-05, + "loss": 0.133, + "step": 10852 + }, + { + "epoch": 2.1869836792262745, + "grad_norm": 0.044415101408958435, + "learning_rate": 6.428412037763459e-05, + "loss": 0.1488, + "step": 10854 + }, + { + "epoch": 2.1873866612935724, + "grad_norm": 0.07276732474565506, + "learning_rate": 6.42713487465543e-05, + "loss": 0.2159, + "step": 10856 + }, + { + "epoch": 2.1877896433608703, + "grad_norm": 0.040792450308799744, + "learning_rate": 6.425857610166021e-05, + "loss": 0.1691, + "step": 10858 + }, + { + "epoch": 2.1881926254281683, + "grad_norm": 0.0323706679046154, + "learning_rate": 6.42458024438597e-05, + "loss": 0.1333, + "step": 10860 + }, + { + "epoch": 2.1885956074954667, + "grad_norm": 0.053765907883644104, + "learning_rate": 6.423302777406013e-05, + "loss": 0.2161, + "step": 10862 + }, + { + "epoch": 2.1889985895627646, + "grad_norm": 0.06381344050168991, + "learning_rate": 6.422025209316906e-05, + "loss": 0.2325, + "step": 10864 + }, + { + "epoch": 2.1894015716300625, + "grad_norm": 0.046741727739572525, + "learning_rate": 6.4207475402094e-05, + "loss": 0.1963, + "step": 10866 + }, + { + "epoch": 2.1898045536973605, + "grad_norm": 0.06143912300467491, + "learning_rate": 6.419469770174263e-05, + "loss": 0.1893, + "step": 10868 + }, + { + "epoch": 2.1902075357646584, + "grad_norm": 0.06888391822576523, + "learning_rate": 6.418191899302263e-05, + "loss": 0.16, + "step": 10870 + }, + { + "epoch": 2.1906105178319564, + "grad_norm": 0.0596514530479908, + "learning_rate": 6.416913927684177e-05, + "loss": 0.2057, + "step": 10872 + }, + { + "epoch": 2.1910134998992543, + "grad_norm": 0.0417722687125206, + "learning_rate": 6.415635855410793e-05, + "loss": 0.1721, + "step": 10874 + }, + { + "epoch": 2.1914164819665523, + "grad_norm": 0.06044435501098633, + "learning_rate": 6.414357682572903e-05, + "loss": 0.2167, + "step": 10876 + }, + { + "epoch": 2.1918194640338506, + "grad_norm": 0.05764295905828476, + "learning_rate": 6.413079409261302e-05, + "loss": 0.1648, + "step": 10878 + }, + { + "epoch": 2.1922224461011486, + "grad_norm": 0.0555766336619854, + "learning_rate": 6.411801035566801e-05, + "loss": 0.1291, + "step": 10880 + }, + { + "epoch": 2.1926254281684465, + "grad_norm": 0.06001344323158264, + "learning_rate": 6.410522561580213e-05, + "loss": 0.1942, + "step": 10882 + }, + { + "epoch": 2.1930284102357445, + "grad_norm": 0.05169009789824486, + "learning_rate": 6.409243987392358e-05, + "loss": 0.1904, + "step": 10884 + }, + { + "epoch": 2.1934313923030424, + "grad_norm": 0.0572587326169014, + "learning_rate": 6.407965313094063e-05, + "loss": 0.1922, + "step": 10886 + }, + { + "epoch": 2.1938343743703403, + "grad_norm": 0.05167214199900627, + "learning_rate": 6.406686538776166e-05, + "loss": 0.1879, + "step": 10888 + }, + { + "epoch": 2.1942373564376387, + "grad_norm": 0.056762244552373886, + "learning_rate": 6.405407664529503e-05, + "loss": 0.1969, + "step": 10890 + }, + { + "epoch": 2.1946403385049367, + "grad_norm": 0.056261587888002396, + "learning_rate": 6.40412869044493e-05, + "loss": 0.233, + "step": 10892 + }, + { + "epoch": 2.1950433205722346, + "grad_norm": 0.0477316789329052, + "learning_rate": 6.4028496166133e-05, + "loss": 0.1761, + "step": 10894 + }, + { + "epoch": 2.1954463026395326, + "grad_norm": 0.06606003642082214, + "learning_rate": 6.401570443125477e-05, + "loss": 0.205, + "step": 10896 + }, + { + "epoch": 2.1958492847068305, + "grad_norm": 0.050838105380535126, + "learning_rate": 6.40029117007233e-05, + "loss": 0.1704, + "step": 10898 + }, + { + "epoch": 2.1962522667741284, + "grad_norm": 0.04778251424431801, + "learning_rate": 6.399011797544739e-05, + "loss": 0.1734, + "step": 10900 + }, + { + "epoch": 2.1966552488414264, + "grad_norm": 0.06802946329116821, + "learning_rate": 6.397732325633587e-05, + "loss": 0.2171, + "step": 10902 + }, + { + "epoch": 2.1970582309087248, + "grad_norm": 0.07286917418241501, + "learning_rate": 6.396452754429766e-05, + "loss": 0.1792, + "step": 10904 + }, + { + "epoch": 2.1974612129760227, + "grad_norm": 0.055850863456726074, + "learning_rate": 6.395173084024174e-05, + "loss": 0.1678, + "step": 10906 + }, + { + "epoch": 2.1978641950433206, + "grad_norm": 0.0752958357334137, + "learning_rate": 6.393893314507717e-05, + "loss": 0.2086, + "step": 10908 + }, + { + "epoch": 2.1982671771106186, + "grad_norm": 0.06332895159721375, + "learning_rate": 6.39261344597131e-05, + "loss": 0.2271, + "step": 10910 + }, + { + "epoch": 2.1986701591779165, + "grad_norm": 0.06559202075004578, + "learning_rate": 6.39133347850587e-05, + "loss": 0.2126, + "step": 10912 + }, + { + "epoch": 2.1990731412452145, + "grad_norm": 0.08061282336711884, + "learning_rate": 6.390053412202324e-05, + "loss": 0.2651, + "step": 10914 + }, + { + "epoch": 2.1994761233125124, + "grad_norm": 0.06229247525334358, + "learning_rate": 6.388773247151606e-05, + "loss": 0.2288, + "step": 10916 + }, + { + "epoch": 2.199879105379811, + "grad_norm": 0.0519050732254982, + "learning_rate": 6.38749298344466e-05, + "loss": 0.1691, + "step": 10918 + }, + { + "epoch": 2.2002820874471087, + "grad_norm": 0.059164535254240036, + "learning_rate": 6.38621262117243e-05, + "loss": 0.1838, + "step": 10920 + }, + { + "epoch": 2.2006850695144067, + "grad_norm": 0.054849907755851746, + "learning_rate": 6.384932160425873e-05, + "loss": 0.1956, + "step": 10922 + }, + { + "epoch": 2.2010880515817046, + "grad_norm": 0.05935594439506531, + "learning_rate": 6.38365160129595e-05, + "loss": 0.2014, + "step": 10924 + }, + { + "epoch": 2.2014910336490026, + "grad_norm": 0.06581877917051315, + "learning_rate": 6.382370943873629e-05, + "loss": 0.1512, + "step": 10926 + }, + { + "epoch": 2.2018940157163005, + "grad_norm": 0.03840584307909012, + "learning_rate": 6.381090188249889e-05, + "loss": 0.157, + "step": 10928 + }, + { + "epoch": 2.2022969977835984, + "grad_norm": 0.05577847361564636, + "learning_rate": 6.37980933451571e-05, + "loss": 0.1547, + "step": 10930 + }, + { + "epoch": 2.202699979850897, + "grad_norm": 0.07472618669271469, + "learning_rate": 6.378528382762082e-05, + "loss": 0.1814, + "step": 10932 + }, + { + "epoch": 2.2031029619181948, + "grad_norm": 0.049337152391672134, + "learning_rate": 6.377247333080002e-05, + "loss": 0.1926, + "step": 10934 + }, + { + "epoch": 2.2035059439854927, + "grad_norm": 0.06512777507305145, + "learning_rate": 6.375966185560473e-05, + "loss": 0.1905, + "step": 10936 + }, + { + "epoch": 2.2039089260527907, + "grad_norm": 0.07381617277860641, + "learning_rate": 6.374684940294508e-05, + "loss": 0.1729, + "step": 10938 + }, + { + "epoch": 2.2043119081200886, + "grad_norm": 0.0456584207713604, + "learning_rate": 6.373403597373125e-05, + "loss": 0.1729, + "step": 10940 + }, + { + "epoch": 2.2047148901873865, + "grad_norm": 0.0703442320227623, + "learning_rate": 6.372122156887345e-05, + "loss": 0.1971, + "step": 10942 + }, + { + "epoch": 2.2051178722546845, + "grad_norm": 0.06072754040360451, + "learning_rate": 6.370840618928202e-05, + "loss": 0.2005, + "step": 10944 + }, + { + "epoch": 2.205520854321983, + "grad_norm": 0.06688905507326126, + "learning_rate": 6.369558983586733e-05, + "loss": 0.1741, + "step": 10946 + }, + { + "epoch": 2.205923836389281, + "grad_norm": 0.07325995713472366, + "learning_rate": 6.368277250953985e-05, + "loss": 0.1826, + "step": 10948 + }, + { + "epoch": 2.2063268184565787, + "grad_norm": 0.06107252091169357, + "learning_rate": 6.366995421121009e-05, + "loss": 0.1818, + "step": 10950 + }, + { + "epoch": 2.2067298005238767, + "grad_norm": 0.07642047852277756, + "learning_rate": 6.365713494178865e-05, + "loss": 0.1636, + "step": 10952 + }, + { + "epoch": 2.2071327825911746, + "grad_norm": 0.05578227341175079, + "learning_rate": 6.364431470218617e-05, + "loss": 0.1681, + "step": 10954 + }, + { + "epoch": 2.2075357646584726, + "grad_norm": 0.054211586713790894, + "learning_rate": 6.363149349331341e-05, + "loss": 0.1663, + "step": 10956 + }, + { + "epoch": 2.2079387467257705, + "grad_norm": 0.054358530789613724, + "learning_rate": 6.361867131608115e-05, + "loss": 0.1941, + "step": 10958 + }, + { + "epoch": 2.208341728793069, + "grad_norm": 0.0320722721517086, + "learning_rate": 6.360584817140025e-05, + "loss": 0.114, + "step": 10960 + }, + { + "epoch": 2.208744710860367, + "grad_norm": 0.06492000073194504, + "learning_rate": 6.359302406018166e-05, + "loss": 0.2156, + "step": 10962 + }, + { + "epoch": 2.2091476929276648, + "grad_norm": 0.06293896585702896, + "learning_rate": 6.358019898333638e-05, + "loss": 0.1998, + "step": 10964 + }, + { + "epoch": 2.2095506749949627, + "grad_norm": 0.06004882603883743, + "learning_rate": 6.356737294177547e-05, + "loss": 0.2076, + "step": 10966 + }, + { + "epoch": 2.2099536570622607, + "grad_norm": 0.05569273978471756, + "learning_rate": 6.35545459364101e-05, + "loss": 0.208, + "step": 10968 + }, + { + "epoch": 2.2103566391295586, + "grad_norm": 0.061380334198474884, + "learning_rate": 6.354171796815146e-05, + "loss": 0.2243, + "step": 10970 + }, + { + "epoch": 2.2107596211968565, + "grad_norm": 0.05998251214623451, + "learning_rate": 6.352888903791083e-05, + "loss": 0.1921, + "step": 10972 + }, + { + "epoch": 2.211162603264155, + "grad_norm": 0.059841521084308624, + "learning_rate": 6.351605914659957e-05, + "loss": 0.1814, + "step": 10974 + }, + { + "epoch": 2.211565585331453, + "grad_norm": 0.05141017213463783, + "learning_rate": 6.350322829512908e-05, + "loss": 0.1651, + "step": 10976 + }, + { + "epoch": 2.211968567398751, + "grad_norm": 0.051106277853250504, + "learning_rate": 6.349039648441084e-05, + "loss": 0.1748, + "step": 10978 + }, + { + "epoch": 2.2123715494660487, + "grad_norm": 0.046505145728588104, + "learning_rate": 6.347756371535642e-05, + "loss": 0.1569, + "step": 10980 + }, + { + "epoch": 2.2127745315333467, + "grad_norm": 0.05843675136566162, + "learning_rate": 6.346472998887741e-05, + "loss": 0.1889, + "step": 10982 + }, + { + "epoch": 2.2131775136006446, + "grad_norm": 0.05025404691696167, + "learning_rate": 6.345189530588553e-05, + "loss": 0.1844, + "step": 10984 + }, + { + "epoch": 2.2135804956679426, + "grad_norm": 0.06326805055141449, + "learning_rate": 6.343905966729251e-05, + "loss": 0.2044, + "step": 10986 + }, + { + "epoch": 2.213983477735241, + "grad_norm": 0.05964503809809685, + "learning_rate": 6.342622307401019e-05, + "loss": 0.2166, + "step": 10988 + }, + { + "epoch": 2.214386459802539, + "grad_norm": 0.04110246151685715, + "learning_rate": 6.341338552695045e-05, + "loss": 0.15, + "step": 10990 + }, + { + "epoch": 2.214789441869837, + "grad_norm": 0.07898419350385666, + "learning_rate": 6.340054702702528e-05, + "loss": 0.1853, + "step": 10992 + }, + { + "epoch": 2.215192423937135, + "grad_norm": 0.057005874812603, + "learning_rate": 6.338770757514664e-05, + "loss": 0.1539, + "step": 10994 + }, + { + "epoch": 2.2155954060044327, + "grad_norm": 0.05199525132775307, + "learning_rate": 6.337486717222668e-05, + "loss": 0.241, + "step": 10996 + }, + { + "epoch": 2.2159983880717307, + "grad_norm": 0.062137044966220856, + "learning_rate": 6.336202581917756e-05, + "loss": 0.1732, + "step": 10998 + }, + { + "epoch": 2.216401370139029, + "grad_norm": 0.0612826943397522, + "learning_rate": 6.334918351691149e-05, + "loss": 0.1975, + "step": 11000 + }, + { + "epoch": 2.216804352206327, + "grad_norm": 0.07609107345342636, + "learning_rate": 6.333634026634074e-05, + "loss": 0.1966, + "step": 11002 + }, + { + "epoch": 2.217207334273625, + "grad_norm": 0.10069447755813599, + "learning_rate": 6.332349606837774e-05, + "loss": 0.1884, + "step": 11004 + }, + { + "epoch": 2.217610316340923, + "grad_norm": 0.054409392178058624, + "learning_rate": 6.331065092393487e-05, + "loss": 0.1815, + "step": 11006 + }, + { + "epoch": 2.218013298408221, + "grad_norm": 0.059293653815984726, + "learning_rate": 6.329780483392466e-05, + "loss": 0.1286, + "step": 11008 + }, + { + "epoch": 2.2184162804755188, + "grad_norm": 0.039616234600543976, + "learning_rate": 6.328495779925966e-05, + "loss": 0.1335, + "step": 11010 + }, + { + "epoch": 2.2188192625428167, + "grad_norm": 0.06314606219530106, + "learning_rate": 6.32721098208525e-05, + "loss": 0.1814, + "step": 11012 + }, + { + "epoch": 2.2192222446101146, + "grad_norm": 0.04756368324160576, + "learning_rate": 6.325926089961589e-05, + "loss": 0.164, + "step": 11014 + }, + { + "epoch": 2.219625226677413, + "grad_norm": 0.06843490153551102, + "learning_rate": 6.324641103646258e-05, + "loss": 0.2759, + "step": 11016 + }, + { + "epoch": 2.220028208744711, + "grad_norm": 0.04619716852903366, + "learning_rate": 6.323356023230541e-05, + "loss": 0.2057, + "step": 11018 + }, + { + "epoch": 2.220431190812009, + "grad_norm": 0.05599410831928253, + "learning_rate": 6.32207084880573e-05, + "loss": 0.141, + "step": 11020 + }, + { + "epoch": 2.220834172879307, + "grad_norm": 0.056852128356695175, + "learning_rate": 6.32078558046312e-05, + "loss": 0.1811, + "step": 11022 + }, + { + "epoch": 2.221237154946605, + "grad_norm": 0.07424285262823105, + "learning_rate": 6.319500218294013e-05, + "loss": 0.1862, + "step": 11024 + }, + { + "epoch": 2.2216401370139027, + "grad_norm": 0.05813653767108917, + "learning_rate": 6.318214762389723e-05, + "loss": 0.1829, + "step": 11026 + }, + { + "epoch": 2.222043119081201, + "grad_norm": 0.08757123351097107, + "learning_rate": 6.316929212841563e-05, + "loss": 0.1998, + "step": 11028 + }, + { + "epoch": 2.222446101148499, + "grad_norm": 0.06450676918029785, + "learning_rate": 6.315643569740857e-05, + "loss": 0.2525, + "step": 11030 + }, + { + "epoch": 2.222849083215797, + "grad_norm": 0.04911727085709572, + "learning_rate": 6.314357833178939e-05, + "loss": 0.2204, + "step": 11032 + }, + { + "epoch": 2.223252065283095, + "grad_norm": 0.056063055992126465, + "learning_rate": 6.31307200324714e-05, + "loss": 0.2079, + "step": 11034 + }, + { + "epoch": 2.223655047350393, + "grad_norm": 0.047666437923908234, + "learning_rate": 6.311786080036806e-05, + "loss": 0.2186, + "step": 11036 + }, + { + "epoch": 2.224058029417691, + "grad_norm": 0.05272279307246208, + "learning_rate": 6.310500063639289e-05, + "loss": 0.1373, + "step": 11038 + }, + { + "epoch": 2.2244610114849888, + "grad_norm": 0.12969458103179932, + "learning_rate": 6.30921395414594e-05, + "loss": 0.2269, + "step": 11040 + }, + { + "epoch": 2.2248639935522867, + "grad_norm": 0.0443669892847538, + "learning_rate": 6.307927751648127e-05, + "loss": 0.1648, + "step": 11042 + }, + { + "epoch": 2.225266975619585, + "grad_norm": 0.07922135293483734, + "learning_rate": 6.306641456237219e-05, + "loss": 0.1777, + "step": 11044 + }, + { + "epoch": 2.225669957686883, + "grad_norm": 0.04765113815665245, + "learning_rate": 6.305355068004591e-05, + "loss": 0.1786, + "step": 11046 + }, + { + "epoch": 2.226072939754181, + "grad_norm": 0.08355677127838135, + "learning_rate": 6.304068587041625e-05, + "loss": 0.2534, + "step": 11048 + }, + { + "epoch": 2.226475921821479, + "grad_norm": 0.06936746835708618, + "learning_rate": 6.302782013439715e-05, + "loss": 0.1977, + "step": 11050 + }, + { + "epoch": 2.226878903888777, + "grad_norm": 0.07767514139413834, + "learning_rate": 6.301495347290252e-05, + "loss": 0.2054, + "step": 11052 + }, + { + "epoch": 2.227281885956075, + "grad_norm": 0.03796292096376419, + "learning_rate": 6.300208588684641e-05, + "loss": 0.1683, + "step": 11054 + }, + { + "epoch": 2.227684868023373, + "grad_norm": 0.07153323292732239, + "learning_rate": 6.298921737714294e-05, + "loss": 0.1856, + "step": 11056 + }, + { + "epoch": 2.228087850090671, + "grad_norm": 0.051289331167936325, + "learning_rate": 6.297634794470621e-05, + "loss": 0.1724, + "step": 11058 + }, + { + "epoch": 2.228490832157969, + "grad_norm": 0.04763934388756752, + "learning_rate": 6.296347759045049e-05, + "loss": 0.1625, + "step": 11060 + }, + { + "epoch": 2.228893814225267, + "grad_norm": 0.07084707170724869, + "learning_rate": 6.295060631529006e-05, + "loss": 0.1415, + "step": 11062 + }, + { + "epoch": 2.229296796292565, + "grad_norm": 0.08750694990158081, + "learning_rate": 6.293773412013926e-05, + "loss": 0.2066, + "step": 11064 + }, + { + "epoch": 2.229699778359863, + "grad_norm": 0.058792464435100555, + "learning_rate": 6.29248610059125e-05, + "loss": 0.2467, + "step": 11066 + }, + { + "epoch": 2.230102760427161, + "grad_norm": 0.07559051364660263, + "learning_rate": 6.291198697352432e-05, + "loss": 0.2182, + "step": 11068 + }, + { + "epoch": 2.2305057424944588, + "grad_norm": 0.047350767999887466, + "learning_rate": 6.289911202388921e-05, + "loss": 0.1571, + "step": 11070 + }, + { + "epoch": 2.230908724561757, + "grad_norm": 0.05143246799707413, + "learning_rate": 6.288623615792183e-05, + "loss": 0.1774, + "step": 11072 + }, + { + "epoch": 2.231311706629055, + "grad_norm": 0.037440188229084015, + "learning_rate": 6.287335937653682e-05, + "loss": 0.141, + "step": 11074 + }, + { + "epoch": 2.231714688696353, + "grad_norm": 0.06669832766056061, + "learning_rate": 6.286048168064896e-05, + "loss": 0.2078, + "step": 11076 + }, + { + "epoch": 2.232117670763651, + "grad_norm": 0.06164056062698364, + "learning_rate": 6.284760307117304e-05, + "loss": 0.1865, + "step": 11078 + }, + { + "epoch": 2.232520652830949, + "grad_norm": 0.056018486618995667, + "learning_rate": 6.283472354902396e-05, + "loss": 0.1628, + "step": 11080 + }, + { + "epoch": 2.232923634898247, + "grad_norm": 0.0484955869615078, + "learning_rate": 6.282184311511664e-05, + "loss": 0.1794, + "step": 11082 + }, + { + "epoch": 2.2333266169655452, + "grad_norm": 0.058489102870225906, + "learning_rate": 6.280896177036608e-05, + "loss": 0.2127, + "step": 11084 + }, + { + "epoch": 2.233729599032843, + "grad_norm": 0.051653195172548294, + "learning_rate": 6.279607951568737e-05, + "loss": 0.1569, + "step": 11086 + }, + { + "epoch": 2.234132581100141, + "grad_norm": 0.06204577907919884, + "learning_rate": 6.278319635199561e-05, + "loss": 0.2496, + "step": 11088 + }, + { + "epoch": 2.234535563167439, + "grad_norm": 0.04957421123981476, + "learning_rate": 6.277031228020607e-05, + "loss": 0.1664, + "step": 11090 + }, + { + "epoch": 2.234938545234737, + "grad_norm": 0.07499062269926071, + "learning_rate": 6.275742730123394e-05, + "loss": 0.2182, + "step": 11092 + }, + { + "epoch": 2.235341527302035, + "grad_norm": 0.03950365260243416, + "learning_rate": 6.274454141599458e-05, + "loss": 0.1521, + "step": 11094 + }, + { + "epoch": 2.235744509369333, + "grad_norm": 0.05986891686916351, + "learning_rate": 6.27316546254034e-05, + "loss": 0.2178, + "step": 11096 + }, + { + "epoch": 2.2361474914366313, + "grad_norm": 0.052209652960300446, + "learning_rate": 6.27187669303758e-05, + "loss": 0.1873, + "step": 11098 + }, + { + "epoch": 2.236550473503929, + "grad_norm": 0.05092112347483635, + "learning_rate": 6.270587833182736e-05, + "loss": 0.2084, + "step": 11100 + }, + { + "epoch": 2.236953455571227, + "grad_norm": 0.06727428734302521, + "learning_rate": 6.269298883067365e-05, + "loss": 0.1824, + "step": 11102 + }, + { + "epoch": 2.237356437638525, + "grad_norm": 0.055155668407678604, + "learning_rate": 6.26800984278303e-05, + "loss": 0.2107, + "step": 11104 + }, + { + "epoch": 2.237759419705823, + "grad_norm": 0.05623969808220863, + "learning_rate": 6.266720712421303e-05, + "loss": 0.1845, + "step": 11106 + }, + { + "epoch": 2.238162401773121, + "grad_norm": 0.04958875849843025, + "learning_rate": 6.265431492073765e-05, + "loss": 0.1954, + "step": 11108 + }, + { + "epoch": 2.238565383840419, + "grad_norm": 0.06290893256664276, + "learning_rate": 6.264142181831995e-05, + "loss": 0.2162, + "step": 11110 + }, + { + "epoch": 2.2389683659077173, + "grad_norm": 0.04571421071887016, + "learning_rate": 6.262852781787587e-05, + "loss": 0.1878, + "step": 11112 + }, + { + "epoch": 2.2393713479750152, + "grad_norm": 0.06730411946773529, + "learning_rate": 6.261563292032137e-05, + "loss": 0.1651, + "step": 11114 + }, + { + "epoch": 2.239774330042313, + "grad_norm": 0.045927029103040695, + "learning_rate": 6.26027371265725e-05, + "loss": 0.2032, + "step": 11116 + }, + { + "epoch": 2.240177312109611, + "grad_norm": 0.03853674978017807, + "learning_rate": 6.258984043754532e-05, + "loss": 0.1757, + "step": 11118 + }, + { + "epoch": 2.240580294176909, + "grad_norm": 0.10684002935886383, + "learning_rate": 6.257694285415602e-05, + "loss": 0.2282, + "step": 11120 + }, + { + "epoch": 2.240983276244207, + "grad_norm": 0.06447183340787888, + "learning_rate": 6.25640443773208e-05, + "loss": 0.2127, + "step": 11122 + }, + { + "epoch": 2.241386258311505, + "grad_norm": 0.04314524680376053, + "learning_rate": 6.255114500795595e-05, + "loss": 0.1964, + "step": 11124 + }, + { + "epoch": 2.2417892403788033, + "grad_norm": 0.0753975510597229, + "learning_rate": 6.253824474697787e-05, + "loss": 0.1722, + "step": 11126 + }, + { + "epoch": 2.2421922224461013, + "grad_norm": 0.041436564177274704, + "learning_rate": 6.252534359530291e-05, + "loss": 0.1355, + "step": 11128 + }, + { + "epoch": 2.242595204513399, + "grad_norm": 0.07188113033771515, + "learning_rate": 6.251244155384758e-05, + "loss": 0.1887, + "step": 11130 + }, + { + "epoch": 2.242998186580697, + "grad_norm": 0.07126448303461075, + "learning_rate": 6.249953862352841e-05, + "loss": 0.2059, + "step": 11132 + }, + { + "epoch": 2.243401168647995, + "grad_norm": 0.05256457254290581, + "learning_rate": 6.2486634805262e-05, + "loss": 0.2111, + "step": 11134 + }, + { + "epoch": 2.243804150715293, + "grad_norm": 0.07690654695034027, + "learning_rate": 6.247373009996502e-05, + "loss": 0.228, + "step": 11136 + }, + { + "epoch": 2.244207132782591, + "grad_norm": 0.04150415584445, + "learning_rate": 6.246082450855423e-05, + "loss": 0.1777, + "step": 11138 + }, + { + "epoch": 2.2446101148498894, + "grad_norm": 0.04561259597539902, + "learning_rate": 6.244791803194637e-05, + "loss": 0.184, + "step": 11140 + }, + { + "epoch": 2.2450130969171873, + "grad_norm": 0.04242684692144394, + "learning_rate": 6.243501067105832e-05, + "loss": 0.1289, + "step": 11142 + }, + { + "epoch": 2.2454160789844853, + "grad_norm": 0.06444307416677475, + "learning_rate": 6.242210242680702e-05, + "loss": 0.1787, + "step": 11144 + }, + { + "epoch": 2.245819061051783, + "grad_norm": 0.05252141132950783, + "learning_rate": 6.24091933001094e-05, + "loss": 0.1534, + "step": 11146 + }, + { + "epoch": 2.246222043119081, + "grad_norm": 0.046815138310194016, + "learning_rate": 6.239628329188256e-05, + "loss": 0.1876, + "step": 11148 + }, + { + "epoch": 2.246625025186379, + "grad_norm": 0.04507233574986458, + "learning_rate": 6.238337240304357e-05, + "loss": 0.2583, + "step": 11150 + }, + { + "epoch": 2.247028007253677, + "grad_norm": 0.07129781693220139, + "learning_rate": 6.23704606345096e-05, + "loss": 0.2266, + "step": 11152 + }, + { + "epoch": 2.2474309893209754, + "grad_norm": 0.06225457042455673, + "learning_rate": 6.235754798719791e-05, + "loss": 0.2047, + "step": 11154 + }, + { + "epoch": 2.2478339713882733, + "grad_norm": 0.04900765046477318, + "learning_rate": 6.234463446202575e-05, + "loss": 0.2016, + "step": 11156 + }, + { + "epoch": 2.2482369534555713, + "grad_norm": 0.06152089685201645, + "learning_rate": 6.233172005991051e-05, + "loss": 0.1688, + "step": 11158 + }, + { + "epoch": 2.2486399355228692, + "grad_norm": 0.04987475648522377, + "learning_rate": 6.231880478176961e-05, + "loss": 0.1885, + "step": 11160 + }, + { + "epoch": 2.249042917590167, + "grad_norm": 0.07317940890789032, + "learning_rate": 6.23058886285205e-05, + "loss": 0.2245, + "step": 11162 + }, + { + "epoch": 2.249445899657465, + "grad_norm": 0.08709168434143066, + "learning_rate": 6.229297160108075e-05, + "loss": 0.1796, + "step": 11164 + }, + { + "epoch": 2.249848881724763, + "grad_norm": 0.05007123947143555, + "learning_rate": 6.228005370036797e-05, + "loss": 0.1922, + "step": 11166 + }, + { + "epoch": 2.2502518637920614, + "grad_norm": 0.06081795319914818, + "learning_rate": 6.22671349272998e-05, + "loss": 0.1997, + "step": 11168 + }, + { + "epoch": 2.2506548458593594, + "grad_norm": 0.05448725447058678, + "learning_rate": 6.225421528279398e-05, + "loss": 0.1809, + "step": 11170 + }, + { + "epoch": 2.2510578279266573, + "grad_norm": 0.0509071871638298, + "learning_rate": 6.224129476776832e-05, + "loss": 0.1956, + "step": 11172 + }, + { + "epoch": 2.2514608099939553, + "grad_norm": 0.04707943648099899, + "learning_rate": 6.222837338314065e-05, + "loss": 0.1996, + "step": 11174 + }, + { + "epoch": 2.251863792061253, + "grad_norm": 0.056294072419404984, + "learning_rate": 6.221545112982887e-05, + "loss": 0.166, + "step": 11176 + }, + { + "epoch": 2.252266774128551, + "grad_norm": 0.07561463862657547, + "learning_rate": 6.220252800875102e-05, + "loss": 0.1987, + "step": 11178 + }, + { + "epoch": 2.252669756195849, + "grad_norm": 0.05711280182003975, + "learning_rate": 6.218960402082505e-05, + "loss": 0.2061, + "step": 11180 + }, + { + "epoch": 2.2530727382631475, + "grad_norm": 0.05807644873857498, + "learning_rate": 6.217667916696913e-05, + "loss": 0.2455, + "step": 11182 + }, + { + "epoch": 2.2534757203304454, + "grad_norm": 0.04157806560397148, + "learning_rate": 6.21637534481014e-05, + "loss": 0.1702, + "step": 11184 + }, + { + "epoch": 2.2538787023977433, + "grad_norm": 0.05336381494998932, + "learning_rate": 6.215082686514007e-05, + "loss": 0.1849, + "step": 11186 + }, + { + "epoch": 2.2542816844650413, + "grad_norm": 0.051333993673324585, + "learning_rate": 6.213789941900342e-05, + "loss": 0.1958, + "step": 11188 + }, + { + "epoch": 2.2546846665323392, + "grad_norm": 0.08715116232633591, + "learning_rate": 6.212497111060983e-05, + "loss": 0.2143, + "step": 11190 + }, + { + "epoch": 2.255087648599637, + "grad_norm": 0.03855302929878235, + "learning_rate": 6.211204194087767e-05, + "loss": 0.1481, + "step": 11192 + }, + { + "epoch": 2.2554906306669356, + "grad_norm": 0.04594408720731735, + "learning_rate": 6.209911191072541e-05, + "loss": 0.1732, + "step": 11194 + }, + { + "epoch": 2.2558936127342335, + "grad_norm": 0.04794025048613548, + "learning_rate": 6.208618102107161e-05, + "loss": 0.1607, + "step": 11196 + }, + { + "epoch": 2.2562965948015314, + "grad_norm": 0.04965617507696152, + "learning_rate": 6.207324927283484e-05, + "loss": 0.1256, + "step": 11198 + }, + { + "epoch": 2.2566995768688294, + "grad_norm": 0.05873771011829376, + "learning_rate": 6.206031666693372e-05, + "loss": 0.1695, + "step": 11200 + }, + { + "epoch": 2.2571025589361273, + "grad_norm": 0.06736169755458832, + "learning_rate": 6.204738320428704e-05, + "loss": 0.2153, + "step": 11202 + }, + { + "epoch": 2.2575055410034253, + "grad_norm": 0.061268240213394165, + "learning_rate": 6.203444888581348e-05, + "loss": 0.2018, + "step": 11204 + }, + { + "epoch": 2.257908523070723, + "grad_norm": 0.05184895917773247, + "learning_rate": 6.202151371243194e-05, + "loss": 0.2235, + "step": 11206 + }, + { + "epoch": 2.258311505138021, + "grad_norm": 0.04206407815217972, + "learning_rate": 6.200857768506129e-05, + "loss": 0.2232, + "step": 11208 + }, + { + "epoch": 2.2587144872053195, + "grad_norm": 0.042239706963300705, + "learning_rate": 6.199564080462049e-05, + "loss": 0.1615, + "step": 11210 + }, + { + "epoch": 2.2591174692726175, + "grad_norm": 0.04908422753214836, + "learning_rate": 6.198270307202852e-05, + "loss": 0.1922, + "step": 11212 + }, + { + "epoch": 2.2595204513399154, + "grad_norm": 0.05737739056348801, + "learning_rate": 6.196976448820453e-05, + "loss": 0.1763, + "step": 11214 + }, + { + "epoch": 2.2599234334072134, + "grad_norm": 0.044318120926618576, + "learning_rate": 6.195682505406759e-05, + "loss": 0.1983, + "step": 11216 + }, + { + "epoch": 2.2603264154745113, + "grad_norm": 0.052237652242183685, + "learning_rate": 6.194388477053693e-05, + "loss": 0.1974, + "step": 11218 + }, + { + "epoch": 2.2607293975418092, + "grad_norm": 0.04132530465722084, + "learning_rate": 6.193094363853179e-05, + "loss": 0.1648, + "step": 11220 + }, + { + "epoch": 2.2611323796091076, + "grad_norm": 0.04706701636314392, + "learning_rate": 6.191800165897149e-05, + "loss": 0.2216, + "step": 11222 + }, + { + "epoch": 2.2615353616764056, + "grad_norm": 0.04157636687159538, + "learning_rate": 6.190505883277541e-05, + "loss": 0.1875, + "step": 11224 + }, + { + "epoch": 2.2619383437437035, + "grad_norm": 0.049499645829200745, + "learning_rate": 6.1892115160863e-05, + "loss": 0.2033, + "step": 11226 + }, + { + "epoch": 2.2623413258110014, + "grad_norm": 0.07546674460172653, + "learning_rate": 6.187917064415375e-05, + "loss": 0.1983, + "step": 11228 + }, + { + "epoch": 2.2627443078782994, + "grad_norm": 0.0587388314306736, + "learning_rate": 6.186622528356723e-05, + "loss": 0.1246, + "step": 11230 + }, + { + "epoch": 2.2631472899455973, + "grad_norm": 0.038730837404727936, + "learning_rate": 6.185327908002301e-05, + "loss": 0.143, + "step": 11232 + }, + { + "epoch": 2.2635502720128953, + "grad_norm": 0.04652130603790283, + "learning_rate": 6.184033203444081e-05, + "loss": 0.1961, + "step": 11234 + }, + { + "epoch": 2.263953254080193, + "grad_norm": 0.06499212980270386, + "learning_rate": 6.182738414774038e-05, + "loss": 0.2313, + "step": 11236 + }, + { + "epoch": 2.2643562361474916, + "grad_norm": 0.05006347596645355, + "learning_rate": 6.181443542084146e-05, + "loss": 0.2217, + "step": 11238 + }, + { + "epoch": 2.2647592182147895, + "grad_norm": 0.05510425940155983, + "learning_rate": 6.180148585466397e-05, + "loss": 0.1508, + "step": 11240 + }, + { + "epoch": 2.2651622002820875, + "grad_norm": 0.06103931739926338, + "learning_rate": 6.17885354501278e-05, + "loss": 0.1751, + "step": 11242 + }, + { + "epoch": 2.2655651823493854, + "grad_norm": 0.07695218920707703, + "learning_rate": 6.177558420815291e-05, + "loss": 0.2253, + "step": 11244 + }, + { + "epoch": 2.2659681644166834, + "grad_norm": 0.05471651628613472, + "learning_rate": 6.176263212965935e-05, + "loss": 0.1386, + "step": 11246 + }, + { + "epoch": 2.2663711464839813, + "grad_norm": 0.04918958246707916, + "learning_rate": 6.174967921556722e-05, + "loss": 0.168, + "step": 11248 + }, + { + "epoch": 2.2667741285512797, + "grad_norm": 0.06307138502597809, + "learning_rate": 6.173672546679667e-05, + "loss": 0.1953, + "step": 11250 + }, + { + "epoch": 2.2671771106185776, + "grad_norm": 0.05643860995769501, + "learning_rate": 6.172377088426791e-05, + "loss": 0.2656, + "step": 11252 + }, + { + "epoch": 2.2675800926858756, + "grad_norm": 0.07580362260341644, + "learning_rate": 6.171081546890122e-05, + "loss": 0.195, + "step": 11254 + }, + { + "epoch": 2.2679830747531735, + "grad_norm": 0.060459479689598083, + "learning_rate": 6.169785922161691e-05, + "loss": 0.1836, + "step": 11256 + }, + { + "epoch": 2.2683860568204715, + "grad_norm": 0.07413048297166824, + "learning_rate": 6.16849021433354e-05, + "loss": 0.2098, + "step": 11258 + }, + { + "epoch": 2.2687890388877694, + "grad_norm": 0.06606211513280869, + "learning_rate": 6.167194423497715e-05, + "loss": 0.2159, + "step": 11260 + }, + { + "epoch": 2.2691920209550673, + "grad_norm": 0.05219024419784546, + "learning_rate": 6.165898549746263e-05, + "loss": 0.1579, + "step": 11262 + }, + { + "epoch": 2.2695950030223653, + "grad_norm": 0.05993516743183136, + "learning_rate": 6.164602593171242e-05, + "loss": 0.2181, + "step": 11264 + }, + { + "epoch": 2.2699979850896637, + "grad_norm": 0.06719011068344116, + "learning_rate": 6.163306553864717e-05, + "loss": 0.2209, + "step": 11266 + }, + { + "epoch": 2.2704009671569616, + "grad_norm": 0.0497952364385128, + "learning_rate": 6.162010431918753e-05, + "loss": 0.1758, + "step": 11268 + }, + { + "epoch": 2.2708039492242595, + "grad_norm": 0.06263022869825363, + "learning_rate": 6.160714227425428e-05, + "loss": 0.1895, + "step": 11270 + }, + { + "epoch": 2.2712069312915575, + "grad_norm": 0.07029848545789719, + "learning_rate": 6.159417940476819e-05, + "loss": 0.2157, + "step": 11272 + }, + { + "epoch": 2.2716099133588554, + "grad_norm": 0.06518268585205078, + "learning_rate": 6.158121571165014e-05, + "loss": 0.2074, + "step": 11274 + }, + { + "epoch": 2.2720128954261534, + "grad_norm": 0.06944593787193298, + "learning_rate": 6.156825119582105e-05, + "loss": 0.1697, + "step": 11276 + }, + { + "epoch": 2.2724158774934518, + "grad_norm": 0.054373059421777725, + "learning_rate": 6.15552858582019e-05, + "loss": 0.1786, + "step": 11278 + }, + { + "epoch": 2.2728188595607497, + "grad_norm": 0.04644101485610008, + "learning_rate": 6.154231969971373e-05, + "loss": 0.1821, + "step": 11280 + }, + { + "epoch": 2.2732218416280476, + "grad_norm": 0.055845990777015686, + "learning_rate": 6.152935272127761e-05, + "loss": 0.2359, + "step": 11282 + }, + { + "epoch": 2.2736248236953456, + "grad_norm": 0.0610295832157135, + "learning_rate": 6.151638492381473e-05, + "loss": 0.1932, + "step": 11284 + }, + { + "epoch": 2.2740278057626435, + "grad_norm": 0.05903216451406479, + "learning_rate": 6.150341630824627e-05, + "loss": 0.2192, + "step": 11286 + }, + { + "epoch": 2.2744307878299415, + "grad_norm": 0.07944760471582413, + "learning_rate": 6.149044687549351e-05, + "loss": 0.2314, + "step": 11288 + }, + { + "epoch": 2.2748337698972394, + "grad_norm": 0.04613855481147766, + "learning_rate": 6.147747662647777e-05, + "loss": 0.213, + "step": 11290 + }, + { + "epoch": 2.2752367519645373, + "grad_norm": 0.05385150760412216, + "learning_rate": 6.146450556212045e-05, + "loss": 0.1715, + "step": 11292 + }, + { + "epoch": 2.2756397340318357, + "grad_norm": 0.057033102959394455, + "learning_rate": 6.145153368334302e-05, + "loss": 0.194, + "step": 11294 + }, + { + "epoch": 2.2760427160991337, + "grad_norm": 0.041800301522016525, + "learning_rate": 6.143856099106692e-05, + "loss": 0.2274, + "step": 11296 + }, + { + "epoch": 2.2764456981664316, + "grad_norm": 0.0549192801117897, + "learning_rate": 6.142558748621376e-05, + "loss": 0.203, + "step": 11298 + }, + { + "epoch": 2.2768486802337295, + "grad_norm": 0.05256705358624458, + "learning_rate": 6.141261316970513e-05, + "loss": 0.1815, + "step": 11300 + }, + { + "epoch": 2.2772516623010275, + "grad_norm": 0.06109241768717766, + "learning_rate": 6.139963804246271e-05, + "loss": 0.1698, + "step": 11302 + }, + { + "epoch": 2.2776546443683254, + "grad_norm": 0.060304321348667145, + "learning_rate": 6.138666210540822e-05, + "loss": 0.1746, + "step": 11304 + }, + { + "epoch": 2.278057626435624, + "grad_norm": 0.05690411105751991, + "learning_rate": 6.13736853594635e-05, + "loss": 0.1728, + "step": 11306 + }, + { + "epoch": 2.2784606085029218, + "grad_norm": 0.12367209792137146, + "learning_rate": 6.136070780555033e-05, + "loss": 0.1808, + "step": 11308 + }, + { + "epoch": 2.2788635905702197, + "grad_norm": 0.04680660739541054, + "learning_rate": 6.134772944459066e-05, + "loss": 0.2192, + "step": 11310 + }, + { + "epoch": 2.2792665726375176, + "grad_norm": 0.0765429437160492, + "learning_rate": 6.133475027750644e-05, + "loss": 0.2293, + "step": 11312 + }, + { + "epoch": 2.2796695547048156, + "grad_norm": 0.06817856431007385, + "learning_rate": 6.132177030521967e-05, + "loss": 0.1625, + "step": 11314 + }, + { + "epoch": 2.2800725367721135, + "grad_norm": 0.0743875801563263, + "learning_rate": 6.130878952865246e-05, + "loss": 0.2173, + "step": 11316 + }, + { + "epoch": 2.2804755188394115, + "grad_norm": 0.08354144543409348, + "learning_rate": 6.129580794872694e-05, + "loss": 0.1676, + "step": 11318 + }, + { + "epoch": 2.2808785009067094, + "grad_norm": 0.054814115166664124, + "learning_rate": 6.128282556636527e-05, + "loss": 0.183, + "step": 11320 + }, + { + "epoch": 2.281281482974008, + "grad_norm": 0.0482783205807209, + "learning_rate": 6.126984238248972e-05, + "loss": 0.1679, + "step": 11322 + }, + { + "epoch": 2.2816844650413057, + "grad_norm": 0.039012711495161057, + "learning_rate": 6.125685839802258e-05, + "loss": 0.1527, + "step": 11324 + }, + { + "epoch": 2.2820874471086037, + "grad_norm": 0.06443888694047928, + "learning_rate": 6.124387361388624e-05, + "loss": 0.2135, + "step": 11326 + }, + { + "epoch": 2.2824904291759016, + "grad_norm": 0.05576111376285553, + "learning_rate": 6.12308880310031e-05, + "loss": 0.2194, + "step": 11328 + }, + { + "epoch": 2.2828934112431996, + "grad_norm": 0.0726291760802269, + "learning_rate": 6.121790165029561e-05, + "loss": 0.187, + "step": 11330 + }, + { + "epoch": 2.2832963933104975, + "grad_norm": 0.04712097719311714, + "learning_rate": 6.120491447268634e-05, + "loss": 0.1997, + "step": 11332 + }, + { + "epoch": 2.283699375377796, + "grad_norm": 0.04324129596352577, + "learning_rate": 6.119192649909788e-05, + "loss": 0.168, + "step": 11334 + }, + { + "epoch": 2.284102357445094, + "grad_norm": 0.07594967633485794, + "learning_rate": 6.117893773045286e-05, + "loss": 0.2147, + "step": 11336 + }, + { + "epoch": 2.2845053395123918, + "grad_norm": 0.05412382259964943, + "learning_rate": 6.116594816767396e-05, + "loss": 0.1807, + "step": 11338 + }, + { + "epoch": 2.2849083215796897, + "grad_norm": 0.04991452768445015, + "learning_rate": 6.115295781168398e-05, + "loss": 0.1619, + "step": 11340 + }, + { + "epoch": 2.2853113036469876, + "grad_norm": 0.05802258849143982, + "learning_rate": 6.11399666634057e-05, + "loss": 0.1286, + "step": 11342 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.05308988317847252, + "learning_rate": 6.112697472376201e-05, + "loss": 0.209, + "step": 11344 + }, + { + "epoch": 2.2861172677815835, + "grad_norm": 0.0739789754152298, + "learning_rate": 6.111398199367584e-05, + "loss": 0.161, + "step": 11346 + }, + { + "epoch": 2.286520249848882, + "grad_norm": 0.04071381315588951, + "learning_rate": 6.110098847407014e-05, + "loss": 0.1882, + "step": 11348 + }, + { + "epoch": 2.28692323191618, + "grad_norm": 0.08201833069324493, + "learning_rate": 6.108799416586799e-05, + "loss": 0.2381, + "step": 11350 + }, + { + "epoch": 2.287326213983478, + "grad_norm": 0.05134955421090126, + "learning_rate": 6.107499906999247e-05, + "loss": 0.1813, + "step": 11352 + }, + { + "epoch": 2.2877291960507757, + "grad_norm": 0.07029873132705688, + "learning_rate": 6.106200318736672e-05, + "loss": 0.2262, + "step": 11354 + }, + { + "epoch": 2.2881321781180737, + "grad_norm": 0.057129088789224625, + "learning_rate": 6.104900651891394e-05, + "loss": 0.1861, + "step": 11356 + }, + { + "epoch": 2.2885351601853716, + "grad_norm": 0.05836978182196617, + "learning_rate": 6.103600906555744e-05, + "loss": 0.2227, + "step": 11358 + }, + { + "epoch": 2.28893814225267, + "grad_norm": 0.06855162978172302, + "learning_rate": 6.1023010828220483e-05, + "loss": 0.2345, + "step": 11360 + }, + { + "epoch": 2.289341124319968, + "grad_norm": 0.06893496960401535, + "learning_rate": 6.101001180782646e-05, + "loss": 0.1785, + "step": 11362 + }, + { + "epoch": 2.289744106387266, + "grad_norm": 0.05776822194457054, + "learning_rate": 6.0997012005298826e-05, + "loss": 0.1627, + "step": 11364 + }, + { + "epoch": 2.290147088454564, + "grad_norm": 0.06768766045570374, + "learning_rate": 6.098401142156104e-05, + "loss": 0.2323, + "step": 11366 + }, + { + "epoch": 2.2905500705218618, + "grad_norm": 0.04803290218114853, + "learning_rate": 6.0971010057536634e-05, + "loss": 0.16, + "step": 11368 + }, + { + "epoch": 2.2909530525891597, + "grad_norm": 0.044886112213134766, + "learning_rate": 6.095800791414924e-05, + "loss": 0.1568, + "step": 11370 + }, + { + "epoch": 2.2913560346564577, + "grad_norm": 0.05759613215923309, + "learning_rate": 6.0945004992322473e-05, + "loss": 0.1812, + "step": 11372 + }, + { + "epoch": 2.2917590167237556, + "grad_norm": 0.07767092436552048, + "learning_rate": 6.0932001292980065e-05, + "loss": 0.1981, + "step": 11374 + }, + { + "epoch": 2.292161998791054, + "grad_norm": 0.06554151326417923, + "learning_rate": 6.091899681704577e-05, + "loss": 0.1989, + "step": 11376 + }, + { + "epoch": 2.292564980858352, + "grad_norm": 0.06701290607452393, + "learning_rate": 6.09059915654434e-05, + "loss": 0.1745, + "step": 11378 + }, + { + "epoch": 2.29296796292565, + "grad_norm": 0.07902407646179199, + "learning_rate": 6.089298553909684e-05, + "loss": 0.28, + "step": 11380 + }, + { + "epoch": 2.293370944992948, + "grad_norm": 0.07576099038124084, + "learning_rate": 6.087997873892999e-05, + "loss": 0.1846, + "step": 11382 + }, + { + "epoch": 2.2937739270602457, + "grad_norm": 0.06893164664506912, + "learning_rate": 6.086697116586685e-05, + "loss": 0.2181, + "step": 11384 + }, + { + "epoch": 2.2941769091275437, + "grad_norm": 0.08184853941202164, + "learning_rate": 6.085396282083147e-05, + "loss": 0.2172, + "step": 11386 + }, + { + "epoch": 2.294579891194842, + "grad_norm": 0.059146005660295486, + "learning_rate": 6.084095370474791e-05, + "loss": 0.1481, + "step": 11388 + }, + { + "epoch": 2.29498287326214, + "grad_norm": 0.060949668288230896, + "learning_rate": 6.0827943818540357e-05, + "loss": 0.2308, + "step": 11390 + }, + { + "epoch": 2.295385855329438, + "grad_norm": 0.04430218040943146, + "learning_rate": 6.081493316313299e-05, + "loss": 0.1673, + "step": 11392 + }, + { + "epoch": 2.295788837396736, + "grad_norm": 0.055645573884248734, + "learning_rate": 6.080192173945006e-05, + "loss": 0.1759, + "step": 11394 + }, + { + "epoch": 2.296191819464034, + "grad_norm": 0.08632339537143707, + "learning_rate": 6.078890954841589e-05, + "loss": 0.2101, + "step": 11396 + }, + { + "epoch": 2.2965948015313318, + "grad_norm": 0.1685890555381775, + "learning_rate": 6.077589659095484e-05, + "loss": 0.1724, + "step": 11398 + }, + { + "epoch": 2.2969977835986297, + "grad_norm": 0.06395189464092255, + "learning_rate": 6.0762882867991325e-05, + "loss": 0.1763, + "step": 11400 + }, + { + "epoch": 2.2974007656659277, + "grad_norm": 0.05895381420850754, + "learning_rate": 6.074986838044983e-05, + "loss": 0.1772, + "step": 11402 + }, + { + "epoch": 2.297803747733226, + "grad_norm": 0.07261353731155396, + "learning_rate": 6.073685312925488e-05, + "loss": 0.1446, + "step": 11404 + }, + { + "epoch": 2.298206729800524, + "grad_norm": 0.0492069236934185, + "learning_rate": 6.072383711533104e-05, + "loss": 0.1637, + "step": 11406 + }, + { + "epoch": 2.298609711867822, + "grad_norm": 0.07343262434005737, + "learning_rate": 6.0710820339602955e-05, + "loss": 0.1681, + "step": 11408 + }, + { + "epoch": 2.29901269393512, + "grad_norm": 0.05374588817358017, + "learning_rate": 6.069780280299535e-05, + "loss": 0.1881, + "step": 11410 + }, + { + "epoch": 2.299415676002418, + "grad_norm": 0.06401274353265762, + "learning_rate": 6.068478450643294e-05, + "loss": 0.2119, + "step": 11412 + }, + { + "epoch": 2.2998186580697157, + "grad_norm": 0.0491844043135643, + "learning_rate": 6.06717654508405e-05, + "loss": 0.169, + "step": 11414 + }, + { + "epoch": 2.300221640137014, + "grad_norm": 0.060520414263010025, + "learning_rate": 6.065874563714293e-05, + "loss": 0.1738, + "step": 11416 + }, + { + "epoch": 2.300624622204312, + "grad_norm": 0.0507986918091774, + "learning_rate": 6.064572506626511e-05, + "loss": 0.1753, + "step": 11418 + }, + { + "epoch": 2.30102760427161, + "grad_norm": 0.045630816370248795, + "learning_rate": 6.0632703739132e-05, + "loss": 0.1549, + "step": 11420 + }, + { + "epoch": 2.301430586338908, + "grad_norm": 0.04640620946884155, + "learning_rate": 6.061968165666865e-05, + "loss": 0.2175, + "step": 11422 + }, + { + "epoch": 2.301833568406206, + "grad_norm": 0.06813772767782211, + "learning_rate": 6.060665881980007e-05, + "loss": 0.1738, + "step": 11424 + }, + { + "epoch": 2.302236550473504, + "grad_norm": 0.049537815153598785, + "learning_rate": 6.0593635229451404e-05, + "loss": 0.1846, + "step": 11426 + }, + { + "epoch": 2.302639532540802, + "grad_norm": 0.048957351595163345, + "learning_rate": 6.058061088654786e-05, + "loss": 0.1928, + "step": 11428 + }, + { + "epoch": 2.3030425146080997, + "grad_norm": 0.06044716387987137, + "learning_rate": 6.0567585792014625e-05, + "loss": 0.1538, + "step": 11430 + }, + { + "epoch": 2.303445496675398, + "grad_norm": 0.07165748625993729, + "learning_rate": 6.055455994677699e-05, + "loss": 0.2081, + "step": 11432 + }, + { + "epoch": 2.303848478742696, + "grad_norm": 0.057387061417102814, + "learning_rate": 6.0541533351760315e-05, + "loss": 0.2362, + "step": 11434 + }, + { + "epoch": 2.304251460809994, + "grad_norm": 0.051506511867046356, + "learning_rate": 6.0528506007889954e-05, + "loss": 0.1875, + "step": 11436 + }, + { + "epoch": 2.304654442877292, + "grad_norm": 0.053086865693330765, + "learning_rate": 6.0515477916091365e-05, + "loss": 0.1542, + "step": 11438 + }, + { + "epoch": 2.30505742494459, + "grad_norm": 0.05012943223118782, + "learning_rate": 6.050244907729005e-05, + "loss": 0.1647, + "step": 11440 + }, + { + "epoch": 2.305460407011888, + "grad_norm": 0.0971345528960228, + "learning_rate": 6.0489419492411534e-05, + "loss": 0.1882, + "step": 11442 + }, + { + "epoch": 2.305863389079186, + "grad_norm": 0.06509329378604889, + "learning_rate": 6.047638916238144e-05, + "loss": 0.2157, + "step": 11444 + }, + { + "epoch": 2.306266371146484, + "grad_norm": 0.09002260118722916, + "learning_rate": 6.046335808812543e-05, + "loss": 0.157, + "step": 11446 + }, + { + "epoch": 2.306669353213782, + "grad_norm": 0.03628845512866974, + "learning_rate": 6.045032627056918e-05, + "loss": 0.1655, + "step": 11448 + }, + { + "epoch": 2.30707233528108, + "grad_norm": 0.0490727573633194, + "learning_rate": 6.043729371063846e-05, + "loss": 0.2058, + "step": 11450 + }, + { + "epoch": 2.307475317348378, + "grad_norm": 0.03740858659148216, + "learning_rate": 6.04242604092591e-05, + "loss": 0.1525, + "step": 11452 + }, + { + "epoch": 2.307878299415676, + "grad_norm": 0.06834115087985992, + "learning_rate": 6.041122636735694e-05, + "loss": 0.224, + "step": 11454 + }, + { + "epoch": 2.308281281482974, + "grad_norm": 0.05891553685069084, + "learning_rate": 6.039819158585792e-05, + "loss": 0.2065, + "step": 11456 + }, + { + "epoch": 2.308684263550272, + "grad_norm": 0.04757963865995407, + "learning_rate": 6.0385156065687987e-05, + "loss": 0.1603, + "step": 11458 + }, + { + "epoch": 2.30908724561757, + "grad_norm": 0.03673839196562767, + "learning_rate": 6.037211980777318e-05, + "loss": 0.1729, + "step": 11460 + }, + { + "epoch": 2.309490227684868, + "grad_norm": 0.06139663606882095, + "learning_rate": 6.035908281303958e-05, + "loss": 0.1852, + "step": 11462 + }, + { + "epoch": 2.309893209752166, + "grad_norm": 0.034579966217279434, + "learning_rate": 6.0346045082413295e-05, + "loss": 0.1267, + "step": 11464 + }, + { + "epoch": 2.310296191819464, + "grad_norm": 0.0544976070523262, + "learning_rate": 6.033300661682051e-05, + "loss": 0.2015, + "step": 11466 + }, + { + "epoch": 2.310699173886762, + "grad_norm": 0.05644477531313896, + "learning_rate": 6.031996741718747e-05, + "loss": 0.156, + "step": 11468 + }, + { + "epoch": 2.31110215595406, + "grad_norm": 0.06838435679674149, + "learning_rate": 6.0306927484440434e-05, + "loss": 0.1519, + "step": 11470 + }, + { + "epoch": 2.3115051380213583, + "grad_norm": 0.04617998003959656, + "learning_rate": 6.029388681950576e-05, + "loss": 0.2111, + "step": 11472 + }, + { + "epoch": 2.311908120088656, + "grad_norm": 0.05172273889183998, + "learning_rate": 6.028084542330984e-05, + "loss": 0.2078, + "step": 11474 + }, + { + "epoch": 2.312311102155954, + "grad_norm": 0.07891248166561127, + "learning_rate": 6.026780329677909e-05, + "loss": 0.2416, + "step": 11476 + }, + { + "epoch": 2.312714084223252, + "grad_norm": 0.07931914180517197, + "learning_rate": 6.025476044084002e-05, + "loss": 0.2088, + "step": 11478 + }, + { + "epoch": 2.31311706629055, + "grad_norm": 0.06278335303068161, + "learning_rate": 6.024171685641917e-05, + "loss": 0.1975, + "step": 11480 + }, + { + "epoch": 2.313520048357848, + "grad_norm": 0.059438444674015045, + "learning_rate": 6.022867254444313e-05, + "loss": 0.1536, + "step": 11482 + }, + { + "epoch": 2.313923030425146, + "grad_norm": 0.08473483473062515, + "learning_rate": 6.021562750583854e-05, + "loss": 0.2057, + "step": 11484 + }, + { + "epoch": 2.314326012492444, + "grad_norm": 0.056876592338085175, + "learning_rate": 6.020258174153213e-05, + "loss": 0.2491, + "step": 11486 + }, + { + "epoch": 2.3147289945597422, + "grad_norm": 0.05166206881403923, + "learning_rate": 6.0189535252450614e-05, + "loss": 0.2241, + "step": 11488 + }, + { + "epoch": 2.31513197662704, + "grad_norm": 0.17782577872276306, + "learning_rate": 6.017648803952082e-05, + "loss": 0.2012, + "step": 11490 + }, + { + "epoch": 2.315534958694338, + "grad_norm": 0.06368952244520187, + "learning_rate": 6.0163440103669586e-05, + "loss": 0.246, + "step": 11492 + }, + { + "epoch": 2.315937940761636, + "grad_norm": 0.07152625173330307, + "learning_rate": 6.015039144582382e-05, + "loss": 0.1911, + "step": 11494 + }, + { + "epoch": 2.316340922828934, + "grad_norm": 0.061066433787345886, + "learning_rate": 6.0137342066910486e-05, + "loss": 0.1865, + "step": 11496 + }, + { + "epoch": 2.316743904896232, + "grad_norm": 0.05115800350904465, + "learning_rate": 6.0124291967856585e-05, + "loss": 0.1442, + "step": 11498 + }, + { + "epoch": 2.3171468869635303, + "grad_norm": 0.036835819482803345, + "learning_rate": 6.0111241149589156e-05, + "loss": 0.1624, + "step": 11500 + }, + { + "epoch": 2.3175498690308283, + "grad_norm": 0.03765489533543587, + "learning_rate": 6.0098189613035335e-05, + "loss": 0.1554, + "step": 11502 + }, + { + "epoch": 2.317952851098126, + "grad_norm": 0.0580357164144516, + "learning_rate": 6.008513735912229e-05, + "loss": 0.2215, + "step": 11504 + }, + { + "epoch": 2.318355833165424, + "grad_norm": 0.03914367035031319, + "learning_rate": 6.007208438877719e-05, + "loss": 0.1123, + "step": 11506 + }, + { + "epoch": 2.318758815232722, + "grad_norm": 0.05752947926521301, + "learning_rate": 6.005903070292733e-05, + "loss": 0.199, + "step": 11508 + }, + { + "epoch": 2.31916179730002, + "grad_norm": 0.06375788152217865, + "learning_rate": 6.004597630250003e-05, + "loss": 0.1791, + "step": 11510 + }, + { + "epoch": 2.319564779367318, + "grad_norm": 0.04775358736515045, + "learning_rate": 6.003292118842263e-05, + "loss": 0.1816, + "step": 11512 + }, + { + "epoch": 2.319967761434616, + "grad_norm": 0.07813245058059692, + "learning_rate": 6.001986536162255e-05, + "loss": 0.2239, + "step": 11514 + }, + { + "epoch": 2.3203707435019143, + "grad_norm": 0.07530572265386581, + "learning_rate": 6.000680882302727e-05, + "loss": 0.2423, + "step": 11516 + }, + { + "epoch": 2.3207737255692122, + "grad_norm": 0.05607306584715843, + "learning_rate": 5.999375157356428e-05, + "loss": 0.1857, + "step": 11518 + }, + { + "epoch": 2.32117670763651, + "grad_norm": 0.0654715746641159, + "learning_rate": 5.9980693614161175e-05, + "loss": 0.2488, + "step": 11520 + }, + { + "epoch": 2.321579689703808, + "grad_norm": 0.06305355578660965, + "learning_rate": 5.9967634945745555e-05, + "loss": 0.2018, + "step": 11522 + }, + { + "epoch": 2.321982671771106, + "grad_norm": 0.030810408294200897, + "learning_rate": 5.9954575569245086e-05, + "loss": 0.159, + "step": 11524 + }, + { + "epoch": 2.322385653838404, + "grad_norm": 0.07300093024969101, + "learning_rate": 5.9941515485587485e-05, + "loss": 0.2026, + "step": 11526 + }, + { + "epoch": 2.3227886359057024, + "grad_norm": 0.05360522121191025, + "learning_rate": 5.992845469570053e-05, + "loss": 0.2091, + "step": 11528 + }, + { + "epoch": 2.3231916179730003, + "grad_norm": 0.07325176149606705, + "learning_rate": 5.9915393200512024e-05, + "loss": 0.1802, + "step": 11530 + }, + { + "epoch": 2.3235946000402983, + "grad_norm": 0.06028595194220543, + "learning_rate": 5.990233100094985e-05, + "loss": 0.2081, + "step": 11532 + }, + { + "epoch": 2.323997582107596, + "grad_norm": 0.08523773401975632, + "learning_rate": 5.9889268097941907e-05, + "loss": 0.2146, + "step": 11534 + }, + { + "epoch": 2.324400564174894, + "grad_norm": 0.0636720284819603, + "learning_rate": 5.9876204492416185e-05, + "loss": 0.2062, + "step": 11536 + }, + { + "epoch": 2.324803546242192, + "grad_norm": 0.0519365556538105, + "learning_rate": 5.986314018530069e-05, + "loss": 0.1801, + "step": 11538 + }, + { + "epoch": 2.32520652830949, + "grad_norm": 0.059190113097429276, + "learning_rate": 5.985007517752349e-05, + "loss": 0.2202, + "step": 11540 + }, + { + "epoch": 2.3256095103767884, + "grad_norm": 0.05759743973612785, + "learning_rate": 5.9837009470012695e-05, + "loss": 0.1759, + "step": 11542 + }, + { + "epoch": 2.3260124924440864, + "grad_norm": 0.0786181166768074, + "learning_rate": 5.9823943063696484e-05, + "loss": 0.2165, + "step": 11544 + }, + { + "epoch": 2.3264154745113843, + "grad_norm": 0.07660158723592758, + "learning_rate": 5.9810875959503065e-05, + "loss": 0.1728, + "step": 11546 + }, + { + "epoch": 2.3268184565786822, + "grad_norm": 0.05472486466169357, + "learning_rate": 5.979780815836071e-05, + "loss": 0.2405, + "step": 11548 + }, + { + "epoch": 2.32722143864598, + "grad_norm": 0.05465126782655716, + "learning_rate": 5.9784739661197744e-05, + "loss": 0.185, + "step": 11550 + }, + { + "epoch": 2.327624420713278, + "grad_norm": 0.07266535609960556, + "learning_rate": 5.977167046894251e-05, + "loss": 0.2024, + "step": 11552 + }, + { + "epoch": 2.328027402780576, + "grad_norm": 0.05688047781586647, + "learning_rate": 5.975860058252343e-05, + "loss": 0.2392, + "step": 11554 + }, + { + "epoch": 2.3284303848478745, + "grad_norm": 0.05334145203232765, + "learning_rate": 5.9745530002868976e-05, + "loss": 0.2258, + "step": 11556 + }, + { + "epoch": 2.3288333669151724, + "grad_norm": 0.056741535663604736, + "learning_rate": 5.973245873090766e-05, + "loss": 0.1913, + "step": 11558 + }, + { + "epoch": 2.3292363489824703, + "grad_norm": 0.06352947652339935, + "learning_rate": 5.971938676756803e-05, + "loss": 0.164, + "step": 11560 + }, + { + "epoch": 2.3296393310497683, + "grad_norm": 0.054152458906173706, + "learning_rate": 5.970631411377872e-05, + "loss": 0.1804, + "step": 11562 + }, + { + "epoch": 2.330042313117066, + "grad_norm": 0.07217392325401306, + "learning_rate": 5.969324077046836e-05, + "loss": 0.2099, + "step": 11564 + }, + { + "epoch": 2.330445295184364, + "grad_norm": 0.05785232037305832, + "learning_rate": 5.968016673856569e-05, + "loss": 0.2028, + "step": 11566 + }, + { + "epoch": 2.330848277251662, + "grad_norm": 0.03307553008198738, + "learning_rate": 5.966709201899947e-05, + "loss": 0.1449, + "step": 11568 + }, + { + "epoch": 2.3312512593189605, + "grad_norm": 0.04345938190817833, + "learning_rate": 5.965401661269847e-05, + "loss": 0.1264, + "step": 11570 + }, + { + "epoch": 2.3316542413862584, + "grad_norm": 0.05565191060304642, + "learning_rate": 5.964094052059158e-05, + "loss": 0.1819, + "step": 11572 + }, + { + "epoch": 2.3320572234535564, + "grad_norm": 0.05930451303720474, + "learning_rate": 5.9627863743607694e-05, + "loss": 0.2198, + "step": 11574 + }, + { + "epoch": 2.3324602055208543, + "grad_norm": 0.051885005086660385, + "learning_rate": 5.961478628267576e-05, + "loss": 0.1606, + "step": 11576 + }, + { + "epoch": 2.3328631875881523, + "grad_norm": 0.07921456545591354, + "learning_rate": 5.960170813872479e-05, + "loss": 0.1337, + "step": 11578 + }, + { + "epoch": 2.33326616965545, + "grad_norm": 0.06802048534154892, + "learning_rate": 5.958862931268383e-05, + "loss": 0.2004, + "step": 11580 + }, + { + "epoch": 2.3336691517227486, + "grad_norm": 0.05401553958654404, + "learning_rate": 5.9575549805481976e-05, + "loss": 0.1612, + "step": 11582 + }, + { + "epoch": 2.3340721337900465, + "grad_norm": 0.05115654692053795, + "learning_rate": 5.956246961804838e-05, + "loss": 0.1779, + "step": 11584 + }, + { + "epoch": 2.3344751158573445, + "grad_norm": 0.05319082364439964, + "learning_rate": 5.954938875131224e-05, + "loss": 0.1613, + "step": 11586 + }, + { + "epoch": 2.3348780979246424, + "grad_norm": 0.058764997869729996, + "learning_rate": 5.953630720620278e-05, + "loss": 0.1842, + "step": 11588 + }, + { + "epoch": 2.3352810799919403, + "grad_norm": 0.05345417559146881, + "learning_rate": 5.952322498364933e-05, + "loss": 0.2541, + "step": 11590 + }, + { + "epoch": 2.3356840620592383, + "grad_norm": 0.07091548293828964, + "learning_rate": 5.951014208458118e-05, + "loss": 0.1976, + "step": 11592 + }, + { + "epoch": 2.3360870441265362, + "grad_norm": 0.048810895532369614, + "learning_rate": 5.949705850992775e-05, + "loss": 0.175, + "step": 11594 + }, + { + "epoch": 2.336490026193834, + "grad_norm": 0.042119260877370834, + "learning_rate": 5.948397426061849e-05, + "loss": 0.1994, + "step": 11596 + }, + { + "epoch": 2.3368930082611326, + "grad_norm": 0.050638552755117416, + "learning_rate": 5.947088933758286e-05, + "loss": 0.2191, + "step": 11598 + }, + { + "epoch": 2.3372959903284305, + "grad_norm": 0.05852693319320679, + "learning_rate": 5.9457803741750384e-05, + "loss": 0.2201, + "step": 11600 + }, + { + "epoch": 2.3376989723957284, + "grad_norm": 0.06796692311763763, + "learning_rate": 5.944471747405067e-05, + "loss": 0.2088, + "step": 11602 + }, + { + "epoch": 2.3381019544630264, + "grad_norm": 0.051638487726449966, + "learning_rate": 5.943163053541333e-05, + "loss": 0.181, + "step": 11604 + }, + { + "epoch": 2.3385049365303243, + "grad_norm": 0.05429547280073166, + "learning_rate": 5.941854292676803e-05, + "loss": 0.2498, + "step": 11606 + }, + { + "epoch": 2.3389079185976223, + "grad_norm": 0.041048940271139145, + "learning_rate": 5.9405454649044525e-05, + "loss": 0.1911, + "step": 11608 + }, + { + "epoch": 2.3393109006649206, + "grad_norm": 0.057707563042640686, + "learning_rate": 5.9392365703172534e-05, + "loss": 0.2572, + "step": 11610 + }, + { + "epoch": 2.3397138827322186, + "grad_norm": 0.037866026163101196, + "learning_rate": 5.9379276090081924e-05, + "loss": 0.1573, + "step": 11612 + }, + { + "epoch": 2.3401168647995165, + "grad_norm": 0.04118697717785835, + "learning_rate": 5.9366185810702545e-05, + "loss": 0.1889, + "step": 11614 + }, + { + "epoch": 2.3405198468668145, + "grad_norm": 0.06807113438844681, + "learning_rate": 5.9353094865964286e-05, + "loss": 0.2459, + "step": 11616 + }, + { + "epoch": 2.3409228289341124, + "grad_norm": 0.06002763658761978, + "learning_rate": 5.934000325679714e-05, + "loss": 0.1925, + "step": 11618 + }, + { + "epoch": 2.3413258110014104, + "grad_norm": 0.05739912763237953, + "learning_rate": 5.93269109841311e-05, + "loss": 0.2382, + "step": 11620 + }, + { + "epoch": 2.3417287930687083, + "grad_norm": 0.061665866523981094, + "learning_rate": 5.931381804889621e-05, + "loss": 0.1745, + "step": 11622 + }, + { + "epoch": 2.3421317751360062, + "grad_norm": 0.04462193325161934, + "learning_rate": 5.930072445202258e-05, + "loss": 0.1698, + "step": 11624 + }, + { + "epoch": 2.3425347572033046, + "grad_norm": 0.03827643766999245, + "learning_rate": 5.928763019444037e-05, + "loss": 0.1476, + "step": 11626 + }, + { + "epoch": 2.3429377392706026, + "grad_norm": 0.047175608575344086, + "learning_rate": 5.9274535277079756e-05, + "loss": 0.2098, + "step": 11628 + }, + { + "epoch": 2.3433407213379005, + "grad_norm": 0.042314786463975906, + "learning_rate": 5.926143970087099e-05, + "loss": 0.1944, + "step": 11630 + }, + { + "epoch": 2.3437437034051984, + "grad_norm": 0.03941023349761963, + "learning_rate": 5.924834346674437e-05, + "loss": 0.1135, + "step": 11632 + }, + { + "epoch": 2.3441466854724964, + "grad_norm": 0.09028346836566925, + "learning_rate": 5.923524657563021e-05, + "loss": 0.2228, + "step": 11634 + }, + { + "epoch": 2.3445496675397943, + "grad_norm": 0.06249230355024338, + "learning_rate": 5.922214902845891e-05, + "loss": 0.1605, + "step": 11636 + }, + { + "epoch": 2.3449526496070927, + "grad_norm": 0.051382407546043396, + "learning_rate": 5.920905082616088e-05, + "loss": 0.15, + "step": 11638 + }, + { + "epoch": 2.3453556316743907, + "grad_norm": 0.05022916570305824, + "learning_rate": 5.919595196966662e-05, + "loss": 0.1367, + "step": 11640 + }, + { + "epoch": 2.3457586137416886, + "grad_norm": 0.07349605113267899, + "learning_rate": 5.918285245990662e-05, + "loss": 0.244, + "step": 11642 + }, + { + "epoch": 2.3461615958089865, + "grad_norm": 0.05159524455666542, + "learning_rate": 5.9169752297811484e-05, + "loss": 0.1832, + "step": 11644 + }, + { + "epoch": 2.3465645778762845, + "grad_norm": 0.050891030579805374, + "learning_rate": 5.915665148431181e-05, + "loss": 0.2035, + "step": 11646 + }, + { + "epoch": 2.3469675599435824, + "grad_norm": 0.06695982813835144, + "learning_rate": 5.914355002033825e-05, + "loss": 0.2231, + "step": 11648 + }, + { + "epoch": 2.3473705420108804, + "grad_norm": 0.06656850874423981, + "learning_rate": 5.913044790682153e-05, + "loss": 0.1957, + "step": 11650 + }, + { + "epoch": 2.3477735240781783, + "grad_norm": 0.07416415959596634, + "learning_rate": 5.9117345144692384e-05, + "loss": 0.2301, + "step": 11652 + }, + { + "epoch": 2.3481765061454767, + "grad_norm": 0.05558010935783386, + "learning_rate": 5.9104241734881626e-05, + "loss": 0.215, + "step": 11654 + }, + { + "epoch": 2.3485794882127746, + "grad_norm": 0.04689667001366615, + "learning_rate": 5.9091137678320087e-05, + "loss": 0.2222, + "step": 11656 + }, + { + "epoch": 2.3489824702800726, + "grad_norm": 0.05192426219582558, + "learning_rate": 5.907803297593867e-05, + "loss": 0.1966, + "step": 11658 + }, + { + "epoch": 2.3493854523473705, + "grad_norm": 0.04759860783815384, + "learning_rate": 5.906492762866831e-05, + "loss": 0.191, + "step": 11660 + }, + { + "epoch": 2.3497884344146684, + "grad_norm": 0.071357861161232, + "learning_rate": 5.9051821637439984e-05, + "loss": 0.1829, + "step": 11662 + }, + { + "epoch": 2.3501914164819664, + "grad_norm": 0.06483819335699081, + "learning_rate": 5.903871500318473e-05, + "loss": 0.2303, + "step": 11664 + }, + { + "epoch": 2.3505943985492648, + "grad_norm": 0.04689923673868179, + "learning_rate": 5.902560772683362e-05, + "loss": 0.1671, + "step": 11666 + }, + { + "epoch": 2.3509973806165627, + "grad_norm": 0.07678768038749695, + "learning_rate": 5.901249980931777e-05, + "loss": 0.202, + "step": 11668 + }, + { + "epoch": 2.3514003626838607, + "grad_norm": 0.052997030317783356, + "learning_rate": 5.8999391251568336e-05, + "loss": 0.2071, + "step": 11670 + }, + { + "epoch": 2.3518033447511586, + "grad_norm": 0.061645928770303726, + "learning_rate": 5.898628205451655e-05, + "loss": 0.1533, + "step": 11672 + }, + { + "epoch": 2.3522063268184565, + "grad_norm": 0.047521840780973434, + "learning_rate": 5.897317221909367e-05, + "loss": 0.1957, + "step": 11674 + }, + { + "epoch": 2.3526093088857545, + "grad_norm": 0.06109415739774704, + "learning_rate": 5.896006174623094e-05, + "loss": 0.1608, + "step": 11676 + }, + { + "epoch": 2.3530122909530524, + "grad_norm": 0.062129903584718704, + "learning_rate": 5.89469506368598e-05, + "loss": 0.1885, + "step": 11678 + }, + { + "epoch": 2.3534152730203504, + "grad_norm": 0.06269177794456482, + "learning_rate": 5.893383889191158e-05, + "loss": 0.2023, + "step": 11680 + }, + { + "epoch": 2.3538182550876487, + "grad_norm": 0.04957219213247299, + "learning_rate": 5.892072651231774e-05, + "loss": 0.1842, + "step": 11682 + }, + { + "epoch": 2.3542212371549467, + "grad_norm": 0.04616183042526245, + "learning_rate": 5.890761349900974e-05, + "loss": 0.2155, + "step": 11684 + }, + { + "epoch": 2.3546242192222446, + "grad_norm": 0.042560383677482605, + "learning_rate": 5.889449985291913e-05, + "loss": 0.2049, + "step": 11686 + }, + { + "epoch": 2.3550272012895426, + "grad_norm": 0.048294831067323685, + "learning_rate": 5.8881385574977485e-05, + "loss": 0.2134, + "step": 11688 + }, + { + "epoch": 2.3554301833568405, + "grad_norm": 0.08637408167123795, + "learning_rate": 5.88682706661164e-05, + "loss": 0.1966, + "step": 11690 + }, + { + "epoch": 2.3558331654241385, + "grad_norm": 0.035582225769758224, + "learning_rate": 5.885515512726755e-05, + "loss": 0.1546, + "step": 11692 + }, + { + "epoch": 2.356236147491437, + "grad_norm": 0.0642366036772728, + "learning_rate": 5.8842038959362656e-05, + "loss": 0.2135, + "step": 11694 + }, + { + "epoch": 2.356639129558735, + "grad_norm": 0.04640725627541542, + "learning_rate": 5.882892216333343e-05, + "loss": 0.1381, + "step": 11696 + }, + { + "epoch": 2.3570421116260327, + "grad_norm": 0.046785078942775726, + "learning_rate": 5.881580474011171e-05, + "loss": 0.2211, + "step": 11698 + }, + { + "epoch": 2.3574450936933307, + "grad_norm": 0.045676589012145996, + "learning_rate": 5.880268669062933e-05, + "loss": 0.1763, + "step": 11700 + }, + { + "epoch": 2.3578480757606286, + "grad_norm": 0.05108589679002762, + "learning_rate": 5.878956801581814e-05, + "loss": 0.1987, + "step": 11702 + }, + { + "epoch": 2.3582510578279265, + "grad_norm": 0.06899011135101318, + "learning_rate": 5.8776448716610114e-05, + "loss": 0.1831, + "step": 11704 + }, + { + "epoch": 2.3586540398952245, + "grad_norm": 0.05484525114297867, + "learning_rate": 5.87633287939372e-05, + "loss": 0.217, + "step": 11706 + }, + { + "epoch": 2.3590570219625224, + "grad_norm": 0.11611830443143845, + "learning_rate": 5.875020824873142e-05, + "loss": 0.1576, + "step": 11708 + }, + { + "epoch": 2.359460004029821, + "grad_norm": 0.06362280249595642, + "learning_rate": 5.8737087081924845e-05, + "loss": 0.1907, + "step": 11710 + }, + { + "epoch": 2.3598629860971188, + "grad_norm": 0.05416010320186615, + "learning_rate": 5.872396529444958e-05, + "loss": 0.1945, + "step": 11712 + }, + { + "epoch": 2.3602659681644167, + "grad_norm": 0.06739898771047592, + "learning_rate": 5.871084288723776e-05, + "loss": 0.2166, + "step": 11714 + }, + { + "epoch": 2.3606689502317146, + "grad_norm": 0.05630958080291748, + "learning_rate": 5.86977198612216e-05, + "loss": 0.1847, + "step": 11716 + }, + { + "epoch": 2.3610719322990126, + "grad_norm": 0.061415087431669235, + "learning_rate": 5.8684596217333346e-05, + "loss": 0.2264, + "step": 11718 + }, + { + "epoch": 2.3614749143663105, + "grad_norm": 0.05699741095304489, + "learning_rate": 5.867147195650524e-05, + "loss": 0.1604, + "step": 11720 + }, + { + "epoch": 2.361877896433609, + "grad_norm": 0.05686968192458153, + "learning_rate": 5.865834707966964e-05, + "loss": 0.1843, + "step": 11722 + }, + { + "epoch": 2.362280878500907, + "grad_norm": 0.053936101496219635, + "learning_rate": 5.864522158775892e-05, + "loss": 0.2176, + "step": 11724 + }, + { + "epoch": 2.362683860568205, + "grad_norm": 0.04711095988750458, + "learning_rate": 5.8632095481705486e-05, + "loss": 0.1868, + "step": 11726 + }, + { + "epoch": 2.3630868426355027, + "grad_norm": 0.05970948934555054, + "learning_rate": 5.861896876244178e-05, + "loss": 0.1718, + "step": 11728 + }, + { + "epoch": 2.3634898247028007, + "grad_norm": 0.04749145731329918, + "learning_rate": 5.860584143090033e-05, + "loss": 0.1736, + "step": 11730 + }, + { + "epoch": 2.3638928067700986, + "grad_norm": 0.042695820331573486, + "learning_rate": 5.859271348801366e-05, + "loss": 0.1696, + "step": 11732 + }, + { + "epoch": 2.3642957888373966, + "grad_norm": 0.046593207865953445, + "learning_rate": 5.857958493471437e-05, + "loss": 0.1662, + "step": 11734 + }, + { + "epoch": 2.3646987709046945, + "grad_norm": 0.0542333722114563, + "learning_rate": 5.8566455771935094e-05, + "loss": 0.1947, + "step": 11736 + }, + { + "epoch": 2.365101752971993, + "grad_norm": 0.04733563959598541, + "learning_rate": 5.8553326000608487e-05, + "loss": 0.197, + "step": 11738 + }, + { + "epoch": 2.365504735039291, + "grad_norm": 0.047249022871255875, + "learning_rate": 5.854019562166728e-05, + "loss": 0.1504, + "step": 11740 + }, + { + "epoch": 2.3659077171065888, + "grad_norm": 0.05582365393638611, + "learning_rate": 5.852706463604425e-05, + "loss": 0.1979, + "step": 11742 + }, + { + "epoch": 2.3663106991738867, + "grad_norm": 0.052134282886981964, + "learning_rate": 5.8513933044672164e-05, + "loss": 0.2014, + "step": 11744 + }, + { + "epoch": 2.3667136812411846, + "grad_norm": 0.038980018347501755, + "learning_rate": 5.8500800848483895e-05, + "loss": 0.149, + "step": 11746 + }, + { + "epoch": 2.3671166633084826, + "grad_norm": 0.055582620203495026, + "learning_rate": 5.848766804841235e-05, + "loss": 0.1674, + "step": 11748 + }, + { + "epoch": 2.367519645375781, + "grad_norm": 0.0617508664727211, + "learning_rate": 5.847453464539041e-05, + "loss": 0.2204, + "step": 11750 + }, + { + "epoch": 2.367922627443079, + "grad_norm": 0.08900310844182968, + "learning_rate": 5.84614006403511e-05, + "loss": 0.1942, + "step": 11752 + }, + { + "epoch": 2.368325609510377, + "grad_norm": 0.06451424211263657, + "learning_rate": 5.844826603422743e-05, + "loss": 0.2078, + "step": 11754 + }, + { + "epoch": 2.368728591577675, + "grad_norm": 0.045638687908649445, + "learning_rate": 5.8435130827952433e-05, + "loss": 0.2024, + "step": 11756 + }, + { + "epoch": 2.3691315736449727, + "grad_norm": 0.04450851306319237, + "learning_rate": 5.8421995022459245e-05, + "loss": 0.1851, + "step": 11758 + }, + { + "epoch": 2.3695345557122707, + "grad_norm": 0.05700334906578064, + "learning_rate": 5.8408858618680984e-05, + "loss": 0.1776, + "step": 11760 + }, + { + "epoch": 2.3699375377795686, + "grad_norm": 0.060567259788513184, + "learning_rate": 5.839572161755087e-05, + "loss": 0.1426, + "step": 11762 + }, + { + "epoch": 2.370340519846867, + "grad_norm": 0.07149244099855423, + "learning_rate": 5.8382584020002116e-05, + "loss": 0.2277, + "step": 11764 + }, + { + "epoch": 2.370743501914165, + "grad_norm": 0.06128925085067749, + "learning_rate": 5.8369445826968e-05, + "loss": 0.1954, + "step": 11766 + }, + { + "epoch": 2.371146483981463, + "grad_norm": 0.05819423869252205, + "learning_rate": 5.8356307039381816e-05, + "loss": 0.2056, + "step": 11768 + }, + { + "epoch": 2.371549466048761, + "grad_norm": 0.048833053559064865, + "learning_rate": 5.834316765817698e-05, + "loss": 0.2106, + "step": 11770 + }, + { + "epoch": 2.3719524481160588, + "grad_norm": 0.0524032786488533, + "learning_rate": 5.833002768428683e-05, + "loss": 0.1701, + "step": 11772 + }, + { + "epoch": 2.3723554301833567, + "grad_norm": 0.05456862598657608, + "learning_rate": 5.8316887118644835e-05, + "loss": 0.2064, + "step": 11774 + }, + { + "epoch": 2.372758412250655, + "grad_norm": 0.062148381024599075, + "learning_rate": 5.83037459621845e-05, + "loss": 0.2047, + "step": 11776 + }, + { + "epoch": 2.373161394317953, + "grad_norm": 0.06895274668931961, + "learning_rate": 5.8290604215839314e-05, + "loss": 0.188, + "step": 11778 + }, + { + "epoch": 2.373564376385251, + "grad_norm": 0.060913410037755966, + "learning_rate": 5.8277461880542864e-05, + "loss": 0.2027, + "step": 11780 + }, + { + "epoch": 2.373967358452549, + "grad_norm": 0.06738603115081787, + "learning_rate": 5.826431895722877e-05, + "loss": 0.1767, + "step": 11782 + }, + { + "epoch": 2.374370340519847, + "grad_norm": 0.07685627043247223, + "learning_rate": 5.8251175446830677e-05, + "loss": 0.2092, + "step": 11784 + }, + { + "epoch": 2.374773322587145, + "grad_norm": 0.10875460505485535, + "learning_rate": 5.823803135028226e-05, + "loss": 0.2403, + "step": 11786 + }, + { + "epoch": 2.3751763046544427, + "grad_norm": 0.049758244305849075, + "learning_rate": 5.8224886668517285e-05, + "loss": 0.2044, + "step": 11788 + }, + { + "epoch": 2.3755792867217407, + "grad_norm": 0.03598930686712265, + "learning_rate": 5.8211741402469496e-05, + "loss": 0.1232, + "step": 11790 + }, + { + "epoch": 2.375982268789039, + "grad_norm": 0.060612574219703674, + "learning_rate": 5.8198595553072746e-05, + "loss": 0.1988, + "step": 11792 + }, + { + "epoch": 2.376385250856337, + "grad_norm": 0.047952961176633835, + "learning_rate": 5.818544912126089e-05, + "loss": 0.1905, + "step": 11794 + }, + { + "epoch": 2.376788232923635, + "grad_norm": 0.05651276931166649, + "learning_rate": 5.8172302107967804e-05, + "loss": 0.2082, + "step": 11796 + }, + { + "epoch": 2.377191214990933, + "grad_norm": 0.05601438134908676, + "learning_rate": 5.8159154514127435e-05, + "loss": 0.2004, + "step": 11798 + }, + { + "epoch": 2.377594197058231, + "grad_norm": 0.058857470750808716, + "learning_rate": 5.81460063406738e-05, + "loss": 0.1537, + "step": 11800 + }, + { + "epoch": 2.3779971791255288, + "grad_norm": 0.05285648629069328, + "learning_rate": 5.813285758854089e-05, + "loss": 0.1686, + "step": 11802 + }, + { + "epoch": 2.378400161192827, + "grad_norm": 0.03570474684238434, + "learning_rate": 5.811970825866279e-05, + "loss": 0.1495, + "step": 11804 + }, + { + "epoch": 2.378803143260125, + "grad_norm": 0.05308526009321213, + "learning_rate": 5.8106558351973606e-05, + "loss": 0.1985, + "step": 11806 + }, + { + "epoch": 2.379206125327423, + "grad_norm": 0.04530463367700577, + "learning_rate": 5.8093407869407466e-05, + "loss": 0.1672, + "step": 11808 + }, + { + "epoch": 2.379609107394721, + "grad_norm": 0.05302588269114494, + "learning_rate": 5.808025681189857e-05, + "loss": 0.1831, + "step": 11810 + }, + { + "epoch": 2.380012089462019, + "grad_norm": 0.07352914661169052, + "learning_rate": 5.8067105180381174e-05, + "loss": 0.2249, + "step": 11812 + }, + { + "epoch": 2.380415071529317, + "grad_norm": 0.045757245272397995, + "learning_rate": 5.8053952975789516e-05, + "loss": 0.1529, + "step": 11814 + }, + { + "epoch": 2.380818053596615, + "grad_norm": 0.06004302576184273, + "learning_rate": 5.804080019905792e-05, + "loss": 0.2177, + "step": 11816 + }, + { + "epoch": 2.3812210356639127, + "grad_norm": 0.06710556149482727, + "learning_rate": 5.802764685112074e-05, + "loss": 0.2256, + "step": 11818 + }, + { + "epoch": 2.381624017731211, + "grad_norm": 0.057796910405159, + "learning_rate": 5.8014492932912354e-05, + "loss": 0.2327, + "step": 11820 + }, + { + "epoch": 2.382026999798509, + "grad_norm": 0.04096395522356033, + "learning_rate": 5.800133844536723e-05, + "loss": 0.213, + "step": 11822 + }, + { + "epoch": 2.382429981865807, + "grad_norm": 0.04763369262218475, + "learning_rate": 5.79881833894198e-05, + "loss": 0.2442, + "step": 11824 + }, + { + "epoch": 2.382832963933105, + "grad_norm": 0.04647963494062424, + "learning_rate": 5.797502776600461e-05, + "loss": 0.1827, + "step": 11826 + }, + { + "epoch": 2.383235946000403, + "grad_norm": 0.05235551670193672, + "learning_rate": 5.796187157605619e-05, + "loss": 0.1772, + "step": 11828 + }, + { + "epoch": 2.383638928067701, + "grad_norm": 0.039118025451898575, + "learning_rate": 5.7948714820509155e-05, + "loss": 0.19, + "step": 11830 + }, + { + "epoch": 2.384041910134999, + "grad_norm": 0.0516664981842041, + "learning_rate": 5.7935557500298124e-05, + "loss": 0.1388, + "step": 11832 + }, + { + "epoch": 2.384444892202297, + "grad_norm": 0.049341436475515366, + "learning_rate": 5.792239961635779e-05, + "loss": 0.1862, + "step": 11834 + }, + { + "epoch": 2.384847874269595, + "grad_norm": 0.06240430846810341, + "learning_rate": 5.7909241169622844e-05, + "loss": 0.2013, + "step": 11836 + }, + { + "epoch": 2.385250856336893, + "grad_norm": 0.0566558800637722, + "learning_rate": 5.789608216102805e-05, + "loss": 0.1829, + "step": 11838 + }, + { + "epoch": 2.385653838404191, + "grad_norm": 0.05318170040845871, + "learning_rate": 5.788292259150823e-05, + "loss": 0.1576, + "step": 11840 + }, + { + "epoch": 2.386056820471489, + "grad_norm": 0.04496678337454796, + "learning_rate": 5.786976246199818e-05, + "loss": 0.1916, + "step": 11842 + }, + { + "epoch": 2.386459802538787, + "grad_norm": 0.05971567705273628, + "learning_rate": 5.78566017734328e-05, + "loss": 0.2041, + "step": 11844 + }, + { + "epoch": 2.386862784606085, + "grad_norm": 0.05461606755852699, + "learning_rate": 5.7843440526746986e-05, + "loss": 0.2, + "step": 11846 + }, + { + "epoch": 2.387265766673383, + "grad_norm": 0.046182163059711456, + "learning_rate": 5.78302787228757e-05, + "loss": 0.1676, + "step": 11848 + }, + { + "epoch": 2.387668748740681, + "grad_norm": 0.04418657720088959, + "learning_rate": 5.781711636275393e-05, + "loss": 0.1601, + "step": 11850 + }, + { + "epoch": 2.388071730807979, + "grad_norm": 0.05287107080221176, + "learning_rate": 5.780395344731674e-05, + "loss": 0.2134, + "step": 11852 + }, + { + "epoch": 2.388474712875277, + "grad_norm": 0.049565572291612625, + "learning_rate": 5.779078997749916e-05, + "loss": 0.2243, + "step": 11854 + }, + { + "epoch": 2.388877694942575, + "grad_norm": 0.06569905579090118, + "learning_rate": 5.777762595423631e-05, + "loss": 0.2142, + "step": 11856 + }, + { + "epoch": 2.389280677009873, + "grad_norm": 0.049043066799640656, + "learning_rate": 5.776446137846337e-05, + "loss": 0.166, + "step": 11858 + }, + { + "epoch": 2.3896836590771713, + "grad_norm": 0.05483182892203331, + "learning_rate": 5.775129625111551e-05, + "loss": 0.2148, + "step": 11860 + }, + { + "epoch": 2.3900866411444692, + "grad_norm": 0.11051026731729507, + "learning_rate": 5.773813057312795e-05, + "loss": 0.2218, + "step": 11862 + }, + { + "epoch": 2.390489623211767, + "grad_norm": 0.08356950432062149, + "learning_rate": 5.7724964345435976e-05, + "loss": 0.2183, + "step": 11864 + }, + { + "epoch": 2.390892605279065, + "grad_norm": 0.05710029602050781, + "learning_rate": 5.771179756897488e-05, + "loss": 0.2078, + "step": 11866 + }, + { + "epoch": 2.391295587346363, + "grad_norm": 0.052062198519706726, + "learning_rate": 5.769863024468002e-05, + "loss": 0.1888, + "step": 11868 + }, + { + "epoch": 2.391698569413661, + "grad_norm": 0.0480102002620697, + "learning_rate": 5.7685462373486796e-05, + "loss": 0.2023, + "step": 11870 + }, + { + "epoch": 2.392101551480959, + "grad_norm": 0.05226249620318413, + "learning_rate": 5.7672293956330603e-05, + "loss": 0.193, + "step": 11872 + }, + { + "epoch": 2.392504533548257, + "grad_norm": 0.054428581148386, + "learning_rate": 5.765912499414691e-05, + "loss": 0.1692, + "step": 11874 + }, + { + "epoch": 2.3929075156155553, + "grad_norm": 0.06447558850049973, + "learning_rate": 5.764595548787124e-05, + "loss": 0.225, + "step": 11876 + }, + { + "epoch": 2.393310497682853, + "grad_norm": 0.04479379206895828, + "learning_rate": 5.763278543843912e-05, + "loss": 0.2317, + "step": 11878 + }, + { + "epoch": 2.393713479750151, + "grad_norm": 0.057004962116479874, + "learning_rate": 5.761961484678612e-05, + "loss": 0.2318, + "step": 11880 + }, + { + "epoch": 2.394116461817449, + "grad_norm": 0.058323029428720474, + "learning_rate": 5.760644371384788e-05, + "loss": 0.1903, + "step": 11882 + }, + { + "epoch": 2.394519443884747, + "grad_norm": 0.06854918599128723, + "learning_rate": 5.759327204056003e-05, + "loss": 0.237, + "step": 11884 + }, + { + "epoch": 2.394922425952045, + "grad_norm": 0.06595735251903534, + "learning_rate": 5.758009982785829e-05, + "loss": 0.1988, + "step": 11886 + }, + { + "epoch": 2.3953254080193433, + "grad_norm": 0.06002083048224449, + "learning_rate": 5.756692707667837e-05, + "loss": 0.1646, + "step": 11888 + }, + { + "epoch": 2.3957283900866413, + "grad_norm": 0.04946382716298103, + "learning_rate": 5.755375378795604e-05, + "loss": 0.2111, + "step": 11890 + }, + { + "epoch": 2.3961313721539392, + "grad_norm": 0.06664959341287613, + "learning_rate": 5.754057996262715e-05, + "loss": 0.1618, + "step": 11892 + }, + { + "epoch": 2.396534354221237, + "grad_norm": 0.06098358705639839, + "learning_rate": 5.752740560162751e-05, + "loss": 0.2055, + "step": 11894 + }, + { + "epoch": 2.396937336288535, + "grad_norm": 0.08736960589885712, + "learning_rate": 5.7514230705893e-05, + "loss": 0.252, + "step": 11896 + }, + { + "epoch": 2.397340318355833, + "grad_norm": 0.05731016770005226, + "learning_rate": 5.750105527635957e-05, + "loss": 0.1637, + "step": 11898 + }, + { + "epoch": 2.397743300423131, + "grad_norm": 0.06296495348215103, + "learning_rate": 5.748787931396317e-05, + "loss": 0.2017, + "step": 11900 + }, + { + "epoch": 2.398146282490429, + "grad_norm": 0.04558323696255684, + "learning_rate": 5.747470281963979e-05, + "loss": 0.1786, + "step": 11902 + }, + { + "epoch": 2.3985492645577273, + "grad_norm": 0.05776946246623993, + "learning_rate": 5.746152579432549e-05, + "loss": 0.1729, + "step": 11904 + }, + { + "epoch": 2.3989522466250253, + "grad_norm": 0.0603502131998539, + "learning_rate": 5.744834823895632e-05, + "loss": 0.1867, + "step": 11906 + }, + { + "epoch": 2.399355228692323, + "grad_norm": 0.06661641597747803, + "learning_rate": 5.74351701544684e-05, + "loss": 0.1869, + "step": 11908 + }, + { + "epoch": 2.399758210759621, + "grad_norm": 0.058900561183691025, + "learning_rate": 5.742199154179789e-05, + "loss": 0.166, + "step": 11910 + }, + { + "epoch": 2.400161192826919, + "grad_norm": 0.059498026967048645, + "learning_rate": 5.740881240188097e-05, + "loss": 0.1944, + "step": 11912 + }, + { + "epoch": 2.400564174894217, + "grad_norm": 0.04069728031754494, + "learning_rate": 5.739563273565386e-05, + "loss": 0.1524, + "step": 11914 + }, + { + "epoch": 2.4009671569615154, + "grad_norm": 0.057217568159103394, + "learning_rate": 5.7382452544052844e-05, + "loss": 0.1966, + "step": 11916 + }, + { + "epoch": 2.4013701390288134, + "grad_norm": 0.08511929214000702, + "learning_rate": 5.736927182801419e-05, + "loss": 0.228, + "step": 11918 + }, + { + "epoch": 2.4017731210961113, + "grad_norm": 0.06262311339378357, + "learning_rate": 5.7356090588474254e-05, + "loss": 0.231, + "step": 11920 + }, + { + "epoch": 2.4021761031634092, + "grad_norm": 0.04965263605117798, + "learning_rate": 5.7342908826369414e-05, + "loss": 0.1729, + "step": 11922 + }, + { + "epoch": 2.402579085230707, + "grad_norm": 0.057901497930288315, + "learning_rate": 5.7329726542636064e-05, + "loss": 0.1913, + "step": 11924 + }, + { + "epoch": 2.402982067298005, + "grad_norm": 0.06473572552204132, + "learning_rate": 5.731654373821066e-05, + "loss": 0.2132, + "step": 11926 + }, + { + "epoch": 2.403385049365303, + "grad_norm": 0.04945135489106178, + "learning_rate": 5.7303360414029706e-05, + "loss": 0.1491, + "step": 11928 + }, + { + "epoch": 2.403788031432601, + "grad_norm": 0.05536174029111862, + "learning_rate": 5.72901765710297e-05, + "loss": 0.1903, + "step": 11930 + }, + { + "epoch": 2.4041910134998994, + "grad_norm": 0.061889227479696274, + "learning_rate": 5.727699221014719e-05, + "loss": 0.2038, + "step": 11932 + }, + { + "epoch": 2.4045939955671973, + "grad_norm": 0.04592079669237137, + "learning_rate": 5.726380733231882e-05, + "loss": 0.1755, + "step": 11934 + }, + { + "epoch": 2.4049969776344953, + "grad_norm": 0.054935865104198456, + "learning_rate": 5.725062193848119e-05, + "loss": 0.2201, + "step": 11936 + }, + { + "epoch": 2.405399959701793, + "grad_norm": 0.049928389489650726, + "learning_rate": 5.723743602957096e-05, + "loss": 0.1442, + "step": 11938 + }, + { + "epoch": 2.405802941769091, + "grad_norm": 0.05707401782274246, + "learning_rate": 5.722424960652486e-05, + "loss": 0.1478, + "step": 11940 + }, + { + "epoch": 2.406205923836389, + "grad_norm": 0.08371180295944214, + "learning_rate": 5.7211062670279615e-05, + "loss": 0.1762, + "step": 11942 + }, + { + "epoch": 2.4066089059036875, + "grad_norm": 0.060301292687654495, + "learning_rate": 5.7197875221772004e-05, + "loss": 0.1517, + "step": 11944 + }, + { + "epoch": 2.4070118879709854, + "grad_norm": 0.05720106512308121, + "learning_rate": 5.718468726193886e-05, + "loss": 0.156, + "step": 11946 + }, + { + "epoch": 2.4074148700382834, + "grad_norm": 0.0571570098400116, + "learning_rate": 5.7171498791717014e-05, + "loss": 0.1617, + "step": 11948 + }, + { + "epoch": 2.4078178521055813, + "grad_norm": 0.03223549202084541, + "learning_rate": 5.7158309812043374e-05, + "loss": 0.1351, + "step": 11950 + }, + { + "epoch": 2.4082208341728792, + "grad_norm": 0.06487289816141129, + "learning_rate": 5.714512032385485e-05, + "loss": 0.1887, + "step": 11952 + }, + { + "epoch": 2.408623816240177, + "grad_norm": 0.6603782773017883, + "learning_rate": 5.71319303280884e-05, + "loss": 0.2058, + "step": 11954 + }, + { + "epoch": 2.409026798307475, + "grad_norm": 0.04995949938893318, + "learning_rate": 5.7118739825681035e-05, + "loss": 0.1826, + "step": 11956 + }, + { + "epoch": 2.4094297803747735, + "grad_norm": 0.05377180874347687, + "learning_rate": 5.710554881756976e-05, + "loss": 0.1987, + "step": 11958 + }, + { + "epoch": 2.4098327624420715, + "grad_norm": 0.048319052904844284, + "learning_rate": 5.709235730469168e-05, + "loss": 0.1767, + "step": 11960 + }, + { + "epoch": 2.4102357445093694, + "grad_norm": 0.05564803257584572, + "learning_rate": 5.707916528798387e-05, + "loss": 0.1956, + "step": 11962 + }, + { + "epoch": 2.4106387265766673, + "grad_norm": 0.061358992010354996, + "learning_rate": 5.706597276838348e-05, + "loss": 0.2277, + "step": 11964 + }, + { + "epoch": 2.4110417086439653, + "grad_norm": 0.04426693171262741, + "learning_rate": 5.7052779746827675e-05, + "loss": 0.1647, + "step": 11966 + }, + { + "epoch": 2.411444690711263, + "grad_norm": 0.0576111376285553, + "learning_rate": 5.7039586224253704e-05, + "loss": 0.179, + "step": 11968 + }, + { + "epoch": 2.4118476727785616, + "grad_norm": 0.04460928216576576, + "learning_rate": 5.7026392201598766e-05, + "loss": 0.1597, + "step": 11970 + }, + { + "epoch": 2.4122506548458595, + "grad_norm": 0.09354057163000107, + "learning_rate": 5.701319767980016e-05, + "loss": 0.168, + "step": 11972 + }, + { + "epoch": 2.4126536369131575, + "grad_norm": 0.06683609634637833, + "learning_rate": 5.700000265979522e-05, + "loss": 0.2157, + "step": 11974 + }, + { + "epoch": 2.4130566189804554, + "grad_norm": 0.06345374882221222, + "learning_rate": 5.698680714252127e-05, + "loss": 0.1864, + "step": 11976 + }, + { + "epoch": 2.4134596010477534, + "grad_norm": 0.041688140481710434, + "learning_rate": 5.6973611128915714e-05, + "loss": 0.1656, + "step": 11978 + }, + { + "epoch": 2.4138625831150513, + "grad_norm": 0.09406961500644684, + "learning_rate": 5.696041461991599e-05, + "loss": 0.1683, + "step": 11980 + }, + { + "epoch": 2.4142655651823492, + "grad_norm": 0.07213851064443588, + "learning_rate": 5.6947217616459536e-05, + "loss": 0.181, + "step": 11982 + }, + { + "epoch": 2.414668547249647, + "grad_norm": 0.05010436475276947, + "learning_rate": 5.693402011948385e-05, + "loss": 0.167, + "step": 11984 + }, + { + "epoch": 2.4150715293169456, + "grad_norm": 0.06185581907629967, + "learning_rate": 5.692082212992648e-05, + "loss": 0.1942, + "step": 11986 + }, + { + "epoch": 2.4154745113842435, + "grad_norm": 0.04987217113375664, + "learning_rate": 5.6907623648724963e-05, + "loss": 0.1844, + "step": 11988 + }, + { + "epoch": 2.4158774934515415, + "grad_norm": 0.0516745001077652, + "learning_rate": 5.689442467681691e-05, + "loss": 0.2261, + "step": 11990 + }, + { + "epoch": 2.4162804755188394, + "grad_norm": 0.07256913185119629, + "learning_rate": 5.6881225215139947e-05, + "loss": 0.1793, + "step": 11992 + }, + { + "epoch": 2.4166834575861373, + "grad_norm": 0.040182922035455704, + "learning_rate": 5.6868025264631755e-05, + "loss": 0.1632, + "step": 11994 + }, + { + "epoch": 2.4170864396534353, + "grad_norm": 0.09456195682287216, + "learning_rate": 5.6854824826230024e-05, + "loss": 0.2299, + "step": 11996 + }, + { + "epoch": 2.4174894217207337, + "grad_norm": 0.057920172810554504, + "learning_rate": 5.684162390087252e-05, + "loss": 0.1902, + "step": 11998 + }, + { + "epoch": 2.4178924037880316, + "grad_norm": 0.10728804767131805, + "learning_rate": 5.682842248949698e-05, + "loss": 0.2152, + "step": 12000 + }, + { + "epoch": 2.4182953858553295, + "grad_norm": 0.04487679898738861, + "learning_rate": 5.681522059304123e-05, + "loss": 0.1691, + "step": 12002 + }, + { + "epoch": 2.4186983679226275, + "grad_norm": 0.05849164351820946, + "learning_rate": 5.6802018212443105e-05, + "loss": 0.1846, + "step": 12004 + }, + { + "epoch": 2.4191013499899254, + "grad_norm": 0.06814565509557724, + "learning_rate": 5.678881534864049e-05, + "loss": 0.2089, + "step": 12006 + }, + { + "epoch": 2.4195043320572234, + "grad_norm": 0.06691406667232513, + "learning_rate": 5.6775612002571286e-05, + "loss": 0.1781, + "step": 12008 + }, + { + "epoch": 2.4199073141245213, + "grad_norm": 0.060444775968790054, + "learning_rate": 5.676240817517344e-05, + "loss": 0.1633, + "step": 12010 + }, + { + "epoch": 2.4203102961918193, + "grad_norm": 0.06389014422893524, + "learning_rate": 5.674920386738494e-05, + "loss": 0.1707, + "step": 12012 + }, + { + "epoch": 2.4207132782591176, + "grad_norm": 0.060476355254650116, + "learning_rate": 5.673599908014379e-05, + "loss": 0.2348, + "step": 12014 + }, + { + "epoch": 2.4211162603264156, + "grad_norm": 0.04274730756878853, + "learning_rate": 5.672279381438803e-05, + "loss": 0.1393, + "step": 12016 + }, + { + "epoch": 2.4215192423937135, + "grad_norm": 0.06446705758571625, + "learning_rate": 5.6709588071055755e-05, + "loss": 0.2038, + "step": 12018 + }, + { + "epoch": 2.4219222244610115, + "grad_norm": 0.06751962006092072, + "learning_rate": 5.669638185108507e-05, + "loss": 0.2483, + "step": 12020 + }, + { + "epoch": 2.4223252065283094, + "grad_norm": 0.05737346410751343, + "learning_rate": 5.668317515541414e-05, + "loss": 0.2671, + "step": 12022 + }, + { + "epoch": 2.4227281885956073, + "grad_norm": 0.06973695755004883, + "learning_rate": 5.666996798498112e-05, + "loss": 0.1662, + "step": 12024 + }, + { + "epoch": 2.4231311706629057, + "grad_norm": 0.06069466099143028, + "learning_rate": 5.665676034072425e-05, + "loss": 0.1833, + "step": 12026 + }, + { + "epoch": 2.4235341527302037, + "grad_norm": 0.04661950841546059, + "learning_rate": 5.664355222358176e-05, + "loss": 0.1636, + "step": 12028 + }, + { + "epoch": 2.4239371347975016, + "grad_norm": 0.042542293667793274, + "learning_rate": 5.6630343634491954e-05, + "loss": 0.1768, + "step": 12030 + }, + { + "epoch": 2.4243401168647996, + "grad_norm": 0.04667328670620918, + "learning_rate": 5.661713457439314e-05, + "loss": 0.1743, + "step": 12032 + }, + { + "epoch": 2.4247430989320975, + "grad_norm": 0.05008386820554733, + "learning_rate": 5.660392504422366e-05, + "loss": 0.1898, + "step": 12034 + }, + { + "epoch": 2.4251460809993954, + "grad_norm": 0.07631053030490875, + "learning_rate": 5.659071504492192e-05, + "loss": 0.2055, + "step": 12036 + }, + { + "epoch": 2.4255490630666934, + "grad_norm": 0.044088345021009445, + "learning_rate": 5.657750457742632e-05, + "loss": 0.1641, + "step": 12038 + }, + { + "epoch": 2.4259520451339913, + "grad_norm": 0.0574885718524456, + "learning_rate": 5.65642936426753e-05, + "loss": 0.1901, + "step": 12040 + }, + { + "epoch": 2.4263550272012897, + "grad_norm": 0.04501950368285179, + "learning_rate": 5.6551082241607365e-05, + "loss": 0.1745, + "step": 12042 + }, + { + "epoch": 2.4267580092685876, + "grad_norm": 0.06143771857023239, + "learning_rate": 5.653787037516104e-05, + "loss": 0.2342, + "step": 12044 + }, + { + "epoch": 2.4271609913358856, + "grad_norm": 0.054222654551267624, + "learning_rate": 5.6524658044274835e-05, + "loss": 0.2231, + "step": 12046 + }, + { + "epoch": 2.4275639734031835, + "grad_norm": 0.045573096722364426, + "learning_rate": 5.6511445249887376e-05, + "loss": 0.1726, + "step": 12048 + }, + { + "epoch": 2.4279669554704815, + "grad_norm": 0.05551764741539955, + "learning_rate": 5.649823199293726e-05, + "loss": 0.177, + "step": 12050 + }, + { + "epoch": 2.4283699375377794, + "grad_norm": 0.05046489089727402, + "learning_rate": 5.648501827436312e-05, + "loss": 0.1782, + "step": 12052 + }, + { + "epoch": 2.428772919605078, + "grad_norm": 0.05633862689137459, + "learning_rate": 5.647180409510366e-05, + "loss": 0.1871, + "step": 12054 + }, + { + "epoch": 2.4291759016723757, + "grad_norm": 0.048323508352041245, + "learning_rate": 5.645858945609759e-05, + "loss": 0.2305, + "step": 12056 + }, + { + "epoch": 2.4295788837396737, + "grad_norm": 0.042128656059503555, + "learning_rate": 5.6445374358283656e-05, + "loss": 0.178, + "step": 12058 + }, + { + "epoch": 2.4299818658069716, + "grad_norm": 0.05420457571744919, + "learning_rate": 5.643215880260062e-05, + "loss": 0.1622, + "step": 12060 + }, + { + "epoch": 2.4303848478742696, + "grad_norm": 0.060230378061532974, + "learning_rate": 5.641894278998733e-05, + "loss": 0.1957, + "step": 12062 + }, + { + "epoch": 2.4307878299415675, + "grad_norm": 0.06890761107206345, + "learning_rate": 5.640572632138259e-05, + "loss": 0.1913, + "step": 12064 + }, + { + "epoch": 2.4311908120088654, + "grad_norm": 0.06876115500926971, + "learning_rate": 5.6392509397725314e-05, + "loss": 0.2407, + "step": 12066 + }, + { + "epoch": 2.4315937940761634, + "grad_norm": 0.05286121740937233, + "learning_rate": 5.637929201995439e-05, + "loss": 0.1624, + "step": 12068 + }, + { + "epoch": 2.4319967761434618, + "grad_norm": 0.050193753093481064, + "learning_rate": 5.636607418900875e-05, + "loss": 0.1676, + "step": 12070 + }, + { + "epoch": 2.4323997582107597, + "grad_norm": 0.04849841073155403, + "learning_rate": 5.6352855905827406e-05, + "loss": 0.2098, + "step": 12072 + }, + { + "epoch": 2.4328027402780577, + "grad_norm": 0.05145376920700073, + "learning_rate": 5.633963717134931e-05, + "loss": 0.2039, + "step": 12074 + }, + { + "epoch": 2.4332057223453556, + "grad_norm": 0.04287027567625046, + "learning_rate": 5.632641798651355e-05, + "loss": 0.2008, + "step": 12076 + }, + { + "epoch": 2.4336087044126535, + "grad_norm": 0.05822911486029625, + "learning_rate": 5.6313198352259166e-05, + "loss": 0.1829, + "step": 12078 + }, + { + "epoch": 2.4340116864799515, + "grad_norm": 0.046748463064432144, + "learning_rate": 5.629997826952527e-05, + "loss": 0.163, + "step": 12080 + }, + { + "epoch": 2.43441466854725, + "grad_norm": 0.049855321645736694, + "learning_rate": 5.6286757739250987e-05, + "loss": 0.2024, + "step": 12082 + }, + { + "epoch": 2.434817650614548, + "grad_norm": 0.052091654390096664, + "learning_rate": 5.627353676237549e-05, + "loss": 0.1661, + "step": 12084 + }, + { + "epoch": 2.4352206326818457, + "grad_norm": 0.056773409247398376, + "learning_rate": 5.6260315339837975e-05, + "loss": 0.1858, + "step": 12086 + }, + { + "epoch": 2.4356236147491437, + "grad_norm": 0.06805513799190521, + "learning_rate": 5.624709347257767e-05, + "loss": 0.1779, + "step": 12088 + }, + { + "epoch": 2.4360265968164416, + "grad_norm": 0.04931602627038956, + "learning_rate": 5.623387116153385e-05, + "loss": 0.187, + "step": 12090 + }, + { + "epoch": 2.4364295788837396, + "grad_norm": 0.057481031864881516, + "learning_rate": 5.622064840764577e-05, + "loss": 0.1431, + "step": 12092 + }, + { + "epoch": 2.4368325609510375, + "grad_norm": 0.06208740547299385, + "learning_rate": 5.620742521185278e-05, + "loss": 0.1878, + "step": 12094 + }, + { + "epoch": 2.4372355430183354, + "grad_norm": 0.07705783098936081, + "learning_rate": 5.619420157509424e-05, + "loss": 0.2383, + "step": 12096 + }, + { + "epoch": 2.437638525085634, + "grad_norm": 0.03305123373866081, + "learning_rate": 5.618097749830952e-05, + "loss": 0.1221, + "step": 12098 + }, + { + "epoch": 2.4380415071529318, + "grad_norm": 0.047065626829862595, + "learning_rate": 5.616775298243804e-05, + "loss": 0.177, + "step": 12100 + }, + { + "epoch": 2.4384444892202297, + "grad_norm": 0.05077784135937691, + "learning_rate": 5.615452802841926e-05, + "loss": 0.1877, + "step": 12102 + }, + { + "epoch": 2.4388474712875277, + "grad_norm": 0.04318971186876297, + "learning_rate": 5.6141302637192647e-05, + "loss": 0.1949, + "step": 12104 + }, + { + "epoch": 2.4392504533548256, + "grad_norm": 0.056446243077516556, + "learning_rate": 5.612807680969772e-05, + "loss": 0.2079, + "step": 12106 + }, + { + "epoch": 2.4396534354221235, + "grad_norm": 0.036065537482500076, + "learning_rate": 5.611485054687402e-05, + "loss": 0.1886, + "step": 12108 + }, + { + "epoch": 2.440056417489422, + "grad_norm": 0.0681459903717041, + "learning_rate": 5.61016238496611e-05, + "loss": 0.2035, + "step": 12110 + }, + { + "epoch": 2.44045939955672, + "grad_norm": 0.04323028773069382, + "learning_rate": 5.608839671899859e-05, + "loss": 0.165, + "step": 12112 + }, + { + "epoch": 2.440862381624018, + "grad_norm": 0.07516587525606155, + "learning_rate": 5.607516915582613e-05, + "loss": 0.1722, + "step": 12114 + }, + { + "epoch": 2.4412653636913157, + "grad_norm": 0.04655295982956886, + "learning_rate": 5.6061941161083344e-05, + "loss": 0.1984, + "step": 12116 + }, + { + "epoch": 2.4416683457586137, + "grad_norm": 0.045383911579847336, + "learning_rate": 5.6048712735709965e-05, + "loss": 0.1924, + "step": 12118 + }, + { + "epoch": 2.4420713278259116, + "grad_norm": 0.04858412221074104, + "learning_rate": 5.60354838806457e-05, + "loss": 0.1514, + "step": 12120 + }, + { + "epoch": 2.4424743098932096, + "grad_norm": 0.049353402107954025, + "learning_rate": 5.602225459683031e-05, + "loss": 0.1158, + "step": 12122 + }, + { + "epoch": 2.4428772919605075, + "grad_norm": 0.05863470956683159, + "learning_rate": 5.60090248852036e-05, + "loss": 0.1913, + "step": 12124 + }, + { + "epoch": 2.443280274027806, + "grad_norm": 0.04578150436282158, + "learning_rate": 5.5995794746705364e-05, + "loss": 0.2106, + "step": 12126 + }, + { + "epoch": 2.443683256095104, + "grad_norm": 0.04799255356192589, + "learning_rate": 5.5982564182275456e-05, + "loss": 0.2112, + "step": 12128 + }, + { + "epoch": 2.444086238162402, + "grad_norm": 0.049805283546447754, + "learning_rate": 5.596933319285376e-05, + "loss": 0.1931, + "step": 12130 + }, + { + "epoch": 2.4444892202296997, + "grad_norm": 0.043248314410448074, + "learning_rate": 5.5956101779380176e-05, + "loss": 0.1918, + "step": 12132 + }, + { + "epoch": 2.4448922022969977, + "grad_norm": 0.05324958264827728, + "learning_rate": 5.594286994279464e-05, + "loss": 0.2067, + "step": 12134 + }, + { + "epoch": 2.4452951843642956, + "grad_norm": 0.04655977338552475, + "learning_rate": 5.592963768403715e-05, + "loss": 0.2068, + "step": 12136 + }, + { + "epoch": 2.445698166431594, + "grad_norm": 0.070985808968544, + "learning_rate": 5.591640500404766e-05, + "loss": 0.1831, + "step": 12138 + }, + { + "epoch": 2.446101148498892, + "grad_norm": 0.04994530230760574, + "learning_rate": 5.590317190376623e-05, + "loss": 0.1686, + "step": 12140 + }, + { + "epoch": 2.44650413056619, + "grad_norm": 0.05718931555747986, + "learning_rate": 5.588993838413291e-05, + "loss": 0.1469, + "step": 12142 + }, + { + "epoch": 2.446907112633488, + "grad_norm": 0.11910577863454819, + "learning_rate": 5.587670444608778e-05, + "loss": 0.16, + "step": 12144 + }, + { + "epoch": 2.4473100947007858, + "grad_norm": 0.05469052866101265, + "learning_rate": 5.5863470090570966e-05, + "loss": 0.172, + "step": 12146 + }, + { + "epoch": 2.4477130767680837, + "grad_norm": 0.06286270171403885, + "learning_rate": 5.5850235318522625e-05, + "loss": 0.2013, + "step": 12148 + }, + { + "epoch": 2.4481160588353816, + "grad_norm": 0.048199381679296494, + "learning_rate": 5.583700013088291e-05, + "loss": 0.1626, + "step": 12150 + }, + { + "epoch": 2.44851904090268, + "grad_norm": 0.035780344158411026, + "learning_rate": 5.5823764528592036e-05, + "loss": 0.1876, + "step": 12152 + }, + { + "epoch": 2.448922022969978, + "grad_norm": 0.0799671933054924, + "learning_rate": 5.581052851259026e-05, + "loss": 0.2456, + "step": 12154 + }, + { + "epoch": 2.449325005037276, + "grad_norm": 0.06183718889951706, + "learning_rate": 5.579729208381782e-05, + "loss": 0.2205, + "step": 12156 + }, + { + "epoch": 2.449727987104574, + "grad_norm": 0.05557161569595337, + "learning_rate": 5.5784055243215025e-05, + "loss": 0.1684, + "step": 12158 + }, + { + "epoch": 2.450130969171872, + "grad_norm": 0.07418825477361679, + "learning_rate": 5.5770817991722205e-05, + "loss": 0.174, + "step": 12160 + }, + { + "epoch": 2.4505339512391697, + "grad_norm": 0.0453825443983078, + "learning_rate": 5.575758033027969e-05, + "loss": 0.1369, + "step": 12162 + }, + { + "epoch": 2.4509369333064677, + "grad_norm": 0.05918452516198158, + "learning_rate": 5.5744342259827874e-05, + "loss": 0.2572, + "step": 12164 + }, + { + "epoch": 2.451339915373766, + "grad_norm": 0.04277510941028595, + "learning_rate": 5.573110378130719e-05, + "loss": 0.2074, + "step": 12166 + }, + { + "epoch": 2.451742897441064, + "grad_norm": 0.09278752654790878, + "learning_rate": 5.5717864895658045e-05, + "loss": 0.1807, + "step": 12168 + }, + { + "epoch": 2.452145879508362, + "grad_norm": 0.06282159686088562, + "learning_rate": 5.5704625603820925e-05, + "loss": 0.1981, + "step": 12170 + }, + { + "epoch": 2.45254886157566, + "grad_norm": 0.04509962350130081, + "learning_rate": 5.569138590673633e-05, + "loss": 0.1393, + "step": 12172 + }, + { + "epoch": 2.452951843642958, + "grad_norm": 0.052625782787799835, + "learning_rate": 5.567814580534477e-05, + "loss": 0.1651, + "step": 12174 + }, + { + "epoch": 2.4533548257102558, + "grad_norm": 0.05318623036146164, + "learning_rate": 5.566490530058681e-05, + "loss": 0.2167, + "step": 12176 + }, + { + "epoch": 2.4537578077775537, + "grad_norm": 0.048767492175102234, + "learning_rate": 5.565166439340306e-05, + "loss": 0.137, + "step": 12178 + }, + { + "epoch": 2.454160789844852, + "grad_norm": 0.0497177429497242, + "learning_rate": 5.5638423084734095e-05, + "loss": 0.1915, + "step": 12180 + }, + { + "epoch": 2.45456377191215, + "grad_norm": 0.05952637642621994, + "learning_rate": 5.562518137552056e-05, + "loss": 0.1969, + "step": 12182 + }, + { + "epoch": 2.454966753979448, + "grad_norm": 0.06582552939653397, + "learning_rate": 5.561193926670316e-05, + "loss": 0.168, + "step": 12184 + }, + { + "epoch": 2.455369736046746, + "grad_norm": 0.03022056818008423, + "learning_rate": 5.5598696759222555e-05, + "loss": 0.1544, + "step": 12186 + }, + { + "epoch": 2.455772718114044, + "grad_norm": 0.09558003395795822, + "learning_rate": 5.5585453854019495e-05, + "loss": 0.1949, + "step": 12188 + }, + { + "epoch": 2.456175700181342, + "grad_norm": 0.06677708774805069, + "learning_rate": 5.557221055203472e-05, + "loss": 0.2189, + "step": 12190 + }, + { + "epoch": 2.45657868224864, + "grad_norm": 0.058833569288253784, + "learning_rate": 5.555896685420902e-05, + "loss": 0.1721, + "step": 12192 + }, + { + "epoch": 2.456981664315938, + "grad_norm": 0.04797518998384476, + "learning_rate": 5.554572276148321e-05, + "loss": 0.2185, + "step": 12194 + }, + { + "epoch": 2.457384646383236, + "grad_norm": 0.054031334817409515, + "learning_rate": 5.553247827479812e-05, + "loss": 0.2034, + "step": 12196 + }, + { + "epoch": 2.457787628450534, + "grad_norm": 0.045484550297260284, + "learning_rate": 5.5519233395094614e-05, + "loss": 0.1817, + "step": 12198 + }, + { + "epoch": 2.458190610517832, + "grad_norm": 0.2897370755672455, + "learning_rate": 5.55059881233136e-05, + "loss": 0.1539, + "step": 12200 + }, + { + "epoch": 2.45859359258513, + "grad_norm": 0.05194070562720299, + "learning_rate": 5.5492742460395996e-05, + "loss": 0.1916, + "step": 12202 + }, + { + "epoch": 2.458996574652428, + "grad_norm": 0.04825571924448013, + "learning_rate": 5.547949640728275e-05, + "loss": 0.1901, + "step": 12204 + }, + { + "epoch": 2.4593995567197258, + "grad_norm": 0.04531543329358101, + "learning_rate": 5.546624996491485e-05, + "loss": 0.1805, + "step": 12206 + }, + { + "epoch": 2.459802538787024, + "grad_norm": 0.0695631206035614, + "learning_rate": 5.545300313423328e-05, + "loss": 0.1507, + "step": 12208 + }, + { + "epoch": 2.460205520854322, + "grad_norm": 0.15136954188346863, + "learning_rate": 5.5439755916179094e-05, + "loss": 0.2416, + "step": 12210 + }, + { + "epoch": 2.46060850292162, + "grad_norm": 0.04831309616565704, + "learning_rate": 5.5426508311693356e-05, + "loss": 0.1837, + "step": 12212 + }, + { + "epoch": 2.461011484988918, + "grad_norm": 0.0721033364534378, + "learning_rate": 5.5413260321717144e-05, + "loss": 0.2002, + "step": 12214 + }, + { + "epoch": 2.461414467056216, + "grad_norm": 0.048263076692819595, + "learning_rate": 5.5400011947191566e-05, + "loss": 0.1699, + "step": 12216 + }, + { + "epoch": 2.461817449123514, + "grad_norm": 0.06317691504955292, + "learning_rate": 5.538676318905779e-05, + "loss": 0.1929, + "step": 12218 + }, + { + "epoch": 2.4622204311908122, + "grad_norm": 0.06637461483478546, + "learning_rate": 5.537351404825696e-05, + "loss": 0.1725, + "step": 12220 + }, + { + "epoch": 2.46262341325811, + "grad_norm": 0.07199779897928238, + "learning_rate": 5.536026452573028e-05, + "loss": 0.2299, + "step": 12222 + }, + { + "epoch": 2.463026395325408, + "grad_norm": 0.05035098269581795, + "learning_rate": 5.534701462241899e-05, + "loss": 0.1801, + "step": 12224 + }, + { + "epoch": 2.463429377392706, + "grad_norm": 0.0727388858795166, + "learning_rate": 5.533376433926434e-05, + "loss": 0.2092, + "step": 12226 + }, + { + "epoch": 2.463832359460004, + "grad_norm": 0.046067435294389725, + "learning_rate": 5.532051367720759e-05, + "loss": 0.1696, + "step": 12228 + }, + { + "epoch": 2.464235341527302, + "grad_norm": 0.07674822956323624, + "learning_rate": 5.530726263719006e-05, + "loss": 0.1275, + "step": 12230 + }, + { + "epoch": 2.4646383235946, + "grad_norm": 0.04457063227891922, + "learning_rate": 5.529401122015307e-05, + "loss": 0.1404, + "step": 12232 + }, + { + "epoch": 2.465041305661898, + "grad_norm": 0.08931057155132294, + "learning_rate": 5.5280759427038e-05, + "loss": 0.2401, + "step": 12234 + }, + { + "epoch": 2.465444287729196, + "grad_norm": 0.042824145406484604, + "learning_rate": 5.5267507258786236e-05, + "loss": 0.2042, + "step": 12236 + }, + { + "epoch": 2.465847269796494, + "grad_norm": 0.058367300778627396, + "learning_rate": 5.525425471633916e-05, + "loss": 0.1814, + "step": 12238 + }, + { + "epoch": 2.466250251863792, + "grad_norm": 0.07647201418876648, + "learning_rate": 5.524100180063825e-05, + "loss": 0.2099, + "step": 12240 + }, + { + "epoch": 2.46665323393109, + "grad_norm": 0.03983471542596817, + "learning_rate": 5.522774851262494e-05, + "loss": 0.2135, + "step": 12242 + }, + { + "epoch": 2.467056215998388, + "grad_norm": 0.05607502534985542, + "learning_rate": 5.521449485324074e-05, + "loss": 0.1735, + "step": 12244 + }, + { + "epoch": 2.467459198065686, + "grad_norm": 0.042698707431554794, + "learning_rate": 5.520124082342717e-05, + "loss": 0.1612, + "step": 12246 + }, + { + "epoch": 2.4678621801329843, + "grad_norm": 0.04548298195004463, + "learning_rate": 5.518798642412577e-05, + "loss": 0.1962, + "step": 12248 + }, + { + "epoch": 2.4682651622002822, + "grad_norm": 0.05207012593746185, + "learning_rate": 5.51747316562781e-05, + "loss": 0.1973, + "step": 12250 + }, + { + "epoch": 2.46866814426758, + "grad_norm": 0.06228821724653244, + "learning_rate": 5.5161476520825785e-05, + "loss": 0.2063, + "step": 12252 + }, + { + "epoch": 2.469071126334878, + "grad_norm": 0.049454256892204285, + "learning_rate": 5.514822101871042e-05, + "loss": 0.2042, + "step": 12254 + }, + { + "epoch": 2.469474108402176, + "grad_norm": 0.06387735903263092, + "learning_rate": 5.5134965150873675e-05, + "loss": 0.1925, + "step": 12256 + }, + { + "epoch": 2.469877090469474, + "grad_norm": 0.0502970926463604, + "learning_rate": 5.512170891825722e-05, + "loss": 0.223, + "step": 12258 + }, + { + "epoch": 2.470280072536772, + "grad_norm": 0.05233887583017349, + "learning_rate": 5.510845232180275e-05, + "loss": 0.1877, + "step": 12260 + }, + { + "epoch": 2.47068305460407, + "grad_norm": 0.053190380334854126, + "learning_rate": 5.509519536245199e-05, + "loss": 0.21, + "step": 12262 + }, + { + "epoch": 2.4710860366713683, + "grad_norm": 0.05719498172402382, + "learning_rate": 5.508193804114671e-05, + "loss": 0.1768, + "step": 12264 + }, + { + "epoch": 2.471489018738666, + "grad_norm": 0.04697343707084656, + "learning_rate": 5.506868035882867e-05, + "loss": 0.1915, + "step": 12266 + }, + { + "epoch": 2.471892000805964, + "grad_norm": 0.06328902393579483, + "learning_rate": 5.5055422316439686e-05, + "loss": 0.2007, + "step": 12268 + }, + { + "epoch": 2.472294982873262, + "grad_norm": 0.05199269577860832, + "learning_rate": 5.504216391492159e-05, + "loss": 0.2097, + "step": 12270 + }, + { + "epoch": 2.47269796494056, + "grad_norm": 0.06375780701637268, + "learning_rate": 5.502890515521624e-05, + "loss": 0.2188, + "step": 12272 + }, + { + "epoch": 2.473100947007858, + "grad_norm": 0.047609761357307434, + "learning_rate": 5.501564603826549e-05, + "loss": 0.1402, + "step": 12274 + }, + { + "epoch": 2.4735039290751564, + "grad_norm": 0.06725708395242691, + "learning_rate": 5.500238656501129e-05, + "loss": 0.2134, + "step": 12276 + }, + { + "epoch": 2.4739069111424543, + "grad_norm": 0.04552186653017998, + "learning_rate": 5.4989126736395526e-05, + "loss": 0.1678, + "step": 12278 + }, + { + "epoch": 2.4743098932097523, + "grad_norm": 0.050900984555482864, + "learning_rate": 5.497586655336019e-05, + "loss": 0.2105, + "step": 12280 + }, + { + "epoch": 2.47471287527705, + "grad_norm": 0.05463655665516853, + "learning_rate": 5.496260601684725e-05, + "loss": 0.2081, + "step": 12282 + }, + { + "epoch": 2.475115857344348, + "grad_norm": 0.04248126596212387, + "learning_rate": 5.4949345127798714e-05, + "loss": 0.1653, + "step": 12284 + }, + { + "epoch": 2.475518839411646, + "grad_norm": 0.05193443223834038, + "learning_rate": 5.493608388715661e-05, + "loss": 0.1912, + "step": 12286 + }, + { + "epoch": 2.475921821478944, + "grad_norm": 0.053680527955293655, + "learning_rate": 5.492282229586302e-05, + "loss": 0.1993, + "step": 12288 + }, + { + "epoch": 2.476324803546242, + "grad_norm": 0.04288513585925102, + "learning_rate": 5.490956035485999e-05, + "loss": 0.1566, + "step": 12290 + }, + { + "epoch": 2.4767277856135403, + "grad_norm": 0.07353232055902481, + "learning_rate": 5.489629806508964e-05, + "loss": 0.2277, + "step": 12292 + }, + { + "epoch": 2.4771307676808383, + "grad_norm": 0.06713627278804779, + "learning_rate": 5.4883035427494125e-05, + "loss": 0.2156, + "step": 12294 + }, + { + "epoch": 2.4775337497481362, + "grad_norm": 0.07119248807430267, + "learning_rate": 5.486977244301556e-05, + "loss": 0.1849, + "step": 12296 + }, + { + "epoch": 2.477936731815434, + "grad_norm": 0.05009063705801964, + "learning_rate": 5.485650911259617e-05, + "loss": 0.228, + "step": 12298 + }, + { + "epoch": 2.478339713882732, + "grad_norm": 0.03596751391887665, + "learning_rate": 5.484324543717814e-05, + "loss": 0.1456, + "step": 12300 + }, + { + "epoch": 2.47874269595003, + "grad_norm": 0.07743962854146957, + "learning_rate": 5.482998141770368e-05, + "loss": 0.2071, + "step": 12302 + }, + { + "epoch": 2.4791456780173284, + "grad_norm": 0.058847635984420776, + "learning_rate": 5.4816717055115065e-05, + "loss": 0.1557, + "step": 12304 + }, + { + "epoch": 2.4795486600846264, + "grad_norm": 0.053091805428266525, + "learning_rate": 5.480345235035459e-05, + "loss": 0.1997, + "step": 12306 + }, + { + "epoch": 2.4799516421519243, + "grad_norm": 0.04882095754146576, + "learning_rate": 5.479018730436454e-05, + "loss": 0.1748, + "step": 12308 + }, + { + "epoch": 2.4803546242192223, + "grad_norm": 0.05197981372475624, + "learning_rate": 5.477692191808723e-05, + "loss": 0.1806, + "step": 12310 + }, + { + "epoch": 2.48075760628652, + "grad_norm": 0.04549933597445488, + "learning_rate": 5.476365619246504e-05, + "loss": 0.1784, + "step": 12312 + }, + { + "epoch": 2.481160588353818, + "grad_norm": 0.07017272710800171, + "learning_rate": 5.475039012844033e-05, + "loss": 0.1789, + "step": 12314 + }, + { + "epoch": 2.481563570421116, + "grad_norm": 0.09665434062480927, + "learning_rate": 5.4737123726955494e-05, + "loss": 0.1972, + "step": 12316 + }, + { + "epoch": 2.481966552488414, + "grad_norm": 0.053050488233566284, + "learning_rate": 5.4723856988952985e-05, + "loss": 0.1895, + "step": 12318 + }, + { + "epoch": 2.4823695345557124, + "grad_norm": 0.05171143636107445, + "learning_rate": 5.471058991537521e-05, + "loss": 0.1766, + "step": 12320 + }, + { + "epoch": 2.4827725166230104, + "grad_norm": 0.04598110914230347, + "learning_rate": 5.469732250716466e-05, + "loss": 0.17, + "step": 12322 + }, + { + "epoch": 2.4831754986903083, + "grad_norm": 0.030317850410938263, + "learning_rate": 5.468405476526385e-05, + "loss": 0.1592, + "step": 12324 + }, + { + "epoch": 2.4835784807576062, + "grad_norm": 0.053349483758211136, + "learning_rate": 5.467078669061526e-05, + "loss": 0.1976, + "step": 12326 + }, + { + "epoch": 2.483981462824904, + "grad_norm": 0.059870265424251556, + "learning_rate": 5.465751828416147e-05, + "loss": 0.1714, + "step": 12328 + }, + { + "epoch": 2.484384444892202, + "grad_norm": 0.04339151084423065, + "learning_rate": 5.4644249546845015e-05, + "loss": 0.231, + "step": 12330 + }, + { + "epoch": 2.4847874269595005, + "grad_norm": 0.06685718894004822, + "learning_rate": 5.4630980479608504e-05, + "loss": 0.2283, + "step": 12332 + }, + { + "epoch": 2.4851904090267984, + "grad_norm": 0.056755565106868744, + "learning_rate": 5.461771108339456e-05, + "loss": 0.1643, + "step": 12334 + }, + { + "epoch": 2.4855933910940964, + "grad_norm": 0.05903014913201332, + "learning_rate": 5.46044413591458e-05, + "loss": 0.2265, + "step": 12336 + }, + { + "epoch": 2.4859963731613943, + "grad_norm": 0.0677526444196701, + "learning_rate": 5.459117130780487e-05, + "loss": 0.2401, + "step": 12338 + }, + { + "epoch": 2.4863993552286923, + "grad_norm": 0.04722040891647339, + "learning_rate": 5.45779009303145e-05, + "loss": 0.1772, + "step": 12340 + }, + { + "epoch": 2.48680233729599, + "grad_norm": 0.058663852512836456, + "learning_rate": 5.4564630227617355e-05, + "loss": 0.2054, + "step": 12342 + }, + { + "epoch": 2.487205319363288, + "grad_norm": 0.08635495603084564, + "learning_rate": 5.455135920065617e-05, + "loss": 0.1855, + "step": 12344 + }, + { + "epoch": 2.4876083014305865, + "grad_norm": 0.060355834662914276, + "learning_rate": 5.453808785037372e-05, + "loss": 0.2347, + "step": 12346 + }, + { + "epoch": 2.4880112834978845, + "grad_norm": 0.06020810455083847, + "learning_rate": 5.452481617771276e-05, + "loss": 0.228, + "step": 12348 + }, + { + "epoch": 2.4884142655651824, + "grad_norm": 0.04620720073580742, + "learning_rate": 5.451154418361609e-05, + "loss": 0.1407, + "step": 12350 + }, + { + "epoch": 2.4888172476324804, + "grad_norm": 0.07045585662126541, + "learning_rate": 5.449827186902655e-05, + "loss": 0.2038, + "step": 12352 + }, + { + "epoch": 2.4892202296997783, + "grad_norm": 0.041793834418058395, + "learning_rate": 5.448499923488697e-05, + "loss": 0.1546, + "step": 12354 + }, + { + "epoch": 2.4896232117670762, + "grad_norm": 0.05781014636158943, + "learning_rate": 5.4471726282140203e-05, + "loss": 0.2068, + "step": 12356 + }, + { + "epoch": 2.490026193834374, + "grad_norm": 0.08505477011203766, + "learning_rate": 5.445845301172917e-05, + "loss": 0.2346, + "step": 12358 + }, + { + "epoch": 2.4904291759016726, + "grad_norm": 0.06190848723053932, + "learning_rate": 5.4445179424596747e-05, + "loss": 0.1849, + "step": 12360 + }, + { + "epoch": 2.4908321579689705, + "grad_norm": 0.03134537488222122, + "learning_rate": 5.443190552168589e-05, + "loss": 0.1761, + "step": 12362 + }, + { + "epoch": 2.4912351400362684, + "grad_norm": 0.04573149234056473, + "learning_rate": 5.441863130393957e-05, + "loss": 0.1969, + "step": 12364 + }, + { + "epoch": 2.4916381221035664, + "grad_norm": 0.05835878103971481, + "learning_rate": 5.4405356772300733e-05, + "loss": 0.2151, + "step": 12366 + }, + { + "epoch": 2.4920411041708643, + "grad_norm": 0.06177271157503128, + "learning_rate": 5.4392081927712394e-05, + "loss": 0.2706, + "step": 12368 + }, + { + "epoch": 2.4924440862381623, + "grad_norm": 0.051789190620183945, + "learning_rate": 5.43788067711176e-05, + "loss": 0.1595, + "step": 12370 + }, + { + "epoch": 2.49284706830546, + "grad_norm": 0.050115808844566345, + "learning_rate": 5.436553130345935e-05, + "loss": 0.1806, + "step": 12372 + }, + { + "epoch": 2.4932500503727586, + "grad_norm": 0.051849812269210815, + "learning_rate": 5.435225552568075e-05, + "loss": 0.1914, + "step": 12374 + }, + { + "epoch": 2.4936530324400565, + "grad_norm": 0.06172528490424156, + "learning_rate": 5.433897943872488e-05, + "loss": 0.18, + "step": 12376 + }, + { + "epoch": 2.4940560145073545, + "grad_norm": 0.04672456160187721, + "learning_rate": 5.432570304353484e-05, + "loss": 0.1496, + "step": 12378 + }, + { + "epoch": 2.4944589965746524, + "grad_norm": 0.07356838136911392, + "learning_rate": 5.431242634105378e-05, + "loss": 0.194, + "step": 12380 + }, + { + "epoch": 2.4948619786419504, + "grad_norm": 0.08135899156332016, + "learning_rate": 5.429914933222485e-05, + "loss": 0.1797, + "step": 12382 + }, + { + "epoch": 2.4952649607092483, + "grad_norm": 0.04457440972328186, + "learning_rate": 5.428587201799122e-05, + "loss": 0.1633, + "step": 12384 + }, + { + "epoch": 2.4956679427765467, + "grad_norm": 0.05523247644305229, + "learning_rate": 5.4272594399296105e-05, + "loss": 0.236, + "step": 12386 + }, + { + "epoch": 2.4960709248438446, + "grad_norm": 0.03238510340452194, + "learning_rate": 5.425931647708272e-05, + "loss": 0.171, + "step": 12388 + }, + { + "epoch": 2.4964739069111426, + "grad_norm": 0.13140137493610382, + "learning_rate": 5.42460382522943e-05, + "loss": 0.196, + "step": 12390 + }, + { + "epoch": 2.4968768889784405, + "grad_norm": 0.06779821962118149, + "learning_rate": 5.423275972587411e-05, + "loss": 0.1828, + "step": 12392 + }, + { + "epoch": 2.4972798710457385, + "grad_norm": 0.04135895520448685, + "learning_rate": 5.421948089876544e-05, + "loss": 0.1577, + "step": 12394 + }, + { + "epoch": 2.4976828531130364, + "grad_norm": 0.06245134025812149, + "learning_rate": 5.420620177191159e-05, + "loss": 0.1924, + "step": 12396 + }, + { + "epoch": 2.4980858351803343, + "grad_norm": 0.04454468935728073, + "learning_rate": 5.4192922346255916e-05, + "loss": 0.1985, + "step": 12398 + }, + { + "epoch": 2.4984888172476323, + "grad_norm": 0.07690700143575668, + "learning_rate": 5.417964262274171e-05, + "loss": 0.1572, + "step": 12400 + }, + { + "epoch": 2.4988917993149307, + "grad_norm": 0.04643810912966728, + "learning_rate": 5.4166362602312396e-05, + "loss": 0.1698, + "step": 12402 + }, + { + "epoch": 2.4992947813822286, + "grad_norm": 0.05322658643126488, + "learning_rate": 5.415308228591135e-05, + "loss": 0.2312, + "step": 12404 + }, + { + "epoch": 2.4996977634495265, + "grad_norm": 0.04215513542294502, + "learning_rate": 5.413980167448197e-05, + "loss": 0.1892, + "step": 12406 + }, + { + "epoch": 2.5001007455168245, + "grad_norm": 0.06809663027524948, + "learning_rate": 5.412652076896769e-05, + "loss": 0.2176, + "step": 12408 + }, + { + "epoch": 2.5005037275841224, + "grad_norm": 0.061162110418081284, + "learning_rate": 5.4113239570312e-05, + "loss": 0.162, + "step": 12410 + }, + { + "epoch": 2.5009067096514204, + "grad_norm": 0.04683419689536095, + "learning_rate": 5.409995807945834e-05, + "loss": 0.1931, + "step": 12412 + }, + { + "epoch": 2.5013096917187188, + "grad_norm": 0.054280806332826614, + "learning_rate": 5.4086676297350204e-05, + "loss": 0.2159, + "step": 12414 + }, + { + "epoch": 2.5017126737860167, + "grad_norm": 0.05468326061964035, + "learning_rate": 5.407339422493113e-05, + "loss": 0.2176, + "step": 12416 + }, + { + "epoch": 2.5021156558533146, + "grad_norm": 0.06952318549156189, + "learning_rate": 5.4060111863144636e-05, + "loss": 0.2025, + "step": 12418 + }, + { + "epoch": 2.5025186379206126, + "grad_norm": 0.06900619715452194, + "learning_rate": 5.404682921293429e-05, + "loss": 0.1671, + "step": 12420 + }, + { + "epoch": 2.5029216199879105, + "grad_norm": 0.037348657846450806, + "learning_rate": 5.403354627524367e-05, + "loss": 0.1616, + "step": 12422 + }, + { + "epoch": 2.5033246020552085, + "grad_norm": 0.07734784483909607, + "learning_rate": 5.4020263051016375e-05, + "loss": 0.2028, + "step": 12424 + }, + { + "epoch": 2.5037275841225064, + "grad_norm": 0.050108522176742554, + "learning_rate": 5.4006979541196024e-05, + "loss": 0.1732, + "step": 12426 + }, + { + "epoch": 2.5041305661898043, + "grad_norm": 0.05922889709472656, + "learning_rate": 5.399369574672626e-05, + "loss": 0.1576, + "step": 12428 + }, + { + "epoch": 2.5045335482571023, + "grad_norm": 0.06523427367210388, + "learning_rate": 5.3980411668550724e-05, + "loss": 0.1894, + "step": 12430 + }, + { + "epoch": 2.5049365303244007, + "grad_norm": 0.05559534579515457, + "learning_rate": 5.396712730761311e-05, + "loss": 0.1832, + "step": 12432 + }, + { + "epoch": 2.5053395123916986, + "grad_norm": 0.06921125203371048, + "learning_rate": 5.395384266485713e-05, + "loss": 0.1934, + "step": 12434 + }, + { + "epoch": 2.5057424944589965, + "grad_norm": 0.04383534938097, + "learning_rate": 5.394055774122648e-05, + "loss": 0.1495, + "step": 12436 + }, + { + "epoch": 2.5061454765262945, + "grad_norm": 0.044495001435279846, + "learning_rate": 5.392727253766491e-05, + "loss": 0.1741, + "step": 12438 + }, + { + "epoch": 2.5065484585935924, + "grad_norm": 0.05062146857380867, + "learning_rate": 5.391398705511619e-05, + "loss": 0.1648, + "step": 12440 + }, + { + "epoch": 2.506951440660891, + "grad_norm": 0.05182384327054024, + "learning_rate": 5.390070129452407e-05, + "loss": 0.1508, + "step": 12442 + }, + { + "epoch": 2.5073544227281888, + "grad_norm": 0.06176676228642464, + "learning_rate": 5.388741525683237e-05, + "loss": 0.1624, + "step": 12444 + }, + { + "epoch": 2.5077574047954867, + "grad_norm": 0.052249081432819366, + "learning_rate": 5.387412894298494e-05, + "loss": 0.1813, + "step": 12446 + }, + { + "epoch": 2.5081603868627846, + "grad_norm": 0.05974605679512024, + "learning_rate": 5.386084235392555e-05, + "loss": 0.1921, + "step": 12448 + }, + { + "epoch": 2.5085633689300826, + "grad_norm": 0.06526529043912888, + "learning_rate": 5.38475554905981e-05, + "loss": 0.2365, + "step": 12450 + }, + { + "epoch": 2.5089663509973805, + "grad_norm": 0.058665867894887924, + "learning_rate": 5.383426835394646e-05, + "loss": 0.1953, + "step": 12452 + }, + { + "epoch": 2.5093693330646785, + "grad_norm": 0.05564659833908081, + "learning_rate": 5.3820980944914534e-05, + "loss": 0.2344, + "step": 12454 + }, + { + "epoch": 2.5097723151319764, + "grad_norm": 0.06119886040687561, + "learning_rate": 5.380769326444624e-05, + "loss": 0.2102, + "step": 12456 + }, + { + "epoch": 2.510175297199275, + "grad_norm": 0.07365282624959946, + "learning_rate": 5.37944053134855e-05, + "loss": 0.1746, + "step": 12458 + }, + { + "epoch": 2.5105782792665727, + "grad_norm": 0.043750863522291183, + "learning_rate": 5.3781117092976264e-05, + "loss": 0.1796, + "step": 12460 + }, + { + "epoch": 2.5109812613338707, + "grad_norm": 0.05926395207643509, + "learning_rate": 5.3767828603862535e-05, + "loss": 0.2018, + "step": 12462 + }, + { + "epoch": 2.5113842434011686, + "grad_norm": 0.07026753574609756, + "learning_rate": 5.3754539847088284e-05, + "loss": 0.1837, + "step": 12464 + }, + { + "epoch": 2.5117872254684666, + "grad_norm": 0.03925260528922081, + "learning_rate": 5.3741250823597514e-05, + "loss": 0.1593, + "step": 12466 + }, + { + "epoch": 2.5121902075357645, + "grad_norm": 0.06190716475248337, + "learning_rate": 5.372796153433428e-05, + "loss": 0.1479, + "step": 12468 + }, + { + "epoch": 2.512593189603063, + "grad_norm": 0.051117755472660065, + "learning_rate": 5.371467198024262e-05, + "loss": 0.1664, + "step": 12470 + }, + { + "epoch": 2.512996171670361, + "grad_norm": 0.057007379829883575, + "learning_rate": 5.370138216226659e-05, + "loss": 0.1731, + "step": 12472 + }, + { + "epoch": 2.5133991537376588, + "grad_norm": 0.07143572717905045, + "learning_rate": 5.368809208135031e-05, + "loss": 0.1865, + "step": 12474 + }, + { + "epoch": 2.5138021358049567, + "grad_norm": 0.09572847932577133, + "learning_rate": 5.3674801738437854e-05, + "loss": 0.1801, + "step": 12476 + }, + { + "epoch": 2.5142051178722546, + "grad_norm": 0.05938468873500824, + "learning_rate": 5.366151113447336e-05, + "loss": 0.1873, + "step": 12478 + }, + { + "epoch": 2.5146080999395526, + "grad_norm": 0.07035186886787415, + "learning_rate": 5.3648220270400985e-05, + "loss": 0.2036, + "step": 12480 + }, + { + "epoch": 2.5150110820068505, + "grad_norm": 0.05732342600822449, + "learning_rate": 5.3634929147164856e-05, + "loss": 0.1857, + "step": 12482 + }, + { + "epoch": 2.5154140640741485, + "grad_norm": 0.055962275713682175, + "learning_rate": 5.362163776570919e-05, + "loss": 0.1447, + "step": 12484 + }, + { + "epoch": 2.515817046141447, + "grad_norm": 0.04367542266845703, + "learning_rate": 5.360834612697816e-05, + "loss": 0.1856, + "step": 12486 + }, + { + "epoch": 2.516220028208745, + "grad_norm": 0.06652933359146118, + "learning_rate": 5.3595054231916e-05, + "loss": 0.2319, + "step": 12488 + }, + { + "epoch": 2.5166230102760427, + "grad_norm": 0.05221908167004585, + "learning_rate": 5.3581762081466936e-05, + "loss": 0.1884, + "step": 12490 + }, + { + "epoch": 2.5170259923433407, + "grad_norm": 0.04966789111495018, + "learning_rate": 5.3568469676575206e-05, + "loss": 0.2137, + "step": 12492 + }, + { + "epoch": 2.5174289744106386, + "grad_norm": 0.056733645498752594, + "learning_rate": 5.355517701818511e-05, + "loss": 0.2197, + "step": 12494 + }, + { + "epoch": 2.517831956477937, + "grad_norm": 0.054679885506629944, + "learning_rate": 5.354188410724092e-05, + "loss": 0.1723, + "step": 12496 + }, + { + "epoch": 2.518234938545235, + "grad_norm": 0.04557587578892708, + "learning_rate": 5.352859094468695e-05, + "loss": 0.1544, + "step": 12498 + }, + { + "epoch": 2.518637920612533, + "grad_norm": 0.055995721369981766, + "learning_rate": 5.351529753146752e-05, + "loss": 0.1585, + "step": 12500 + }, + { + "epoch": 2.519040902679831, + "grad_norm": 0.06158788874745369, + "learning_rate": 5.350200386852698e-05, + "loss": 0.1699, + "step": 12502 + }, + { + "epoch": 2.5194438847471288, + "grad_norm": 0.07631693035364151, + "learning_rate": 5.348870995680969e-05, + "loss": 0.2663, + "step": 12504 + }, + { + "epoch": 2.5198468668144267, + "grad_norm": 0.05153597518801689, + "learning_rate": 5.347541579726001e-05, + "loss": 0.1706, + "step": 12506 + }, + { + "epoch": 2.5202498488817247, + "grad_norm": 0.050337210297584534, + "learning_rate": 5.346212139082236e-05, + "loss": 0.1744, + "step": 12508 + }, + { + "epoch": 2.5206528309490226, + "grad_norm": 0.051275044679641724, + "learning_rate": 5.3448826738441135e-05, + "loss": 0.1443, + "step": 12510 + }, + { + "epoch": 2.5210558130163205, + "grad_norm": 0.0649779736995697, + "learning_rate": 5.343553184106078e-05, + "loss": 0.2232, + "step": 12512 + }, + { + "epoch": 2.521458795083619, + "grad_norm": 0.0551101416349411, + "learning_rate": 5.342223669962575e-05, + "loss": 0.2095, + "step": 12514 + }, + { + "epoch": 2.521861777150917, + "grad_norm": 0.06402526795864105, + "learning_rate": 5.3408941315080476e-05, + "loss": 0.1813, + "step": 12516 + }, + { + "epoch": 2.522264759218215, + "grad_norm": 0.06047520786523819, + "learning_rate": 5.3395645688369464e-05, + "loss": 0.1938, + "step": 12518 + }, + { + "epoch": 2.5226677412855127, + "grad_norm": 0.05238273739814758, + "learning_rate": 5.338234982043723e-05, + "loss": 0.2008, + "step": 12520 + }, + { + "epoch": 2.5230707233528107, + "grad_norm": 0.05175113305449486, + "learning_rate": 5.3369053712228265e-05, + "loss": 0.1408, + "step": 12522 + }, + { + "epoch": 2.523473705420109, + "grad_norm": 0.04698505997657776, + "learning_rate": 5.335575736468711e-05, + "loss": 0.1922, + "step": 12524 + }, + { + "epoch": 2.523876687487407, + "grad_norm": 0.05037987604737282, + "learning_rate": 5.334246077875833e-05, + "loss": 0.1506, + "step": 12526 + }, + { + "epoch": 2.524279669554705, + "grad_norm": 0.04577264562249184, + "learning_rate": 5.332916395538646e-05, + "loss": 0.1513, + "step": 12528 + }, + { + "epoch": 2.524682651622003, + "grad_norm": 0.0762718990445137, + "learning_rate": 5.331586689551612e-05, + "loss": 0.1935, + "step": 12530 + }, + { + "epoch": 2.525085633689301, + "grad_norm": 0.05379846692085266, + "learning_rate": 5.33025696000919e-05, + "loss": 0.2161, + "step": 12532 + }, + { + "epoch": 2.5254886157565988, + "grad_norm": 0.05114980787038803, + "learning_rate": 5.3289272070058415e-05, + "loss": 0.1496, + "step": 12534 + }, + { + "epoch": 2.5258915978238967, + "grad_norm": 0.04115746542811394, + "learning_rate": 5.3275974306360296e-05, + "loss": 0.1536, + "step": 12536 + }, + { + "epoch": 2.5262945798911947, + "grad_norm": 0.055138975381851196, + "learning_rate": 5.326267630994222e-05, + "loss": 0.2359, + "step": 12538 + }, + { + "epoch": 2.5266975619584926, + "grad_norm": 0.05190563201904297, + "learning_rate": 5.3249378081748815e-05, + "loss": 0.1924, + "step": 12540 + }, + { + "epoch": 2.527100544025791, + "grad_norm": 0.055855296552181244, + "learning_rate": 5.32360796227248e-05, + "loss": 0.1893, + "step": 12542 + }, + { + "epoch": 2.527503526093089, + "grad_norm": 0.0799691379070282, + "learning_rate": 5.322278093381486e-05, + "loss": 0.1675, + "step": 12544 + }, + { + "epoch": 2.527906508160387, + "grad_norm": 0.05231550708413124, + "learning_rate": 5.320948201596372e-05, + "loss": 0.2105, + "step": 12546 + }, + { + "epoch": 2.528309490227685, + "grad_norm": 0.06030051410198212, + "learning_rate": 5.319618287011611e-05, + "loss": 0.1889, + "step": 12548 + }, + { + "epoch": 2.5287124722949827, + "grad_norm": 0.04509327560663223, + "learning_rate": 5.3182883497216785e-05, + "loss": 0.1286, + "step": 12550 + }, + { + "epoch": 2.529115454362281, + "grad_norm": 0.04975868761539459, + "learning_rate": 5.3169583898210495e-05, + "loss": 0.1533, + "step": 12552 + }, + { + "epoch": 2.529518436429579, + "grad_norm": 0.04566744342446327, + "learning_rate": 5.315628407404203e-05, + "loss": 0.1536, + "step": 12554 + }, + { + "epoch": 2.529921418496877, + "grad_norm": 0.05292901396751404, + "learning_rate": 5.314298402565621e-05, + "loss": 0.2023, + "step": 12556 + }, + { + "epoch": 2.530324400564175, + "grad_norm": 0.05594682693481445, + "learning_rate": 5.312968375399782e-05, + "loss": 0.1589, + "step": 12558 + }, + { + "epoch": 2.530727382631473, + "grad_norm": 0.051560178399086, + "learning_rate": 5.311638326001172e-05, + "loss": 0.1737, + "step": 12560 + }, + { + "epoch": 2.531130364698771, + "grad_norm": 0.050082527101039886, + "learning_rate": 5.31030825446427e-05, + "loss": 0.216, + "step": 12562 + }, + { + "epoch": 2.531533346766069, + "grad_norm": 0.0678807944059372, + "learning_rate": 5.3089781608835684e-05, + "loss": 0.2065, + "step": 12564 + }, + { + "epoch": 2.5319363288333667, + "grad_norm": 0.04848482087254524, + "learning_rate": 5.307648045353553e-05, + "loss": 0.1619, + "step": 12566 + }, + { + "epoch": 2.5323393109006647, + "grad_norm": 0.06144823879003525, + "learning_rate": 5.306317907968711e-05, + "loss": 0.179, + "step": 12568 + }, + { + "epoch": 2.532742292967963, + "grad_norm": 0.07615198940038681, + "learning_rate": 5.3049877488235346e-05, + "loss": 0.202, + "step": 12570 + }, + { + "epoch": 2.533145275035261, + "grad_norm": 0.058797042816877365, + "learning_rate": 5.303657568012518e-05, + "loss": 0.1801, + "step": 12572 + }, + { + "epoch": 2.533548257102559, + "grad_norm": 0.0644260123372078, + "learning_rate": 5.302327365630151e-05, + "loss": 0.1665, + "step": 12574 + }, + { + "epoch": 2.533951239169857, + "grad_norm": 0.055937882512807846, + "learning_rate": 5.300997141770933e-05, + "loss": 0.1723, + "step": 12576 + }, + { + "epoch": 2.534354221237155, + "grad_norm": 0.061812832951545715, + "learning_rate": 5.299666896529359e-05, + "loss": 0.2301, + "step": 12578 + }, + { + "epoch": 2.534757203304453, + "grad_norm": 0.07181499153375626, + "learning_rate": 5.298336629999928e-05, + "loss": 0.1966, + "step": 12580 + }, + { + "epoch": 2.535160185371751, + "grad_norm": 0.05782296136021614, + "learning_rate": 5.29700634227714e-05, + "loss": 0.1752, + "step": 12582 + }, + { + "epoch": 2.535563167439049, + "grad_norm": 0.06424509733915329, + "learning_rate": 5.2956760334554966e-05, + "loss": 0.1884, + "step": 12584 + }, + { + "epoch": 2.535966149506347, + "grad_norm": 0.0681728720664978, + "learning_rate": 5.2943457036295e-05, + "loss": 0.1922, + "step": 12586 + }, + { + "epoch": 2.536369131573645, + "grad_norm": 0.06878721714019775, + "learning_rate": 5.2930153528936556e-05, + "loss": 0.2045, + "step": 12588 + }, + { + "epoch": 2.536772113640943, + "grad_norm": 0.06963707506656647, + "learning_rate": 5.2916849813424694e-05, + "loss": 0.192, + "step": 12590 + }, + { + "epoch": 2.537175095708241, + "grad_norm": 0.05961614102125168, + "learning_rate": 5.2903545890704484e-05, + "loss": 0.1639, + "step": 12592 + }, + { + "epoch": 2.537578077775539, + "grad_norm": 0.06052708998322487, + "learning_rate": 5.289024176172102e-05, + "loss": 0.2408, + "step": 12594 + }, + { + "epoch": 2.5379810598428367, + "grad_norm": 0.05931401625275612, + "learning_rate": 5.28769374274194e-05, + "loss": 0.2079, + "step": 12596 + }, + { + "epoch": 2.538384041910135, + "grad_norm": 0.06881610304117203, + "learning_rate": 5.2863632888744753e-05, + "loss": 0.1564, + "step": 12598 + }, + { + "epoch": 2.538787023977433, + "grad_norm": 0.09120440483093262, + "learning_rate": 5.2850328146642194e-05, + "loss": 0.2046, + "step": 12600 + }, + { + "epoch": 2.539190006044731, + "grad_norm": 0.06185607239603996, + "learning_rate": 5.283702320205689e-05, + "loss": 0.1858, + "step": 12602 + }, + { + "epoch": 2.539592988112029, + "grad_norm": 0.06449907273054123, + "learning_rate": 5.282371805593399e-05, + "loss": 0.1725, + "step": 12604 + }, + { + "epoch": 2.539995970179327, + "grad_norm": 0.07941526174545288, + "learning_rate": 5.281041270921867e-05, + "loss": 0.1761, + "step": 12606 + }, + { + "epoch": 2.5403989522466253, + "grad_norm": 0.05380752682685852, + "learning_rate": 5.2797107162856154e-05, + "loss": 0.2393, + "step": 12608 + }, + { + "epoch": 2.540801934313923, + "grad_norm": 0.06477542966604233, + "learning_rate": 5.278380141779159e-05, + "loss": 0.2467, + "step": 12610 + }, + { + "epoch": 2.541204916381221, + "grad_norm": 0.07349900901317596, + "learning_rate": 5.277049547497023e-05, + "loss": 0.2821, + "step": 12612 + }, + { + "epoch": 2.541607898448519, + "grad_norm": 0.05324852094054222, + "learning_rate": 5.275718933533731e-05, + "loss": 0.2045, + "step": 12614 + }, + { + "epoch": 2.542010880515817, + "grad_norm": 0.052172720432281494, + "learning_rate": 5.274388299983807e-05, + "loss": 0.1792, + "step": 12616 + }, + { + "epoch": 2.542413862583115, + "grad_norm": 0.03972654789686203, + "learning_rate": 5.273057646941776e-05, + "loss": 0.183, + "step": 12618 + }, + { + "epoch": 2.542816844650413, + "grad_norm": 0.056424275040626526, + "learning_rate": 5.271726974502167e-05, + "loss": 0.2112, + "step": 12620 + }, + { + "epoch": 2.543219826717711, + "grad_norm": 0.0493265837430954, + "learning_rate": 5.270396282759508e-05, + "loss": 0.1641, + "step": 12622 + }, + { + "epoch": 2.543622808785009, + "grad_norm": 0.04471006616950035, + "learning_rate": 5.269065571808329e-05, + "loss": 0.1254, + "step": 12624 + }, + { + "epoch": 2.544025790852307, + "grad_norm": 0.05025114119052887, + "learning_rate": 5.2677348417431636e-05, + "loss": 0.1767, + "step": 12626 + }, + { + "epoch": 2.544428772919605, + "grad_norm": 0.07107166945934296, + "learning_rate": 5.266404092658542e-05, + "loss": 0.1732, + "step": 12628 + }, + { + "epoch": 2.544831754986903, + "grad_norm": 0.07669036090373993, + "learning_rate": 5.2650733246490014e-05, + "loss": 0.1624, + "step": 12630 + }, + { + "epoch": 2.545234737054201, + "grad_norm": 0.05052323639392853, + "learning_rate": 5.263742537809074e-05, + "loss": 0.1813, + "step": 12632 + }, + { + "epoch": 2.545637719121499, + "grad_norm": 0.04952344670891762, + "learning_rate": 5.262411732233299e-05, + "loss": 0.1793, + "step": 12634 + }, + { + "epoch": 2.5460407011887973, + "grad_norm": 0.06734666973352432, + "learning_rate": 5.261080908016215e-05, + "loss": 0.2026, + "step": 12636 + }, + { + "epoch": 2.5464436832560953, + "grad_norm": 0.056185994297266006, + "learning_rate": 5.2597500652523594e-05, + "loss": 0.1895, + "step": 12638 + }, + { + "epoch": 2.546846665323393, + "grad_norm": 0.04555206000804901, + "learning_rate": 5.258419204036275e-05, + "loss": 0.1778, + "step": 12640 + }, + { + "epoch": 2.547249647390691, + "grad_norm": 0.0584423765540123, + "learning_rate": 5.257088324462505e-05, + "loss": 0.1854, + "step": 12642 + }, + { + "epoch": 2.547652629457989, + "grad_norm": 0.05547767132520676, + "learning_rate": 5.255757426625589e-05, + "loss": 0.2358, + "step": 12644 + }, + { + "epoch": 2.548055611525287, + "grad_norm": 0.0526818111538887, + "learning_rate": 5.254426510620076e-05, + "loss": 0.1936, + "step": 12646 + }, + { + "epoch": 2.548458593592585, + "grad_norm": 0.059681832790374756, + "learning_rate": 5.253095576540511e-05, + "loss": 0.204, + "step": 12648 + }, + { + "epoch": 2.548861575659883, + "grad_norm": 0.11047457903623581, + "learning_rate": 5.25176462448144e-05, + "loss": 0.1929, + "step": 12650 + }, + { + "epoch": 2.5492645577271813, + "grad_norm": 0.047334007918834686, + "learning_rate": 5.250433654537413e-05, + "loss": 0.1852, + "step": 12652 + }, + { + "epoch": 2.5496675397944792, + "grad_norm": 0.061922837048769, + "learning_rate": 5.249102666802981e-05, + "loss": 0.1463, + "step": 12654 + }, + { + "epoch": 2.550070521861777, + "grad_norm": 0.07106788456439972, + "learning_rate": 5.247771661372692e-05, + "loss": 0.1574, + "step": 12656 + }, + { + "epoch": 2.550473503929075, + "grad_norm": 0.041897907853126526, + "learning_rate": 5.2464406383411004e-05, + "loss": 0.1597, + "step": 12658 + }, + { + "epoch": 2.550876485996373, + "grad_norm": 0.036984678357839584, + "learning_rate": 5.245109597802762e-05, + "loss": 0.167, + "step": 12660 + }, + { + "epoch": 2.551279468063671, + "grad_norm": 0.049853935837745667, + "learning_rate": 5.243778539852228e-05, + "loss": 0.2422, + "step": 12662 + }, + { + "epoch": 2.5516824501309694, + "grad_norm": 0.054746098816394806, + "learning_rate": 5.2424474645840574e-05, + "loss": 0.1764, + "step": 12664 + }, + { + "epoch": 2.5520854321982673, + "grad_norm": 0.06157633662223816, + "learning_rate": 5.241116372092806e-05, + "loss": 0.1968, + "step": 12666 + }, + { + "epoch": 2.5524884142655653, + "grad_norm": 0.06506279110908508, + "learning_rate": 5.2397852624730327e-05, + "loss": 0.1789, + "step": 12668 + }, + { + "epoch": 2.552891396332863, + "grad_norm": 0.03678274154663086, + "learning_rate": 5.2384541358192986e-05, + "loss": 0.134, + "step": 12670 + }, + { + "epoch": 2.553294378400161, + "grad_norm": 0.06827437877655029, + "learning_rate": 5.237122992226165e-05, + "loss": 0.2136, + "step": 12672 + }, + { + "epoch": 2.553697360467459, + "grad_norm": 0.058229926973581314, + "learning_rate": 5.2357918317881915e-05, + "loss": 0.2051, + "step": 12674 + }, + { + "epoch": 2.554100342534757, + "grad_norm": 0.06786002963781357, + "learning_rate": 5.2344606545999433e-05, + "loss": 0.1732, + "step": 12676 + }, + { + "epoch": 2.554503324602055, + "grad_norm": 0.06592409312725067, + "learning_rate": 5.233129460755987e-05, + "loss": 0.183, + "step": 12678 + }, + { + "epoch": 2.5549063066693534, + "grad_norm": 0.04685007780790329, + "learning_rate": 5.2317982503508856e-05, + "loss": 0.1344, + "step": 12680 + }, + { + "epoch": 2.5553092887366513, + "grad_norm": 0.09164592623710632, + "learning_rate": 5.230467023479206e-05, + "loss": 0.1841, + "step": 12682 + }, + { + "epoch": 2.5557122708039492, + "grad_norm": 0.07550480961799622, + "learning_rate": 5.22913578023552e-05, + "loss": 0.2059, + "step": 12684 + }, + { + "epoch": 2.556115252871247, + "grad_norm": 0.054588042199611664, + "learning_rate": 5.227804520714392e-05, + "loss": 0.1812, + "step": 12686 + }, + { + "epoch": 2.556518234938545, + "grad_norm": 0.2929278314113617, + "learning_rate": 5.226473245010397e-05, + "loss": 0.2214, + "step": 12688 + }, + { + "epoch": 2.5569212170058435, + "grad_norm": 0.04900304600596428, + "learning_rate": 5.2251419532181054e-05, + "loss": 0.1693, + "step": 12690 + }, + { + "epoch": 2.5573241990731415, + "grad_norm": 0.06676102429628372, + "learning_rate": 5.223810645432088e-05, + "loss": 0.1692, + "step": 12692 + }, + { + "epoch": 2.5577271811404394, + "grad_norm": 0.06755790114402771, + "learning_rate": 5.2224793217469213e-05, + "loss": 0.1997, + "step": 12694 + }, + { + "epoch": 2.5581301632077373, + "grad_norm": 0.4008621871471405, + "learning_rate": 5.221147982257178e-05, + "loss": 0.2239, + "step": 12696 + }, + { + "epoch": 2.5585331452750353, + "grad_norm": 0.05779675021767616, + "learning_rate": 5.2198166270574366e-05, + "loss": 0.2308, + "step": 12698 + }, + { + "epoch": 2.5589361273423332, + "grad_norm": 0.060293830931186676, + "learning_rate": 5.2184852562422746e-05, + "loss": 0.2085, + "step": 12700 + }, + { + "epoch": 2.559339109409631, + "grad_norm": 0.05748559907078743, + "learning_rate": 5.2171538699062686e-05, + "loss": 0.2141, + "step": 12702 + }, + { + "epoch": 2.559742091476929, + "grad_norm": 0.04755993187427521, + "learning_rate": 5.215822468143998e-05, + "loss": 0.151, + "step": 12704 + }, + { + "epoch": 2.560145073544227, + "grad_norm": 0.06931561976671219, + "learning_rate": 5.214491051050045e-05, + "loss": 0.2315, + "step": 12706 + }, + { + "epoch": 2.5605480556115254, + "grad_norm": 0.07574167847633362, + "learning_rate": 5.2131596187189914e-05, + "loss": 0.2175, + "step": 12708 + }, + { + "epoch": 2.5609510376788234, + "grad_norm": 0.057953450828790665, + "learning_rate": 5.2118281712454184e-05, + "loss": 0.2294, + "step": 12710 + }, + { + "epoch": 2.5613540197461213, + "grad_norm": 0.05987294390797615, + "learning_rate": 5.210496708723912e-05, + "loss": 0.169, + "step": 12712 + }, + { + "epoch": 2.5617570018134193, + "grad_norm": 0.05373973026871681, + "learning_rate": 5.2091652312490556e-05, + "loss": 0.1728, + "step": 12714 + }, + { + "epoch": 2.562159983880717, + "grad_norm": 0.057644158601760864, + "learning_rate": 5.207833738915435e-05, + "loss": 0.2014, + "step": 12716 + }, + { + "epoch": 2.5625629659480156, + "grad_norm": 0.06298521906137466, + "learning_rate": 5.206502231817639e-05, + "loss": 0.1506, + "step": 12718 + }, + { + "epoch": 2.5629659480153135, + "grad_norm": 0.06016525253653526, + "learning_rate": 5.2051707100502534e-05, + "loss": 0.2087, + "step": 12720 + }, + { + "epoch": 2.5633689300826115, + "grad_norm": 0.08092235773801804, + "learning_rate": 5.2038391737078694e-05, + "loss": 0.2032, + "step": 12722 + }, + { + "epoch": 2.5637719121499094, + "grad_norm": 0.053698521107435226, + "learning_rate": 5.202507622885078e-05, + "loss": 0.178, + "step": 12724 + }, + { + "epoch": 2.5641748942172073, + "grad_norm": 0.06155625730752945, + "learning_rate": 5.201176057676467e-05, + "loss": 0.189, + "step": 12726 + }, + { + "epoch": 2.5645778762845053, + "grad_norm": 0.05709967389702797, + "learning_rate": 5.199844478176631e-05, + "loss": 0.2007, + "step": 12728 + }, + { + "epoch": 2.5649808583518032, + "grad_norm": 0.05761351063847542, + "learning_rate": 5.1985128844801633e-05, + "loss": 0.1464, + "step": 12730 + }, + { + "epoch": 2.565383840419101, + "grad_norm": 0.05454608052968979, + "learning_rate": 5.197181276681657e-05, + "loss": 0.2254, + "step": 12732 + }, + { + "epoch": 2.565786822486399, + "grad_norm": 0.23275476694107056, + "learning_rate": 5.195849654875709e-05, + "loss": 0.2148, + "step": 12734 + }, + { + "epoch": 2.5661898045536975, + "grad_norm": 0.09468139708042145, + "learning_rate": 5.194518019156914e-05, + "loss": 0.1706, + "step": 12736 + }, + { + "epoch": 2.5665927866209954, + "grad_norm": 0.05750922113656998, + "learning_rate": 5.19318636961987e-05, + "loss": 0.1874, + "step": 12738 + }, + { + "epoch": 2.5669957686882934, + "grad_norm": 0.05291645601391792, + "learning_rate": 5.191854706359175e-05, + "loss": 0.2202, + "step": 12740 + }, + { + "epoch": 2.5673987507555913, + "grad_norm": 0.04654770344495773, + "learning_rate": 5.190523029469431e-05, + "loss": 0.2129, + "step": 12742 + }, + { + "epoch": 2.5678017328228893, + "grad_norm": 0.0438658781349659, + "learning_rate": 5.189191339045233e-05, + "loss": 0.1831, + "step": 12744 + }, + { + "epoch": 2.5682047148901876, + "grad_norm": 0.056785158812999725, + "learning_rate": 5.1878596351811845e-05, + "loss": 0.2295, + "step": 12746 + }, + { + "epoch": 2.5686076969574856, + "grad_norm": 0.03449772298336029, + "learning_rate": 5.1865279179718906e-05, + "loss": 0.1447, + "step": 12748 + }, + { + "epoch": 2.5690106790247835, + "grad_norm": 0.06026627495884895, + "learning_rate": 5.1851961875119493e-05, + "loss": 0.1828, + "step": 12750 + }, + { + "epoch": 2.5694136610920815, + "grad_norm": 0.07659012079238892, + "learning_rate": 5.183864443895967e-05, + "loss": 0.2014, + "step": 12752 + }, + { + "epoch": 2.5698166431593794, + "grad_norm": 0.05824153125286102, + "learning_rate": 5.182532687218551e-05, + "loss": 0.2268, + "step": 12754 + }, + { + "epoch": 2.5702196252266774, + "grad_norm": 0.05987565591931343, + "learning_rate": 5.181200917574303e-05, + "loss": 0.207, + "step": 12756 + }, + { + "epoch": 2.5706226072939753, + "grad_norm": 0.06632793694734573, + "learning_rate": 5.179869135057831e-05, + "loss": 0.151, + "step": 12758 + }, + { + "epoch": 2.5710255893612732, + "grad_norm": 0.05087270215153694, + "learning_rate": 5.178537339763745e-05, + "loss": 0.2149, + "step": 12760 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.07185879349708557, + "learning_rate": 5.177205531786651e-05, + "loss": 0.2331, + "step": 12762 + }, + { + "epoch": 2.5718315534958696, + "grad_norm": 0.060018185526132584, + "learning_rate": 5.175873711221161e-05, + "loss": 0.1866, + "step": 12764 + }, + { + "epoch": 2.5722345355631675, + "grad_norm": 0.08213285356760025, + "learning_rate": 5.174541878161881e-05, + "loss": 0.2149, + "step": 12766 + }, + { + "epoch": 2.5726375176304654, + "grad_norm": 0.04798312857747078, + "learning_rate": 5.173210032703427e-05, + "loss": 0.1771, + "step": 12768 + }, + { + "epoch": 2.5730404996977634, + "grad_norm": 0.07627954334020615, + "learning_rate": 5.171878174940409e-05, + "loss": 0.2149, + "step": 12770 + }, + { + "epoch": 2.5734434817650613, + "grad_norm": 0.07262519747018814, + "learning_rate": 5.1705463049674397e-05, + "loss": 0.2236, + "step": 12772 + }, + { + "epoch": 2.5738464638323597, + "grad_norm": 0.05804283544421196, + "learning_rate": 5.169214422879134e-05, + "loss": 0.2591, + "step": 12774 + }, + { + "epoch": 2.5742494458996577, + "grad_norm": 0.05936234071850777, + "learning_rate": 5.167882528770107e-05, + "loss": 0.1525, + "step": 12776 + }, + { + "epoch": 2.5746524279669556, + "grad_norm": 0.07271191477775574, + "learning_rate": 5.1665506227349726e-05, + "loss": 0.192, + "step": 12778 + }, + { + "epoch": 2.5750554100342535, + "grad_norm": 0.07205134630203247, + "learning_rate": 5.165218704868349e-05, + "loss": 0.182, + "step": 12780 + }, + { + "epoch": 2.5754583921015515, + "grad_norm": 0.055812589824199677, + "learning_rate": 5.1638867752648534e-05, + "loss": 0.2042, + "step": 12782 + }, + { + "epoch": 2.5758613741688494, + "grad_norm": 0.05865110456943512, + "learning_rate": 5.1625548340191024e-05, + "loss": 0.1843, + "step": 12784 + }, + { + "epoch": 2.5762643562361474, + "grad_norm": 0.04999213665723801, + "learning_rate": 5.161222881225716e-05, + "loss": 0.1712, + "step": 12786 + }, + { + "epoch": 2.5766673383034453, + "grad_norm": 0.05770622566342354, + "learning_rate": 5.1598909169793144e-05, + "loss": 0.2321, + "step": 12788 + }, + { + "epoch": 2.5770703203707432, + "grad_norm": 0.04536011442542076, + "learning_rate": 5.1585589413745176e-05, + "loss": 0.1553, + "step": 12790 + }, + { + "epoch": 2.5774733024380416, + "grad_norm": 0.07230181246995926, + "learning_rate": 5.157226954505946e-05, + "loss": 0.2216, + "step": 12792 + }, + { + "epoch": 2.5778762845053396, + "grad_norm": 0.06430327147245407, + "learning_rate": 5.1558949564682245e-05, + "loss": 0.2242, + "step": 12794 + }, + { + "epoch": 2.5782792665726375, + "grad_norm": 0.05737382546067238, + "learning_rate": 5.1545629473559745e-05, + "loss": 0.1762, + "step": 12796 + }, + { + "epoch": 2.5786822486399354, + "grad_norm": 0.04715484380722046, + "learning_rate": 5.1532309272638194e-05, + "loss": 0.1554, + "step": 12798 + }, + { + "epoch": 2.5790852307072334, + "grad_norm": 0.05664588510990143, + "learning_rate": 5.151898896286385e-05, + "loss": 0.1658, + "step": 12800 + }, + { + "epoch": 2.5794882127745318, + "grad_norm": 0.05977484956383705, + "learning_rate": 5.150566854518294e-05, + "loss": 0.1833, + "step": 12802 + }, + { + "epoch": 2.5798911948418297, + "grad_norm": 0.04098358750343323, + "learning_rate": 5.149234802054176e-05, + "loss": 0.2023, + "step": 12804 + }, + { + "epoch": 2.5802941769091277, + "grad_norm": 0.04994696006178856, + "learning_rate": 5.147902738988657e-05, + "loss": 0.1946, + "step": 12806 + }, + { + "epoch": 2.5806971589764256, + "grad_norm": 0.07139338552951813, + "learning_rate": 5.146570665416363e-05, + "loss": 0.2112, + "step": 12808 + }, + { + "epoch": 2.5811001410437235, + "grad_norm": 0.058642320334911346, + "learning_rate": 5.145238581431923e-05, + "loss": 0.21, + "step": 12810 + }, + { + "epoch": 2.5815031231110215, + "grad_norm": 0.06522581726312637, + "learning_rate": 5.143906487129967e-05, + "loss": 0.1602, + "step": 12812 + }, + { + "epoch": 2.5819061051783194, + "grad_norm": 0.05621035769581795, + "learning_rate": 5.1425743826051245e-05, + "loss": 0.2493, + "step": 12814 + }, + { + "epoch": 2.5823090872456174, + "grad_norm": 0.04596579074859619, + "learning_rate": 5.1412422679520245e-05, + "loss": 0.205, + "step": 12816 + }, + { + "epoch": 2.5827120693129153, + "grad_norm": 0.04868212342262268, + "learning_rate": 5.139910143265302e-05, + "loss": 0.2341, + "step": 12818 + }, + { + "epoch": 2.5831150513802137, + "grad_norm": 0.046460337936878204, + "learning_rate": 5.1385780086395853e-05, + "loss": 0.1865, + "step": 12820 + }, + { + "epoch": 2.5835180334475116, + "grad_norm": 0.04652968794107437, + "learning_rate": 5.137245864169507e-05, + "loss": 0.1769, + "step": 12822 + }, + { + "epoch": 2.5839210155148096, + "grad_norm": 0.05539759248495102, + "learning_rate": 5.135913709949706e-05, + "loss": 0.1864, + "step": 12824 + }, + { + "epoch": 2.5843239975821075, + "grad_norm": 0.07780510932207108, + "learning_rate": 5.134581546074809e-05, + "loss": 0.1687, + "step": 12826 + }, + { + "epoch": 2.5847269796494055, + "grad_norm": 0.052189409732818604, + "learning_rate": 5.133249372639455e-05, + "loss": 0.2018, + "step": 12828 + }, + { + "epoch": 2.585129961716704, + "grad_norm": 0.0710270032286644, + "learning_rate": 5.131917189738279e-05, + "loss": 0.2054, + "step": 12830 + }, + { + "epoch": 2.585532943784002, + "grad_norm": 0.04049243405461311, + "learning_rate": 5.130584997465917e-05, + "loss": 0.2103, + "step": 12832 + }, + { + "epoch": 2.5859359258512997, + "grad_norm": 0.08410030603408813, + "learning_rate": 5.129252795917006e-05, + "loss": 0.1936, + "step": 12834 + }, + { + "epoch": 2.5863389079185977, + "grad_norm": 0.07183802127838135, + "learning_rate": 5.127920585186181e-05, + "loss": 0.2514, + "step": 12836 + }, + { + "epoch": 2.5867418899858956, + "grad_norm": 0.046077702194452286, + "learning_rate": 5.1265883653680825e-05, + "loss": 0.1637, + "step": 12838 + }, + { + "epoch": 2.5871448720531935, + "grad_norm": 0.06531906127929688, + "learning_rate": 5.1252561365573516e-05, + "loss": 0.2507, + "step": 12840 + }, + { + "epoch": 2.5875478541204915, + "grad_norm": 0.03278656303882599, + "learning_rate": 5.123923898848623e-05, + "loss": 0.1597, + "step": 12842 + }, + { + "epoch": 2.5879508361877894, + "grad_norm": 0.06488583981990814, + "learning_rate": 5.122591652336539e-05, + "loss": 0.1736, + "step": 12844 + }, + { + "epoch": 2.588353818255088, + "grad_norm": 0.04900403320789337, + "learning_rate": 5.1212593971157405e-05, + "loss": 0.244, + "step": 12846 + }, + { + "epoch": 2.5887568003223858, + "grad_norm": 0.04153195396065712, + "learning_rate": 5.119927133280869e-05, + "loss": 0.1476, + "step": 12848 + }, + { + "epoch": 2.5891597823896837, + "grad_norm": 0.057202406227588654, + "learning_rate": 5.118594860926564e-05, + "loss": 0.1796, + "step": 12850 + }, + { + "epoch": 2.5895627644569816, + "grad_norm": 0.06250036507844925, + "learning_rate": 5.117262580147472e-05, + "loss": 0.1753, + "step": 12852 + }, + { + "epoch": 2.5899657465242796, + "grad_norm": 0.057463813573122025, + "learning_rate": 5.115930291038232e-05, + "loss": 0.186, + "step": 12854 + }, + { + "epoch": 2.5903687285915775, + "grad_norm": 0.049552544951438904, + "learning_rate": 5.114597993693491e-05, + "loss": 0.1732, + "step": 12856 + }, + { + "epoch": 2.590771710658876, + "grad_norm": 0.06389881670475006, + "learning_rate": 5.11326568820789e-05, + "loss": 0.1787, + "step": 12858 + }, + { + "epoch": 2.591174692726174, + "grad_norm": 0.0628291442990303, + "learning_rate": 5.111933374676077e-05, + "loss": 0.2285, + "step": 12860 + }, + { + "epoch": 2.591577674793472, + "grad_norm": 0.0501752570271492, + "learning_rate": 5.110601053192696e-05, + "loss": 0.1721, + "step": 12862 + }, + { + "epoch": 2.5919806568607697, + "grad_norm": 0.05748743563890457, + "learning_rate": 5.1092687238523926e-05, + "loss": 0.2718, + "step": 12864 + }, + { + "epoch": 2.5923836389280677, + "grad_norm": 0.06448312848806381, + "learning_rate": 5.1079363867498134e-05, + "loss": 0.2096, + "step": 12866 + }, + { + "epoch": 2.5927866209953656, + "grad_norm": 0.04302519932389259, + "learning_rate": 5.1066040419796066e-05, + "loss": 0.2141, + "step": 12868 + }, + { + "epoch": 2.5931896030626636, + "grad_norm": 0.03750751540064812, + "learning_rate": 5.10527168963642e-05, + "loss": 0.1829, + "step": 12870 + }, + { + "epoch": 2.5935925851299615, + "grad_norm": 0.04912559688091278, + "learning_rate": 5.103939329814898e-05, + "loss": 0.1977, + "step": 12872 + }, + { + "epoch": 2.59399556719726, + "grad_norm": 0.05096503719687462, + "learning_rate": 5.1026069626096964e-05, + "loss": 0.1397, + "step": 12874 + }, + { + "epoch": 2.594398549264558, + "grad_norm": 0.059429194778203964, + "learning_rate": 5.101274588115457e-05, + "loss": 0.1858, + "step": 12876 + }, + { + "epoch": 2.5948015313318558, + "grad_norm": 0.05182720348238945, + "learning_rate": 5.099942206426833e-05, + "loss": 0.2315, + "step": 12878 + }, + { + "epoch": 2.5952045133991537, + "grad_norm": 0.06778381764888763, + "learning_rate": 5.098609817638477e-05, + "loss": 0.1854, + "step": 12880 + }, + { + "epoch": 2.5956074954664516, + "grad_norm": 0.04259805753827095, + "learning_rate": 5.097277421845035e-05, + "loss": 0.1813, + "step": 12882 + }, + { + "epoch": 2.5960104775337496, + "grad_norm": 0.06854765862226486, + "learning_rate": 5.0959450191411606e-05, + "loss": 0.2141, + "step": 12884 + }, + { + "epoch": 2.596413459601048, + "grad_norm": 0.05548940226435661, + "learning_rate": 5.094612609621506e-05, + "loss": 0.1481, + "step": 12886 + }, + { + "epoch": 2.596816441668346, + "grad_norm": 0.06769641488790512, + "learning_rate": 5.093280193380723e-05, + "loss": 0.1951, + "step": 12888 + }, + { + "epoch": 2.597219423735644, + "grad_norm": 0.05677637830376625, + "learning_rate": 5.0919477705134644e-05, + "loss": 0.1935, + "step": 12890 + }, + { + "epoch": 2.597622405802942, + "grad_norm": 0.0598987378180027, + "learning_rate": 5.090615341114383e-05, + "loss": 0.2259, + "step": 12892 + }, + { + "epoch": 2.5980253878702397, + "grad_norm": 0.05017285421490669, + "learning_rate": 5.0892829052781334e-05, + "loss": 0.2007, + "step": 12894 + }, + { + "epoch": 2.5984283699375377, + "grad_norm": 0.04577360674738884, + "learning_rate": 5.087950463099367e-05, + "loss": 0.2345, + "step": 12896 + }, + { + "epoch": 2.5988313520048356, + "grad_norm": 0.04841199517250061, + "learning_rate": 5.086618014672743e-05, + "loss": 0.2259, + "step": 12898 + }, + { + "epoch": 2.5992343340721336, + "grad_norm": 0.061152152717113495, + "learning_rate": 5.0852855600929116e-05, + "loss": 0.2082, + "step": 12900 + }, + { + "epoch": 2.599637316139432, + "grad_norm": 0.052234455943107605, + "learning_rate": 5.0839530994545316e-05, + "loss": 0.1641, + "step": 12902 + }, + { + "epoch": 2.60004029820673, + "grad_norm": 0.07535497099161148, + "learning_rate": 5.082620632852258e-05, + "loss": 0.2003, + "step": 12904 + }, + { + "epoch": 2.600443280274028, + "grad_norm": 0.05371668562293053, + "learning_rate": 5.081288160380745e-05, + "loss": 0.1998, + "step": 12906 + }, + { + "epoch": 2.6008462623413258, + "grad_norm": 0.04646865651011467, + "learning_rate": 5.079955682134652e-05, + "loss": 0.1759, + "step": 12908 + }, + { + "epoch": 2.6012492444086237, + "grad_norm": 0.05468188226222992, + "learning_rate": 5.0786231982086364e-05, + "loss": 0.147, + "step": 12910 + }, + { + "epoch": 2.601652226475922, + "grad_norm": 0.06912878155708313, + "learning_rate": 5.077290708697353e-05, + "loss": 0.197, + "step": 12912 + }, + { + "epoch": 2.60205520854322, + "grad_norm": 0.05467282235622406, + "learning_rate": 5.075958213695461e-05, + "loss": 0.1566, + "step": 12914 + }, + { + "epoch": 2.602458190610518, + "grad_norm": 0.042261648923158646, + "learning_rate": 5.0746257132976205e-05, + "loss": 0.1476, + "step": 12916 + }, + { + "epoch": 2.602861172677816, + "grad_norm": 0.05570102855563164, + "learning_rate": 5.073293207598487e-05, + "loss": 0.1765, + "step": 12918 + }, + { + "epoch": 2.603264154745114, + "grad_norm": 0.05489011108875275, + "learning_rate": 5.0719606966927226e-05, + "loss": 0.171, + "step": 12920 + }, + { + "epoch": 2.603667136812412, + "grad_norm": 0.052448570728302, + "learning_rate": 5.070628180674986e-05, + "loss": 0.1551, + "step": 12922 + }, + { + "epoch": 2.6040701188797097, + "grad_norm": 0.24931637942790985, + "learning_rate": 5.0692956596399344e-05, + "loss": 0.2118, + "step": 12924 + }, + { + "epoch": 2.6044731009470077, + "grad_norm": 0.052810054272413254, + "learning_rate": 5.067963133682232e-05, + "loss": 0.2092, + "step": 12926 + }, + { + "epoch": 2.6048760830143056, + "grad_norm": 0.051145680248737335, + "learning_rate": 5.066630602896536e-05, + "loss": 0.2083, + "step": 12928 + }, + { + "epoch": 2.605279065081604, + "grad_norm": 0.08187511563301086, + "learning_rate": 5.0652980673775085e-05, + "loss": 0.1843, + "step": 12930 + }, + { + "epoch": 2.605682047148902, + "grad_norm": 0.08010423928499222, + "learning_rate": 5.0639655272198116e-05, + "loss": 0.1689, + "step": 12932 + }, + { + "epoch": 2.6060850292162, + "grad_norm": 0.05526207014918327, + "learning_rate": 5.062632982518105e-05, + "loss": 0.17, + "step": 12934 + }, + { + "epoch": 2.606488011283498, + "grad_norm": 0.05141100659966469, + "learning_rate": 5.061300433367051e-05, + "loss": 0.2417, + "step": 12936 + }, + { + "epoch": 2.6068909933507958, + "grad_norm": 0.05525074899196625, + "learning_rate": 5.059967879861314e-05, + "loss": 0.2469, + "step": 12938 + }, + { + "epoch": 2.607293975418094, + "grad_norm": 0.051896847784519196, + "learning_rate": 5.058635322095553e-05, + "loss": 0.1749, + "step": 12940 + }, + { + "epoch": 2.607696957485392, + "grad_norm": 0.04624801501631737, + "learning_rate": 5.057302760164433e-05, + "loss": 0.2052, + "step": 12942 + }, + { + "epoch": 2.60809993955269, + "grad_norm": 0.08334264159202576, + "learning_rate": 5.055970194162618e-05, + "loss": 0.244, + "step": 12944 + }, + { + "epoch": 2.608502921619988, + "grad_norm": 0.0596526637673378, + "learning_rate": 5.054637624184768e-05, + "loss": 0.1921, + "step": 12946 + }, + { + "epoch": 2.608905903687286, + "grad_norm": 0.07964113354682922, + "learning_rate": 5.053305050325549e-05, + "loss": 0.2158, + "step": 12948 + }, + { + "epoch": 2.609308885754584, + "grad_norm": 0.06931203603744507, + "learning_rate": 5.051972472679626e-05, + "loss": 0.1674, + "step": 12950 + }, + { + "epoch": 2.609711867821882, + "grad_norm": 0.06616196036338806, + "learning_rate": 5.0506398913416596e-05, + "loss": 0.2039, + "step": 12952 + }, + { + "epoch": 2.6101148498891797, + "grad_norm": 0.04973366856575012, + "learning_rate": 5.049307306406317e-05, + "loss": 0.2103, + "step": 12954 + }, + { + "epoch": 2.6105178319564777, + "grad_norm": 0.07316756248474121, + "learning_rate": 5.047974717968262e-05, + "loss": 0.2184, + "step": 12956 + }, + { + "epoch": 2.610920814023776, + "grad_norm": 0.06390600651502609, + "learning_rate": 5.0466421261221606e-05, + "loss": 0.2159, + "step": 12958 + }, + { + "epoch": 2.611323796091074, + "grad_norm": 0.07867567986249924, + "learning_rate": 5.045309530962676e-05, + "loss": 0.2068, + "step": 12960 + }, + { + "epoch": 2.611726778158372, + "grad_norm": 0.06732230633497238, + "learning_rate": 5.0439769325844765e-05, + "loss": 0.1552, + "step": 12962 + }, + { + "epoch": 2.61212976022567, + "grad_norm": 0.05458653345704079, + "learning_rate": 5.042644331082225e-05, + "loss": 0.2168, + "step": 12964 + }, + { + "epoch": 2.612532742292968, + "grad_norm": 0.05227232724428177, + "learning_rate": 5.041311726550587e-05, + "loss": 0.1737, + "step": 12966 + }, + { + "epoch": 2.612935724360266, + "grad_norm": 0.041917361319065094, + "learning_rate": 5.0399791190842324e-05, + "loss": 0.1745, + "step": 12968 + }, + { + "epoch": 2.613338706427564, + "grad_norm": 0.048856884241104126, + "learning_rate": 5.0386465087778235e-05, + "loss": 0.1913, + "step": 12970 + }, + { + "epoch": 2.613741688494862, + "grad_norm": 0.060772497206926346, + "learning_rate": 5.037313895726029e-05, + "loss": 0.1481, + "step": 12972 + }, + { + "epoch": 2.61414467056216, + "grad_norm": 0.06059327349066734, + "learning_rate": 5.035981280023516e-05, + "loss": 0.1709, + "step": 12974 + }, + { + "epoch": 2.614547652629458, + "grad_norm": 0.08554743230342865, + "learning_rate": 5.034648661764949e-05, + "loss": 0.212, + "step": 12976 + }, + { + "epoch": 2.614950634696756, + "grad_norm": 0.05296426638960838, + "learning_rate": 5.0333160410449966e-05, + "loss": 0.1944, + "step": 12978 + }, + { + "epoch": 2.615353616764054, + "grad_norm": 0.05745255574584007, + "learning_rate": 5.031983417958327e-05, + "loss": 0.16, + "step": 12980 + }, + { + "epoch": 2.615756598831352, + "grad_norm": 0.05538434907793999, + "learning_rate": 5.030650792599605e-05, + "loss": 0.2114, + "step": 12982 + }, + { + "epoch": 2.6161595808986498, + "grad_norm": 0.04889480769634247, + "learning_rate": 5.0293181650635e-05, + "loss": 0.2085, + "step": 12984 + }, + { + "epoch": 2.616562562965948, + "grad_norm": 0.04710153490304947, + "learning_rate": 5.0279855354446815e-05, + "loss": 0.2168, + "step": 12986 + }, + { + "epoch": 2.616965545033246, + "grad_norm": 0.061837129294872284, + "learning_rate": 5.026652903837813e-05, + "loss": 0.1725, + "step": 12988 + }, + { + "epoch": 2.617368527100544, + "grad_norm": 0.06950433552265167, + "learning_rate": 5.025320270337566e-05, + "loss": 0.2282, + "step": 12990 + }, + { + "epoch": 2.617771509167842, + "grad_norm": 0.07739868760108948, + "learning_rate": 5.0239876350386076e-05, + "loss": 0.229, + "step": 12992 + }, + { + "epoch": 2.61817449123514, + "grad_norm": 0.04732915386557579, + "learning_rate": 5.022654998035604e-05, + "loss": 0.144, + "step": 12994 + }, + { + "epoch": 2.6185774733024383, + "grad_norm": 0.05778880417346954, + "learning_rate": 5.021322359423228e-05, + "loss": 0.2039, + "step": 12996 + }, + { + "epoch": 2.6189804553697362, + "grad_norm": 0.05006580054759979, + "learning_rate": 5.019989719296144e-05, + "loss": 0.1509, + "step": 12998 + }, + { + "epoch": 2.619383437437034, + "grad_norm": 0.04661744460463524, + "learning_rate": 5.018657077749024e-05, + "loss": 0.1915, + "step": 13000 + }, + { + "epoch": 2.619786419504332, + "grad_norm": 0.07048065215349197, + "learning_rate": 5.0173244348765345e-05, + "loss": 0.1915, + "step": 13002 + }, + { + "epoch": 2.62018940157163, + "grad_norm": 0.05740850046277046, + "learning_rate": 5.0159917907733436e-05, + "loss": 0.2505, + "step": 13004 + }, + { + "epoch": 2.620592383638928, + "grad_norm": 0.06299738585948944, + "learning_rate": 5.0146591455341217e-05, + "loss": 0.2106, + "step": 13006 + }, + { + "epoch": 2.620995365706226, + "grad_norm": 0.04687739536166191, + "learning_rate": 5.013326499253539e-05, + "loss": 0.1793, + "step": 13008 + }, + { + "epoch": 2.621398347773524, + "grad_norm": 0.05674981698393822, + "learning_rate": 5.0119938520262624e-05, + "loss": 0.1942, + "step": 13010 + }, + { + "epoch": 2.621801329840822, + "grad_norm": 0.04868143051862717, + "learning_rate": 5.010661203946961e-05, + "loss": 0.1878, + "step": 13012 + }, + { + "epoch": 2.62220431190812, + "grad_norm": 0.037791360169649124, + "learning_rate": 5.0093285551103064e-05, + "loss": 0.1803, + "step": 13014 + }, + { + "epoch": 2.622607293975418, + "grad_norm": 0.05020664632320404, + "learning_rate": 5.007995905610964e-05, + "loss": 0.2101, + "step": 13016 + }, + { + "epoch": 2.623010276042716, + "grad_norm": 0.044175636023283005, + "learning_rate": 5.006663255543607e-05, + "loss": 0.1813, + "step": 13018 + }, + { + "epoch": 2.623413258110014, + "grad_norm": 0.05268344283103943, + "learning_rate": 5.0053306050029026e-05, + "loss": 0.244, + "step": 13020 + }, + { + "epoch": 2.623816240177312, + "grad_norm": 0.05456307902932167, + "learning_rate": 5.003997954083519e-05, + "loss": 0.142, + "step": 13022 + }, + { + "epoch": 2.6242192222446104, + "grad_norm": 0.05344918742775917, + "learning_rate": 5.002665302880129e-05, + "loss": 0.1684, + "step": 13024 + }, + { + "epoch": 2.6246222043119083, + "grad_norm": 0.07235932350158691, + "learning_rate": 5.0013326514874e-05, + "loss": 0.2067, + "step": 13026 + }, + { + "epoch": 2.6250251863792062, + "grad_norm": 0.051819127053022385, + "learning_rate": 5e-05, + "loss": 0.1963, + "step": 13028 + }, + { + "epoch": 2.625428168446504, + "grad_norm": 0.057188838720321655, + "learning_rate": 4.998667348512601e-05, + "loss": 0.1831, + "step": 13030 + }, + { + "epoch": 2.625831150513802, + "grad_norm": 0.05618830397725105, + "learning_rate": 4.9973346971198725e-05, + "loss": 0.2096, + "step": 13032 + }, + { + "epoch": 2.6262341325811, + "grad_norm": 0.060924481600522995, + "learning_rate": 4.99600204591648e-05, + "loss": 0.2214, + "step": 13034 + }, + { + "epoch": 2.626637114648398, + "grad_norm": 0.05707908049225807, + "learning_rate": 4.994669394997099e-05, + "loss": 0.2045, + "step": 13036 + }, + { + "epoch": 2.627040096715696, + "grad_norm": 0.057565122842788696, + "learning_rate": 4.9933367444563936e-05, + "loss": 0.2445, + "step": 13038 + }, + { + "epoch": 2.627443078782994, + "grad_norm": 0.046373993158340454, + "learning_rate": 4.9920040943890364e-05, + "loss": 0.1656, + "step": 13040 + }, + { + "epoch": 2.6278460608502923, + "grad_norm": 0.04796910285949707, + "learning_rate": 4.9906714448896955e-05, + "loss": 0.1453, + "step": 13042 + }, + { + "epoch": 2.62824904291759, + "grad_norm": 0.048241354525089264, + "learning_rate": 4.9893387960530406e-05, + "loss": 0.1605, + "step": 13044 + }, + { + "epoch": 2.628652024984888, + "grad_norm": 0.06200138479471207, + "learning_rate": 4.9880061479737374e-05, + "loss": 0.1776, + "step": 13046 + }, + { + "epoch": 2.629055007052186, + "grad_norm": 0.058670494705438614, + "learning_rate": 4.9866735007464614e-05, + "loss": 0.2085, + "step": 13048 + }, + { + "epoch": 2.629457989119484, + "grad_norm": 0.051949888467788696, + "learning_rate": 4.985340854465878e-05, + "loss": 0.2077, + "step": 13050 + }, + { + "epoch": 2.6298609711867824, + "grad_norm": 0.054738037288188934, + "learning_rate": 4.984008209226657e-05, + "loss": 0.1633, + "step": 13052 + }, + { + "epoch": 2.6302639532540804, + "grad_norm": 0.058610428124666214, + "learning_rate": 4.982675565123467e-05, + "loss": 0.2085, + "step": 13054 + }, + { + "epoch": 2.6306669353213783, + "grad_norm": 0.0664537250995636, + "learning_rate": 4.981342922250978e-05, + "loss": 0.1794, + "step": 13056 + }, + { + "epoch": 2.6310699173886762, + "grad_norm": 0.05168154463171959, + "learning_rate": 4.980010280703855e-05, + "loss": 0.1925, + "step": 13058 + }, + { + "epoch": 2.631472899455974, + "grad_norm": 0.07177409529685974, + "learning_rate": 4.978677640576773e-05, + "loss": 0.2115, + "step": 13060 + }, + { + "epoch": 2.631875881523272, + "grad_norm": 0.05283867195248604, + "learning_rate": 4.977345001964395e-05, + "loss": 0.1536, + "step": 13062 + }, + { + "epoch": 2.63227886359057, + "grad_norm": 0.08195855468511581, + "learning_rate": 4.9760123649613936e-05, + "loss": 0.208, + "step": 13064 + }, + { + "epoch": 2.632681845657868, + "grad_norm": 0.08787176758050919, + "learning_rate": 4.9746797296624346e-05, + "loss": 0.1907, + "step": 13066 + }, + { + "epoch": 2.6330848277251664, + "grad_norm": 0.06067819148302078, + "learning_rate": 4.973347096162188e-05, + "loss": 0.2148, + "step": 13068 + }, + { + "epoch": 2.6334878097924643, + "grad_norm": 0.04483300447463989, + "learning_rate": 4.972014464555319e-05, + "loss": 0.1782, + "step": 13070 + }, + { + "epoch": 2.6338907918597623, + "grad_norm": 0.057843420654535294, + "learning_rate": 4.9706818349365006e-05, + "loss": 0.2338, + "step": 13072 + }, + { + "epoch": 2.63429377392706, + "grad_norm": 0.05431623384356499, + "learning_rate": 4.969349207400395e-05, + "loss": 0.1761, + "step": 13074 + }, + { + "epoch": 2.634696755994358, + "grad_norm": 0.045999687165021896, + "learning_rate": 4.968016582041674e-05, + "loss": 0.1846, + "step": 13076 + }, + { + "epoch": 2.635099738061656, + "grad_norm": 0.05476114898920059, + "learning_rate": 4.966683958955003e-05, + "loss": 0.2219, + "step": 13078 + }, + { + "epoch": 2.6355027201289545, + "grad_norm": 0.06299296766519547, + "learning_rate": 4.965351338235053e-05, + "loss": 0.1971, + "step": 13080 + }, + { + "epoch": 2.6359057021962524, + "grad_norm": 0.06087028235197067, + "learning_rate": 4.9640187199764844e-05, + "loss": 0.1991, + "step": 13082 + }, + { + "epoch": 2.6363086842635504, + "grad_norm": 0.06715063750743866, + "learning_rate": 4.962686104273972e-05, + "loss": 0.2434, + "step": 13084 + }, + { + "epoch": 2.6367116663308483, + "grad_norm": 0.05608596280217171, + "learning_rate": 4.9613534912221756e-05, + "loss": 0.1879, + "step": 13086 + }, + { + "epoch": 2.6371146483981462, + "grad_norm": 0.06677883863449097, + "learning_rate": 4.960020880915769e-05, + "loss": 0.1768, + "step": 13088 + }, + { + "epoch": 2.637517630465444, + "grad_norm": 0.04541515186429024, + "learning_rate": 4.9586882734494126e-05, + "loss": 0.1903, + "step": 13090 + }, + { + "epoch": 2.637920612532742, + "grad_norm": 0.06370842456817627, + "learning_rate": 4.957355668917777e-05, + "loss": 0.1449, + "step": 13092 + }, + { + "epoch": 2.63832359460004, + "grad_norm": 0.04287172481417656, + "learning_rate": 4.956023067415525e-05, + "loss": 0.1806, + "step": 13094 + }, + { + "epoch": 2.6387265766673385, + "grad_norm": 0.04955010116100311, + "learning_rate": 4.954690469037325e-05, + "loss": 0.2014, + "step": 13096 + }, + { + "epoch": 2.6391295587346364, + "grad_norm": 0.047106534242630005, + "learning_rate": 4.95335787387784e-05, + "loss": 0.1747, + "step": 13098 + }, + { + "epoch": 2.6395325408019343, + "grad_norm": 0.06430371850728989, + "learning_rate": 4.952025282031739e-05, + "loss": 0.1857, + "step": 13100 + }, + { + "epoch": 2.6399355228692323, + "grad_norm": 0.0573105588555336, + "learning_rate": 4.950692693593683e-05, + "loss": 0.1549, + "step": 13102 + }, + { + "epoch": 2.64033850493653, + "grad_norm": 0.048013072460889816, + "learning_rate": 4.9493601086583416e-05, + "loss": 0.1909, + "step": 13104 + }, + { + "epoch": 2.6407414870038286, + "grad_norm": 0.05720860883593559, + "learning_rate": 4.9480275273203755e-05, + "loss": 0.2411, + "step": 13106 + }, + { + "epoch": 2.6411444690711265, + "grad_norm": 0.04672304913401604, + "learning_rate": 4.946694949674452e-05, + "loss": 0.2303, + "step": 13108 + }, + { + "epoch": 2.6415474511384245, + "grad_norm": 0.05690852925181389, + "learning_rate": 4.9453623758152316e-05, + "loss": 0.1842, + "step": 13110 + }, + { + "epoch": 2.6419504332057224, + "grad_norm": 0.056028977036476135, + "learning_rate": 4.9440298058373834e-05, + "loss": 0.1571, + "step": 13112 + }, + { + "epoch": 2.6423534152730204, + "grad_norm": 0.05338013917207718, + "learning_rate": 4.942697239835567e-05, + "loss": 0.1442, + "step": 13114 + }, + { + "epoch": 2.6427563973403183, + "grad_norm": 0.06082042679190636, + "learning_rate": 4.9413646779044475e-05, + "loss": 0.1667, + "step": 13116 + }, + { + "epoch": 2.6431593794076162, + "grad_norm": 0.06253007054328918, + "learning_rate": 4.9400321201386873e-05, + "loss": 0.2382, + "step": 13118 + }, + { + "epoch": 2.643562361474914, + "grad_norm": 0.04853769391775131, + "learning_rate": 4.93869956663295e-05, + "loss": 0.1721, + "step": 13120 + }, + { + "epoch": 2.643965343542212, + "grad_norm": 0.09063606709241867, + "learning_rate": 4.9373670174818956e-05, + "loss": 0.2299, + "step": 13122 + }, + { + "epoch": 2.6443683256095105, + "grad_norm": 0.07562512904405594, + "learning_rate": 4.93603447278019e-05, + "loss": 0.2321, + "step": 13124 + }, + { + "epoch": 2.6447713076768085, + "grad_norm": 0.05161966755986214, + "learning_rate": 4.934701932622492e-05, + "loss": 0.1779, + "step": 13126 + }, + { + "epoch": 2.6451742897441064, + "grad_norm": 0.05404038354754448, + "learning_rate": 4.933369397103465e-05, + "loss": 0.2406, + "step": 13128 + }, + { + "epoch": 2.6455772718114043, + "grad_norm": 0.0837070494890213, + "learning_rate": 4.932036866317769e-05, + "loss": 0.2105, + "step": 13130 + }, + { + "epoch": 2.6459802538787023, + "grad_norm": 0.040656138211488724, + "learning_rate": 4.930704340360066e-05, + "loss": 0.1896, + "step": 13132 + }, + { + "epoch": 2.6463832359460007, + "grad_norm": 0.05056383088231087, + "learning_rate": 4.929371819325014e-05, + "loss": 0.1915, + "step": 13134 + }, + { + "epoch": 2.6467862180132986, + "grad_norm": 0.07564432919025421, + "learning_rate": 4.9280393033072785e-05, + "loss": 0.1927, + "step": 13136 + }, + { + "epoch": 2.6471892000805965, + "grad_norm": 0.0784987360239029, + "learning_rate": 4.926706792401512e-05, + "loss": 0.2176, + "step": 13138 + }, + { + "epoch": 2.6475921821478945, + "grad_norm": 0.05027944594621658, + "learning_rate": 4.9253742867023806e-05, + "loss": 0.2728, + "step": 13140 + }, + { + "epoch": 2.6479951642151924, + "grad_norm": 0.045518044382333755, + "learning_rate": 4.9240417863045384e-05, + "loss": 0.1721, + "step": 13142 + }, + { + "epoch": 2.6483981462824904, + "grad_norm": 0.049447476863861084, + "learning_rate": 4.922709291302648e-05, + "loss": 0.1765, + "step": 13144 + }, + { + "epoch": 2.6488011283497883, + "grad_norm": 0.06533786654472351, + "learning_rate": 4.9213768017913634e-05, + "loss": 0.1861, + "step": 13146 + }, + { + "epoch": 2.6492041104170863, + "grad_norm": 0.06059259548783302, + "learning_rate": 4.920044317865349e-05, + "loss": 0.242, + "step": 13148 + }, + { + "epoch": 2.649607092484384, + "grad_norm": 0.05759195238351822, + "learning_rate": 4.918711839619255e-05, + "loss": 0.1757, + "step": 13150 + }, + { + "epoch": 2.6500100745516826, + "grad_norm": 0.05047111213207245, + "learning_rate": 4.9173793671477435e-05, + "loss": 0.1754, + "step": 13152 + }, + { + "epoch": 2.6504130566189805, + "grad_norm": 0.06632532179355621, + "learning_rate": 4.916046900545469e-05, + "loss": 0.2063, + "step": 13154 + }, + { + "epoch": 2.6508160386862785, + "grad_norm": 0.052938882261514664, + "learning_rate": 4.9147144399070896e-05, + "loss": 0.2113, + "step": 13156 + }, + { + "epoch": 2.6512190207535764, + "grad_norm": 0.04302411153912544, + "learning_rate": 4.9133819853272584e-05, + "loss": 0.1403, + "step": 13158 + }, + { + "epoch": 2.6516220028208743, + "grad_norm": 0.05520898476243019, + "learning_rate": 4.912049536900634e-05, + "loss": 0.1963, + "step": 13160 + }, + { + "epoch": 2.6520249848881727, + "grad_norm": 0.0664575919508934, + "learning_rate": 4.910717094721867e-05, + "loss": 0.215, + "step": 13162 + }, + { + "epoch": 2.6524279669554707, + "grad_norm": 0.06888501346111298, + "learning_rate": 4.909384658885617e-05, + "loss": 0.2019, + "step": 13164 + }, + { + "epoch": 2.6528309490227686, + "grad_norm": 0.04566109552979469, + "learning_rate": 4.9080522294865354e-05, + "loss": 0.1602, + "step": 13166 + }, + { + "epoch": 2.6532339310900666, + "grad_norm": 0.05641119182109833, + "learning_rate": 4.906719806619278e-05, + "loss": 0.2274, + "step": 13168 + }, + { + "epoch": 2.6536369131573645, + "grad_norm": 0.0688968226313591, + "learning_rate": 4.905387390378494e-05, + "loss": 0.1854, + "step": 13170 + }, + { + "epoch": 2.6540398952246624, + "grad_norm": 0.05073093995451927, + "learning_rate": 4.9040549808588405e-05, + "loss": 0.2027, + "step": 13172 + }, + { + "epoch": 2.6544428772919604, + "grad_norm": 0.0423697866499424, + "learning_rate": 4.902722578154965e-05, + "loss": 0.2197, + "step": 13174 + }, + { + "epoch": 2.6548458593592583, + "grad_norm": 0.057178203016519547, + "learning_rate": 4.901390182361524e-05, + "loss": 0.1881, + "step": 13176 + }, + { + "epoch": 2.6552488414265563, + "grad_norm": 0.04987271502614021, + "learning_rate": 4.900057793573166e-05, + "loss": 0.2196, + "step": 13178 + }, + { + "epoch": 2.6556518234938546, + "grad_norm": 0.056287556886672974, + "learning_rate": 4.8987254118845436e-05, + "loss": 0.1917, + "step": 13180 + }, + { + "epoch": 2.6560548055611526, + "grad_norm": 0.05609240382909775, + "learning_rate": 4.8973930373903054e-05, + "loss": 0.1993, + "step": 13182 + }, + { + "epoch": 2.6564577876284505, + "grad_norm": 0.0506737045943737, + "learning_rate": 4.896060670185102e-05, + "loss": 0.2618, + "step": 13184 + }, + { + "epoch": 2.6568607696957485, + "grad_norm": 0.04480299726128578, + "learning_rate": 4.894728310363581e-05, + "loss": 0.1861, + "step": 13186 + }, + { + "epoch": 2.6572637517630464, + "grad_norm": 0.05772459879517555, + "learning_rate": 4.893395958020394e-05, + "loss": 0.194, + "step": 13188 + }, + { + "epoch": 2.657666733830345, + "grad_norm": 0.05927930027246475, + "learning_rate": 4.8920636132501864e-05, + "loss": 0.1799, + "step": 13190 + }, + { + "epoch": 2.6580697158976427, + "grad_norm": 0.06080922484397888, + "learning_rate": 4.8907312761476085e-05, + "loss": 0.1891, + "step": 13192 + }, + { + "epoch": 2.6584726979649407, + "grad_norm": 0.036109864711761475, + "learning_rate": 4.889398946807305e-05, + "loss": 0.1901, + "step": 13194 + }, + { + "epoch": 2.6588756800322386, + "grad_norm": 0.06764542311429977, + "learning_rate": 4.8880666253239244e-05, + "loss": 0.2052, + "step": 13196 + }, + { + "epoch": 2.6592786620995366, + "grad_norm": 0.05238844081759453, + "learning_rate": 4.886734311792109e-05, + "loss": 0.2018, + "step": 13198 + }, + { + "epoch": 2.6596816441668345, + "grad_norm": 0.0657360702753067, + "learning_rate": 4.8854020063065104e-05, + "loss": 0.1742, + "step": 13200 + }, + { + "epoch": 2.6600846262341324, + "grad_norm": 0.05971973389387131, + "learning_rate": 4.8840697089617685e-05, + "loss": 0.2342, + "step": 13202 + }, + { + "epoch": 2.6604876083014304, + "grad_norm": 0.052394554018974304, + "learning_rate": 4.8827374198525293e-05, + "loss": 0.241, + "step": 13204 + }, + { + "epoch": 2.6608905903687283, + "grad_norm": 0.060340166091918945, + "learning_rate": 4.8814051390734364e-05, + "loss": 0.222, + "step": 13206 + }, + { + "epoch": 2.6612935724360267, + "grad_norm": 0.042948994785547256, + "learning_rate": 4.8800728667191324e-05, + "loss": 0.161, + "step": 13208 + }, + { + "epoch": 2.6616965545033247, + "grad_norm": 0.04476882889866829, + "learning_rate": 4.878740602884259e-05, + "loss": 0.2258, + "step": 13210 + }, + { + "epoch": 2.6620995365706226, + "grad_norm": 0.05171223357319832, + "learning_rate": 4.8774083476634626e-05, + "loss": 0.1949, + "step": 13212 + }, + { + "epoch": 2.6625025186379205, + "grad_norm": 0.07086855173110962, + "learning_rate": 4.8760761011513776e-05, + "loss": 0.2226, + "step": 13214 + }, + { + "epoch": 2.6629055007052185, + "grad_norm": 0.05836058035492897, + "learning_rate": 4.87474386344265e-05, + "loss": 0.1843, + "step": 13216 + }, + { + "epoch": 2.663308482772517, + "grad_norm": 0.042663928121328354, + "learning_rate": 4.873411634631917e-05, + "loss": 0.1727, + "step": 13218 + }, + { + "epoch": 2.663711464839815, + "grad_norm": 0.04874979704618454, + "learning_rate": 4.87207941481382e-05, + "loss": 0.1892, + "step": 13220 + }, + { + "epoch": 2.6641144469071127, + "grad_norm": 0.04593397676944733, + "learning_rate": 4.8707472040829954e-05, + "loss": 0.2116, + "step": 13222 + }, + { + "epoch": 2.6645174289744107, + "grad_norm": 0.04077121242880821, + "learning_rate": 4.8694150025340856e-05, + "loss": 0.1798, + "step": 13224 + }, + { + "epoch": 2.6649204110417086, + "grad_norm": 0.050686176866292953, + "learning_rate": 4.8680828102617215e-05, + "loss": 0.1842, + "step": 13226 + }, + { + "epoch": 2.6653233931090066, + "grad_norm": 0.06952465325593948, + "learning_rate": 4.866750627360546e-05, + "loss": 0.1789, + "step": 13228 + }, + { + "epoch": 2.6657263751763045, + "grad_norm": 0.07306458055973053, + "learning_rate": 4.865418453925192e-05, + "loss": 0.1833, + "step": 13230 + }, + { + "epoch": 2.6661293572436024, + "grad_norm": 0.06378468871116638, + "learning_rate": 4.864086290050297e-05, + "loss": 0.1782, + "step": 13232 + }, + { + "epoch": 2.6665323393109004, + "grad_norm": 0.06567275524139404, + "learning_rate": 4.862754135830493e-05, + "loss": 0.1871, + "step": 13234 + }, + { + "epoch": 2.6669353213781988, + "grad_norm": 0.055104974657297134, + "learning_rate": 4.861421991360418e-05, + "loss": 0.1788, + "step": 13236 + }, + { + "epoch": 2.6673383034454967, + "grad_norm": 0.053641512989997864, + "learning_rate": 4.860089856734699e-05, + "loss": 0.187, + "step": 13238 + }, + { + "epoch": 2.6677412855127947, + "grad_norm": 0.045359183102846146, + "learning_rate": 4.858757732047976e-05, + "loss": 0.1966, + "step": 13240 + }, + { + "epoch": 2.6681442675800926, + "grad_norm": 0.04768161475658417, + "learning_rate": 4.8574256173948766e-05, + "loss": 0.2288, + "step": 13242 + }, + { + "epoch": 2.6685472496473905, + "grad_norm": 0.07475445419549942, + "learning_rate": 4.856093512870035e-05, + "loss": 0.2402, + "step": 13244 + }, + { + "epoch": 2.668950231714689, + "grad_norm": 0.0542769655585289, + "learning_rate": 4.854761418568078e-05, + "loss": 0.2363, + "step": 13246 + }, + { + "epoch": 2.669353213781987, + "grad_norm": 0.03904513269662857, + "learning_rate": 4.85342933458364e-05, + "loss": 0.1482, + "step": 13248 + }, + { + "epoch": 2.669756195849285, + "grad_norm": 0.061630938202142715, + "learning_rate": 4.852097261011344e-05, + "loss": 0.2125, + "step": 13250 + }, + { + "epoch": 2.6701591779165827, + "grad_norm": 0.05297626927495003, + "learning_rate": 4.850765197945825e-05, + "loss": 0.223, + "step": 13252 + }, + { + "epoch": 2.6705621599838807, + "grad_norm": 0.042190030217170715, + "learning_rate": 4.8494331454817064e-05, + "loss": 0.1734, + "step": 13254 + }, + { + "epoch": 2.6709651420511786, + "grad_norm": 0.041881803423166275, + "learning_rate": 4.8481011037136176e-05, + "loss": 0.1577, + "step": 13256 + }, + { + "epoch": 2.6713681241184766, + "grad_norm": 0.047158777713775635, + "learning_rate": 4.8467690727361825e-05, + "loss": 0.1604, + "step": 13258 + }, + { + "epoch": 2.6717711061857745, + "grad_norm": 0.06598570197820663, + "learning_rate": 4.845437052644029e-05, + "loss": 0.2118, + "step": 13260 + }, + { + "epoch": 2.672174088253073, + "grad_norm": 0.05773276835680008, + "learning_rate": 4.8441050435317766e-05, + "loss": 0.2249, + "step": 13262 + }, + { + "epoch": 2.672577070320371, + "grad_norm": 0.0460265651345253, + "learning_rate": 4.842773045494055e-05, + "loss": 0.1632, + "step": 13264 + }, + { + "epoch": 2.672980052387669, + "grad_norm": 0.053771812468767166, + "learning_rate": 4.841441058625484e-05, + "loss": 0.2166, + "step": 13266 + }, + { + "epoch": 2.6733830344549667, + "grad_norm": 0.04382085055112839, + "learning_rate": 4.840109083020688e-05, + "loss": 0.2236, + "step": 13268 + }, + { + "epoch": 2.6737860165222647, + "grad_norm": 0.050635140389204025, + "learning_rate": 4.838777118774286e-05, + "loss": 0.1704, + "step": 13270 + }, + { + "epoch": 2.6741889985895626, + "grad_norm": 0.051803749054670334, + "learning_rate": 4.837445165980901e-05, + "loss": 0.2351, + "step": 13272 + }, + { + "epoch": 2.674591980656861, + "grad_norm": 0.07338876277208328, + "learning_rate": 4.8361132247351484e-05, + "loss": 0.2134, + "step": 13274 + }, + { + "epoch": 2.674994962724159, + "grad_norm": 0.0721621885895729, + "learning_rate": 4.834781295131654e-05, + "loss": 0.1942, + "step": 13276 + }, + { + "epoch": 2.675397944791457, + "grad_norm": 0.06050800532102585, + "learning_rate": 4.833449377265028e-05, + "loss": 0.1653, + "step": 13278 + }, + { + "epoch": 2.675800926858755, + "grad_norm": 0.062214065343141556, + "learning_rate": 4.832117471229895e-05, + "loss": 0.2049, + "step": 13280 + }, + { + "epoch": 2.6762039089260528, + "grad_norm": 0.045399442315101624, + "learning_rate": 4.8307855771208674e-05, + "loss": 0.1932, + "step": 13282 + }, + { + "epoch": 2.6766068909933507, + "grad_norm": 0.04470831900835037, + "learning_rate": 4.829453695032562e-05, + "loss": 0.1875, + "step": 13284 + }, + { + "epoch": 2.6770098730606486, + "grad_norm": 0.050232332199811935, + "learning_rate": 4.8281218250595914e-05, + "loss": 0.1804, + "step": 13286 + }, + { + "epoch": 2.6774128551279466, + "grad_norm": 0.05278032645583153, + "learning_rate": 4.8267899672965755e-05, + "loss": 0.1992, + "step": 13288 + }, + { + "epoch": 2.677815837195245, + "grad_norm": 0.054559800773859024, + "learning_rate": 4.825458121838119e-05, + "loss": 0.2348, + "step": 13290 + }, + { + "epoch": 2.678218819262543, + "grad_norm": 0.07798026502132416, + "learning_rate": 4.8241262887788416e-05, + "loss": 0.1449, + "step": 13292 + }, + { + "epoch": 2.678621801329841, + "grad_norm": 0.046620819717645645, + "learning_rate": 4.8227944682133495e-05, + "loss": 0.1847, + "step": 13294 + }, + { + "epoch": 2.679024783397139, + "grad_norm": 0.07669510692358017, + "learning_rate": 4.821462660236257e-05, + "loss": 0.2126, + "step": 13296 + }, + { + "epoch": 2.6794277654644367, + "grad_norm": 0.05963238328695297, + "learning_rate": 4.8201308649421696e-05, + "loss": 0.2398, + "step": 13298 + }, + { + "epoch": 2.679830747531735, + "grad_norm": 0.05043806880712509, + "learning_rate": 4.8187990824256996e-05, + "loss": 0.2067, + "step": 13300 + }, + { + "epoch": 2.680233729599033, + "grad_norm": 0.06212414801120758, + "learning_rate": 4.8174673127814505e-05, + "loss": 0.1356, + "step": 13302 + }, + { + "epoch": 2.680636711666331, + "grad_norm": 0.07311484962701797, + "learning_rate": 4.8161355561040336e-05, + "loss": 0.2394, + "step": 13304 + }, + { + "epoch": 2.681039693733629, + "grad_norm": 0.07517794519662857, + "learning_rate": 4.814803812488052e-05, + "loss": 0.1514, + "step": 13306 + }, + { + "epoch": 2.681442675800927, + "grad_norm": 0.056023143231868744, + "learning_rate": 4.813472082028112e-05, + "loss": 0.177, + "step": 13308 + }, + { + "epoch": 2.681845657868225, + "grad_norm": 0.05077926069498062, + "learning_rate": 4.812140364818816e-05, + "loss": 0.1821, + "step": 13310 + }, + { + "epoch": 2.6822486399355228, + "grad_norm": 0.08374262601137161, + "learning_rate": 4.81080866095477e-05, + "loss": 0.1925, + "step": 13312 + }, + { + "epoch": 2.6826516220028207, + "grad_norm": 0.059895146638154984, + "learning_rate": 4.809476970530571e-05, + "loss": 0.1605, + "step": 13314 + }, + { + "epoch": 2.6830546040701186, + "grad_norm": 0.06866048276424408, + "learning_rate": 4.808145293640826e-05, + "loss": 0.2282, + "step": 13316 + }, + { + "epoch": 2.683457586137417, + "grad_norm": 0.0920555368065834, + "learning_rate": 4.806813630380131e-05, + "loss": 0.2012, + "step": 13318 + }, + { + "epoch": 2.683860568204715, + "grad_norm": 0.059407614171504974, + "learning_rate": 4.8054819808430876e-05, + "loss": 0.1991, + "step": 13320 + }, + { + "epoch": 2.684263550272013, + "grad_norm": 0.06879781931638718, + "learning_rate": 4.804150345124293e-05, + "loss": 0.2295, + "step": 13322 + }, + { + "epoch": 2.684666532339311, + "grad_norm": 0.05245072394609451, + "learning_rate": 4.8028187233183454e-05, + "loss": 0.2259, + "step": 13324 + }, + { + "epoch": 2.685069514406609, + "grad_norm": 0.06847406923770905, + "learning_rate": 4.8014871155198385e-05, + "loss": 0.1739, + "step": 13326 + }, + { + "epoch": 2.685472496473907, + "grad_norm": 0.06828334182500839, + "learning_rate": 4.8001555218233704e-05, + "loss": 0.1833, + "step": 13328 + }, + { + "epoch": 2.685875478541205, + "grad_norm": 0.06589218974113464, + "learning_rate": 4.798823942323534e-05, + "loss": 0.2163, + "step": 13330 + }, + { + "epoch": 2.686278460608503, + "grad_norm": 0.05553770810365677, + "learning_rate": 4.797492377114925e-05, + "loss": 0.2191, + "step": 13332 + }, + { + "epoch": 2.686681442675801, + "grad_norm": 0.04765209183096886, + "learning_rate": 4.796160826292132e-05, + "loss": 0.1729, + "step": 13334 + }, + { + "epoch": 2.687084424743099, + "grad_norm": 0.04598912596702576, + "learning_rate": 4.7948292899497485e-05, + "loss": 0.2046, + "step": 13336 + }, + { + "epoch": 2.687487406810397, + "grad_norm": 0.058516427874565125, + "learning_rate": 4.793497768182362e-05, + "loss": 0.1781, + "step": 13338 + }, + { + "epoch": 2.687890388877695, + "grad_norm": 0.0651201382279396, + "learning_rate": 4.792166261084567e-05, + "loss": 0.1859, + "step": 13340 + }, + { + "epoch": 2.6882933709449928, + "grad_norm": 0.06374579668045044, + "learning_rate": 4.7908347687509456e-05, + "loss": 0.1804, + "step": 13342 + }, + { + "epoch": 2.6886963530122907, + "grad_norm": 0.08643309772014618, + "learning_rate": 4.7895032912760904e-05, + "loss": 0.1678, + "step": 13344 + }, + { + "epoch": 2.689099335079589, + "grad_norm": 0.08268879354000092, + "learning_rate": 4.788171828754583e-05, + "loss": 0.1917, + "step": 13346 + }, + { + "epoch": 2.689502317146887, + "grad_norm": 0.07940013706684113, + "learning_rate": 4.786840381281011e-05, + "loss": 0.2438, + "step": 13348 + }, + { + "epoch": 2.689905299214185, + "grad_norm": 0.06752052903175354, + "learning_rate": 4.785508948949955e-05, + "loss": 0.169, + "step": 13350 + }, + { + "epoch": 2.690308281281483, + "grad_norm": 0.0565398707985878, + "learning_rate": 4.784177531856004e-05, + "loss": 0.1829, + "step": 13352 + }, + { + "epoch": 2.690711263348781, + "grad_norm": 0.04072846844792366, + "learning_rate": 4.782846130093733e-05, + "loss": 0.1687, + "step": 13354 + }, + { + "epoch": 2.6911142454160792, + "grad_norm": 0.05021306127309799, + "learning_rate": 4.781514743757727e-05, + "loss": 0.2042, + "step": 13356 + }, + { + "epoch": 2.691517227483377, + "grad_norm": 0.057188209146261215, + "learning_rate": 4.7801833729425645e-05, + "loss": 0.1799, + "step": 13358 + }, + { + "epoch": 2.691920209550675, + "grad_norm": 0.06615625321865082, + "learning_rate": 4.7788520177428235e-05, + "loss": 0.1755, + "step": 13360 + }, + { + "epoch": 2.692323191617973, + "grad_norm": 0.040171921253204346, + "learning_rate": 4.77752067825308e-05, + "loss": 0.1796, + "step": 13362 + }, + { + "epoch": 2.692726173685271, + "grad_norm": 0.07169929146766663, + "learning_rate": 4.7761893545679145e-05, + "loss": 0.2366, + "step": 13364 + }, + { + "epoch": 2.693129155752569, + "grad_norm": 0.0498359277844429, + "learning_rate": 4.774858046781896e-05, + "loss": 0.1666, + "step": 13366 + }, + { + "epoch": 2.693532137819867, + "grad_norm": 0.05385211855173111, + "learning_rate": 4.773526754989604e-05, + "loss": 0.1815, + "step": 13368 + }, + { + "epoch": 2.693935119887165, + "grad_norm": 0.04354149475693703, + "learning_rate": 4.7721954792856085e-05, + "loss": 0.2145, + "step": 13370 + }, + { + "epoch": 2.6943381019544628, + "grad_norm": 0.05808882415294647, + "learning_rate": 4.7708642197644826e-05, + "loss": 0.2205, + "step": 13372 + }, + { + "epoch": 2.694741084021761, + "grad_norm": 0.04748508334159851, + "learning_rate": 4.769532976520795e-05, + "loss": 0.1858, + "step": 13374 + }, + { + "epoch": 2.695144066089059, + "grad_norm": 0.06552096456289291, + "learning_rate": 4.768201749649117e-05, + "loss": 0.1915, + "step": 13376 + }, + { + "epoch": 2.695547048156357, + "grad_norm": 0.08570721745491028, + "learning_rate": 4.766870539244014e-05, + "loss": 0.244, + "step": 13378 + }, + { + "epoch": 2.695950030223655, + "grad_norm": 0.05731480196118355, + "learning_rate": 4.765539345400057e-05, + "loss": 0.1671, + "step": 13380 + }, + { + "epoch": 2.696353012290953, + "grad_norm": 0.05735393241047859, + "learning_rate": 4.7642081682118096e-05, + "loss": 0.2473, + "step": 13382 + }, + { + "epoch": 2.6967559943582513, + "grad_norm": 0.07119689881801605, + "learning_rate": 4.762877007773838e-05, + "loss": 0.1961, + "step": 13384 + }, + { + "epoch": 2.6971589764255492, + "grad_norm": 0.03842030093073845, + "learning_rate": 4.7615458641807025e-05, + "loss": 0.1277, + "step": 13386 + }, + { + "epoch": 2.697561958492847, + "grad_norm": 0.04085619002580643, + "learning_rate": 4.760214737526969e-05, + "loss": 0.1637, + "step": 13388 + }, + { + "epoch": 2.697964940560145, + "grad_norm": 0.05109267681837082, + "learning_rate": 4.7588836279071944e-05, + "loss": 0.1744, + "step": 13390 + }, + { + "epoch": 2.698367922627443, + "grad_norm": 0.07984402775764465, + "learning_rate": 4.7575525354159445e-05, + "loss": 0.2293, + "step": 13392 + }, + { + "epoch": 2.698770904694741, + "grad_norm": 0.07411880046129227, + "learning_rate": 4.7562214601477725e-05, + "loss": 0.2167, + "step": 13394 + }, + { + "epoch": 2.699173886762039, + "grad_norm": 0.04365954548120499, + "learning_rate": 4.75489040219724e-05, + "loss": 0.194, + "step": 13396 + }, + { + "epoch": 2.699576868829337, + "grad_norm": 0.05702051520347595, + "learning_rate": 4.7535593616589e-05, + "loss": 0.1744, + "step": 13398 + }, + { + "epoch": 2.699979850896635, + "grad_norm": 0.05207115411758423, + "learning_rate": 4.75222833862731e-05, + "loss": 0.1674, + "step": 13400 + }, + { + "epoch": 2.700382832963933, + "grad_norm": 0.04992508888244629, + "learning_rate": 4.750897333197021e-05, + "loss": 0.226, + "step": 13402 + }, + { + "epoch": 2.700785815031231, + "grad_norm": 0.0452972836792469, + "learning_rate": 4.7495663454625885e-05, + "loss": 0.2254, + "step": 13404 + }, + { + "epoch": 2.701188797098529, + "grad_norm": 0.0995267704129219, + "learning_rate": 4.748235375518561e-05, + "loss": 0.179, + "step": 13406 + }, + { + "epoch": 2.701591779165827, + "grad_norm": 0.07648054510354996, + "learning_rate": 4.746904423459491e-05, + "loss": 0.1982, + "step": 13408 + }, + { + "epoch": 2.701994761233125, + "grad_norm": 0.05254209414124489, + "learning_rate": 4.7455734893799256e-05, + "loss": 0.136, + "step": 13410 + }, + { + "epoch": 2.7023977433004234, + "grad_norm": 0.0762658417224884, + "learning_rate": 4.744242573374413e-05, + "loss": 0.1972, + "step": 13412 + }, + { + "epoch": 2.7028007253677213, + "grad_norm": 0.06384989619255066, + "learning_rate": 4.742911675537497e-05, + "loss": 0.2099, + "step": 13414 + }, + { + "epoch": 2.7032037074350193, + "grad_norm": 0.04679650813341141, + "learning_rate": 4.741580795963726e-05, + "loss": 0.153, + "step": 13416 + }, + { + "epoch": 2.703606689502317, + "grad_norm": 0.05645020306110382, + "learning_rate": 4.740249934747642e-05, + "loss": 0.217, + "step": 13418 + }, + { + "epoch": 2.704009671569615, + "grad_norm": 0.07073520869016647, + "learning_rate": 4.7389190919837865e-05, + "loss": 0.2049, + "step": 13420 + }, + { + "epoch": 2.704412653636913, + "grad_norm": 0.04735686630010605, + "learning_rate": 4.737588267766703e-05, + "loss": 0.1856, + "step": 13422 + }, + { + "epoch": 2.704815635704211, + "grad_norm": 0.05677908658981323, + "learning_rate": 4.7362574621909264e-05, + "loss": 0.2155, + "step": 13424 + }, + { + "epoch": 2.705218617771509, + "grad_norm": 0.05154525861144066, + "learning_rate": 4.734926675351e-05, + "loss": 0.1814, + "step": 13426 + }, + { + "epoch": 2.705621599838807, + "grad_norm": 0.05443556606769562, + "learning_rate": 4.733595907341458e-05, + "loss": 0.2147, + "step": 13428 + }, + { + "epoch": 2.7060245819061053, + "grad_norm": 0.05580776929855347, + "learning_rate": 4.732265158256837e-05, + "loss": 0.1586, + "step": 13430 + }, + { + "epoch": 2.7064275639734032, + "grad_norm": 0.05419588088989258, + "learning_rate": 4.730934428191671e-05, + "loss": 0.1915, + "step": 13432 + }, + { + "epoch": 2.706830546040701, + "grad_norm": 0.04755214601755142, + "learning_rate": 4.7296037172404934e-05, + "loss": 0.1544, + "step": 13434 + }, + { + "epoch": 2.707233528107999, + "grad_norm": 0.06594018638134003, + "learning_rate": 4.728273025497833e-05, + "loss": 0.18, + "step": 13436 + }, + { + "epoch": 2.707636510175297, + "grad_norm": 0.05939716100692749, + "learning_rate": 4.726942353058226e-05, + "loss": 0.2284, + "step": 13438 + }, + { + "epoch": 2.7080394922425954, + "grad_norm": 0.05221749469637871, + "learning_rate": 4.7256117000161935e-05, + "loss": 0.2107, + "step": 13440 + }, + { + "epoch": 2.7084424743098934, + "grad_norm": 0.06709478050470352, + "learning_rate": 4.72428106646627e-05, + "loss": 0.1697, + "step": 13442 + }, + { + "epoch": 2.7088454563771913, + "grad_norm": 0.09320972859859467, + "learning_rate": 4.722950452502977e-05, + "loss": 0.2327, + "step": 13444 + }, + { + "epoch": 2.7092484384444893, + "grad_norm": 0.04267571121454239, + "learning_rate": 4.721619858220842e-05, + "loss": 0.1675, + "step": 13446 + }, + { + "epoch": 2.709651420511787, + "grad_norm": 0.06907591968774796, + "learning_rate": 4.720289283714385e-05, + "loss": 0.174, + "step": 13448 + }, + { + "epoch": 2.710054402579085, + "grad_norm": 0.06685389578342438, + "learning_rate": 4.718958729078133e-05, + "loss": 0.2013, + "step": 13450 + }, + { + "epoch": 2.710457384646383, + "grad_norm": 0.053654421120882034, + "learning_rate": 4.717628194406601e-05, + "loss": 0.2289, + "step": 13452 + }, + { + "epoch": 2.710860366713681, + "grad_norm": 0.0344913974404335, + "learning_rate": 4.716297679794312e-05, + "loss": 0.1451, + "step": 13454 + }, + { + "epoch": 2.7112633487809794, + "grad_norm": 0.051283374428749084, + "learning_rate": 4.7149671853357804e-05, + "loss": 0.173, + "step": 13456 + }, + { + "epoch": 2.7116663308482774, + "grad_norm": 0.06237954646348953, + "learning_rate": 4.7136367111255265e-05, + "loss": 0.2347, + "step": 13458 + }, + { + "epoch": 2.7120693129155753, + "grad_norm": 0.08349774777889252, + "learning_rate": 4.7123062572580603e-05, + "loss": 0.1609, + "step": 13460 + }, + { + "epoch": 2.7124722949828732, + "grad_norm": 0.05388723313808441, + "learning_rate": 4.7109758238278993e-05, + "loss": 0.1892, + "step": 13462 + }, + { + "epoch": 2.712875277050171, + "grad_norm": 0.05169912055134773, + "learning_rate": 4.709645410929552e-05, + "loss": 0.1759, + "step": 13464 + }, + { + "epoch": 2.713278259117469, + "grad_norm": 0.05110298469662666, + "learning_rate": 4.708315018657532e-05, + "loss": 0.1833, + "step": 13466 + }, + { + "epoch": 2.7136812411847675, + "grad_norm": 0.07065416872501373, + "learning_rate": 4.706984647106345e-05, + "loss": 0.2534, + "step": 13468 + }, + { + "epoch": 2.7140842232520654, + "grad_norm": 0.05724635720252991, + "learning_rate": 4.7056542963705014e-05, + "loss": 0.203, + "step": 13470 + }, + { + "epoch": 2.7144872053193634, + "grad_norm": 0.0664157047867775, + "learning_rate": 4.704323966544505e-05, + "loss": 0.1914, + "step": 13472 + }, + { + "epoch": 2.7148901873866613, + "grad_norm": 0.0474373884499073, + "learning_rate": 4.702993657722862e-05, + "loss": 0.1917, + "step": 13474 + }, + { + "epoch": 2.7152931694539593, + "grad_norm": 0.0590548999607563, + "learning_rate": 4.701663370000072e-05, + "loss": 0.2429, + "step": 13476 + }, + { + "epoch": 2.715696151521257, + "grad_norm": 0.06265545636415482, + "learning_rate": 4.700333103470642e-05, + "loss": 0.1909, + "step": 13478 + }, + { + "epoch": 2.716099133588555, + "grad_norm": 0.06928554177284241, + "learning_rate": 4.699002858229067e-05, + "loss": 0.1811, + "step": 13480 + }, + { + "epoch": 2.716502115655853, + "grad_norm": 0.06219932809472084, + "learning_rate": 4.6976726343698504e-05, + "loss": 0.1795, + "step": 13482 + }, + { + "epoch": 2.7169050977231515, + "grad_norm": 0.0587947741150856, + "learning_rate": 4.696342431987484e-05, + "loss": 0.1673, + "step": 13484 + }, + { + "epoch": 2.7173080797904494, + "grad_norm": 0.07239782065153122, + "learning_rate": 4.6950122511764665e-05, + "loss": 0.2172, + "step": 13486 + }, + { + "epoch": 2.7177110618577474, + "grad_norm": 0.06954152882099152, + "learning_rate": 4.6936820920312894e-05, + "loss": 0.1926, + "step": 13488 + }, + { + "epoch": 2.7181140439250453, + "grad_norm": 0.07562306523323059, + "learning_rate": 4.692351954646448e-05, + "loss": 0.1607, + "step": 13490 + }, + { + "epoch": 2.7185170259923432, + "grad_norm": 0.062392767518758774, + "learning_rate": 4.691021839116432e-05, + "loss": 0.2182, + "step": 13492 + }, + { + "epoch": 2.718920008059641, + "grad_norm": 0.057378824800252914, + "learning_rate": 4.6896917455357304e-05, + "loss": 0.1561, + "step": 13494 + }, + { + "epoch": 2.7193229901269396, + "grad_norm": 0.05048033967614174, + "learning_rate": 4.68836167399883e-05, + "loss": 0.2058, + "step": 13496 + }, + { + "epoch": 2.7197259721942375, + "grad_norm": 0.06855437904596329, + "learning_rate": 4.6870316246002195e-05, + "loss": 0.1935, + "step": 13498 + }, + { + "epoch": 2.7201289542615354, + "grad_norm": 0.039874229580163956, + "learning_rate": 4.6857015974343785e-05, + "loss": 0.1621, + "step": 13500 + }, + { + "epoch": 2.7205319363288334, + "grad_norm": 0.054651062935590744, + "learning_rate": 4.684371592595798e-05, + "loss": 0.181, + "step": 13502 + }, + { + "epoch": 2.7209349183961313, + "grad_norm": 0.05894327163696289, + "learning_rate": 4.683041610178951e-05, + "loss": 0.2302, + "step": 13504 + }, + { + "epoch": 2.7213379004634293, + "grad_norm": 0.09319541603326797, + "learning_rate": 4.681711650278323e-05, + "loss": 0.1972, + "step": 13506 + }, + { + "epoch": 2.721740882530727, + "grad_norm": 0.07073593139648438, + "learning_rate": 4.68038171298839e-05, + "loss": 0.2044, + "step": 13508 + }, + { + "epoch": 2.722143864598025, + "grad_norm": 0.061639729887247086, + "learning_rate": 4.679051798403629e-05, + "loss": 0.1904, + "step": 13510 + }, + { + "epoch": 2.7225468466653235, + "grad_norm": 0.057687997817993164, + "learning_rate": 4.677721906618514e-05, + "loss": 0.1751, + "step": 13512 + }, + { + "epoch": 2.7229498287326215, + "grad_norm": 0.06481876969337463, + "learning_rate": 4.676392037727522e-05, + "loss": 0.2169, + "step": 13514 + }, + { + "epoch": 2.7233528107999194, + "grad_norm": 0.05692235007882118, + "learning_rate": 4.675062191825118e-05, + "loss": 0.21, + "step": 13516 + }, + { + "epoch": 2.7237557928672174, + "grad_norm": 0.06150791794061661, + "learning_rate": 4.673732369005779e-05, + "loss": 0.1686, + "step": 13518 + }, + { + "epoch": 2.7241587749345153, + "grad_norm": 0.05073976516723633, + "learning_rate": 4.672402569363971e-05, + "loss": 0.1632, + "step": 13520 + }, + { + "epoch": 2.7245617570018137, + "grad_norm": 0.0635070875287056, + "learning_rate": 4.67107279299416e-05, + "loss": 0.2368, + "step": 13522 + }, + { + "epoch": 2.7249647390691116, + "grad_norm": 0.07709597796201706, + "learning_rate": 4.66974303999081e-05, + "loss": 0.1828, + "step": 13524 + }, + { + "epoch": 2.7253677211364096, + "grad_norm": 0.06766793131828308, + "learning_rate": 4.66841331044839e-05, + "loss": 0.18, + "step": 13526 + }, + { + "epoch": 2.7257707032037075, + "grad_norm": 0.04549749195575714, + "learning_rate": 4.6670836044613536e-05, + "loss": 0.1761, + "step": 13528 + }, + { + "epoch": 2.7261736852710055, + "grad_norm": 0.07432392239570618, + "learning_rate": 4.6657539221241684e-05, + "loss": 0.2038, + "step": 13530 + }, + { + "epoch": 2.7265766673383034, + "grad_norm": 0.05852292478084564, + "learning_rate": 4.664424263531289e-05, + "loss": 0.2055, + "step": 13532 + }, + { + "epoch": 2.7269796494056013, + "grad_norm": 0.07217149436473846, + "learning_rate": 4.6630946287771746e-05, + "loss": 0.192, + "step": 13534 + }, + { + "epoch": 2.7273826314728993, + "grad_norm": 0.054228655993938446, + "learning_rate": 4.6617650179562774e-05, + "loss": 0.1559, + "step": 13536 + }, + { + "epoch": 2.727785613540197, + "grad_norm": 0.04575591906905174, + "learning_rate": 4.660435431163054e-05, + "loss": 0.1644, + "step": 13538 + }, + { + "epoch": 2.7281885956074956, + "grad_norm": 0.05871587619185448, + "learning_rate": 4.659105868491952e-05, + "loss": 0.2315, + "step": 13540 + }, + { + "epoch": 2.7285915776747935, + "grad_norm": 0.061794646084308624, + "learning_rate": 4.657776330037427e-05, + "loss": 0.1802, + "step": 13542 + }, + { + "epoch": 2.7289945597420915, + "grad_norm": 0.053640998899936676, + "learning_rate": 4.656446815893922e-05, + "loss": 0.206, + "step": 13544 + }, + { + "epoch": 2.7293975418093894, + "grad_norm": 0.1432095468044281, + "learning_rate": 4.655117326155887e-05, + "loss": 0.2125, + "step": 13546 + }, + { + "epoch": 2.7298005238766874, + "grad_norm": 0.04829771816730499, + "learning_rate": 4.6537878609177646e-05, + "loss": 0.1351, + "step": 13548 + }, + { + "epoch": 2.7302035059439858, + "grad_norm": 0.05013980343937874, + "learning_rate": 4.652458420274e-05, + "loss": 0.2334, + "step": 13550 + }, + { + "epoch": 2.7306064880112837, + "grad_norm": 0.06168575957417488, + "learning_rate": 4.6511290043190314e-05, + "loss": 0.1966, + "step": 13552 + }, + { + "epoch": 2.7310094700785816, + "grad_norm": 0.06013885512948036, + "learning_rate": 4.649799613147303e-05, + "loss": 0.1846, + "step": 13554 + }, + { + "epoch": 2.7314124521458796, + "grad_norm": 0.0718698501586914, + "learning_rate": 4.648470246853248e-05, + "loss": 0.2349, + "step": 13556 + }, + { + "epoch": 2.7318154342131775, + "grad_norm": 0.04924318194389343, + "learning_rate": 4.6471409055313056e-05, + "loss": 0.1665, + "step": 13558 + }, + { + "epoch": 2.7322184162804755, + "grad_norm": 0.07484610378742218, + "learning_rate": 4.645811589275909e-05, + "loss": 0.1914, + "step": 13560 + }, + { + "epoch": 2.7326213983477734, + "grad_norm": 0.055900052189826965, + "learning_rate": 4.64448229818149e-05, + "loss": 0.2099, + "step": 13562 + }, + { + "epoch": 2.7330243804150713, + "grad_norm": 0.05515626445412636, + "learning_rate": 4.643153032342479e-05, + "loss": 0.1421, + "step": 13564 + }, + { + "epoch": 2.7334273624823693, + "grad_norm": 0.06733749061822891, + "learning_rate": 4.641823791853308e-05, + "loss": 0.2094, + "step": 13566 + }, + { + "epoch": 2.7338303445496677, + "grad_norm": 0.06128699705004692, + "learning_rate": 4.6404945768084005e-05, + "loss": 0.2177, + "step": 13568 + }, + { + "epoch": 2.7342333266169656, + "grad_norm": 0.05260147899389267, + "learning_rate": 4.639165387302185e-05, + "loss": 0.1178, + "step": 13570 + }, + { + "epoch": 2.7346363086842636, + "grad_norm": 0.05988286808133125, + "learning_rate": 4.6378362234290817e-05, + "loss": 0.2061, + "step": 13572 + }, + { + "epoch": 2.7350392907515615, + "grad_norm": 0.06819775700569153, + "learning_rate": 4.636507085283515e-05, + "loss": 0.2071, + "step": 13574 + }, + { + "epoch": 2.7354422728188594, + "grad_norm": 0.04776353761553764, + "learning_rate": 4.635177972959902e-05, + "loss": 0.1735, + "step": 13576 + }, + { + "epoch": 2.735845254886158, + "grad_norm": 0.05012471228837967, + "learning_rate": 4.6338488865526655e-05, + "loss": 0.1976, + "step": 13578 + }, + { + "epoch": 2.7362482369534558, + "grad_norm": 0.07344962656497955, + "learning_rate": 4.6325198261562144e-05, + "loss": 0.1867, + "step": 13580 + }, + { + "epoch": 2.7366512190207537, + "grad_norm": 0.05135316029191017, + "learning_rate": 4.63119079186497e-05, + "loss": 0.2513, + "step": 13582 + }, + { + "epoch": 2.7370542010880516, + "grad_norm": 0.05327500030398369, + "learning_rate": 4.629861783773341e-05, + "loss": 0.1704, + "step": 13584 + }, + { + "epoch": 2.7374571831553496, + "grad_norm": 0.05641722306609154, + "learning_rate": 4.6285328019757395e-05, + "loss": 0.1633, + "step": 13586 + }, + { + "epoch": 2.7378601652226475, + "grad_norm": 0.08201052248477936, + "learning_rate": 4.627203846566572e-05, + "loss": 0.1811, + "step": 13588 + }, + { + "epoch": 2.7382631472899455, + "grad_norm": 0.04866538196802139, + "learning_rate": 4.62587491764025e-05, + "loss": 0.1921, + "step": 13590 + }, + { + "epoch": 2.7386661293572434, + "grad_norm": 0.0483061820268631, + "learning_rate": 4.624546015291172e-05, + "loss": 0.2044, + "step": 13592 + }, + { + "epoch": 2.7390691114245413, + "grad_norm": 0.06678812950849533, + "learning_rate": 4.623217139613748e-05, + "loss": 0.2455, + "step": 13594 + }, + { + "epoch": 2.7394720934918397, + "grad_norm": 0.053607940673828125, + "learning_rate": 4.6218882907023734e-05, + "loss": 0.1809, + "step": 13596 + }, + { + "epoch": 2.7398750755591377, + "grad_norm": 0.049875251948833466, + "learning_rate": 4.620559468651451e-05, + "loss": 0.2261, + "step": 13598 + }, + { + "epoch": 2.7402780576264356, + "grad_norm": 0.061627864837646484, + "learning_rate": 4.619230673555377e-05, + "loss": 0.1984, + "step": 13600 + }, + { + "epoch": 2.7406810396937336, + "grad_norm": 0.05893751606345177, + "learning_rate": 4.617901905508548e-05, + "loss": 0.2013, + "step": 13602 + }, + { + "epoch": 2.7410840217610315, + "grad_norm": 0.05172676220536232, + "learning_rate": 4.616573164605354e-05, + "loss": 0.18, + "step": 13604 + }, + { + "epoch": 2.74148700382833, + "grad_norm": 0.057852406054735184, + "learning_rate": 4.615244450940191e-05, + "loss": 0.218, + "step": 13606 + }, + { + "epoch": 2.741889985895628, + "grad_norm": 0.059165455400943756, + "learning_rate": 4.613915764607446e-05, + "loss": 0.2052, + "step": 13608 + }, + { + "epoch": 2.7422929679629258, + "grad_norm": 0.046519551426172256, + "learning_rate": 4.612587105701509e-05, + "loss": 0.2126, + "step": 13610 + }, + { + "epoch": 2.7426959500302237, + "grad_norm": 0.05582638457417488, + "learning_rate": 4.611258474316764e-05, + "loss": 0.1854, + "step": 13612 + }, + { + "epoch": 2.7430989320975216, + "grad_norm": 0.05454739183187485, + "learning_rate": 4.609929870547595e-05, + "loss": 0.19, + "step": 13614 + }, + { + "epoch": 2.7435019141648196, + "grad_norm": 0.03613164275884628, + "learning_rate": 4.6086012944883825e-05, + "loss": 0.17, + "step": 13616 + }, + { + "epoch": 2.7439048962321175, + "grad_norm": 0.05004088953137398, + "learning_rate": 4.60727274623351e-05, + "loss": 0.1799, + "step": 13618 + }, + { + "epoch": 2.7443078782994155, + "grad_norm": 0.06082676351070404, + "learning_rate": 4.6059442258773536e-05, + "loss": 0.2265, + "step": 13620 + }, + { + "epoch": 2.7447108603667134, + "grad_norm": 0.06850215792655945, + "learning_rate": 4.604615733514289e-05, + "loss": 0.2378, + "step": 13622 + }, + { + "epoch": 2.745113842434012, + "grad_norm": 0.047571759670972824, + "learning_rate": 4.60328726923869e-05, + "loss": 0.1445, + "step": 13624 + }, + { + "epoch": 2.7455168245013097, + "grad_norm": 0.039432961493730545, + "learning_rate": 4.60195883314493e-05, + "loss": 0.1593, + "step": 13626 + }, + { + "epoch": 2.7459198065686077, + "grad_norm": 0.04747028276324272, + "learning_rate": 4.600630425327375e-05, + "loss": 0.1961, + "step": 13628 + }, + { + "epoch": 2.7463227886359056, + "grad_norm": 0.051651448011398315, + "learning_rate": 4.599302045880399e-05, + "loss": 0.1608, + "step": 13630 + }, + { + "epoch": 2.7467257707032036, + "grad_norm": 0.050929635763168335, + "learning_rate": 4.597973694898363e-05, + "loss": 0.1845, + "step": 13632 + }, + { + "epoch": 2.747128752770502, + "grad_norm": 0.06035885959863663, + "learning_rate": 4.596645372475634e-05, + "loss": 0.2094, + "step": 13634 + }, + { + "epoch": 2.7475317348378, + "grad_norm": 0.04276600480079651, + "learning_rate": 4.595317078706572e-05, + "loss": 0.1647, + "step": 13636 + }, + { + "epoch": 2.747934716905098, + "grad_norm": 0.07479345053434372, + "learning_rate": 4.593988813685539e-05, + "loss": 0.2003, + "step": 13638 + }, + { + "epoch": 2.7483376989723958, + "grad_norm": 0.050298575311899185, + "learning_rate": 4.592660577506888e-05, + "loss": 0.1719, + "step": 13640 + }, + { + "epoch": 2.7487406810396937, + "grad_norm": 0.05687572807073593, + "learning_rate": 4.591332370264982e-05, + "loss": 0.2393, + "step": 13642 + }, + { + "epoch": 2.7491436631069917, + "grad_norm": 0.07088617235422134, + "learning_rate": 4.590004192054168e-05, + "loss": 0.189, + "step": 13644 + }, + { + "epoch": 2.7495466451742896, + "grad_norm": 0.03902342915534973, + "learning_rate": 4.5886760429688016e-05, + "loss": 0.1422, + "step": 13646 + }, + { + "epoch": 2.7499496272415875, + "grad_norm": 0.050030291080474854, + "learning_rate": 4.587347923103231e-05, + "loss": 0.1976, + "step": 13648 + }, + { + "epoch": 2.750352609308886, + "grad_norm": 0.05007876083254814, + "learning_rate": 4.5860198325518055e-05, + "loss": 0.179, + "step": 13650 + }, + { + "epoch": 2.750755591376184, + "grad_norm": 0.050599128007888794, + "learning_rate": 4.584691771408866e-05, + "loss": 0.1626, + "step": 13652 + }, + { + "epoch": 2.751158573443482, + "grad_norm": 0.051085565239191055, + "learning_rate": 4.583363739768763e-05, + "loss": 0.1839, + "step": 13654 + }, + { + "epoch": 2.7515615555107797, + "grad_norm": 0.07226687669754028, + "learning_rate": 4.582035737725829e-05, + "loss": 0.2334, + "step": 13656 + }, + { + "epoch": 2.7519645375780777, + "grad_norm": 0.06962387263774872, + "learning_rate": 4.5807077653744116e-05, + "loss": 0.2079, + "step": 13658 + }, + { + "epoch": 2.7523675196453756, + "grad_norm": 0.08981406688690186, + "learning_rate": 4.579379822808841e-05, + "loss": 0.1935, + "step": 13660 + }, + { + "epoch": 2.752770501712674, + "grad_norm": 0.05534931644797325, + "learning_rate": 4.578051910123458e-05, + "loss": 0.1722, + "step": 13662 + }, + { + "epoch": 2.753173483779972, + "grad_norm": 0.07691693305969238, + "learning_rate": 4.5767240274125904e-05, + "loss": 0.165, + "step": 13664 + }, + { + "epoch": 2.75357646584727, + "grad_norm": 0.05576300621032715, + "learning_rate": 4.5753961747705726e-05, + "loss": 0.2217, + "step": 13666 + }, + { + "epoch": 2.753979447914568, + "grad_norm": 0.05548671633005142, + "learning_rate": 4.574068352291729e-05, + "loss": 0.2706, + "step": 13668 + }, + { + "epoch": 2.7543824299818658, + "grad_norm": 0.04379713162779808, + "learning_rate": 4.572740560070391e-05, + "loss": 0.157, + "step": 13670 + }, + { + "epoch": 2.7547854120491637, + "grad_norm": 0.052931223064661026, + "learning_rate": 4.571412798200878e-05, + "loss": 0.2155, + "step": 13672 + }, + { + "epoch": 2.7551883941164617, + "grad_norm": 0.05508912354707718, + "learning_rate": 4.5700850667775166e-05, + "loss": 0.2049, + "step": 13674 + }, + { + "epoch": 2.7555913761837596, + "grad_norm": 0.048319194465875626, + "learning_rate": 4.568757365894623e-05, + "loss": 0.1545, + "step": 13676 + }, + { + "epoch": 2.755994358251058, + "grad_norm": 0.05623358115553856, + "learning_rate": 4.567429695646518e-05, + "loss": 0.1878, + "step": 13678 + }, + { + "epoch": 2.756397340318356, + "grad_norm": 0.04395512863993645, + "learning_rate": 4.566102056127513e-05, + "loss": 0.1332, + "step": 13680 + }, + { + "epoch": 2.756800322385654, + "grad_norm": 0.0522671602666378, + "learning_rate": 4.564774447431927e-05, + "loss": 0.2048, + "step": 13682 + }, + { + "epoch": 2.757203304452952, + "grad_norm": 0.05952902510762215, + "learning_rate": 4.563446869654066e-05, + "loss": 0.1795, + "step": 13684 + }, + { + "epoch": 2.7576062865202497, + "grad_norm": 0.04310224950313568, + "learning_rate": 4.562119322888243e-05, + "loss": 0.1426, + "step": 13686 + }, + { + "epoch": 2.7580092685875477, + "grad_norm": 0.06073429435491562, + "learning_rate": 4.560791807228761e-05, + "loss": 0.1593, + "step": 13688 + }, + { + "epoch": 2.758412250654846, + "grad_norm": 0.07317265123128891, + "learning_rate": 4.559464322769929e-05, + "loss": 0.1992, + "step": 13690 + }, + { + "epoch": 2.758815232722144, + "grad_norm": 0.07108186930418015, + "learning_rate": 4.558136869606045e-05, + "loss": 0.1988, + "step": 13692 + }, + { + "epoch": 2.759218214789442, + "grad_norm": 0.054733678698539734, + "learning_rate": 4.556809447831412e-05, + "loss": 0.1376, + "step": 13694 + }, + { + "epoch": 2.75962119685674, + "grad_norm": 0.06446659564971924, + "learning_rate": 4.5554820575403265e-05, + "loss": 0.1946, + "step": 13696 + }, + { + "epoch": 2.760024178924038, + "grad_norm": 0.05727505311369896, + "learning_rate": 4.5541546988270856e-05, + "loss": 0.1917, + "step": 13698 + }, + { + "epoch": 2.760427160991336, + "grad_norm": 0.05168221890926361, + "learning_rate": 4.552827371785981e-05, + "loss": 0.2005, + "step": 13700 + }, + { + "epoch": 2.7608301430586337, + "grad_norm": 0.06143752112984657, + "learning_rate": 4.551500076511306e-05, + "loss": 0.2234, + "step": 13702 + }, + { + "epoch": 2.7612331251259317, + "grad_norm": 0.05393059179186821, + "learning_rate": 4.550172813097346e-05, + "loss": 0.1632, + "step": 13704 + }, + { + "epoch": 2.76163610719323, + "grad_norm": 0.05636855214834213, + "learning_rate": 4.548845581638392e-05, + "loss": 0.1491, + "step": 13706 + }, + { + "epoch": 2.762039089260528, + "grad_norm": 0.04566177725791931, + "learning_rate": 4.547518382228725e-05, + "loss": 0.1682, + "step": 13708 + }, + { + "epoch": 2.762442071327826, + "grad_norm": 0.05374148488044739, + "learning_rate": 4.54619121496263e-05, + "loss": 0.2025, + "step": 13710 + }, + { + "epoch": 2.762845053395124, + "grad_norm": 0.05850568413734436, + "learning_rate": 4.544864079934385e-05, + "loss": 0.2037, + "step": 13712 + }, + { + "epoch": 2.763248035462422, + "grad_norm": 0.04666848108172417, + "learning_rate": 4.543536977238268e-05, + "loss": 0.1904, + "step": 13714 + }, + { + "epoch": 2.76365101752972, + "grad_norm": 0.0654207244515419, + "learning_rate": 4.542209906968551e-05, + "loss": 0.197, + "step": 13716 + }, + { + "epoch": 2.764053999597018, + "grad_norm": 0.050538092851638794, + "learning_rate": 4.540882869219515e-05, + "loss": 0.1964, + "step": 13718 + }, + { + "epoch": 2.764456981664316, + "grad_norm": 0.10446475446224213, + "learning_rate": 4.539555864085422e-05, + "loss": 0.2004, + "step": 13720 + }, + { + "epoch": 2.764859963731614, + "grad_norm": 0.046087007969617844, + "learning_rate": 4.538228891660546e-05, + "loss": 0.1674, + "step": 13722 + }, + { + "epoch": 2.765262945798912, + "grad_norm": 0.05671662464737892, + "learning_rate": 4.53690195203915e-05, + "loss": 0.2174, + "step": 13724 + }, + { + "epoch": 2.76566592786621, + "grad_norm": 0.0580022819340229, + "learning_rate": 4.5355750453155e-05, + "loss": 0.1821, + "step": 13726 + }, + { + "epoch": 2.766068909933508, + "grad_norm": 0.05730949342250824, + "learning_rate": 4.534248171583854e-05, + "loss": 0.2026, + "step": 13728 + }, + { + "epoch": 2.766471892000806, + "grad_norm": 0.04864330217242241, + "learning_rate": 4.532921330938476e-05, + "loss": 0.2253, + "step": 13730 + }, + { + "epoch": 2.7668748740681037, + "grad_norm": 0.06228374317288399, + "learning_rate": 4.531594523473616e-05, + "loss": 0.1941, + "step": 13732 + }, + { + "epoch": 2.767277856135402, + "grad_norm": 0.08437447249889374, + "learning_rate": 4.530267749283535e-05, + "loss": 0.2316, + "step": 13734 + }, + { + "epoch": 2.7676808382027, + "grad_norm": 0.06000857055187225, + "learning_rate": 4.52894100846248e-05, + "loss": 0.1896, + "step": 13736 + }, + { + "epoch": 2.768083820269998, + "grad_norm": 0.062360458076000214, + "learning_rate": 4.527614301104704e-05, + "loss": 0.1581, + "step": 13738 + }, + { + "epoch": 2.768486802337296, + "grad_norm": 0.04638027027249336, + "learning_rate": 4.526287627304451e-05, + "loss": 0.1657, + "step": 13740 + }, + { + "epoch": 2.768889784404594, + "grad_norm": 0.06518687307834625, + "learning_rate": 4.5249609871559693e-05, + "loss": 0.205, + "step": 13742 + }, + { + "epoch": 2.7692927664718923, + "grad_norm": 0.04653553292155266, + "learning_rate": 4.5236343807534964e-05, + "loss": 0.1813, + "step": 13744 + }, + { + "epoch": 2.76969574853919, + "grad_norm": 0.04413139447569847, + "learning_rate": 4.522307808191278e-05, + "loss": 0.1976, + "step": 13746 + }, + { + "epoch": 2.770098730606488, + "grad_norm": 0.07281026244163513, + "learning_rate": 4.520981269563548e-05, + "loss": 0.2496, + "step": 13748 + }, + { + "epoch": 2.770501712673786, + "grad_norm": 0.03691798821091652, + "learning_rate": 4.5196547649645426e-05, + "loss": 0.1752, + "step": 13750 + }, + { + "epoch": 2.770904694741084, + "grad_norm": 0.055397164076566696, + "learning_rate": 4.518328294488494e-05, + "loss": 0.1883, + "step": 13752 + }, + { + "epoch": 2.771307676808382, + "grad_norm": 0.08555757254362106, + "learning_rate": 4.517001858229634e-05, + "loss": 0.1912, + "step": 13754 + }, + { + "epoch": 2.77171065887568, + "grad_norm": 0.04360605776309967, + "learning_rate": 4.515675456282188e-05, + "loss": 0.1671, + "step": 13756 + }, + { + "epoch": 2.772113640942978, + "grad_norm": 0.06162875518202782, + "learning_rate": 4.5143490887403844e-05, + "loss": 0.189, + "step": 13758 + }, + { + "epoch": 2.772516623010276, + "grad_norm": 0.08789025247097015, + "learning_rate": 4.513022755698444e-05, + "loss": 0.1658, + "step": 13760 + }, + { + "epoch": 2.772919605077574, + "grad_norm": 0.05798583850264549, + "learning_rate": 4.51169645725059e-05, + "loss": 0.209, + "step": 13762 + }, + { + "epoch": 2.773322587144872, + "grad_norm": 0.07589727640151978, + "learning_rate": 4.510370193491037e-05, + "loss": 0.1956, + "step": 13764 + }, + { + "epoch": 2.77372556921217, + "grad_norm": 0.05734777823090553, + "learning_rate": 4.509043964514003e-05, + "loss": 0.1661, + "step": 13766 + }, + { + "epoch": 2.774128551279468, + "grad_norm": 0.07661935687065125, + "learning_rate": 4.507717770413699e-05, + "loss": 0.1956, + "step": 13768 + }, + { + "epoch": 2.774531533346766, + "grad_norm": 0.07800666242837906, + "learning_rate": 4.5063916112843394e-05, + "loss": 0.2079, + "step": 13770 + }, + { + "epoch": 2.7749345154140643, + "grad_norm": 0.05529240146279335, + "learning_rate": 4.50506548722013e-05, + "loss": 0.1785, + "step": 13772 + }, + { + "epoch": 2.7753374974813623, + "grad_norm": 0.052781157195568085, + "learning_rate": 4.503739398315277e-05, + "loss": 0.1843, + "step": 13774 + }, + { + "epoch": 2.77574047954866, + "grad_norm": 0.04509377107024193, + "learning_rate": 4.502413344663983e-05, + "loss": 0.1793, + "step": 13776 + }, + { + "epoch": 2.776143461615958, + "grad_norm": 0.04552818834781647, + "learning_rate": 4.501087326360449e-05, + "loss": 0.1808, + "step": 13778 + }, + { + "epoch": 2.776546443683256, + "grad_norm": 0.07951189577579498, + "learning_rate": 4.499761343498873e-05, + "loss": 0.2475, + "step": 13780 + }, + { + "epoch": 2.776949425750554, + "grad_norm": 0.06761814653873444, + "learning_rate": 4.498435396173453e-05, + "loss": 0.2362, + "step": 13782 + }, + { + "epoch": 2.777352407817852, + "grad_norm": 0.0560927577316761, + "learning_rate": 4.497109484478378e-05, + "loss": 0.1828, + "step": 13784 + }, + { + "epoch": 2.77775538988515, + "grad_norm": 0.05709867179393768, + "learning_rate": 4.4957836085078426e-05, + "loss": 0.1846, + "step": 13786 + }, + { + "epoch": 2.778158371952448, + "grad_norm": 0.07907452434301376, + "learning_rate": 4.4944577683560325e-05, + "loss": 0.1647, + "step": 13788 + }, + { + "epoch": 2.7785613540197462, + "grad_norm": 0.05839816480875015, + "learning_rate": 4.493131964117135e-05, + "loss": 0.149, + "step": 13790 + }, + { + "epoch": 2.778964336087044, + "grad_norm": 0.06229160353541374, + "learning_rate": 4.49180619588533e-05, + "loss": 0.2122, + "step": 13792 + }, + { + "epoch": 2.779367318154342, + "grad_norm": 0.04715651273727417, + "learning_rate": 4.490480463754804e-05, + "loss": 0.1707, + "step": 13794 + }, + { + "epoch": 2.77977030022164, + "grad_norm": 0.06249184161424637, + "learning_rate": 4.489154767819727e-05, + "loss": 0.2341, + "step": 13796 + }, + { + "epoch": 2.780173282288938, + "grad_norm": 0.05317969247698784, + "learning_rate": 4.48782910817428e-05, + "loss": 0.1957, + "step": 13798 + }, + { + "epoch": 2.7805762643562364, + "grad_norm": 0.04029626026749611, + "learning_rate": 4.4865034849126336e-05, + "loss": 0.165, + "step": 13800 + }, + { + "epoch": 2.7809792464235343, + "grad_norm": 0.04745176434516907, + "learning_rate": 4.485177898128957e-05, + "loss": 0.1561, + "step": 13802 + }, + { + "epoch": 2.7813822284908323, + "grad_norm": 0.04458378627896309, + "learning_rate": 4.483852347917423e-05, + "loss": 0.1586, + "step": 13804 + }, + { + "epoch": 2.78178521055813, + "grad_norm": 0.06560066342353821, + "learning_rate": 4.4825268343721896e-05, + "loss": 0.2025, + "step": 13806 + }, + { + "epoch": 2.782188192625428, + "grad_norm": 0.055546555668115616, + "learning_rate": 4.481201357587424e-05, + "loss": 0.1803, + "step": 13808 + }, + { + "epoch": 2.782591174692726, + "grad_norm": 0.05878676101565361, + "learning_rate": 4.479875917657284e-05, + "loss": 0.2401, + "step": 13810 + }, + { + "epoch": 2.782994156760024, + "grad_norm": 0.053903158754110336, + "learning_rate": 4.478550514675927e-05, + "loss": 0.1834, + "step": 13812 + }, + { + "epoch": 2.783397138827322, + "grad_norm": 0.05280616879463196, + "learning_rate": 4.477225148737506e-05, + "loss": 0.1777, + "step": 13814 + }, + { + "epoch": 2.78380012089462, + "grad_norm": 0.05329929664731026, + "learning_rate": 4.4758998199361765e-05, + "loss": 0.1804, + "step": 13816 + }, + { + "epoch": 2.7842031029619183, + "grad_norm": 0.06433535367250443, + "learning_rate": 4.4745745283660835e-05, + "loss": 0.2252, + "step": 13818 + }, + { + "epoch": 2.7846060850292162, + "grad_norm": 0.030351370573043823, + "learning_rate": 4.4732492741213776e-05, + "loss": 0.152, + "step": 13820 + }, + { + "epoch": 2.785009067096514, + "grad_norm": 0.055486053228378296, + "learning_rate": 4.471924057296199e-05, + "loss": 0.1758, + "step": 13822 + }, + { + "epoch": 2.785412049163812, + "grad_norm": 0.05568544939160347, + "learning_rate": 4.470598877984693e-05, + "loss": 0.1725, + "step": 13824 + }, + { + "epoch": 2.78581503123111, + "grad_norm": 0.06160859763622284, + "learning_rate": 4.469273736280994e-05, + "loss": 0.2058, + "step": 13826 + }, + { + "epoch": 2.7862180132984085, + "grad_norm": 0.0535859577357769, + "learning_rate": 4.467948632279243e-05, + "loss": 0.203, + "step": 13828 + }, + { + "epoch": 2.7866209953657064, + "grad_norm": 0.06398488581180573, + "learning_rate": 4.4666235660735665e-05, + "loss": 0.2098, + "step": 13830 + }, + { + "epoch": 2.7870239774330043, + "grad_norm": 0.06315203756093979, + "learning_rate": 4.4652985377581016e-05, + "loss": 0.188, + "step": 13832 + }, + { + "epoch": 2.7874269595003023, + "grad_norm": 0.045979224145412445, + "learning_rate": 4.463973547426972e-05, + "loss": 0.1747, + "step": 13834 + }, + { + "epoch": 2.7878299415676002, + "grad_norm": 0.04362760856747627, + "learning_rate": 4.4626485951743055e-05, + "loss": 0.1939, + "step": 13836 + }, + { + "epoch": 2.788232923634898, + "grad_norm": 0.04942842200398445, + "learning_rate": 4.461323681094223e-05, + "loss": 0.2127, + "step": 13838 + }, + { + "epoch": 2.788635905702196, + "grad_norm": 0.06796036660671234, + "learning_rate": 4.459998805280845e-05, + "loss": 0.1717, + "step": 13840 + }, + { + "epoch": 2.789038887769494, + "grad_norm": 0.05861689895391464, + "learning_rate": 4.458673967828286e-05, + "loss": 0.1831, + "step": 13842 + }, + { + "epoch": 2.789441869836792, + "grad_norm": 0.06883051246404648, + "learning_rate": 4.457349168830665e-05, + "loss": 0.1663, + "step": 13844 + }, + { + "epoch": 2.7898448519040904, + "grad_norm": 0.05223101004958153, + "learning_rate": 4.4560244083820904e-05, + "loss": 0.1898, + "step": 13846 + }, + { + "epoch": 2.7902478339713883, + "grad_norm": 0.0574951246380806, + "learning_rate": 4.454699686576673e-05, + "loss": 0.2478, + "step": 13848 + }, + { + "epoch": 2.7906508160386863, + "grad_norm": 0.05982831120491028, + "learning_rate": 4.453375003508516e-05, + "loss": 0.1523, + "step": 13850 + }, + { + "epoch": 2.791053798105984, + "grad_norm": 0.059536661952733994, + "learning_rate": 4.452050359271726e-05, + "loss": 0.1827, + "step": 13852 + }, + { + "epoch": 2.791456780173282, + "grad_norm": 0.06197162717580795, + "learning_rate": 4.4507257539604e-05, + "loss": 0.1867, + "step": 13854 + }, + { + "epoch": 2.7918597622405805, + "grad_norm": 0.057462189346551895, + "learning_rate": 4.4494011876686407e-05, + "loss": 0.1587, + "step": 13856 + }, + { + "epoch": 2.7922627443078785, + "grad_norm": 0.044567789882421494, + "learning_rate": 4.448076660490539e-05, + "loss": 0.2258, + "step": 13858 + }, + { + "epoch": 2.7926657263751764, + "grad_norm": 0.06285153329372406, + "learning_rate": 4.446752172520189e-05, + "loss": 0.1995, + "step": 13860 + }, + { + "epoch": 2.7930687084424743, + "grad_norm": 0.059338536113500595, + "learning_rate": 4.44542772385168e-05, + "loss": 0.2036, + "step": 13862 + }, + { + "epoch": 2.7934716905097723, + "grad_norm": 0.041714541614055634, + "learning_rate": 4.4441033145790986e-05, + "loss": 0.1538, + "step": 13864 + }, + { + "epoch": 2.7938746725770702, + "grad_norm": 0.04737304151058197, + "learning_rate": 4.442778944796527e-05, + "loss": 0.174, + "step": 13866 + }, + { + "epoch": 2.794277654644368, + "grad_norm": 0.07135643810033798, + "learning_rate": 4.441454614598051e-05, + "loss": 0.2362, + "step": 13868 + }, + { + "epoch": 2.794680636711666, + "grad_norm": 0.05102970823645592, + "learning_rate": 4.440130324077744e-05, + "loss": 0.1533, + "step": 13870 + }, + { + "epoch": 2.7950836187789645, + "grad_norm": 0.06961221992969513, + "learning_rate": 4.4388060733296846e-05, + "loss": 0.2393, + "step": 13872 + }, + { + "epoch": 2.7954866008462624, + "grad_norm": 0.07168768346309662, + "learning_rate": 4.437481862447943e-05, + "loss": 0.2357, + "step": 13874 + }, + { + "epoch": 2.7958895829135604, + "grad_norm": 0.05309915915131569, + "learning_rate": 4.436157691526592e-05, + "loss": 0.1884, + "step": 13876 + }, + { + "epoch": 2.7962925649808583, + "grad_norm": 0.07483027130365372, + "learning_rate": 4.434833560659694e-05, + "loss": 0.219, + "step": 13878 + }, + { + "epoch": 2.7966955470481563, + "grad_norm": 0.06671711802482605, + "learning_rate": 4.4335094699413196e-05, + "loss": 0.2298, + "step": 13880 + }, + { + "epoch": 2.797098529115454, + "grad_norm": 0.050368938595056534, + "learning_rate": 4.432185419465523e-05, + "loss": 0.1488, + "step": 13882 + }, + { + "epoch": 2.7975015111827526, + "grad_norm": 0.0435597226023674, + "learning_rate": 4.4308614093263684e-05, + "loss": 0.1721, + "step": 13884 + }, + { + "epoch": 2.7979044932500505, + "grad_norm": 0.06164487451314926, + "learning_rate": 4.429537439617908e-05, + "loss": 0.1786, + "step": 13886 + }, + { + "epoch": 2.7983074753173485, + "grad_norm": 0.07364491373300552, + "learning_rate": 4.428213510434197e-05, + "loss": 0.2253, + "step": 13888 + }, + { + "epoch": 2.7987104573846464, + "grad_norm": 0.04716651514172554, + "learning_rate": 4.426889621869281e-05, + "loss": 0.2165, + "step": 13890 + }, + { + "epoch": 2.7991134394519444, + "grad_norm": 0.046748001128435135, + "learning_rate": 4.425565774017213e-05, + "loss": 0.1213, + "step": 13892 + }, + { + "epoch": 2.7995164215192423, + "grad_norm": 0.055533722043037415, + "learning_rate": 4.424241966972031e-05, + "loss": 0.177, + "step": 13894 + }, + { + "epoch": 2.7999194035865402, + "grad_norm": 0.09642328321933746, + "learning_rate": 4.42291820082778e-05, + "loss": 0.2075, + "step": 13896 + }, + { + "epoch": 2.800322385653838, + "grad_norm": 0.04621008038520813, + "learning_rate": 4.421594475678497e-05, + "loss": 0.1543, + "step": 13898 + }, + { + "epoch": 2.8007253677211366, + "grad_norm": 0.0577290914952755, + "learning_rate": 4.4202707916182185e-05, + "loss": 0.1797, + "step": 13900 + }, + { + "epoch": 2.8011283497884345, + "grad_norm": 0.056990884244441986, + "learning_rate": 4.418947148740974e-05, + "loss": 0.1922, + "step": 13902 + }, + { + "epoch": 2.8015313318557324, + "grad_norm": 0.0492350235581398, + "learning_rate": 4.417623547140797e-05, + "loss": 0.1559, + "step": 13904 + }, + { + "epoch": 2.8019343139230304, + "grad_norm": 0.05712248757481575, + "learning_rate": 4.416299986911709e-05, + "loss": 0.1663, + "step": 13906 + }, + { + "epoch": 2.8023372959903283, + "grad_norm": 0.062008894979953766, + "learning_rate": 4.414976468147739e-05, + "loss": 0.1751, + "step": 13908 + }, + { + "epoch": 2.8027402780576267, + "grad_norm": 0.05482635647058487, + "learning_rate": 4.413652990942904e-05, + "loss": 0.1928, + "step": 13910 + }, + { + "epoch": 2.8031432601249247, + "grad_norm": 0.05846606567502022, + "learning_rate": 4.4123295553912233e-05, + "loss": 0.1886, + "step": 13912 + }, + { + "epoch": 2.8035462421922226, + "grad_norm": 0.039809294044971466, + "learning_rate": 4.41100616158671e-05, + "loss": 0.1723, + "step": 13914 + }, + { + "epoch": 2.8039492242595205, + "grad_norm": 0.0680151879787445, + "learning_rate": 4.409682809623379e-05, + "loss": 0.2079, + "step": 13916 + }, + { + "epoch": 2.8043522063268185, + "grad_norm": 0.05849350243806839, + "learning_rate": 4.408359499595234e-05, + "loss": 0.2057, + "step": 13918 + }, + { + "epoch": 2.8047551883941164, + "grad_norm": 0.07199438661336899, + "learning_rate": 4.4070362315962866e-05, + "loss": 0.1936, + "step": 13920 + }, + { + "epoch": 2.8051581704614144, + "grad_norm": 0.0518622063100338, + "learning_rate": 4.405713005720536e-05, + "loss": 0.1634, + "step": 13922 + }, + { + "epoch": 2.8055611525287123, + "grad_norm": 0.09157627820968628, + "learning_rate": 4.4043898220619836e-05, + "loss": 0.1812, + "step": 13924 + }, + { + "epoch": 2.8059641345960102, + "grad_norm": 0.06180949881672859, + "learning_rate": 4.403066680714625e-05, + "loss": 0.2044, + "step": 13926 + }, + { + "epoch": 2.8063671166633086, + "grad_norm": 0.062134623527526855, + "learning_rate": 4.401743581772456e-05, + "loss": 0.2088, + "step": 13928 + }, + { + "epoch": 2.8067700987306066, + "grad_norm": 0.06650009751319885, + "learning_rate": 4.400420525329464e-05, + "loss": 0.1917, + "step": 13930 + }, + { + "epoch": 2.8071730807979045, + "grad_norm": 0.07102461159229279, + "learning_rate": 4.399097511479641e-05, + "loss": 0.2143, + "step": 13932 + }, + { + "epoch": 2.8075760628652024, + "grad_norm": 0.058678820729255676, + "learning_rate": 4.3977745403169686e-05, + "loss": 0.2062, + "step": 13934 + }, + { + "epoch": 2.8079790449325004, + "grad_norm": 0.04873026907444, + "learning_rate": 4.396451611935431e-05, + "loss": 0.1855, + "step": 13936 + }, + { + "epoch": 2.8083820269997988, + "grad_norm": 0.051642045378685, + "learning_rate": 4.395128726429004e-05, + "loss": 0.151, + "step": 13938 + }, + { + "epoch": 2.8087850090670967, + "grad_norm": 0.06375691294670105, + "learning_rate": 4.393805883891667e-05, + "loss": 0.222, + "step": 13940 + }, + { + "epoch": 2.8091879911343947, + "grad_norm": 0.08424566686153412, + "learning_rate": 4.392483084417388e-05, + "loss": 0.1753, + "step": 13942 + }, + { + "epoch": 2.8095909732016926, + "grad_norm": 0.060530856251716614, + "learning_rate": 4.391160328100142e-05, + "loss": 0.1772, + "step": 13944 + }, + { + "epoch": 2.8099939552689905, + "grad_norm": 0.06497149169445038, + "learning_rate": 4.3898376150338896e-05, + "loss": 0.1854, + "step": 13946 + }, + { + "epoch": 2.8103969373362885, + "grad_norm": 0.12235381454229355, + "learning_rate": 4.388514945312599e-05, + "loss": 0.1871, + "step": 13948 + }, + { + "epoch": 2.8107999194035864, + "grad_norm": 0.05921998620033264, + "learning_rate": 4.387192319030229e-05, + "loss": 0.1796, + "step": 13950 + }, + { + "epoch": 2.8112029014708844, + "grad_norm": 0.04586871340870857, + "learning_rate": 4.3858697362807365e-05, + "loss": 0.152, + "step": 13952 + }, + { + "epoch": 2.8116058835381823, + "grad_norm": 0.06341541558504105, + "learning_rate": 4.384547197158074e-05, + "loss": 0.1648, + "step": 13954 + }, + { + "epoch": 2.8120088656054807, + "grad_norm": 0.08245648443698883, + "learning_rate": 4.383224701756197e-05, + "loss": 0.2385, + "step": 13956 + }, + { + "epoch": 2.8124118476727786, + "grad_norm": 0.04495750740170479, + "learning_rate": 4.381902250169048e-05, + "loss": 0.218, + "step": 13958 + }, + { + "epoch": 2.8128148297400766, + "grad_norm": 0.03344697132706642, + "learning_rate": 4.380579842490577e-05, + "loss": 0.1628, + "step": 13960 + }, + { + "epoch": 2.8132178118073745, + "grad_norm": 0.05339306592941284, + "learning_rate": 4.3792574788147224e-05, + "loss": 0.1534, + "step": 13962 + }, + { + "epoch": 2.8136207938746725, + "grad_norm": 0.06429272145032883, + "learning_rate": 4.3779351592354246e-05, + "loss": 0.2016, + "step": 13964 + }, + { + "epoch": 2.814023775941971, + "grad_norm": 0.06380239874124527, + "learning_rate": 4.376612883846617e-05, + "loss": 0.2009, + "step": 13966 + }, + { + "epoch": 2.814426758009269, + "grad_norm": 0.05741169676184654, + "learning_rate": 4.3752906527422346e-05, + "loss": 0.2213, + "step": 13968 + }, + { + "epoch": 2.8148297400765667, + "grad_norm": 0.052648428827524185, + "learning_rate": 4.373968466016202e-05, + "loss": 0.1926, + "step": 13970 + }, + { + "epoch": 2.8152327221438647, + "grad_norm": 0.053499605506658554, + "learning_rate": 4.3726463237624517e-05, + "loss": 0.2153, + "step": 13972 + }, + { + "epoch": 2.8156357042111626, + "grad_norm": 0.06318365782499313, + "learning_rate": 4.371324226074902e-05, + "loss": 0.166, + "step": 13974 + }, + { + "epoch": 2.8160386862784605, + "grad_norm": 0.07373753190040588, + "learning_rate": 4.3700021730474745e-05, + "loss": 0.1996, + "step": 13976 + }, + { + "epoch": 2.8164416683457585, + "grad_norm": 0.05533366650342941, + "learning_rate": 4.3686801647740846e-05, + "loss": 0.154, + "step": 13978 + }, + { + "epoch": 2.8168446504130564, + "grad_norm": 0.06208839640021324, + "learning_rate": 4.367358201348647e-05, + "loss": 0.1526, + "step": 13980 + }, + { + "epoch": 2.8172476324803544, + "grad_norm": 0.056060247123241425, + "learning_rate": 4.366036282865068e-05, + "loss": 0.1649, + "step": 13982 + }, + { + "epoch": 2.8176506145476528, + "grad_norm": 0.07038706541061401, + "learning_rate": 4.364714409417261e-05, + "loss": 0.2252, + "step": 13984 + }, + { + "epoch": 2.8180535966149507, + "grad_norm": 0.055569134652614594, + "learning_rate": 4.363392581099125e-05, + "loss": 0.1669, + "step": 13986 + }, + { + "epoch": 2.8184565786822486, + "grad_norm": 0.06313623487949371, + "learning_rate": 4.362070798004563e-05, + "loss": 0.1651, + "step": 13988 + }, + { + "epoch": 2.8188595607495466, + "grad_norm": 0.08299045264720917, + "learning_rate": 4.360749060227469e-05, + "loss": 0.1541, + "step": 13990 + }, + { + "epoch": 2.8192625428168445, + "grad_norm": 0.06302808970212936, + "learning_rate": 4.359427367861742e-05, + "loss": 0.1875, + "step": 13992 + }, + { + "epoch": 2.819665524884143, + "grad_norm": 0.0563676692545414, + "learning_rate": 4.3581057210012676e-05, + "loss": 0.2123, + "step": 13994 + }, + { + "epoch": 2.820068506951441, + "grad_norm": 0.06105897203087807, + "learning_rate": 4.356784119739939e-05, + "loss": 0.2102, + "step": 13996 + }, + { + "epoch": 2.820471489018739, + "grad_norm": 0.06385830044746399, + "learning_rate": 4.3554625641716355e-05, + "loss": 0.1786, + "step": 13998 + }, + { + "epoch": 2.8208744710860367, + "grad_norm": 0.04783019796013832, + "learning_rate": 4.354141054390243e-05, + "loss": 0.1909, + "step": 14000 + }, + { + "epoch": 2.8212774531533347, + "grad_norm": 0.06092913821339607, + "learning_rate": 4.352819590489635e-05, + "loss": 0.16, + "step": 14002 + }, + { + "epoch": 2.8216804352206326, + "grad_norm": 0.06487409770488739, + "learning_rate": 4.35149817256369e-05, + "loss": 0.2304, + "step": 14004 + }, + { + "epoch": 2.8220834172879306, + "grad_norm": 0.060342274606227875, + "learning_rate": 4.3501768007062754e-05, + "loss": 0.202, + "step": 14006 + }, + { + "epoch": 2.8224863993552285, + "grad_norm": 0.05749595910310745, + "learning_rate": 4.348855475011264e-05, + "loss": 0.1563, + "step": 14008 + }, + { + "epoch": 2.8228893814225264, + "grad_norm": 0.06227300316095352, + "learning_rate": 4.347534195572517e-05, + "loss": 0.1511, + "step": 14010 + }, + { + "epoch": 2.823292363489825, + "grad_norm": 0.07745243608951569, + "learning_rate": 4.3462129624838984e-05, + "loss": 0.183, + "step": 14012 + }, + { + "epoch": 2.8236953455571228, + "grad_norm": 0.06026535481214523, + "learning_rate": 4.344891775839264e-05, + "loss": 0.1633, + "step": 14014 + }, + { + "epoch": 2.8240983276244207, + "grad_norm": 0.08717188239097595, + "learning_rate": 4.3435706357324716e-05, + "loss": 0.1903, + "step": 14016 + }, + { + "epoch": 2.8245013096917186, + "grad_norm": 0.0764845758676529, + "learning_rate": 4.342249542257369e-05, + "loss": 0.2251, + "step": 14018 + }, + { + "epoch": 2.8249042917590166, + "grad_norm": 0.07623816281557083, + "learning_rate": 4.340928495507811e-05, + "loss": 0.184, + "step": 14020 + }, + { + "epoch": 2.825307273826315, + "grad_norm": 0.05312751606106758, + "learning_rate": 4.339607495577634e-05, + "loss": 0.21, + "step": 14022 + }, + { + "epoch": 2.825710255893613, + "grad_norm": 0.05700031667947769, + "learning_rate": 4.3382865425606875e-05, + "loss": 0.215, + "step": 14024 + }, + { + "epoch": 2.826113237960911, + "grad_norm": 0.07716810703277588, + "learning_rate": 4.336965636550806e-05, + "loss": 0.1729, + "step": 14026 + }, + { + "epoch": 2.826516220028209, + "grad_norm": 0.10202040523290634, + "learning_rate": 4.335644777641826e-05, + "loss": 0.1678, + "step": 14028 + }, + { + "epoch": 2.8269192020955067, + "grad_norm": 0.0468902550637722, + "learning_rate": 4.3343239659275764e-05, + "loss": 0.1965, + "step": 14030 + }, + { + "epoch": 2.8273221841628047, + "grad_norm": 0.06173230707645416, + "learning_rate": 4.3330032015018905e-05, + "loss": 0.1859, + "step": 14032 + }, + { + "epoch": 2.8277251662301026, + "grad_norm": 0.06326699256896973, + "learning_rate": 4.331682484458588e-05, + "loss": 0.176, + "step": 14034 + }, + { + "epoch": 2.8281281482974006, + "grad_norm": 0.058440111577510834, + "learning_rate": 4.3303618148914944e-05, + "loss": 0.1853, + "step": 14036 + }, + { + "epoch": 2.8285311303646985, + "grad_norm": 0.05458078905940056, + "learning_rate": 4.329041192894426e-05, + "loss": 0.1649, + "step": 14038 + }, + { + "epoch": 2.828934112431997, + "grad_norm": 0.046770691871643066, + "learning_rate": 4.3277206185611986e-05, + "loss": 0.1496, + "step": 14040 + }, + { + "epoch": 2.829337094499295, + "grad_norm": 0.057294707745313644, + "learning_rate": 4.326400091985623e-05, + "loss": 0.1922, + "step": 14042 + }, + { + "epoch": 2.8297400765665928, + "grad_norm": 0.06126628443598747, + "learning_rate": 4.325079613261508e-05, + "loss": 0.1622, + "step": 14044 + }, + { + "epoch": 2.8301430586338907, + "grad_norm": 0.047013264149427414, + "learning_rate": 4.3237591824826565e-05, + "loss": 0.1449, + "step": 14046 + }, + { + "epoch": 2.8305460407011886, + "grad_norm": 0.086411252617836, + "learning_rate": 4.3224387997428726e-05, + "loss": 0.1789, + "step": 14048 + }, + { + "epoch": 2.830949022768487, + "grad_norm": 0.049013204872608185, + "learning_rate": 4.321118465135952e-05, + "loss": 0.1821, + "step": 14050 + }, + { + "epoch": 2.831352004835785, + "grad_norm": 0.048979319632053375, + "learning_rate": 4.319798178755691e-05, + "loss": 0.1656, + "step": 14052 + }, + { + "epoch": 2.831754986903083, + "grad_norm": 0.05120906978845596, + "learning_rate": 4.3184779406958785e-05, + "loss": 0.1742, + "step": 14054 + }, + { + "epoch": 2.832157968970381, + "grad_norm": 0.05259276553988457, + "learning_rate": 4.3171577510503046e-05, + "loss": 0.1767, + "step": 14056 + }, + { + "epoch": 2.832560951037679, + "grad_norm": 0.06700893491506577, + "learning_rate": 4.31583760991275e-05, + "loss": 0.2277, + "step": 14058 + }, + { + "epoch": 2.8329639331049767, + "grad_norm": 0.05070505291223526, + "learning_rate": 4.314517517376999e-05, + "loss": 0.1628, + "step": 14060 + }, + { + "epoch": 2.8333669151722747, + "grad_norm": 0.03246789425611496, + "learning_rate": 4.313197473536826e-05, + "loss": 0.185, + "step": 14062 + }, + { + "epoch": 2.8337698972395726, + "grad_norm": 0.08361760526895523, + "learning_rate": 4.311877478486007e-05, + "loss": 0.1882, + "step": 14064 + }, + { + "epoch": 2.834172879306871, + "grad_norm": 0.04620158672332764, + "learning_rate": 4.310557532318311e-05, + "loss": 0.1459, + "step": 14066 + }, + { + "epoch": 2.834575861374169, + "grad_norm": 0.05384652316570282, + "learning_rate": 4.309237635127507e-05, + "loss": 0.2281, + "step": 14068 + }, + { + "epoch": 2.834978843441467, + "grad_norm": 0.06559676676988602, + "learning_rate": 4.307917787007353e-05, + "loss": 0.1756, + "step": 14070 + }, + { + "epoch": 2.835381825508765, + "grad_norm": 0.05740681663155556, + "learning_rate": 4.306597988051615e-05, + "loss": 0.2154, + "step": 14072 + }, + { + "epoch": 2.8357848075760628, + "grad_norm": 0.05628305301070213, + "learning_rate": 4.305278238354047e-05, + "loss": 0.2091, + "step": 14074 + }, + { + "epoch": 2.8361877896433607, + "grad_norm": 0.032225411385297775, + "learning_rate": 4.3039585380084025e-05, + "loss": 0.1544, + "step": 14076 + }, + { + "epoch": 2.836590771710659, + "grad_norm": 0.056562524288892746, + "learning_rate": 4.302638887108429e-05, + "loss": 0.1389, + "step": 14078 + }, + { + "epoch": 2.836993753777957, + "grad_norm": 0.07340873777866364, + "learning_rate": 4.301319285747875e-05, + "loss": 0.2385, + "step": 14080 + }, + { + "epoch": 2.837396735845255, + "grad_norm": 0.058820270001888275, + "learning_rate": 4.299999734020479e-05, + "loss": 0.1836, + "step": 14082 + }, + { + "epoch": 2.837799717912553, + "grad_norm": 0.08579188585281372, + "learning_rate": 4.2986802320199866e-05, + "loss": 0.1922, + "step": 14084 + }, + { + "epoch": 2.838202699979851, + "grad_norm": 0.05140161141753197, + "learning_rate": 4.297360779840125e-05, + "loss": 0.1919, + "step": 14086 + }, + { + "epoch": 2.838605682047149, + "grad_norm": 0.05750875547528267, + "learning_rate": 4.296041377574632e-05, + "loss": 0.1504, + "step": 14088 + }, + { + "epoch": 2.8390086641144467, + "grad_norm": 0.06645703315734863, + "learning_rate": 4.294722025317233e-05, + "loss": 0.1537, + "step": 14090 + }, + { + "epoch": 2.8394116461817447, + "grad_norm": 0.05591237172484398, + "learning_rate": 4.2934027231616545e-05, + "loss": 0.2056, + "step": 14092 + }, + { + "epoch": 2.839814628249043, + "grad_norm": 0.05866971239447594, + "learning_rate": 4.2920834712016136e-05, + "loss": 0.1648, + "step": 14094 + }, + { + "epoch": 2.840217610316341, + "grad_norm": 0.05585728958249092, + "learning_rate": 4.290764269530835e-05, + "loss": 0.2098, + "step": 14096 + }, + { + "epoch": 2.840620592383639, + "grad_norm": 0.05240754410624504, + "learning_rate": 4.289445118243024e-05, + "loss": 0.1762, + "step": 14098 + }, + { + "epoch": 2.841023574450937, + "grad_norm": 0.06093365326523781, + "learning_rate": 4.2881260174318984e-05, + "loss": 0.1604, + "step": 14100 + }, + { + "epoch": 2.841426556518235, + "grad_norm": 0.053919680416584015, + "learning_rate": 4.286806967191161e-05, + "loss": 0.2471, + "step": 14102 + }, + { + "epoch": 2.8418295385855328, + "grad_norm": 0.054130956530570984, + "learning_rate": 4.2854879676145166e-05, + "loss": 0.1734, + "step": 14104 + }, + { + "epoch": 2.842232520652831, + "grad_norm": 0.055720653384923935, + "learning_rate": 4.284169018795664e-05, + "loss": 0.1855, + "step": 14106 + }, + { + "epoch": 2.842635502720129, + "grad_norm": 0.06489083170890808, + "learning_rate": 4.2828501208283e-05, + "loss": 0.1979, + "step": 14108 + }, + { + "epoch": 2.843038484787427, + "grad_norm": 0.043121401220560074, + "learning_rate": 4.2815312738061145e-05, + "loss": 0.1592, + "step": 14110 + }, + { + "epoch": 2.843441466854725, + "grad_norm": 0.06269125640392303, + "learning_rate": 4.2802124778228e-05, + "loss": 0.2043, + "step": 14112 + }, + { + "epoch": 2.843844448922023, + "grad_norm": 0.04860043525695801, + "learning_rate": 4.27889373297204e-05, + "loss": 0.1608, + "step": 14114 + }, + { + "epoch": 2.844247430989321, + "grad_norm": 0.05572217330336571, + "learning_rate": 4.2775750393475164e-05, + "loss": 0.2031, + "step": 14116 + }, + { + "epoch": 2.844650413056619, + "grad_norm": 0.0398903451859951, + "learning_rate": 4.2762563970429054e-05, + "loss": 0.1928, + "step": 14118 + }, + { + "epoch": 2.8450533951239168, + "grad_norm": 0.06042037159204483, + "learning_rate": 4.274937806151884e-05, + "loss": 0.1717, + "step": 14120 + }, + { + "epoch": 2.845456377191215, + "grad_norm": 0.042064543813467026, + "learning_rate": 4.2736192667681185e-05, + "loss": 0.143, + "step": 14122 + }, + { + "epoch": 2.845859359258513, + "grad_norm": 0.03717980161309242, + "learning_rate": 4.272300778985281e-05, + "loss": 0.1289, + "step": 14124 + }, + { + "epoch": 2.846262341325811, + "grad_norm": 0.08025442063808441, + "learning_rate": 4.270982342897032e-05, + "loss": 0.1962, + "step": 14126 + }, + { + "epoch": 2.846665323393109, + "grad_norm": 0.061862844973802567, + "learning_rate": 4.269663958597032e-05, + "loss": 0.229, + "step": 14128 + }, + { + "epoch": 2.847068305460407, + "grad_norm": 0.057234328240156174, + "learning_rate": 4.268345626178935e-05, + "loss": 0.2134, + "step": 14130 + }, + { + "epoch": 2.8474712875277053, + "grad_norm": 0.0573008768260479, + "learning_rate": 4.267027345736396e-05, + "loss": 0.1752, + "step": 14132 + }, + { + "epoch": 2.8478742695950032, + "grad_norm": 0.05297653749585152, + "learning_rate": 4.26570911736306e-05, + "loss": 0.1613, + "step": 14134 + }, + { + "epoch": 2.848277251662301, + "grad_norm": 0.06393546611070633, + "learning_rate": 4.2643909411525765e-05, + "loss": 0.1694, + "step": 14136 + }, + { + "epoch": 2.848680233729599, + "grad_norm": 0.06238080561161041, + "learning_rate": 4.263072817198582e-05, + "loss": 0.1931, + "step": 14138 + }, + { + "epoch": 2.849083215796897, + "grad_norm": 0.051683563739061356, + "learning_rate": 4.261754745594718e-05, + "loss": 0.1634, + "step": 14140 + }, + { + "epoch": 2.849486197864195, + "grad_norm": 0.054612595587968826, + "learning_rate": 4.2604367264346147e-05, + "loss": 0.1973, + "step": 14142 + }, + { + "epoch": 2.849889179931493, + "grad_norm": 0.06870435923337936, + "learning_rate": 4.259118759811905e-05, + "loss": 0.2024, + "step": 14144 + }, + { + "epoch": 2.850292161998791, + "grad_norm": 0.052692923694849014, + "learning_rate": 4.2578008458202113e-05, + "loss": 0.1674, + "step": 14146 + }, + { + "epoch": 2.850695144066089, + "grad_norm": 0.05360223725438118, + "learning_rate": 4.256482984553162e-05, + "loss": 0.2175, + "step": 14148 + }, + { + "epoch": 2.851098126133387, + "grad_norm": 0.05867795646190643, + "learning_rate": 4.2551651761043694e-05, + "loss": 0.1892, + "step": 14150 + }, + { + "epoch": 2.851501108200685, + "grad_norm": 0.06195264682173729, + "learning_rate": 4.253847420567453e-05, + "loss": 0.2264, + "step": 14152 + }, + { + "epoch": 2.851904090267983, + "grad_norm": 0.07267355918884277, + "learning_rate": 4.252529718036022e-05, + "loss": 0.2061, + "step": 14154 + }, + { + "epoch": 2.852307072335281, + "grad_norm": 0.058362722396850586, + "learning_rate": 4.251212068603685e-05, + "loss": 0.1994, + "step": 14156 + }, + { + "epoch": 2.852710054402579, + "grad_norm": 0.048904869705438614, + "learning_rate": 4.2498944723640434e-05, + "loss": 0.1776, + "step": 14158 + }, + { + "epoch": 2.8531130364698774, + "grad_norm": 0.06460296362638474, + "learning_rate": 4.248576929410702e-05, + "loss": 0.2103, + "step": 14160 + }, + { + "epoch": 2.8535160185371753, + "grad_norm": 0.05771844834089279, + "learning_rate": 4.2472594398372505e-05, + "loss": 0.2026, + "step": 14162 + }, + { + "epoch": 2.8539190006044732, + "grad_norm": 0.051594674587249756, + "learning_rate": 4.245942003737287e-05, + "loss": 0.1841, + "step": 14164 + }, + { + "epoch": 2.854321982671771, + "grad_norm": 0.06020747125148773, + "learning_rate": 4.2446246212043964e-05, + "loss": 0.1857, + "step": 14166 + }, + { + "epoch": 2.854724964739069, + "grad_norm": 0.06190628930926323, + "learning_rate": 4.2433072923321656e-05, + "loss": 0.1678, + "step": 14168 + }, + { + "epoch": 2.855127946806367, + "grad_norm": 0.07561472803354263, + "learning_rate": 4.2419900172141723e-05, + "loss": 0.2247, + "step": 14170 + }, + { + "epoch": 2.855530928873665, + "grad_norm": 0.062340203672647476, + "learning_rate": 4.2406727959439995e-05, + "loss": 0.2042, + "step": 14172 + }, + { + "epoch": 2.855933910940963, + "grad_norm": 0.05466499179601669, + "learning_rate": 4.239355628615214e-05, + "loss": 0.1949, + "step": 14174 + }, + { + "epoch": 2.856336893008261, + "grad_norm": 0.06361190229654312, + "learning_rate": 4.23803851532139e-05, + "loss": 0.1932, + "step": 14176 + }, + { + "epoch": 2.8567398750755593, + "grad_norm": 0.057557422667741776, + "learning_rate": 4.23672145615609e-05, + "loss": 0.2181, + "step": 14178 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.05499812588095665, + "learning_rate": 4.235404451212878e-05, + "loss": 0.1839, + "step": 14180 + }, + { + "epoch": 2.857545839210155, + "grad_norm": 0.06741025298833847, + "learning_rate": 4.23408750058531e-05, + "loss": 0.1744, + "step": 14182 + }, + { + "epoch": 2.857948821277453, + "grad_norm": 0.06582369655370712, + "learning_rate": 4.232770604366942e-05, + "loss": 0.2026, + "step": 14184 + }, + { + "epoch": 2.858351803344751, + "grad_norm": 0.04338719695806503, + "learning_rate": 4.2314537626513216e-05, + "loss": 0.156, + "step": 14186 + }, + { + "epoch": 2.8587547854120494, + "grad_norm": 0.05401334539055824, + "learning_rate": 4.230136975531998e-05, + "loss": 0.1697, + "step": 14188 + }, + { + "epoch": 2.8591577674793474, + "grad_norm": 0.05770457908511162, + "learning_rate": 4.228820243102513e-05, + "loss": 0.1832, + "step": 14190 + }, + { + "epoch": 2.8595607495466453, + "grad_norm": 0.058899931609630585, + "learning_rate": 4.227503565456403e-05, + "loss": 0.1979, + "step": 14192 + }, + { + "epoch": 2.8599637316139432, + "grad_norm": 0.06733313202857971, + "learning_rate": 4.226186942687207e-05, + "loss": 0.211, + "step": 14194 + }, + { + "epoch": 2.860366713681241, + "grad_norm": 0.05753978341817856, + "learning_rate": 4.22487037488845e-05, + "loss": 0.1866, + "step": 14196 + }, + { + "epoch": 2.860769695748539, + "grad_norm": 0.0664329007267952, + "learning_rate": 4.223553862153664e-05, + "loss": 0.2111, + "step": 14198 + }, + { + "epoch": 2.861172677815837, + "grad_norm": 0.04582129046320915, + "learning_rate": 4.2222374045763686e-05, + "loss": 0.1512, + "step": 14200 + }, + { + "epoch": 2.861575659883135, + "grad_norm": 0.05810026824474335, + "learning_rate": 4.220921002250086e-05, + "loss": 0.1722, + "step": 14202 + }, + { + "epoch": 2.861978641950433, + "grad_norm": 0.056344956159591675, + "learning_rate": 4.219604655268328e-05, + "loss": 0.1742, + "step": 14204 + }, + { + "epoch": 2.8623816240177313, + "grad_norm": 0.05658912658691406, + "learning_rate": 4.2182883637246074e-05, + "loss": 0.2023, + "step": 14206 + }, + { + "epoch": 2.8627846060850293, + "grad_norm": 0.04890948161482811, + "learning_rate": 4.21697212771243e-05, + "loss": 0.1886, + "step": 14208 + }, + { + "epoch": 2.863187588152327, + "grad_norm": 0.06561271846294403, + "learning_rate": 4.2156559473253025e-05, + "loss": 0.1962, + "step": 14210 + }, + { + "epoch": 2.863590570219625, + "grad_norm": 0.0460575707256794, + "learning_rate": 4.214339822656721e-05, + "loss": 0.1798, + "step": 14212 + }, + { + "epoch": 2.863993552286923, + "grad_norm": 0.05941230431199074, + "learning_rate": 4.213023753800183e-05, + "loss": 0.2593, + "step": 14214 + }, + { + "epoch": 2.8643965343542215, + "grad_norm": 0.059036318212747574, + "learning_rate": 4.211707740849178e-05, + "loss": 0.2102, + "step": 14216 + }, + { + "epoch": 2.8647995164215194, + "grad_norm": 0.06053904816508293, + "learning_rate": 4.210391783897196e-05, + "loss": 0.1604, + "step": 14218 + }, + { + "epoch": 2.8652024984888174, + "grad_norm": 0.05623185262084007, + "learning_rate": 4.209075883037716e-05, + "loss": 0.235, + "step": 14220 + }, + { + "epoch": 2.8656054805561153, + "grad_norm": 0.048137541860342026, + "learning_rate": 4.207760038364223e-05, + "loss": 0.1749, + "step": 14222 + }, + { + "epoch": 2.8660084626234132, + "grad_norm": 0.0612863153219223, + "learning_rate": 4.206444249970188e-05, + "loss": 0.1812, + "step": 14224 + }, + { + "epoch": 2.866411444690711, + "grad_norm": 0.05674326792359352, + "learning_rate": 4.205128517949086e-05, + "loss": 0.1889, + "step": 14226 + }, + { + "epoch": 2.866814426758009, + "grad_norm": 0.05260700732469559, + "learning_rate": 4.2038128423943815e-05, + "loss": 0.1802, + "step": 14228 + }, + { + "epoch": 2.867217408825307, + "grad_norm": 0.056204576045274734, + "learning_rate": 4.202497223399541e-05, + "loss": 0.1513, + "step": 14230 + }, + { + "epoch": 2.867620390892605, + "grad_norm": 0.052276674658060074, + "learning_rate": 4.20118166105802e-05, + "loss": 0.1623, + "step": 14232 + }, + { + "epoch": 2.8680233729599034, + "grad_norm": 0.06562013179063797, + "learning_rate": 4.199866155463278e-05, + "loss": 0.1922, + "step": 14234 + }, + { + "epoch": 2.8684263550272013, + "grad_norm": 0.052156250923871994, + "learning_rate": 4.198550706708764e-05, + "loss": 0.1618, + "step": 14236 + }, + { + "epoch": 2.8688293370944993, + "grad_norm": 0.06406504660844803, + "learning_rate": 4.197235314887927e-05, + "loss": 0.2082, + "step": 14238 + }, + { + "epoch": 2.869232319161797, + "grad_norm": 0.052224770188331604, + "learning_rate": 4.1959199800942083e-05, + "loss": 0.1636, + "step": 14240 + }, + { + "epoch": 2.869635301229095, + "grad_norm": 0.051645223051309586, + "learning_rate": 4.1946047024210495e-05, + "loss": 0.1757, + "step": 14242 + }, + { + "epoch": 2.8700382832963935, + "grad_norm": 0.06716208904981613, + "learning_rate": 4.1932894819618824e-05, + "loss": 0.1585, + "step": 14244 + }, + { + "epoch": 2.8704412653636915, + "grad_norm": 0.06000441685318947, + "learning_rate": 4.1919743188101435e-05, + "loss": 0.1799, + "step": 14246 + }, + { + "epoch": 2.8708442474309894, + "grad_norm": 0.05317326635122299, + "learning_rate": 4.190659213059254e-05, + "loss": 0.1669, + "step": 14248 + }, + { + "epoch": 2.8712472294982874, + "grad_norm": 0.04350145906209946, + "learning_rate": 4.189344164802641e-05, + "loss": 0.2012, + "step": 14250 + }, + { + "epoch": 2.8716502115655853, + "grad_norm": 0.050959598273038864, + "learning_rate": 4.1880291741337216e-05, + "loss": 0.2116, + "step": 14252 + }, + { + "epoch": 2.8720531936328833, + "grad_norm": 0.05060587078332901, + "learning_rate": 4.1867142411459115e-05, + "loss": 0.1911, + "step": 14254 + }, + { + "epoch": 2.872456175700181, + "grad_norm": 0.05303170904517174, + "learning_rate": 4.1853993659326194e-05, + "loss": 0.1733, + "step": 14256 + }, + { + "epoch": 2.872859157767479, + "grad_norm": 0.06755878776311874, + "learning_rate": 4.184084548587257e-05, + "loss": 0.1934, + "step": 14258 + }, + { + "epoch": 2.8732621398347775, + "grad_norm": 0.059417106211185455, + "learning_rate": 4.18276978920322e-05, + "loss": 0.2123, + "step": 14260 + }, + { + "epoch": 2.8736651219020755, + "grad_norm": 0.05868794769048691, + "learning_rate": 4.181455087873912e-05, + "loss": 0.2412, + "step": 14262 + }, + { + "epoch": 2.8740681039693734, + "grad_norm": 0.05174780637025833, + "learning_rate": 4.180140444692725e-05, + "loss": 0.1739, + "step": 14264 + }, + { + "epoch": 2.8744710860366713, + "grad_norm": 0.0744800716638565, + "learning_rate": 4.178825859753051e-05, + "loss": 0.2051, + "step": 14266 + }, + { + "epoch": 2.8748740681039693, + "grad_norm": 0.0438925176858902, + "learning_rate": 4.177511333148273e-05, + "loss": 0.1976, + "step": 14268 + }, + { + "epoch": 2.8752770501712672, + "grad_norm": 0.053939249366521835, + "learning_rate": 4.176196864971775e-05, + "loss": 0.196, + "step": 14270 + }, + { + "epoch": 2.8756800322385656, + "grad_norm": 0.07452262192964554, + "learning_rate": 4.174882455316933e-05, + "loss": 0.1775, + "step": 14272 + }, + { + "epoch": 2.8760830143058635, + "grad_norm": 0.06098518148064613, + "learning_rate": 4.1735681042771236e-05, + "loss": 0.1944, + "step": 14274 + }, + { + "epoch": 2.8764859963731615, + "grad_norm": 0.046656448394060135, + "learning_rate": 4.1722538119457134e-05, + "loss": 0.1812, + "step": 14276 + }, + { + "epoch": 2.8768889784404594, + "grad_norm": 0.07880371063947678, + "learning_rate": 4.17093957841607e-05, + "loss": 0.1714, + "step": 14278 + }, + { + "epoch": 2.8772919605077574, + "grad_norm": 0.050928760319948196, + "learning_rate": 4.169625403781551e-05, + "loss": 0.1708, + "step": 14280 + }, + { + "epoch": 2.8776949425750553, + "grad_norm": 0.06175197288393974, + "learning_rate": 4.1683112881355177e-05, + "loss": 0.2171, + "step": 14282 + }, + { + "epoch": 2.8780979246423533, + "grad_norm": 0.11151041835546494, + "learning_rate": 4.166997231571317e-05, + "loss": 0.2139, + "step": 14284 + }, + { + "epoch": 2.878500906709651, + "grad_norm": 0.06008008494973183, + "learning_rate": 4.165683234182304e-05, + "loss": 0.1806, + "step": 14286 + }, + { + "epoch": 2.8789038887769496, + "grad_norm": 0.044566765427589417, + "learning_rate": 4.164369296061818e-05, + "loss": 0.1506, + "step": 14288 + }, + { + "epoch": 2.8793068708442475, + "grad_norm": 0.08086936175823212, + "learning_rate": 4.163055417303202e-05, + "loss": 0.1846, + "step": 14290 + }, + { + "epoch": 2.8797098529115455, + "grad_norm": 0.09690766781568527, + "learning_rate": 4.1617415979997896e-05, + "loss": 0.2048, + "step": 14292 + }, + { + "epoch": 2.8801128349788434, + "grad_norm": 0.05995067209005356, + "learning_rate": 4.160427838244915e-05, + "loss": 0.1783, + "step": 14294 + }, + { + "epoch": 2.8805158170461413, + "grad_norm": 0.058783888816833496, + "learning_rate": 4.159114138131901e-05, + "loss": 0.1628, + "step": 14296 + }, + { + "epoch": 2.8809187991134393, + "grad_norm": 0.04987872391939163, + "learning_rate": 4.1578004977540767e-05, + "loss": 0.1664, + "step": 14298 + }, + { + "epoch": 2.8813217811807377, + "grad_norm": 0.05188801884651184, + "learning_rate": 4.156486917204757e-05, + "loss": 0.1897, + "step": 14300 + }, + { + "epoch": 2.8817247632480356, + "grad_norm": 0.050275251269340515, + "learning_rate": 4.155173396577259e-05, + "loss": 0.1994, + "step": 14302 + }, + { + "epoch": 2.8821277453153336, + "grad_norm": 0.05531008914113045, + "learning_rate": 4.15385993596489e-05, + "loss": 0.1803, + "step": 14304 + }, + { + "epoch": 2.8825307273826315, + "grad_norm": 0.0625590980052948, + "learning_rate": 4.1525465354609596e-05, + "loss": 0.2059, + "step": 14306 + }, + { + "epoch": 2.8829337094499294, + "grad_norm": 0.0541142039000988, + "learning_rate": 4.1512331951587656e-05, + "loss": 0.1838, + "step": 14308 + }, + { + "epoch": 2.8833366915172274, + "grad_norm": 0.0530061237514019, + "learning_rate": 4.149919915151611e-05, + "loss": 0.2092, + "step": 14310 + }, + { + "epoch": 2.8837396735845253, + "grad_norm": 0.0603446289896965, + "learning_rate": 4.1486066955327834e-05, + "loss": 0.1828, + "step": 14312 + }, + { + "epoch": 2.8841426556518233, + "grad_norm": 0.08570460975170135, + "learning_rate": 4.147293536395577e-05, + "loss": 0.2143, + "step": 14314 + }, + { + "epoch": 2.8845456377191216, + "grad_norm": 0.04139425978064537, + "learning_rate": 4.1459804378332724e-05, + "loss": 0.1668, + "step": 14316 + }, + { + "epoch": 2.8849486197864196, + "grad_norm": 0.08652174472808838, + "learning_rate": 4.1446673999391525e-05, + "loss": 0.2002, + "step": 14318 + }, + { + "epoch": 2.8853516018537175, + "grad_norm": 0.06046774983406067, + "learning_rate": 4.143354422806491e-05, + "loss": 0.1494, + "step": 14320 + }, + { + "epoch": 2.8857545839210155, + "grad_norm": 0.06893909722566605, + "learning_rate": 4.142041506528564e-05, + "loss": 0.2392, + "step": 14322 + }, + { + "epoch": 2.8861575659883134, + "grad_norm": 0.05902134254574776, + "learning_rate": 4.1407286511986335e-05, + "loss": 0.2174, + "step": 14324 + }, + { + "epoch": 2.886560548055612, + "grad_norm": 0.07994405180215836, + "learning_rate": 4.139415856909968e-05, + "loss": 0.2721, + "step": 14326 + }, + { + "epoch": 2.8869635301229097, + "grad_norm": 0.04702121019363403, + "learning_rate": 4.1381031237558224e-05, + "loss": 0.2053, + "step": 14328 + }, + { + "epoch": 2.8873665121902077, + "grad_norm": 0.07409130036830902, + "learning_rate": 4.136790451829453e-05, + "loss": 0.2046, + "step": 14330 + }, + { + "epoch": 2.8877694942575056, + "grad_norm": 0.05629954859614372, + "learning_rate": 4.1354778412241075e-05, + "loss": 0.211, + "step": 14332 + }, + { + "epoch": 2.8881724763248036, + "grad_norm": 0.04987116530537605, + "learning_rate": 4.134165292033037e-05, + "loss": 0.1781, + "step": 14334 + }, + { + "epoch": 2.8885754583921015, + "grad_norm": 0.06818079948425293, + "learning_rate": 4.1328528043494755e-05, + "loss": 0.2116, + "step": 14336 + }, + { + "epoch": 2.8889784404593994, + "grad_norm": 0.07187522947788239, + "learning_rate": 4.131540378266667e-05, + "loss": 0.2035, + "step": 14338 + }, + { + "epoch": 2.8893814225266974, + "grad_norm": 0.056888628751039505, + "learning_rate": 4.13022801387784e-05, + "loss": 0.2228, + "step": 14340 + }, + { + "epoch": 2.8897844045939953, + "grad_norm": 0.06543140858411789, + "learning_rate": 4.1289157112762244e-05, + "loss": 0.223, + "step": 14342 + }, + { + "epoch": 2.8901873866612937, + "grad_norm": 0.05908466503024101, + "learning_rate": 4.1276034705550434e-05, + "loss": 0.1893, + "step": 14344 + }, + { + "epoch": 2.8905903687285917, + "grad_norm": 0.05234837904572487, + "learning_rate": 4.1262912918075166e-05, + "loss": 0.2027, + "step": 14346 + }, + { + "epoch": 2.8909933507958896, + "grad_norm": 0.05392058193683624, + "learning_rate": 4.124979175126858e-05, + "loss": 0.1785, + "step": 14348 + }, + { + "epoch": 2.8913963328631875, + "grad_norm": 0.0729118064045906, + "learning_rate": 4.123667120606281e-05, + "loss": 0.1821, + "step": 14350 + }, + { + "epoch": 2.8917993149304855, + "grad_norm": 0.04530632868409157, + "learning_rate": 4.122355128338989e-05, + "loss": 0.1548, + "step": 14352 + }, + { + "epoch": 2.892202296997784, + "grad_norm": 0.06139715015888214, + "learning_rate": 4.121043198418187e-05, + "loss": 0.1948, + "step": 14354 + }, + { + "epoch": 2.892605279065082, + "grad_norm": 0.04849937930703163, + "learning_rate": 4.119731330937069e-05, + "loss": 0.1738, + "step": 14356 + }, + { + "epoch": 2.8930082611323797, + "grad_norm": 0.0831499695777893, + "learning_rate": 4.11841952598883e-05, + "loss": 0.2124, + "step": 14358 + }, + { + "epoch": 2.8934112431996777, + "grad_norm": 0.144696444272995, + "learning_rate": 4.117107783666656e-05, + "loss": 0.2026, + "step": 14360 + }, + { + "epoch": 2.8938142252669756, + "grad_norm": 0.051758021116256714, + "learning_rate": 4.115796104063736e-05, + "loss": 0.1871, + "step": 14362 + }, + { + "epoch": 2.8942172073342736, + "grad_norm": 0.08509272336959839, + "learning_rate": 4.1144844872732455e-05, + "loss": 0.253, + "step": 14364 + }, + { + "epoch": 2.8946201894015715, + "grad_norm": 0.045031026005744934, + "learning_rate": 4.1131729333883614e-05, + "loss": 0.1607, + "step": 14366 + }, + { + "epoch": 2.8950231714688694, + "grad_norm": 0.05014210566878319, + "learning_rate": 4.111861442502253e-05, + "loss": 0.1378, + "step": 14368 + }, + { + "epoch": 2.8954261535361674, + "grad_norm": 0.06437412649393082, + "learning_rate": 4.1105500147080876e-05, + "loss": 0.2153, + "step": 14370 + }, + { + "epoch": 2.8958291356034658, + "grad_norm": 0.04823167249560356, + "learning_rate": 4.1092386500990256e-05, + "loss": 0.1839, + "step": 14372 + }, + { + "epoch": 2.8962321176707637, + "grad_norm": 0.06474374979734421, + "learning_rate": 4.107927348768227e-05, + "loss": 0.209, + "step": 14374 + }, + { + "epoch": 2.8966350997380617, + "grad_norm": 0.047769974917173386, + "learning_rate": 4.106616110808843e-05, + "loss": 0.1931, + "step": 14376 + }, + { + "epoch": 2.8970380818053596, + "grad_norm": 0.04988821968436241, + "learning_rate": 4.105304936314021e-05, + "loss": 0.1862, + "step": 14378 + }, + { + "epoch": 2.8974410638726575, + "grad_norm": 0.06389687210321426, + "learning_rate": 4.103993825376905e-05, + "loss": 0.1765, + "step": 14380 + }, + { + "epoch": 2.897844045939956, + "grad_norm": 0.06107024848461151, + "learning_rate": 4.1026827780906365e-05, + "loss": 0.1516, + "step": 14382 + }, + { + "epoch": 2.898247028007254, + "grad_norm": 0.05711763724684715, + "learning_rate": 4.1013717945483454e-05, + "loss": 0.1581, + "step": 14384 + }, + { + "epoch": 2.898650010074552, + "grad_norm": 0.06959626078605652, + "learning_rate": 4.100060874843168e-05, + "loss": 0.1497, + "step": 14386 + }, + { + "epoch": 2.8990529921418497, + "grad_norm": 0.051988981664180756, + "learning_rate": 4.098750019068225e-05, + "loss": 0.2156, + "step": 14388 + }, + { + "epoch": 2.8994559742091477, + "grad_norm": 0.05507201701402664, + "learning_rate": 4.09743922731664e-05, + "loss": 0.1684, + "step": 14390 + }, + { + "epoch": 2.8998589562764456, + "grad_norm": 0.05257626995444298, + "learning_rate": 4.096128499681529e-05, + "loss": 0.1745, + "step": 14392 + }, + { + "epoch": 2.9002619383437436, + "grad_norm": 0.05004867911338806, + "learning_rate": 4.0948178362560034e-05, + "loss": 0.1921, + "step": 14394 + }, + { + "epoch": 2.9006649204110415, + "grad_norm": 0.06818536669015884, + "learning_rate": 4.093507237133169e-05, + "loss": 0.2584, + "step": 14396 + }, + { + "epoch": 2.9010679024783395, + "grad_norm": 0.06977367401123047, + "learning_rate": 4.0921967024061355e-05, + "loss": 0.2516, + "step": 14398 + }, + { + "epoch": 2.901470884545638, + "grad_norm": 0.05165844410657883, + "learning_rate": 4.0908862321679925e-05, + "loss": 0.1735, + "step": 14400 + }, + { + "epoch": 2.901873866612936, + "grad_norm": 0.0612303651869297, + "learning_rate": 4.089575826511839e-05, + "loss": 0.2446, + "step": 14402 + }, + { + "epoch": 2.9022768486802337, + "grad_norm": 0.06506997346878052, + "learning_rate": 4.088265485530763e-05, + "loss": 0.1762, + "step": 14404 + }, + { + "epoch": 2.9026798307475317, + "grad_norm": 0.08582379668951035, + "learning_rate": 4.086955209317849e-05, + "loss": 0.1823, + "step": 14406 + }, + { + "epoch": 2.9030828128148296, + "grad_norm": 0.09465978294610977, + "learning_rate": 4.085644997966176e-05, + "loss": 0.2538, + "step": 14408 + }, + { + "epoch": 2.903485794882128, + "grad_norm": 0.06913874298334122, + "learning_rate": 4.0843348515688214e-05, + "loss": 0.2471, + "step": 14410 + }, + { + "epoch": 2.903888776949426, + "grad_norm": 0.06045586243271828, + "learning_rate": 4.083024770218852e-05, + "loss": 0.2089, + "step": 14412 + }, + { + "epoch": 2.904291759016724, + "grad_norm": 0.07784318923950195, + "learning_rate": 4.081714754009339e-05, + "loss": 0.1892, + "step": 14414 + }, + { + "epoch": 2.904694741084022, + "grad_norm": 0.06447883695363998, + "learning_rate": 4.08040480303334e-05, + "loss": 0.1931, + "step": 14416 + }, + { + "epoch": 2.9050977231513198, + "grad_norm": 0.0589727982878685, + "learning_rate": 4.079094917383914e-05, + "loss": 0.2088, + "step": 14418 + }, + { + "epoch": 2.9055007052186177, + "grad_norm": 0.06887009739875793, + "learning_rate": 4.077785097154111e-05, + "loss": 0.1967, + "step": 14420 + }, + { + "epoch": 2.9059036872859156, + "grad_norm": 0.06360602378845215, + "learning_rate": 4.076475342436981e-05, + "loss": 0.2037, + "step": 14422 + }, + { + "epoch": 2.9063066693532136, + "grad_norm": 0.05032625421881676, + "learning_rate": 4.075165653325564e-05, + "loss": 0.1425, + "step": 14424 + }, + { + "epoch": 2.9067096514205115, + "grad_norm": 0.059502359479665756, + "learning_rate": 4.073856029912902e-05, + "loss": 0.2147, + "step": 14426 + }, + { + "epoch": 2.90711263348781, + "grad_norm": 0.054628755897283554, + "learning_rate": 4.072546472292025e-05, + "loss": 0.1848, + "step": 14428 + }, + { + "epoch": 2.907515615555108, + "grad_norm": 0.051899537444114685, + "learning_rate": 4.071236980555965e-05, + "loss": 0.1782, + "step": 14430 + }, + { + "epoch": 2.907918597622406, + "grad_norm": 0.038006119430065155, + "learning_rate": 4.0699275547977425e-05, + "loss": 0.1577, + "step": 14432 + }, + { + "epoch": 2.9083215796897037, + "grad_norm": 0.05724431946873665, + "learning_rate": 4.068618195110381e-05, + "loss": 0.1784, + "step": 14434 + }, + { + "epoch": 2.9087245617570017, + "grad_norm": 0.05458924174308777, + "learning_rate": 4.067308901586892e-05, + "loss": 0.1425, + "step": 14436 + }, + { + "epoch": 2.9091275438243, + "grad_norm": 0.04887452721595764, + "learning_rate": 4.065999674320288e-05, + "loss": 0.1658, + "step": 14438 + }, + { + "epoch": 2.909530525891598, + "grad_norm": 0.05824177339673042, + "learning_rate": 4.0646905134035726e-05, + "loss": 0.1837, + "step": 14440 + }, + { + "epoch": 2.909933507958896, + "grad_norm": 0.045779384672641754, + "learning_rate": 4.063381418929748e-05, + "loss": 0.1702, + "step": 14442 + }, + { + "epoch": 2.910336490026194, + "grad_norm": 0.06375889480113983, + "learning_rate": 4.062072390991809e-05, + "loss": 0.2236, + "step": 14444 + }, + { + "epoch": 2.910739472093492, + "grad_norm": 0.06309327483177185, + "learning_rate": 4.060763429682748e-05, + "loss": 0.2208, + "step": 14446 + }, + { + "epoch": 2.9111424541607898, + "grad_norm": 0.0643438994884491, + "learning_rate": 4.059454535095549e-05, + "loss": 0.1983, + "step": 14448 + }, + { + "epoch": 2.9115454362280877, + "grad_norm": 0.0457322783768177, + "learning_rate": 4.058145707323199e-05, + "loss": 0.1695, + "step": 14450 + }, + { + "epoch": 2.9119484182953856, + "grad_norm": 0.07760033011436462, + "learning_rate": 4.056836946458668e-05, + "loss": 0.262, + "step": 14452 + }, + { + "epoch": 2.9123514003626836, + "grad_norm": 0.06333133578300476, + "learning_rate": 4.0555282525949346e-05, + "loss": 0.1888, + "step": 14454 + }, + { + "epoch": 2.912754382429982, + "grad_norm": 0.0646766647696495, + "learning_rate": 4.054219625824963e-05, + "loss": 0.2099, + "step": 14456 + }, + { + "epoch": 2.91315736449728, + "grad_norm": 0.06069457530975342, + "learning_rate": 4.052911066241717e-05, + "loss": 0.2131, + "step": 14458 + }, + { + "epoch": 2.913560346564578, + "grad_norm": 0.07926452159881592, + "learning_rate": 4.051602573938152e-05, + "loss": 0.1968, + "step": 14460 + }, + { + "epoch": 2.913963328631876, + "grad_norm": 0.060611922293901443, + "learning_rate": 4.0502941490072264e-05, + "loss": 0.199, + "step": 14462 + }, + { + "epoch": 2.9143663106991737, + "grad_norm": 0.0660584568977356, + "learning_rate": 4.0489857915418826e-05, + "loss": 0.2008, + "step": 14464 + }, + { + "epoch": 2.914769292766472, + "grad_norm": 0.07786539942026138, + "learning_rate": 4.04767750163507e-05, + "loss": 0.2281, + "step": 14466 + }, + { + "epoch": 2.91517227483377, + "grad_norm": 0.057906147092580795, + "learning_rate": 4.046369279379723e-05, + "loss": 0.1753, + "step": 14468 + }, + { + "epoch": 2.915575256901068, + "grad_norm": 0.04037892818450928, + "learning_rate": 4.045061124868779e-05, + "loss": 0.1498, + "step": 14470 + }, + { + "epoch": 2.915978238968366, + "grad_norm": 0.048774946480989456, + "learning_rate": 4.043753038195164e-05, + "loss": 0.1925, + "step": 14472 + }, + { + "epoch": 2.916381221035664, + "grad_norm": 0.04842628538608551, + "learning_rate": 4.042445019451805e-05, + "loss": 0.2065, + "step": 14474 + }, + { + "epoch": 2.916784203102962, + "grad_norm": 0.058236975222826004, + "learning_rate": 4.041137068731617e-05, + "loss": 0.2216, + "step": 14476 + }, + { + "epoch": 2.9171871851702598, + "grad_norm": 0.06977251917123795, + "learning_rate": 4.039829186127522e-05, + "loss": 0.2252, + "step": 14478 + }, + { + "epoch": 2.9175901672375577, + "grad_norm": 0.0626988634467125, + "learning_rate": 4.038521371732425e-05, + "loss": 0.2011, + "step": 14480 + }, + { + "epoch": 2.917993149304856, + "grad_norm": 0.07548090070486069, + "learning_rate": 4.0372136256392324e-05, + "loss": 0.1634, + "step": 14482 + }, + { + "epoch": 2.918396131372154, + "grad_norm": 0.05160561203956604, + "learning_rate": 4.0359059479408436e-05, + "loss": 0.203, + "step": 14484 + }, + { + "epoch": 2.918799113439452, + "grad_norm": 0.06201513111591339, + "learning_rate": 4.034598338730155e-05, + "loss": 0.2376, + "step": 14486 + }, + { + "epoch": 2.91920209550675, + "grad_norm": 0.06060298904776573, + "learning_rate": 4.0332907981000546e-05, + "loss": 0.1541, + "step": 14488 + }, + { + "epoch": 2.919605077574048, + "grad_norm": 0.060880377888679504, + "learning_rate": 4.031983326143432e-05, + "loss": 0.2237, + "step": 14490 + }, + { + "epoch": 2.920008059641346, + "grad_norm": 0.045542385429143906, + "learning_rate": 4.0306759229531644e-05, + "loss": 0.1644, + "step": 14492 + }, + { + "epoch": 2.920411041708644, + "grad_norm": 0.06396988779306412, + "learning_rate": 4.029368588622131e-05, + "loss": 0.2005, + "step": 14494 + }, + { + "epoch": 2.920814023775942, + "grad_norm": 0.06875386089086533, + "learning_rate": 4.0280613232431984e-05, + "loss": 0.1812, + "step": 14496 + }, + { + "epoch": 2.92121700584324, + "grad_norm": 0.06327049434185028, + "learning_rate": 4.026754126909237e-05, + "loss": 0.1504, + "step": 14498 + }, + { + "epoch": 2.921619987910538, + "grad_norm": 0.04754214361310005, + "learning_rate": 4.0254469997131035e-05, + "loss": 0.1549, + "step": 14500 + }, + { + "epoch": 2.922022969977836, + "grad_norm": 0.05176509916782379, + "learning_rate": 4.024139941747658e-05, + "loss": 0.1864, + "step": 14502 + }, + { + "epoch": 2.922425952045134, + "grad_norm": 0.06160098686814308, + "learning_rate": 4.0228329531057506e-05, + "loss": 0.1441, + "step": 14504 + }, + { + "epoch": 2.922828934112432, + "grad_norm": 0.050542235374450684, + "learning_rate": 4.021526033880228e-05, + "loss": 0.1799, + "step": 14506 + }, + { + "epoch": 2.9232319161797298, + "grad_norm": 0.04303761571645737, + "learning_rate": 4.02021918416393e-05, + "loss": 0.1321, + "step": 14508 + }, + { + "epoch": 2.923634898247028, + "grad_norm": 0.05790780112147331, + "learning_rate": 4.0189124040496954e-05, + "loss": 0.189, + "step": 14510 + }, + { + "epoch": 2.924037880314326, + "grad_norm": 0.07991109043359756, + "learning_rate": 4.017605693630353e-05, + "loss": 0.2345, + "step": 14512 + }, + { + "epoch": 2.924440862381624, + "grad_norm": 0.06370232254266739, + "learning_rate": 4.016299052998732e-05, + "loss": 0.1756, + "step": 14514 + }, + { + "epoch": 2.924843844448922, + "grad_norm": 0.07485152781009674, + "learning_rate": 4.0149924822476526e-05, + "loss": 0.201, + "step": 14516 + }, + { + "epoch": 2.92524682651622, + "grad_norm": 0.05705023184418678, + "learning_rate": 4.013685981469933e-05, + "loss": 0.1849, + "step": 14518 + }, + { + "epoch": 2.9256498085835183, + "grad_norm": 0.08263441175222397, + "learning_rate": 4.0123795507583826e-05, + "loss": 0.2508, + "step": 14520 + }, + { + "epoch": 2.9260527906508162, + "grad_norm": 0.05221550539135933, + "learning_rate": 4.0110731902058105e-05, + "loss": 0.1852, + "step": 14522 + }, + { + "epoch": 2.926455772718114, + "grad_norm": 0.058928411453962326, + "learning_rate": 4.009766899905016e-05, + "loss": 0.1796, + "step": 14524 + }, + { + "epoch": 2.926858754785412, + "grad_norm": 0.06618046760559082, + "learning_rate": 4.0084606799488e-05, + "loss": 0.2088, + "step": 14526 + }, + { + "epoch": 2.92726173685271, + "grad_norm": 0.05029602348804474, + "learning_rate": 4.007154530429949e-05, + "loss": 0.1485, + "step": 14528 + }, + { + "epoch": 2.927664718920008, + "grad_norm": 0.05263345316052437, + "learning_rate": 4.0058484514412534e-05, + "loss": 0.1941, + "step": 14530 + }, + { + "epoch": 2.928067700987306, + "grad_norm": 0.049153879284858704, + "learning_rate": 4.004542443075493e-05, + "loss": 0.1857, + "step": 14532 + }, + { + "epoch": 2.928470683054604, + "grad_norm": 0.05516686663031578, + "learning_rate": 4.003236505425447e-05, + "loss": 0.1917, + "step": 14534 + }, + { + "epoch": 2.928873665121902, + "grad_norm": 0.0750420019030571, + "learning_rate": 4.001930638583883e-05, + "loss": 0.1709, + "step": 14536 + }, + { + "epoch": 2.9292766471892002, + "grad_norm": 0.050966911017894745, + "learning_rate": 4.000624842643574e-05, + "loss": 0.2332, + "step": 14538 + }, + { + "epoch": 2.929679629256498, + "grad_norm": 0.05441434308886528, + "learning_rate": 3.9993191176972746e-05, + "loss": 0.2048, + "step": 14540 + }, + { + "epoch": 2.930082611323796, + "grad_norm": 0.06468924880027771, + "learning_rate": 3.998013463837747e-05, + "loss": 0.1602, + "step": 14542 + }, + { + "epoch": 2.930485593391094, + "grad_norm": 0.047111447900533676, + "learning_rate": 3.996707881157739e-05, + "loss": 0.185, + "step": 14544 + }, + { + "epoch": 2.930888575458392, + "grad_norm": 0.06189405545592308, + "learning_rate": 3.995402369749999e-05, + "loss": 0.1489, + "step": 14546 + }, + { + "epoch": 2.9312915575256904, + "grad_norm": 0.0858076959848404, + "learning_rate": 3.994096929707268e-05, + "loss": 0.2368, + "step": 14548 + }, + { + "epoch": 2.9316945395929883, + "grad_norm": 0.05392390489578247, + "learning_rate": 3.992791561122283e-05, + "loss": 0.2169, + "step": 14550 + }, + { + "epoch": 2.9320975216602863, + "grad_norm": 0.05944026634097099, + "learning_rate": 3.991486264087773e-05, + "loss": 0.1943, + "step": 14552 + }, + { + "epoch": 2.932500503727584, + "grad_norm": 0.06575492769479752, + "learning_rate": 3.9901810386964676e-05, + "loss": 0.234, + "step": 14554 + }, + { + "epoch": 2.932903485794882, + "grad_norm": 0.04996848106384277, + "learning_rate": 3.988875885041085e-05, + "loss": 0.2108, + "step": 14556 + }, + { + "epoch": 2.93330646786218, + "grad_norm": 0.04437585547566414, + "learning_rate": 3.987570803214345e-05, + "loss": 0.1708, + "step": 14558 + }, + { + "epoch": 2.933709449929478, + "grad_norm": 0.05040000379085541, + "learning_rate": 3.986265793308953e-05, + "loss": 0.2091, + "step": 14560 + }, + { + "epoch": 2.934112431996776, + "grad_norm": 0.05723320692777634, + "learning_rate": 3.98496085541762e-05, + "loss": 0.1863, + "step": 14562 + }, + { + "epoch": 2.934515414064074, + "grad_norm": 0.06218833848834038, + "learning_rate": 3.983655989633042e-05, + "loss": 0.2182, + "step": 14564 + }, + { + "epoch": 2.9349183961313723, + "grad_norm": 0.05402584373950958, + "learning_rate": 3.982351196047919e-05, + "loss": 0.1882, + "step": 14566 + }, + { + "epoch": 2.9353213781986702, + "grad_norm": 0.04149405285716057, + "learning_rate": 3.981046474754939e-05, + "loss": 0.2108, + "step": 14568 + }, + { + "epoch": 2.935724360265968, + "grad_norm": 0.05765628442168236, + "learning_rate": 3.979741825846789e-05, + "loss": 0.1995, + "step": 14570 + }, + { + "epoch": 2.936127342333266, + "grad_norm": 0.06953881680965424, + "learning_rate": 3.978437249416146e-05, + "loss": 0.2073, + "step": 14572 + }, + { + "epoch": 2.936530324400564, + "grad_norm": 0.06293442100286484, + "learning_rate": 3.9771327455556874e-05, + "loss": 0.227, + "step": 14574 + }, + { + "epoch": 2.9369333064678624, + "grad_norm": 0.03527409955859184, + "learning_rate": 3.975828314358084e-05, + "loss": 0.1351, + "step": 14576 + }, + { + "epoch": 2.9373362885351604, + "grad_norm": 0.0683702901005745, + "learning_rate": 3.9745239559159984e-05, + "loss": 0.1517, + "step": 14578 + }, + { + "epoch": 2.9377392706024583, + "grad_norm": 0.0447571687400341, + "learning_rate": 3.9732196703220916e-05, + "loss": 0.1898, + "step": 14580 + }, + { + "epoch": 2.9381422526697563, + "grad_norm": 0.06839301437139511, + "learning_rate": 3.971915457669017e-05, + "loss": 0.2024, + "step": 14582 + }, + { + "epoch": 2.938545234737054, + "grad_norm": 0.08592922985553741, + "learning_rate": 3.970611318049425e-05, + "loss": 0.2404, + "step": 14584 + }, + { + "epoch": 2.938948216804352, + "grad_norm": 0.05581474304199219, + "learning_rate": 3.969307251555956e-05, + "loss": 0.1685, + "step": 14586 + }, + { + "epoch": 2.93935119887165, + "grad_norm": 0.057193271815776825, + "learning_rate": 3.9680032582812546e-05, + "loss": 0.1826, + "step": 14588 + }, + { + "epoch": 2.939754180938948, + "grad_norm": 0.06702768057584763, + "learning_rate": 3.966699338317949e-05, + "loss": 0.1922, + "step": 14590 + }, + { + "epoch": 2.940157163006246, + "grad_norm": 0.0606420673429966, + "learning_rate": 3.965395491758672e-05, + "loss": 0.2224, + "step": 14592 + }, + { + "epoch": 2.9405601450735444, + "grad_norm": 0.0733923390507698, + "learning_rate": 3.964091718696043e-05, + "loss": 0.2127, + "step": 14594 + }, + { + "epoch": 2.9409631271408423, + "grad_norm": 0.06314688920974731, + "learning_rate": 3.962788019222683e-05, + "loss": 0.1973, + "step": 14596 + }, + { + "epoch": 2.9413661092081402, + "grad_norm": 0.056110117584466934, + "learning_rate": 3.9614843934312005e-05, + "loss": 0.2015, + "step": 14598 + }, + { + "epoch": 2.941769091275438, + "grad_norm": 0.05174252390861511, + "learning_rate": 3.960180841414209e-05, + "loss": 0.1905, + "step": 14600 + }, + { + "epoch": 2.942172073342736, + "grad_norm": 0.05452876538038254, + "learning_rate": 3.958877363264306e-05, + "loss": 0.1852, + "step": 14602 + }, + { + "epoch": 2.9425750554100345, + "grad_norm": 0.09879107773303986, + "learning_rate": 3.957573959074091e-05, + "loss": 0.2238, + "step": 14604 + }, + { + "epoch": 2.9429780374773324, + "grad_norm": 0.03528512269258499, + "learning_rate": 3.956270628936154e-05, + "loss": 0.166, + "step": 14606 + }, + { + "epoch": 2.9433810195446304, + "grad_norm": 0.08624977618455887, + "learning_rate": 3.9549673729430837e-05, + "loss": 0.2206, + "step": 14608 + }, + { + "epoch": 2.9437840016119283, + "grad_norm": 0.04483683779835701, + "learning_rate": 3.9536641911874575e-05, + "loss": 0.1809, + "step": 14610 + }, + { + "epoch": 2.9441869836792263, + "grad_norm": 0.05532107129693031, + "learning_rate": 3.9523610837618565e-05, + "loss": 0.1945, + "step": 14612 + }, + { + "epoch": 2.944589965746524, + "grad_norm": 0.04578416794538498, + "learning_rate": 3.951058050758846e-05, + "loss": 0.1713, + "step": 14614 + }, + { + "epoch": 2.944992947813822, + "grad_norm": 0.06979301571846008, + "learning_rate": 3.949755092270996e-05, + "loss": 0.1939, + "step": 14616 + }, + { + "epoch": 2.94539592988112, + "grad_norm": 0.05871470272541046, + "learning_rate": 3.948452208390864e-05, + "loss": 0.1924, + "step": 14618 + }, + { + "epoch": 2.945798911948418, + "grad_norm": 0.07979889214038849, + "learning_rate": 3.947149399211006e-05, + "loss": 0.1894, + "step": 14620 + }, + { + "epoch": 2.9462018940157164, + "grad_norm": 0.0501592755317688, + "learning_rate": 3.945846664823969e-05, + "loss": 0.1665, + "step": 14622 + }, + { + "epoch": 2.9466048760830144, + "grad_norm": 0.05370146036148071, + "learning_rate": 3.944544005322301e-05, + "loss": 0.1336, + "step": 14624 + }, + { + "epoch": 2.9470078581503123, + "grad_norm": 0.07011505961418152, + "learning_rate": 3.943241420798538e-05, + "loss": 0.1929, + "step": 14626 + }, + { + "epoch": 2.9474108402176102, + "grad_norm": 0.06304550915956497, + "learning_rate": 3.941938911345215e-05, + "loss": 0.1602, + "step": 14628 + }, + { + "epoch": 2.947813822284908, + "grad_norm": 0.05381280928850174, + "learning_rate": 3.940636477054859e-05, + "loss": 0.1834, + "step": 14630 + }, + { + "epoch": 2.9482168043522066, + "grad_norm": 0.08575271815061569, + "learning_rate": 3.9393341180199944e-05, + "loss": 0.2101, + "step": 14632 + }, + { + "epoch": 2.9486197864195045, + "grad_norm": 0.060505758970975876, + "learning_rate": 3.9380318343331357e-05, + "loss": 0.1701, + "step": 14634 + }, + { + "epoch": 2.9490227684868024, + "grad_norm": 0.05807434022426605, + "learning_rate": 3.9367296260868e-05, + "loss": 0.1929, + "step": 14636 + }, + { + "epoch": 2.9494257505541004, + "grad_norm": 0.07592857629060745, + "learning_rate": 3.935427493373489e-05, + "loss": 0.2111, + "step": 14638 + }, + { + "epoch": 2.9498287326213983, + "grad_norm": 0.04870220646262169, + "learning_rate": 3.934125436285708e-05, + "loss": 0.167, + "step": 14640 + }, + { + "epoch": 2.9502317146886963, + "grad_norm": 0.09737106412649155, + "learning_rate": 3.93282345491595e-05, + "loss": 0.1648, + "step": 14642 + }, + { + "epoch": 2.950634696755994, + "grad_norm": 0.06507037580013275, + "learning_rate": 3.931521549356708e-05, + "loss": 0.1759, + "step": 14644 + }, + { + "epoch": 2.951037678823292, + "grad_norm": 0.04871666431427002, + "learning_rate": 3.930219719700466e-05, + "loss": 0.1689, + "step": 14646 + }, + { + "epoch": 2.95144066089059, + "grad_norm": 0.07162641733884811, + "learning_rate": 3.928917966039705e-05, + "loss": 0.2247, + "step": 14648 + }, + { + "epoch": 2.9518436429578885, + "grad_norm": 0.04025919735431671, + "learning_rate": 3.927616288466896e-05, + "loss": 0.1745, + "step": 14650 + }, + { + "epoch": 2.9522466250251864, + "grad_norm": 0.05081998184323311, + "learning_rate": 3.926314687074514e-05, + "loss": 0.218, + "step": 14652 + }, + { + "epoch": 2.9526496070924844, + "grad_norm": 0.05820939689874649, + "learning_rate": 3.925013161955018e-05, + "loss": 0.1842, + "step": 14654 + }, + { + "epoch": 2.9530525891597823, + "grad_norm": 0.06399496644735336, + "learning_rate": 3.923711713200868e-05, + "loss": 0.2142, + "step": 14656 + }, + { + "epoch": 2.9534555712270802, + "grad_norm": 0.04385112598538399, + "learning_rate": 3.9224103409045164e-05, + "loss": 0.1394, + "step": 14658 + }, + { + "epoch": 2.9538585532943786, + "grad_norm": 0.053807783871889114, + "learning_rate": 3.921109045158412e-05, + "loss": 0.2103, + "step": 14660 + }, + { + "epoch": 2.9542615353616766, + "grad_norm": 0.05360929295420647, + "learning_rate": 3.9198078260549936e-05, + "loss": 0.2018, + "step": 14662 + }, + { + "epoch": 2.9546645174289745, + "grad_norm": 0.074669249355793, + "learning_rate": 3.918506683686702e-05, + "loss": 0.1858, + "step": 14664 + }, + { + "epoch": 2.9550674994962725, + "grad_norm": 0.06290856748819351, + "learning_rate": 3.917205618145964e-05, + "loss": 0.2006, + "step": 14666 + }, + { + "epoch": 2.9554704815635704, + "grad_norm": 0.06198323518037796, + "learning_rate": 3.915904629525209e-05, + "loss": 0.214, + "step": 14668 + }, + { + "epoch": 2.9558734636308683, + "grad_norm": 0.05055319517850876, + "learning_rate": 3.914603717916854e-05, + "loss": 0.1624, + "step": 14670 + }, + { + "epoch": 2.9562764456981663, + "grad_norm": 0.08773225545883179, + "learning_rate": 3.913302883413316e-05, + "loss": 0.1928, + "step": 14672 + }, + { + "epoch": 2.956679427765464, + "grad_norm": 0.053192343562841415, + "learning_rate": 3.912002126107002e-05, + "loss": 0.1594, + "step": 14674 + }, + { + "epoch": 2.9570824098327626, + "grad_norm": 0.04311097785830498, + "learning_rate": 3.910701446090318e-05, + "loss": 0.1447, + "step": 14676 + }, + { + "epoch": 2.9574853919000605, + "grad_norm": 0.048903919756412506, + "learning_rate": 3.9094008434556603e-05, + "loss": 0.1815, + "step": 14678 + }, + { + "epoch": 2.9578883739673585, + "grad_norm": 0.0562017560005188, + "learning_rate": 3.908100318295424e-05, + "loss": 0.2002, + "step": 14680 + }, + { + "epoch": 2.9582913560346564, + "grad_norm": 0.05847121402621269, + "learning_rate": 3.906799870701994e-05, + "loss": 0.1946, + "step": 14682 + }, + { + "epoch": 2.9586943381019544, + "grad_norm": 0.05873006582260132, + "learning_rate": 3.905499500767753e-05, + "loss": 0.1812, + "step": 14684 + }, + { + "epoch": 2.9590973201692523, + "grad_norm": 0.08028661459684372, + "learning_rate": 3.904199208585076e-05, + "loss": 0.2255, + "step": 14686 + }, + { + "epoch": 2.9595003022365507, + "grad_norm": 0.08308486640453339, + "learning_rate": 3.902898994246337e-05, + "loss": 0.1943, + "step": 14688 + }, + { + "epoch": 2.9599032843038486, + "grad_norm": 0.044617436826229095, + "learning_rate": 3.901598857843896e-05, + "loss": 0.1629, + "step": 14690 + }, + { + "epoch": 2.9603062663711466, + "grad_norm": 0.040800608694553375, + "learning_rate": 3.900298799470118e-05, + "loss": 0.1431, + "step": 14692 + }, + { + "epoch": 2.9607092484384445, + "grad_norm": 0.08411987870931625, + "learning_rate": 3.898998819217353e-05, + "loss": 0.2145, + "step": 14694 + }, + { + "epoch": 2.9611122305057425, + "grad_norm": 0.06618909537792206, + "learning_rate": 3.8976989171779535e-05, + "loss": 0.1955, + "step": 14696 + }, + { + "epoch": 2.9615152125730404, + "grad_norm": 0.06884243339300156, + "learning_rate": 3.896399093444256e-05, + "loss": 0.209, + "step": 14698 + }, + { + "epoch": 2.9619181946403383, + "grad_norm": 0.07660902291536331, + "learning_rate": 3.8950993481086065e-05, + "loss": 0.2158, + "step": 14700 + }, + { + "epoch": 2.9623211767076363, + "grad_norm": 0.09313687682151794, + "learning_rate": 3.893799681263328e-05, + "loss": 0.2272, + "step": 14702 + }, + { + "epoch": 2.9627241587749347, + "grad_norm": 0.05195392668247223, + "learning_rate": 3.892500093000755e-05, + "loss": 0.1562, + "step": 14704 + }, + { + "epoch": 2.9631271408422326, + "grad_norm": 0.09103037416934967, + "learning_rate": 3.891200583413201e-05, + "loss": 0.1454, + "step": 14706 + }, + { + "epoch": 2.9635301229095306, + "grad_norm": 0.056169308722019196, + "learning_rate": 3.8899011525929863e-05, + "loss": 0.2122, + "step": 14708 + }, + { + "epoch": 2.9639331049768285, + "grad_norm": 0.049981966614723206, + "learning_rate": 3.8886018006324174e-05, + "loss": 0.1725, + "step": 14710 + }, + { + "epoch": 2.9643360870441264, + "grad_norm": 0.03693721070885658, + "learning_rate": 3.8873025276238004e-05, + "loss": 0.1361, + "step": 14712 + }, + { + "epoch": 2.964739069111425, + "grad_norm": 0.06162644922733307, + "learning_rate": 3.88600333365943e-05, + "loss": 0.1606, + "step": 14714 + }, + { + "epoch": 2.9651420511787228, + "grad_norm": 0.0703161209821701, + "learning_rate": 3.884704218831603e-05, + "loss": 0.1939, + "step": 14716 + }, + { + "epoch": 2.9655450332460207, + "grad_norm": 0.03999608755111694, + "learning_rate": 3.883405183232604e-05, + "loss": 0.1514, + "step": 14718 + }, + { + "epoch": 2.9659480153133186, + "grad_norm": 0.06137223541736603, + "learning_rate": 3.882106226954716e-05, + "loss": 0.1938, + "step": 14720 + }, + { + "epoch": 2.9663509973806166, + "grad_norm": 0.061834823340177536, + "learning_rate": 3.880807350090213e-05, + "loss": 0.1996, + "step": 14722 + }, + { + "epoch": 2.9667539794479145, + "grad_norm": 0.07341071218252182, + "learning_rate": 3.879508552731366e-05, + "loss": 0.1703, + "step": 14724 + }, + { + "epoch": 2.9671569615152125, + "grad_norm": 0.04776541888713837, + "learning_rate": 3.878209834970438e-05, + "loss": 0.1597, + "step": 14726 + }, + { + "epoch": 2.9675599435825104, + "grad_norm": 0.04152734950184822, + "learning_rate": 3.876911196899692e-05, + "loss": 0.1617, + "step": 14728 + }, + { + "epoch": 2.9679629256498083, + "grad_norm": 0.05166812613606453, + "learning_rate": 3.8756126386113766e-05, + "loss": 0.1923, + "step": 14730 + }, + { + "epoch": 2.9683659077171067, + "grad_norm": 0.058748915791511536, + "learning_rate": 3.874314160197743e-05, + "loss": 0.1995, + "step": 14732 + }, + { + "epoch": 2.9687688897844047, + "grad_norm": 0.07406263053417206, + "learning_rate": 3.8730157617510295e-05, + "loss": 0.2137, + "step": 14734 + }, + { + "epoch": 2.9691718718517026, + "grad_norm": 0.06733974814414978, + "learning_rate": 3.871717443363475e-05, + "loss": 0.1814, + "step": 14736 + }, + { + "epoch": 2.9695748539190006, + "grad_norm": 0.0653805136680603, + "learning_rate": 3.870419205127307e-05, + "loss": 0.1673, + "step": 14738 + }, + { + "epoch": 2.9699778359862985, + "grad_norm": 0.06307506561279297, + "learning_rate": 3.869121047134754e-05, + "loss": 0.2257, + "step": 14740 + }, + { + "epoch": 2.970380818053597, + "grad_norm": 0.0746563971042633, + "learning_rate": 3.8678229694780324e-05, + "loss": 0.169, + "step": 14742 + }, + { + "epoch": 2.970783800120895, + "grad_norm": 0.06383204460144043, + "learning_rate": 3.8665249722493576e-05, + "loss": 0.2629, + "step": 14744 + }, + { + "epoch": 2.9711867821881928, + "grad_norm": 0.0686652734875679, + "learning_rate": 3.865227055540935e-05, + "loss": 0.1795, + "step": 14746 + }, + { + "epoch": 2.9715897642554907, + "grad_norm": 0.07736478000879288, + "learning_rate": 3.863929219444968e-05, + "loss": 0.1945, + "step": 14748 + }, + { + "epoch": 2.9719927463227886, + "grad_norm": 0.0649065375328064, + "learning_rate": 3.862631464053651e-05, + "loss": 0.1543, + "step": 14750 + }, + { + "epoch": 2.9723957283900866, + "grad_norm": 0.06521575897932053, + "learning_rate": 3.861333789459178e-05, + "loss": 0.2298, + "step": 14752 + }, + { + "epoch": 2.9727987104573845, + "grad_norm": 0.057148393243551254, + "learning_rate": 3.8600361957537296e-05, + "loss": 0.2069, + "step": 14754 + }, + { + "epoch": 2.9732016925246825, + "grad_norm": 0.07252558320760727, + "learning_rate": 3.858738683029489e-05, + "loss": 0.2158, + "step": 14756 + }, + { + "epoch": 2.9736046745919804, + "grad_norm": 0.055180639028549194, + "learning_rate": 3.857441251378625e-05, + "loss": 0.224, + "step": 14758 + }, + { + "epoch": 2.974007656659279, + "grad_norm": 0.05758465826511383, + "learning_rate": 3.856143900893309e-05, + "loss": 0.1997, + "step": 14760 + }, + { + "epoch": 2.9744106387265767, + "grad_norm": 0.0825326144695282, + "learning_rate": 3.8548466316656985e-05, + "loss": 0.1782, + "step": 14762 + }, + { + "epoch": 2.9748136207938747, + "grad_norm": 0.0534152127802372, + "learning_rate": 3.853549443787955e-05, + "loss": 0.1905, + "step": 14764 + }, + { + "epoch": 2.9752166028611726, + "grad_norm": 0.049097124487161636, + "learning_rate": 3.852252337352223e-05, + "loss": 0.2191, + "step": 14766 + }, + { + "epoch": 2.9756195849284706, + "grad_norm": 0.04427768290042877, + "learning_rate": 3.850955312450651e-05, + "loss": 0.1987, + "step": 14768 + }, + { + "epoch": 2.976022566995769, + "grad_norm": 0.05539549142122269, + "learning_rate": 3.849658369175375e-05, + "loss": 0.1895, + "step": 14770 + }, + { + "epoch": 2.976425549063067, + "grad_norm": 0.05405467003583908, + "learning_rate": 3.84836150761853e-05, + "loss": 0.1869, + "step": 14772 + }, + { + "epoch": 2.976828531130365, + "grad_norm": 0.05085950717329979, + "learning_rate": 3.8470647278722404e-05, + "loss": 0.1758, + "step": 14774 + }, + { + "epoch": 2.9772315131976628, + "grad_norm": 0.061146095395088196, + "learning_rate": 3.84576803002863e-05, + "loss": 0.1606, + "step": 14776 + }, + { + "epoch": 2.9776344952649607, + "grad_norm": 0.046381138265132904, + "learning_rate": 3.8444714141798106e-05, + "loss": 0.1816, + "step": 14778 + }, + { + "epoch": 2.9780374773322587, + "grad_norm": 0.06234044209122658, + "learning_rate": 3.843174880417896e-05, + "loss": 0.199, + "step": 14780 + }, + { + "epoch": 2.9784404593995566, + "grad_norm": 0.05754299834370613, + "learning_rate": 3.8418784288349865e-05, + "loss": 0.1671, + "step": 14782 + }, + { + "epoch": 2.9788434414668545, + "grad_norm": 0.07325414568185806, + "learning_rate": 3.840582059523182e-05, + "loss": 0.2082, + "step": 14784 + }, + { + "epoch": 2.9792464235341525, + "grad_norm": 0.041347935795784, + "learning_rate": 3.839285772574574e-05, + "loss": 0.1734, + "step": 14786 + }, + { + "epoch": 2.979649405601451, + "grad_norm": 0.07269836962223053, + "learning_rate": 3.837989568081249e-05, + "loss": 0.2257, + "step": 14788 + }, + { + "epoch": 2.980052387668749, + "grad_norm": 0.060071539133787155, + "learning_rate": 3.8366934461352846e-05, + "loss": 0.1823, + "step": 14790 + }, + { + "epoch": 2.9804553697360467, + "grad_norm": 0.0773068517446518, + "learning_rate": 3.835397406828759e-05, + "loss": 0.1749, + "step": 14792 + }, + { + "epoch": 2.9808583518033447, + "grad_norm": 0.048480074852705, + "learning_rate": 3.834101450253738e-05, + "loss": 0.1994, + "step": 14794 + }, + { + "epoch": 2.9812613338706426, + "grad_norm": 0.05097859725356102, + "learning_rate": 3.832805576502287e-05, + "loss": 0.2009, + "step": 14796 + }, + { + "epoch": 2.981664315937941, + "grad_norm": 0.05600469186902046, + "learning_rate": 3.83150978566646e-05, + "loss": 0.2094, + "step": 14798 + }, + { + "epoch": 2.982067298005239, + "grad_norm": 0.05589543282985687, + "learning_rate": 3.83021407783831e-05, + "loss": 0.1738, + "step": 14800 + }, + { + "epoch": 2.982470280072537, + "grad_norm": 0.04236073046922684, + "learning_rate": 3.8289184531098795e-05, + "loss": 0.1333, + "step": 14802 + }, + { + "epoch": 2.982873262139835, + "grad_norm": 0.04489387571811676, + "learning_rate": 3.82762291157321e-05, + "loss": 0.1528, + "step": 14804 + }, + { + "epoch": 2.9832762442071328, + "grad_norm": 0.05915519595146179, + "learning_rate": 3.826327453320334e-05, + "loss": 0.2527, + "step": 14806 + }, + { + "epoch": 2.9836792262744307, + "grad_norm": 0.0606289803981781, + "learning_rate": 3.8250320784432805e-05, + "loss": 0.1475, + "step": 14808 + }, + { + "epoch": 2.9840822083417287, + "grad_norm": 0.06469710171222687, + "learning_rate": 3.823736787034067e-05, + "loss": 0.2019, + "step": 14810 + }, + { + "epoch": 2.9844851904090266, + "grad_norm": 0.058138661086559296, + "learning_rate": 3.822441579184712e-05, + "loss": 0.2144, + "step": 14812 + }, + { + "epoch": 2.9848881724763245, + "grad_norm": 0.05327381566166878, + "learning_rate": 3.8211464549872214e-05, + "loss": 0.1871, + "step": 14814 + }, + { + "epoch": 2.985291154543623, + "grad_norm": 0.05329843983054161, + "learning_rate": 3.819851414533604e-05, + "loss": 0.2019, + "step": 14816 + }, + { + "epoch": 2.985694136610921, + "grad_norm": 0.07764440774917603, + "learning_rate": 3.818556457915854e-05, + "loss": 0.1886, + "step": 14818 + }, + { + "epoch": 2.986097118678219, + "grad_norm": 0.05114993825554848, + "learning_rate": 3.8172615852259644e-05, + "loss": 0.225, + "step": 14820 + }, + { + "epoch": 2.9865001007455168, + "grad_norm": 0.059108905494213104, + "learning_rate": 3.81596679655592e-05, + "loss": 0.2137, + "step": 14822 + }, + { + "epoch": 2.9869030828128147, + "grad_norm": 0.060795605182647705, + "learning_rate": 3.8146720919977005e-05, + "loss": 0.2168, + "step": 14824 + }, + { + "epoch": 2.987306064880113, + "grad_norm": 0.0726071447134018, + "learning_rate": 3.813377471643279e-05, + "loss": 0.1937, + "step": 14826 + }, + { + "epoch": 2.987709046947411, + "grad_norm": 0.06462717056274414, + "learning_rate": 3.812082935584627e-05, + "loss": 0.2359, + "step": 14828 + }, + { + "epoch": 2.988112029014709, + "grad_norm": 0.0547826811671257, + "learning_rate": 3.8107884839137e-05, + "loss": 0.1894, + "step": 14830 + }, + { + "epoch": 2.988515011082007, + "grad_norm": 0.047724831849336624, + "learning_rate": 3.80949411672246e-05, + "loss": 0.2132, + "step": 14832 + }, + { + "epoch": 2.988917993149305, + "grad_norm": 0.04862813651561737, + "learning_rate": 3.808199834102852e-05, + "loss": 0.2272, + "step": 14834 + }, + { + "epoch": 2.989320975216603, + "grad_norm": 0.05972544103860855, + "learning_rate": 3.806905636146824e-05, + "loss": 0.2171, + "step": 14836 + }, + { + "epoch": 2.9897239572839007, + "grad_norm": 0.06059863418340683, + "learning_rate": 3.8056115229463086e-05, + "loss": 0.2395, + "step": 14838 + }, + { + "epoch": 2.9901269393511987, + "grad_norm": 0.061624523252248764, + "learning_rate": 3.804317494593244e-05, + "loss": 0.2317, + "step": 14840 + }, + { + "epoch": 2.9905299214184966, + "grad_norm": 0.05651069059967995, + "learning_rate": 3.8030235511795484e-05, + "loss": 0.1882, + "step": 14842 + }, + { + "epoch": 2.990932903485795, + "grad_norm": 0.047117821872234344, + "learning_rate": 3.801729692797149e-05, + "loss": 0.1767, + "step": 14844 + }, + { + "epoch": 2.991335885553093, + "grad_norm": 0.058522630482912064, + "learning_rate": 3.800435919537953e-05, + "loss": 0.227, + "step": 14846 + }, + { + "epoch": 2.991738867620391, + "grad_norm": 0.06430003046989441, + "learning_rate": 3.799142231493873e-05, + "loss": 0.2436, + "step": 14848 + }, + { + "epoch": 2.992141849687689, + "grad_norm": 0.03316927328705788, + "learning_rate": 3.7978486287568076e-05, + "loss": 0.1492, + "step": 14850 + }, + { + "epoch": 2.9925448317549868, + "grad_norm": 0.07100309431552887, + "learning_rate": 3.796555111418654e-05, + "loss": 0.1636, + "step": 14852 + }, + { + "epoch": 2.992947813822285, + "grad_norm": 0.05562622845172882, + "learning_rate": 3.795261679571298e-05, + "loss": 0.2164, + "step": 14854 + }, + { + "epoch": 2.993350795889583, + "grad_norm": 0.06413529068231583, + "learning_rate": 3.7939683333066276e-05, + "loss": 0.1957, + "step": 14856 + }, + { + "epoch": 2.993753777956881, + "grad_norm": 0.07106629014015198, + "learning_rate": 3.7926750727165175e-05, + "loss": 0.1811, + "step": 14858 + }, + { + "epoch": 2.994156760024179, + "grad_norm": 0.06832096725702286, + "learning_rate": 3.7913818978928403e-05, + "loss": 0.1592, + "step": 14860 + }, + { + "epoch": 2.994559742091477, + "grad_norm": 0.04877585172653198, + "learning_rate": 3.790088808927459e-05, + "loss": 0.2011, + "step": 14862 + }, + { + "epoch": 2.994962724158775, + "grad_norm": 0.08021704107522964, + "learning_rate": 3.788795805912235e-05, + "loss": 0.2259, + "step": 14864 + }, + { + "epoch": 2.995365706226073, + "grad_norm": 0.05000881478190422, + "learning_rate": 3.787502888939019e-05, + "loss": 0.1734, + "step": 14866 + }, + { + "epoch": 2.9957686882933707, + "grad_norm": 0.06632080674171448, + "learning_rate": 3.786210058099659e-05, + "loss": 0.179, + "step": 14868 + }, + { + "epoch": 2.996171670360669, + "grad_norm": 0.05244016274809837, + "learning_rate": 3.784917313485995e-05, + "loss": 0.2026, + "step": 14870 + }, + { + "epoch": 2.996574652427967, + "grad_norm": 0.07437470555305481, + "learning_rate": 3.783624655189862e-05, + "loss": 0.2714, + "step": 14872 + }, + { + "epoch": 2.996977634495265, + "grad_norm": 0.053574662655591965, + "learning_rate": 3.7823320833030885e-05, + "loss": 0.1724, + "step": 14874 + }, + { + "epoch": 2.997380616562563, + "grad_norm": 0.055552784353494644, + "learning_rate": 3.781039597917496e-05, + "loss": 0.1951, + "step": 14876 + }, + { + "epoch": 2.997783598629861, + "grad_norm": 0.04839596152305603, + "learning_rate": 3.7797471991249e-05, + "loss": 0.1626, + "step": 14878 + }, + { + "epoch": 2.998186580697159, + "grad_norm": 0.06522393971681595, + "learning_rate": 3.778454887017113e-05, + "loss": 0.2036, + "step": 14880 + }, + { + "epoch": 2.998589562764457, + "grad_norm": 0.07289113104343414, + "learning_rate": 3.777162661685937e-05, + "loss": 0.2254, + "step": 14882 + }, + { + "epoch": 2.998992544831755, + "grad_norm": 0.05435548722743988, + "learning_rate": 3.77587052322317e-05, + "loss": 0.166, + "step": 14884 + }, + { + "epoch": 2.999395526899053, + "grad_norm": 0.062487825751304626, + "learning_rate": 3.774578471720603e-05, + "loss": 0.2084, + "step": 14886 + }, + { + "epoch": 2.999798508966351, + "grad_norm": 0.05912657454609871, + "learning_rate": 3.7732865072700225e-05, + "loss": 0.1823, + "step": 14888 + } + ], + "logging_steps": 2, + "max_steps": 24815, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 1.266637847050322e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}